Commit 2894096d authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Implemented copySparseMatrix function for conversion between sparse matrix formats

parent 574c7901
Loading
Loading
Loading
Loading
+2 −5
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@
#include <TNL/String.h>
#include <TNL/Containers/Vector.h>
#include <TNL/Timer.h>
#include <TNL/Matrices/MatrixReader.h>

namespace TNL {
namespace Matrices {   
@@ -405,11 +406,7 @@ class MatrixReaderDeviceDependentCode< Devices::Cuda >
      if( ! MatrixReader< HostMatrixType >::readMtxFileHostMatrix( file, hostMatrix, rowLengthsVector, verbose ) )
         return false;

      typename Matrix::CompressedRowLengthsVector cudaCompressedRowLengthsVector;
      cudaCompressedRowLengthsVector.setLike( rowLengthsVector );
      cudaCompressedRowLengthsVector = rowLengthsVector;
      if( ! matrix.copyFrom( hostMatrix, cudaCompressedRowLengthsVector ) )
         return false;
      matrix = hostMatrix;
      return true;
   }
};
+7 −0
Original line number Diff line number Diff line
@@ -65,6 +65,13 @@ class Sparse : public Matrix< Real, Device, Index >
   Index maxRowLength;
};


// This cannot be a method of the Sparse class, because the implementation uses
// methods (marked with __cuda_callable__) which are defined only on the
// subclasses, but are not virtual methods of Sparse.
template< typename Matrix1, typename Matrix2 >
void copySparseMatrix( Matrix1& A, const Matrix2& B );

} // namespace Matrices
} // namespace TNL

+143 −1
Original line number Diff line number Diff line
@@ -11,6 +11,7 @@
#pragma once

#include "Sparse.h"
#include <TNL/DevicePointer.h>

namespace TNL {
namespace Matrices {
@@ -129,6 +130,147 @@ template< typename Real,
          typename Index >
void Sparse< Real, Device, Index >::printStructure( std::ostream& str ) const
{
   TNL_ASSERT_TRUE( false, "Not implemented yet." );
}


#ifdef HAVE_CUDA
template< typename Vector, typename Matrix >
__global__ void
SparseMatrixSetRowLengthsVectorKernel( Vector* rowLengths,
                                       const Matrix* matrix,
                                       typename Matrix::IndexType rows,
                                       typename Matrix::IndexType cols )
{
   using IndexType = typename Matrix::IndexType;

   IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
   const IndexType gridSize = blockDim.x * gridDim.x;

   while( rowIdx < rows ) {
      const auto max_length = matrix->getRowLengthFast( rowIdx );
      const auto row = matrix->getRow( rowIdx );
      IndexType length = 0;
      for( IndexType c_j = 0; c_j < max_length; c_j++ )
         if( row.getElementColumn( c_j ) < cols )
            length++;
         else
            break;
      rowLengths[ rowIdx ] = length;
      rowIdx += gridSize;
   }
}

template< typename Matrix1, typename Matrix2 >
__global__ void
SparseMatrixCopyKernel( Matrix1* A,
                        const Matrix2* B,
                        const typename Matrix2::IndexType* rowLengths,
                        typename Matrix2::IndexType rows )
{
   using IndexType = typename Matrix2::IndexType;

   IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
   const IndexType gridSize = blockDim.x * gridDim.x;

   while( rowIdx < rows ) {
      const auto length = rowLengths[ rowIdx ];
      const auto rowB = B->getRow( rowIdx );
      auto rowA = A->getRow( rowIdx );
      for( IndexType c = 0; c < length; c++ )
         rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
      rowIdx += gridSize;
   }
}
#endif

template< typename Matrix1, typename Matrix2 >
void
copySparseMatrix( Matrix1& A, const Matrix2& B )
{
   static_assert( std::is_same< typename Matrix1::RealType, typename Matrix2::RealType >::value,
                  "The matrices must have the same RealType." );
   static_assert( std::is_same< typename Matrix1::DeviceType, typename Matrix2::DeviceType >::value,
                  "The matrices must be allocated on the same device." );
   static_assert( std::is_same< typename Matrix1::IndexType, typename Matrix2::IndexType >::value,
                  "The matrices must have the same IndexType." );

   using RealType = typename Matrix1::RealType;
   using DeviceType = typename Matrix1::DeviceType;
   using IndexType = typename Matrix1::IndexType;

   const IndexType rows = B.getRows();
   const IndexType cols = B.getColumns();

   A.setDimensions( rows, cols );

   if( std::is_same< DeviceType, Devices::Host >::value ) {
      // set row lengths
      typename Matrix1::CompressedRowLengthsVector rowLengths;
      rowLengths.setSize( rows );
#ifdef HAVE_OPENMP
#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
#endif
      for( IndexType i = 0; i < rows; i++ ) {
         const auto max_length = B.getRowLength( i );
         const auto row = B.getRow( i );
         IndexType length = 0;
         for( IndexType c_j = 0; c_j < max_length; c_j++ )
            if( row.getElementColumn( c_j ) < cols )
               length++;
            else
               break;
         rowLengths[ i ] = length;
      }
      A.setCompressedRowLengths( rowLengths );

#ifdef HAVE_OPENMP
#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
#endif
      for( IndexType i = 0; i < rows; i++ ) {
         const auto length = rowLengths[ i ];
         const auto rowB = B.getRow( i );
         auto rowA = A.getRow( i );
         for( IndexType c = 0; c < length; c++ )
            rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
      }
   }

   if( std::is_same< DeviceType, Devices::Cuda >::value ) {
#ifdef HAVE_CUDA
      dim3 blockSize( 256 );
      dim3 gridSize;
      const IndexType desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
      gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( rows, blockSize.x ) );

      typename Matrix1::CompressedRowLengthsVector rowLengths;
      rowLengths.setSize( rows );

      DevicePointer< Matrix1 > Apointer( A );
      const DevicePointer< const Matrix2 > Bpointer( B );

      // set row lengths
      Devices::Cuda::synchronizeDevice();
      SparseMatrixSetRowLengthsVectorKernel<<< gridSize, blockSize >>>(
            rowLengths.getData(),
            &Bpointer.template getData< TNL::Devices::Cuda >(),
            rows,
            cols );
      TNL_CHECK_CUDA_DEVICE;
      Apointer->setCompressedRowLengths( rowLengths );

      // copy rows
      Devices::Cuda::synchronizeDevice();
      SparseMatrixCopyKernel<<< gridSize, blockSize >>>(
            &Apointer.template modifyData< TNL::Devices::Cuda >(),
            &Bpointer.template getData< TNL::Devices::Cuda >(),
            rowLengths.getData(),
            rows );
      TNL_CHECK_CUDA_DEVICE;
#else
      throw Exceptions::CudaSupportMissing();
#endif
   }
}

} // namespace Matrices
+89 −13
Original line number Diff line number Diff line
@@ -111,7 +111,7 @@ void checkMatrix( Matrix& m )
}

template< typename Matrix1, typename Matrix2 >
void testMatrixCopy()
void testCopyAssignment()
{
   Matrix1 m1;
   setupMatrix( m1 );
@@ -122,72 +122,148 @@ void testMatrixCopy()
   checkMatrix( m2 );
}

template< typename Matrix1, typename Matrix2 >
void testConversion()
{
   Matrix1 m1;
   setupMatrix( m1 );
   checkMatrix( m1 );

   Matrix2 m2;
   TNL::Matrices::copySparseMatrix( m2, m1 );
   checkMatrix( m2 );
}


TEST( SparseMatrixCopyTest, CSR_HostToHost )
{
   testMatrixCopy< CSR_host, CSR_host >();
   testCopyAssignment< CSR_host, CSR_host >();
}

#ifdef HAVE_CUDA
TEST( SparseMatrixCopyTest, CSR_HostToCuda )
{
   testMatrixCopy< CSR_host, CSR_cuda >();
   testCopyAssignment< CSR_host, CSR_cuda >();
}

TEST( SparseMatrixCopyTest, CSR_CudaToHost )
{
   testMatrixCopy< CSR_cuda, CSR_host >();
   testCopyAssignment< CSR_cuda, CSR_host >();
}

TEST( SparseMatrixCopyTest, CSR_CudaToCuda )
{
   testMatrixCopy< CSR_cuda, CSR_cuda >();
   testCopyAssignment< CSR_cuda, CSR_cuda >();
}
#endif


TEST( SparseMatrixCopyTest, Ellpack_HostToHost )
{
   testMatrixCopy< E_host, E_host >();
   testCopyAssignment< E_host, E_host >();
}

#ifdef HAVE_CUDA
TEST( SparseMatrixCopyTest, Ellpack_HostToCuda )
{
   testMatrixCopy< E_host, E_cuda >();
   testCopyAssignment< E_host, E_cuda >();
}

TEST( SparseMatrixCopyTest, Ellpack_CudaToHost )
{
   testMatrixCopy< E_cuda, E_host >();
   testCopyAssignment< E_cuda, E_host >();
}

TEST( SparseMatrixCopyTest, Ellpack_CudaToCuda )
{
   testMatrixCopy< E_cuda, E_cuda >();
   testCopyAssignment< E_cuda, E_cuda >();
}
#endif


TEST( SparseMatrixCopyTest, SlicedEllpack_HostToHost )
{
   testMatrixCopy< SE_host, SE_host >();
   testCopyAssignment< SE_host, SE_host >();
}

#ifdef HAVE_CUDA
TEST( SparseMatrixCopyTest, SlicedEllpack_HostToCuda )
{
   testMatrixCopy< SE_host, SE_cuda >();
   testCopyAssignment< SE_host, SE_cuda >();
}

TEST( SparseMatrixCopyTest, SlicedEllpack_CudaToHost )
{
   testMatrixCopy< SE_cuda, SE_host >();
   testCopyAssignment< SE_cuda, SE_host >();
}

TEST( SparseMatrixCopyTest, SlicedEllpack_CudaToCuda )
{
   testMatrixCopy< SE_cuda, SE_cuda >();
   testCopyAssignment< SE_cuda, SE_cuda >();
}
#endif


// test conversion between formats
TEST( SparseMatrixCopyTest, CSR_to_Ellpack_host )
{
   testConversion< CSR_host, E_host >();
}

TEST( SparseMatrixCopyTest, Ellpack_to_CSR_host )
{
   testConversion< E_host, CSR_host >();
}

TEST( SparseMatrixCopyTest, CSR_to_SlicedEllpack_host )
{
   testConversion< CSR_host, SE_host >();
}

TEST( SparseMatrixCopyTest, SlicedEllpack_to_CSR_host )
{
   testConversion< SE_host, CSR_host >();
}

TEST( SparseMatrixCopyTest, Ellpack_to_SlicedEllpack_host )
{
   testConversion< E_host, SE_host >();
}

TEST( SparseMatrixCopyTest, SlicedEllpack_to_Ellpack_host )
{
   testConversion< SE_host, E_host >();
}

#ifdef HAVE_CUDA
TEST( SparseMatrixCopyTest, CSR_to_Ellpack_cuda )
{
   testConversion< CSR_cuda, E_cuda >();
}

TEST( SparseMatrixCopyTest, Ellpack_to_CSR_cuda )
{
   testConversion< E_cuda, CSR_cuda >();
}

TEST( SparseMatrixCopyTest, CSR_to_SlicedEllpack_cuda )
{
   testConversion< CSR_cuda, SE_cuda >();
}

TEST( SparseMatrixCopyTest, SlicedEllpack_to_CSR_cuda )
{
   testConversion< SE_cuda, CSR_cuda >();
}

TEST( SparseMatrixCopyTest, Ellpack_to_SlicedEllpack_cuda )
{
   testConversion< E_cuda, SE_cuda >();
}

TEST( SparseMatrixCopyTest, SlicedEllpack_to_Ellpack_cuda )
{
   testConversion< SE_cuda, E_cuda >();
}
#endif

+233 −274
Original line number Diff line number Diff line
@@ -326,7 +326,6 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
         return false;
      }
      const int rows = csrMatrix.getRows();
      const int columns = csrMatrix.getColumns();
      const long int nonzeroElements = csrMatrix.getNumberOfMatrixElements();
      Containers::Vector< int, Devices::Host, int > rowLengthsHost;
      rowLengthsHost.setSize( rows );
@@ -363,13 +362,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
      typedef CSR< Real, Devices::Cuda, int > CSRCudaType;
      CSRCudaType cudaCSR;
      //cout << "Copying matrix to GPU... ";
      if( ! cudaCSR.copyFrom( csrMatrix, rowLengthsCuda ) )
      {
         std::cerr << "I am not able to transfer the matrix on GPU." << std::endl;
         writeTestFailed( logFile, 21 );
      }
      else
      {
      cudaCSR = csrMatrix;
      ::tnlCusparseCSR< Real > cusparseCSR;
      cusparseCSR.init( cudaCSR, &cusparseHandle );
      benchmarkMatrix( cusparseCSR,
@@ -516,7 +509,6 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
                       baseline,
                       verbose,
                       logFile );*/
      }
      cudaCSR.reset();
#endif

@@ -524,10 +516,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
      double padding;
      typedef Ellpack< Real, Devices::Host, int > EllpackType;
      EllpackType ellpackMatrix;
      if( ! ellpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) )
         writeTestFailed( logFile, 7 );
      else
      {
      Matrices::copySparseMatrix( ellpackMatrix, csrMatrix );
      allocatedElements = ellpackMatrix.getNumberOfMatrixElements();
      padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
      logFile << "    " << padding << std::endl;
@@ -544,13 +533,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
      typedef Ellpack< Real, Devices::Cuda, int > EllpackCudaType;
      EllpackCudaType cudaEllpack;
      std::cout << "Copying matrix to GPU... ";
         if( ! cudaEllpack.copyFrom( ellpackMatrix, rowLengthsCuda ) )
         {
            std::cerr << "I am not able to transfer the matrix on GPU." << std::endl;
            writeTestFailed( logFile, 3 );
         }
         else
         {
      cudaEllpack = ellpackMatrix;
      std::cout << " done.   \r";
      benchmarkMatrix( cudaEllpack,
                       cudaX,
@@ -561,18 +544,13 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
                       baseline,
                       verbose,
                       logFile );
         }
      cudaEllpack.reset();
#endif
      ellpackMatrix.reset();
      }

      typedef SlicedEllpack< Real, Devices::Host, int > SlicedEllpackType;
      SlicedEllpackType slicedEllpack;
      if( ! slicedEllpack.copyFrom( csrMatrix, rowLengthsHost ) )
         writeTestFailed( logFile, 7 );
      else
      {
      Matrices::copySparseMatrix( slicedEllpack, csrMatrix );
      allocatedElements = slicedEllpack.getNumberOfMatrixElements();
      padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
      logFile << "    " << padding << std::endl;
@@ -589,13 +567,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
      typedef SlicedEllpack< Real, Devices::Cuda, int > SlicedEllpackCudaType;
      SlicedEllpackCudaType cudaSlicedEllpack;
      std::cout << "Copying matrix to GPU... ";
         if( ! cudaSlicedEllpack.copyFrom( slicedEllpack, rowLengthsCuda ) )
         {
            std::cerr << "I am not able to transfer the matrix on GPU." << std::endl;
            writeTestFailed( logFile, 3 );
         }
         else
         {
      cudaSlicedEllpack = slicedEllpack;
      std::cout << " done.   \r";
      benchmarkMatrix( cudaSlicedEllpack,
                       cudaX,
@@ -606,18 +578,13 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
                       baseline,
                       verbose,
                       logFile );
         }
      cudaSlicedEllpack.reset();
#endif
      slicedEllpack.reset();
      }

      typedef ChunkedEllpack< Real, Devices::Host, int > ChunkedEllpackType;
      ChunkedEllpackType chunkedEllpack;
      if( ! chunkedEllpack.copyFrom( csrMatrix, rowLengthsHost ) )
         writeTestFailed( logFile, 7 );
      else
      {
      Matrices::copySparseMatrix( chunkedEllpack, csrMatrix );
      allocatedElements = chunkedEllpack.getNumberOfMatrixElements();
      padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
      logFile << "    " << padding << std::endl;
@@ -634,13 +601,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
      typedef ChunkedEllpack< Real, Devices::Cuda, int > ChunkedEllpackCudaType;
      ChunkedEllpackCudaType cudaChunkedEllpack;
      std::cout << "Copying matrix to GPU... ";
         if( ! cudaChunkedEllpack.copyFrom( chunkedEllpack, rowLengthsCuda ) )
         {
            std::cerr << "I am not able to transfer the matrix on GPU." << std::endl;
            writeTestFailed( logFile, 3 );
         }
         else
         {
      cudaChunkedEllpack = chunkedEllpack;
      std::cout << " done.    \r";
      benchmarkMatrix( cudaChunkedEllpack,
                       cudaX,
@@ -651,12 +612,10 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
                       baseline,
                       verbose,
                       logFile );
         }
      cudaChunkedEllpack.reset();
#endif
      chunkedEllpack.reset();
   }
   }
   return true;
}