Loading src/TNL/Matrices/MatrixReader_impl.h +2 −5 Original line number Diff line number Diff line Loading @@ -15,6 +15,7 @@ #include <TNL/String.h> #include <TNL/Containers/Vector.h> #include <TNL/Timer.h> #include <TNL/Matrices/MatrixReader.h> namespace TNL { namespace Matrices { Loading Loading @@ -405,11 +406,7 @@ class MatrixReaderDeviceDependentCode< Devices::Cuda > if( ! MatrixReader< HostMatrixType >::readMtxFileHostMatrix( file, hostMatrix, rowLengthsVector, verbose ) ) return false; typename Matrix::CompressedRowLengthsVector cudaCompressedRowLengthsVector; cudaCompressedRowLengthsVector.setLike( rowLengthsVector ); cudaCompressedRowLengthsVector = rowLengthsVector; if( ! matrix.copyFrom( hostMatrix, cudaCompressedRowLengthsVector ) ) return false; matrix = hostMatrix; return true; } }; Loading src/TNL/Matrices/Sparse.h +7 −0 Original line number Diff line number Diff line Loading @@ -65,6 +65,13 @@ class Sparse : public Matrix< Real, Device, Index > Index maxRowLength; }; // This cannot be a method of the Sparse class, because the implementation uses // methods (marked with __cuda_callable__) which are defined only on the // subclasses, but are not virtual methods of Sparse. template< typename Matrix1, typename Matrix2 > void copySparseMatrix( Matrix1& A, const Matrix2& B ); } // namespace Matrices } // namespace TNL Loading src/TNL/Matrices/Sparse_impl.h +143 −1 Original line number Diff line number Diff line Loading @@ -11,6 +11,7 @@ #pragma once #include "Sparse.h" #include <TNL/DevicePointer.h> namespace TNL { namespace Matrices { Loading Loading @@ -129,6 +130,147 @@ template< typename Real, typename Index > void Sparse< Real, Device, Index >::printStructure( std::ostream& str ) const { TNL_ASSERT_TRUE( false, "Not implemented yet." ); } #ifdef HAVE_CUDA template< typename Vector, typename Matrix > __global__ void SparseMatrixSetRowLengthsVectorKernel( Vector* rowLengths, const Matrix* matrix, typename Matrix::IndexType rows, typename Matrix::IndexType cols ) { using IndexType = typename Matrix::IndexType; IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x; const IndexType gridSize = blockDim.x * gridDim.x; while( rowIdx < rows ) { const auto max_length = matrix->getRowLengthFast( rowIdx ); const auto row = matrix->getRow( rowIdx ); IndexType length = 0; for( IndexType c_j = 0; c_j < max_length; c_j++ ) if( row.getElementColumn( c_j ) < cols ) length++; else break; rowLengths[ rowIdx ] = length; rowIdx += gridSize; } } template< typename Matrix1, typename Matrix2 > __global__ void SparseMatrixCopyKernel( Matrix1* A, const Matrix2* B, const typename Matrix2::IndexType* rowLengths, typename Matrix2::IndexType rows ) { using IndexType = typename Matrix2::IndexType; IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x; const IndexType gridSize = blockDim.x * gridDim.x; while( rowIdx < rows ) { const auto length = rowLengths[ rowIdx ]; const auto rowB = B->getRow( rowIdx ); auto rowA = A->getRow( rowIdx ); for( IndexType c = 0; c < length; c++ ) rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) ); rowIdx += gridSize; } } #endif template< typename Matrix1, typename Matrix2 > void copySparseMatrix( Matrix1& A, const Matrix2& B ) { static_assert( std::is_same< typename Matrix1::RealType, typename Matrix2::RealType >::value, "The matrices must have the same RealType." ); static_assert( std::is_same< typename Matrix1::DeviceType, typename Matrix2::DeviceType >::value, "The matrices must be allocated on the same device." ); static_assert( std::is_same< typename Matrix1::IndexType, typename Matrix2::IndexType >::value, "The matrices must have the same IndexType." ); using RealType = typename Matrix1::RealType; using DeviceType = typename Matrix1::DeviceType; using IndexType = typename Matrix1::IndexType; const IndexType rows = B.getRows(); const IndexType cols = B.getColumns(); A.setDimensions( rows, cols ); if( std::is_same< DeviceType, Devices::Host >::value ) { // set row lengths typename Matrix1::CompressedRowLengthsVector rowLengths; rowLengths.setSize( rows ); #ifdef HAVE_OPENMP #pragma omp parallel for if( Devices::Host::isOMPEnabled() ) #endif for( IndexType i = 0; i < rows; i++ ) { const auto max_length = B.getRowLength( i ); const auto row = B.getRow( i ); IndexType length = 0; for( IndexType c_j = 0; c_j < max_length; c_j++ ) if( row.getElementColumn( c_j ) < cols ) length++; else break; rowLengths[ i ] = length; } A.setCompressedRowLengths( rowLengths ); #ifdef HAVE_OPENMP #pragma omp parallel for if( Devices::Host::isOMPEnabled() ) #endif for( IndexType i = 0; i < rows; i++ ) { const auto length = rowLengths[ i ]; const auto rowB = B.getRow( i ); auto rowA = A.getRow( i ); for( IndexType c = 0; c < length; c++ ) rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) ); } } if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef HAVE_CUDA dim3 blockSize( 256 ); dim3 gridSize; const IndexType desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() ); gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( rows, blockSize.x ) ); typename Matrix1::CompressedRowLengthsVector rowLengths; rowLengths.setSize( rows ); DevicePointer< Matrix1 > Apointer( A ); const DevicePointer< const Matrix2 > Bpointer( B ); // set row lengths Devices::Cuda::synchronizeDevice(); SparseMatrixSetRowLengthsVectorKernel<<< gridSize, blockSize >>>( rowLengths.getData(), &Bpointer.template getData< TNL::Devices::Cuda >(), rows, cols ); TNL_CHECK_CUDA_DEVICE; Apointer->setCompressedRowLengths( rowLengths ); // copy rows Devices::Cuda::synchronizeDevice(); SparseMatrixCopyKernel<<< gridSize, blockSize >>>( &Apointer.template modifyData< TNL::Devices::Cuda >(), &Bpointer.template getData< TNL::Devices::Cuda >(), rowLengths.getData(), rows ); TNL_CHECK_CUDA_DEVICE; #else throw Exceptions::CudaSupportMissing(); #endif } } } // namespace Matrices Loading src/UnitTests/Matrices/SparseMatrixCopyTest.h +89 −13 Original line number Diff line number Diff line Loading @@ -111,7 +111,7 @@ void checkMatrix( Matrix& m ) } template< typename Matrix1, typename Matrix2 > void testMatrixCopy() void testCopyAssignment() { Matrix1 m1; setupMatrix( m1 ); Loading @@ -122,72 +122,148 @@ void testMatrixCopy() checkMatrix( m2 ); } template< typename Matrix1, typename Matrix2 > void testConversion() { Matrix1 m1; setupMatrix( m1 ); checkMatrix( m1 ); Matrix2 m2; TNL::Matrices::copySparseMatrix( m2, m1 ); checkMatrix( m2 ); } TEST( SparseMatrixCopyTest, CSR_HostToHost ) { testMatrixCopy< CSR_host, CSR_host >(); testCopyAssignment< CSR_host, CSR_host >(); } #ifdef HAVE_CUDA TEST( SparseMatrixCopyTest, CSR_HostToCuda ) { testMatrixCopy< CSR_host, CSR_cuda >(); testCopyAssignment< CSR_host, CSR_cuda >(); } TEST( SparseMatrixCopyTest, CSR_CudaToHost ) { testMatrixCopy< CSR_cuda, CSR_host >(); testCopyAssignment< CSR_cuda, CSR_host >(); } TEST( SparseMatrixCopyTest, CSR_CudaToCuda ) { testMatrixCopy< CSR_cuda, CSR_cuda >(); testCopyAssignment< CSR_cuda, CSR_cuda >(); } #endif TEST( SparseMatrixCopyTest, Ellpack_HostToHost ) { testMatrixCopy< E_host, E_host >(); testCopyAssignment< E_host, E_host >(); } #ifdef HAVE_CUDA TEST( SparseMatrixCopyTest, Ellpack_HostToCuda ) { testMatrixCopy< E_host, E_cuda >(); testCopyAssignment< E_host, E_cuda >(); } TEST( SparseMatrixCopyTest, Ellpack_CudaToHost ) { testMatrixCopy< E_cuda, E_host >(); testCopyAssignment< E_cuda, E_host >(); } TEST( SparseMatrixCopyTest, Ellpack_CudaToCuda ) { testMatrixCopy< E_cuda, E_cuda >(); testCopyAssignment< E_cuda, E_cuda >(); } #endif TEST( SparseMatrixCopyTest, SlicedEllpack_HostToHost ) { testMatrixCopy< SE_host, SE_host >(); testCopyAssignment< SE_host, SE_host >(); } #ifdef HAVE_CUDA TEST( SparseMatrixCopyTest, SlicedEllpack_HostToCuda ) { testMatrixCopy< SE_host, SE_cuda >(); testCopyAssignment< SE_host, SE_cuda >(); } TEST( SparseMatrixCopyTest, SlicedEllpack_CudaToHost ) { testMatrixCopy< SE_cuda, SE_host >(); testCopyAssignment< SE_cuda, SE_host >(); } TEST( SparseMatrixCopyTest, SlicedEllpack_CudaToCuda ) { testMatrixCopy< SE_cuda, SE_cuda >(); testCopyAssignment< SE_cuda, SE_cuda >(); } #endif // test conversion between formats TEST( SparseMatrixCopyTest, CSR_to_Ellpack_host ) { testConversion< CSR_host, E_host >(); } TEST( SparseMatrixCopyTest, Ellpack_to_CSR_host ) { testConversion< E_host, CSR_host >(); } TEST( SparseMatrixCopyTest, CSR_to_SlicedEllpack_host ) { testConversion< CSR_host, SE_host >(); } TEST( SparseMatrixCopyTest, SlicedEllpack_to_CSR_host ) { testConversion< SE_host, CSR_host >(); } TEST( SparseMatrixCopyTest, Ellpack_to_SlicedEllpack_host ) { testConversion< E_host, SE_host >(); } TEST( SparseMatrixCopyTest, SlicedEllpack_to_Ellpack_host ) { testConversion< SE_host, E_host >(); } #ifdef HAVE_CUDA TEST( SparseMatrixCopyTest, CSR_to_Ellpack_cuda ) { testConversion< CSR_cuda, E_cuda >(); } TEST( SparseMatrixCopyTest, Ellpack_to_CSR_cuda ) { testConversion< E_cuda, CSR_cuda >(); } TEST( SparseMatrixCopyTest, CSR_to_SlicedEllpack_cuda ) { testConversion< CSR_cuda, SE_cuda >(); } TEST( SparseMatrixCopyTest, SlicedEllpack_to_CSR_cuda ) { testConversion< SE_cuda, CSR_cuda >(); } TEST( SparseMatrixCopyTest, Ellpack_to_SlicedEllpack_cuda ) { testConversion< E_cuda, SE_cuda >(); } TEST( SparseMatrixCopyTest, SlicedEllpack_to_Ellpack_cuda ) { testConversion< SE_cuda, E_cuda >(); } #endif Loading tests/benchmarks/tnl-benchmark-spmv.h +233 −274 Original line number Diff line number Diff line Loading @@ -326,7 +326,6 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) return false; } const int rows = csrMatrix.getRows(); const int columns = csrMatrix.getColumns(); const long int nonzeroElements = csrMatrix.getNumberOfMatrixElements(); Containers::Vector< int, Devices::Host, int > rowLengthsHost; rowLengthsHost.setSize( rows ); Loading Loading @@ -363,13 +362,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) typedef CSR< Real, Devices::Cuda, int > CSRCudaType; CSRCudaType cudaCSR; //cout << "Copying matrix to GPU... "; if( ! cudaCSR.copyFrom( csrMatrix, rowLengthsCuda ) ) { std::cerr << "I am not able to transfer the matrix on GPU." << std::endl; writeTestFailed( logFile, 21 ); } else { cudaCSR = csrMatrix; ::tnlCusparseCSR< Real > cusparseCSR; cusparseCSR.init( cudaCSR, &cusparseHandle ); benchmarkMatrix( cusparseCSR, Loading Loading @@ -516,7 +509,6 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) baseline, verbose, logFile );*/ } cudaCSR.reset(); #endif Loading @@ -524,10 +516,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) double padding; typedef Ellpack< Real, Devices::Host, int > EllpackType; EllpackType ellpackMatrix; if( ! ellpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) ) writeTestFailed( logFile, 7 ); else { Matrices::copySparseMatrix( ellpackMatrix, csrMatrix ); allocatedElements = ellpackMatrix.getNumberOfMatrixElements(); padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; logFile << " " << padding << std::endl; Loading @@ -544,13 +533,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) typedef Ellpack< Real, Devices::Cuda, int > EllpackCudaType; EllpackCudaType cudaEllpack; std::cout << "Copying matrix to GPU... "; if( ! cudaEllpack.copyFrom( ellpackMatrix, rowLengthsCuda ) ) { std::cerr << "I am not able to transfer the matrix on GPU." << std::endl; writeTestFailed( logFile, 3 ); } else { cudaEllpack = ellpackMatrix; std::cout << " done. \r"; benchmarkMatrix( cudaEllpack, cudaX, Loading @@ -561,18 +544,13 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) baseline, verbose, logFile ); } cudaEllpack.reset(); #endif ellpackMatrix.reset(); } typedef SlicedEllpack< Real, Devices::Host, int > SlicedEllpackType; SlicedEllpackType slicedEllpack; if( ! slicedEllpack.copyFrom( csrMatrix, rowLengthsHost ) ) writeTestFailed( logFile, 7 ); else { Matrices::copySparseMatrix( slicedEllpack, csrMatrix ); allocatedElements = slicedEllpack.getNumberOfMatrixElements(); padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; logFile << " " << padding << std::endl; Loading @@ -589,13 +567,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) typedef SlicedEllpack< Real, Devices::Cuda, int > SlicedEllpackCudaType; SlicedEllpackCudaType cudaSlicedEllpack; std::cout << "Copying matrix to GPU... "; if( ! cudaSlicedEllpack.copyFrom( slicedEllpack, rowLengthsCuda ) ) { std::cerr << "I am not able to transfer the matrix on GPU." << std::endl; writeTestFailed( logFile, 3 ); } else { cudaSlicedEllpack = slicedEllpack; std::cout << " done. \r"; benchmarkMatrix( cudaSlicedEllpack, cudaX, Loading @@ -606,18 +578,13 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) baseline, verbose, logFile ); } cudaSlicedEllpack.reset(); #endif slicedEllpack.reset(); } typedef ChunkedEllpack< Real, Devices::Host, int > ChunkedEllpackType; ChunkedEllpackType chunkedEllpack; if( ! chunkedEllpack.copyFrom( csrMatrix, rowLengthsHost ) ) writeTestFailed( logFile, 7 ); else { Matrices::copySparseMatrix( chunkedEllpack, csrMatrix ); allocatedElements = chunkedEllpack.getNumberOfMatrixElements(); padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; logFile << " " << padding << std::endl; Loading @@ -634,13 +601,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) typedef ChunkedEllpack< Real, Devices::Cuda, int > ChunkedEllpackCudaType; ChunkedEllpackCudaType cudaChunkedEllpack; std::cout << "Copying matrix to GPU... "; if( ! cudaChunkedEllpack.copyFrom( chunkedEllpack, rowLengthsCuda ) ) { std::cerr << "I am not able to transfer the matrix on GPU." << std::endl; writeTestFailed( logFile, 3 ); } else { cudaChunkedEllpack = chunkedEllpack; std::cout << " done. \r"; benchmarkMatrix( cudaChunkedEllpack, cudaX, Loading @@ -651,12 +612,10 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) baseline, verbose, logFile ); } cudaChunkedEllpack.reset(); #endif chunkedEllpack.reset(); } } return true; } Loading Loading
src/TNL/Matrices/MatrixReader_impl.h +2 −5 Original line number Diff line number Diff line Loading @@ -15,6 +15,7 @@ #include <TNL/String.h> #include <TNL/Containers/Vector.h> #include <TNL/Timer.h> #include <TNL/Matrices/MatrixReader.h> namespace TNL { namespace Matrices { Loading Loading @@ -405,11 +406,7 @@ class MatrixReaderDeviceDependentCode< Devices::Cuda > if( ! MatrixReader< HostMatrixType >::readMtxFileHostMatrix( file, hostMatrix, rowLengthsVector, verbose ) ) return false; typename Matrix::CompressedRowLengthsVector cudaCompressedRowLengthsVector; cudaCompressedRowLengthsVector.setLike( rowLengthsVector ); cudaCompressedRowLengthsVector = rowLengthsVector; if( ! matrix.copyFrom( hostMatrix, cudaCompressedRowLengthsVector ) ) return false; matrix = hostMatrix; return true; } }; Loading
src/TNL/Matrices/Sparse.h +7 −0 Original line number Diff line number Diff line Loading @@ -65,6 +65,13 @@ class Sparse : public Matrix< Real, Device, Index > Index maxRowLength; }; // This cannot be a method of the Sparse class, because the implementation uses // methods (marked with __cuda_callable__) which are defined only on the // subclasses, but are not virtual methods of Sparse. template< typename Matrix1, typename Matrix2 > void copySparseMatrix( Matrix1& A, const Matrix2& B ); } // namespace Matrices } // namespace TNL Loading
src/TNL/Matrices/Sparse_impl.h +143 −1 Original line number Diff line number Diff line Loading @@ -11,6 +11,7 @@ #pragma once #include "Sparse.h" #include <TNL/DevicePointer.h> namespace TNL { namespace Matrices { Loading Loading @@ -129,6 +130,147 @@ template< typename Real, typename Index > void Sparse< Real, Device, Index >::printStructure( std::ostream& str ) const { TNL_ASSERT_TRUE( false, "Not implemented yet." ); } #ifdef HAVE_CUDA template< typename Vector, typename Matrix > __global__ void SparseMatrixSetRowLengthsVectorKernel( Vector* rowLengths, const Matrix* matrix, typename Matrix::IndexType rows, typename Matrix::IndexType cols ) { using IndexType = typename Matrix::IndexType; IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x; const IndexType gridSize = blockDim.x * gridDim.x; while( rowIdx < rows ) { const auto max_length = matrix->getRowLengthFast( rowIdx ); const auto row = matrix->getRow( rowIdx ); IndexType length = 0; for( IndexType c_j = 0; c_j < max_length; c_j++ ) if( row.getElementColumn( c_j ) < cols ) length++; else break; rowLengths[ rowIdx ] = length; rowIdx += gridSize; } } template< typename Matrix1, typename Matrix2 > __global__ void SparseMatrixCopyKernel( Matrix1* A, const Matrix2* B, const typename Matrix2::IndexType* rowLengths, typename Matrix2::IndexType rows ) { using IndexType = typename Matrix2::IndexType; IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x; const IndexType gridSize = blockDim.x * gridDim.x; while( rowIdx < rows ) { const auto length = rowLengths[ rowIdx ]; const auto rowB = B->getRow( rowIdx ); auto rowA = A->getRow( rowIdx ); for( IndexType c = 0; c < length; c++ ) rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) ); rowIdx += gridSize; } } #endif template< typename Matrix1, typename Matrix2 > void copySparseMatrix( Matrix1& A, const Matrix2& B ) { static_assert( std::is_same< typename Matrix1::RealType, typename Matrix2::RealType >::value, "The matrices must have the same RealType." ); static_assert( std::is_same< typename Matrix1::DeviceType, typename Matrix2::DeviceType >::value, "The matrices must be allocated on the same device." ); static_assert( std::is_same< typename Matrix1::IndexType, typename Matrix2::IndexType >::value, "The matrices must have the same IndexType." ); using RealType = typename Matrix1::RealType; using DeviceType = typename Matrix1::DeviceType; using IndexType = typename Matrix1::IndexType; const IndexType rows = B.getRows(); const IndexType cols = B.getColumns(); A.setDimensions( rows, cols ); if( std::is_same< DeviceType, Devices::Host >::value ) { // set row lengths typename Matrix1::CompressedRowLengthsVector rowLengths; rowLengths.setSize( rows ); #ifdef HAVE_OPENMP #pragma omp parallel for if( Devices::Host::isOMPEnabled() ) #endif for( IndexType i = 0; i < rows; i++ ) { const auto max_length = B.getRowLength( i ); const auto row = B.getRow( i ); IndexType length = 0; for( IndexType c_j = 0; c_j < max_length; c_j++ ) if( row.getElementColumn( c_j ) < cols ) length++; else break; rowLengths[ i ] = length; } A.setCompressedRowLengths( rowLengths ); #ifdef HAVE_OPENMP #pragma omp parallel for if( Devices::Host::isOMPEnabled() ) #endif for( IndexType i = 0; i < rows; i++ ) { const auto length = rowLengths[ i ]; const auto rowB = B.getRow( i ); auto rowA = A.getRow( i ); for( IndexType c = 0; c < length; c++ ) rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) ); } } if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef HAVE_CUDA dim3 blockSize( 256 ); dim3 gridSize; const IndexType desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() ); gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( rows, blockSize.x ) ); typename Matrix1::CompressedRowLengthsVector rowLengths; rowLengths.setSize( rows ); DevicePointer< Matrix1 > Apointer( A ); const DevicePointer< const Matrix2 > Bpointer( B ); // set row lengths Devices::Cuda::synchronizeDevice(); SparseMatrixSetRowLengthsVectorKernel<<< gridSize, blockSize >>>( rowLengths.getData(), &Bpointer.template getData< TNL::Devices::Cuda >(), rows, cols ); TNL_CHECK_CUDA_DEVICE; Apointer->setCompressedRowLengths( rowLengths ); // copy rows Devices::Cuda::synchronizeDevice(); SparseMatrixCopyKernel<<< gridSize, blockSize >>>( &Apointer.template modifyData< TNL::Devices::Cuda >(), &Bpointer.template getData< TNL::Devices::Cuda >(), rowLengths.getData(), rows ); TNL_CHECK_CUDA_DEVICE; #else throw Exceptions::CudaSupportMissing(); #endif } } } // namespace Matrices Loading
src/UnitTests/Matrices/SparseMatrixCopyTest.h +89 −13 Original line number Diff line number Diff line Loading @@ -111,7 +111,7 @@ void checkMatrix( Matrix& m ) } template< typename Matrix1, typename Matrix2 > void testMatrixCopy() void testCopyAssignment() { Matrix1 m1; setupMatrix( m1 ); Loading @@ -122,72 +122,148 @@ void testMatrixCopy() checkMatrix( m2 ); } template< typename Matrix1, typename Matrix2 > void testConversion() { Matrix1 m1; setupMatrix( m1 ); checkMatrix( m1 ); Matrix2 m2; TNL::Matrices::copySparseMatrix( m2, m1 ); checkMatrix( m2 ); } TEST( SparseMatrixCopyTest, CSR_HostToHost ) { testMatrixCopy< CSR_host, CSR_host >(); testCopyAssignment< CSR_host, CSR_host >(); } #ifdef HAVE_CUDA TEST( SparseMatrixCopyTest, CSR_HostToCuda ) { testMatrixCopy< CSR_host, CSR_cuda >(); testCopyAssignment< CSR_host, CSR_cuda >(); } TEST( SparseMatrixCopyTest, CSR_CudaToHost ) { testMatrixCopy< CSR_cuda, CSR_host >(); testCopyAssignment< CSR_cuda, CSR_host >(); } TEST( SparseMatrixCopyTest, CSR_CudaToCuda ) { testMatrixCopy< CSR_cuda, CSR_cuda >(); testCopyAssignment< CSR_cuda, CSR_cuda >(); } #endif TEST( SparseMatrixCopyTest, Ellpack_HostToHost ) { testMatrixCopy< E_host, E_host >(); testCopyAssignment< E_host, E_host >(); } #ifdef HAVE_CUDA TEST( SparseMatrixCopyTest, Ellpack_HostToCuda ) { testMatrixCopy< E_host, E_cuda >(); testCopyAssignment< E_host, E_cuda >(); } TEST( SparseMatrixCopyTest, Ellpack_CudaToHost ) { testMatrixCopy< E_cuda, E_host >(); testCopyAssignment< E_cuda, E_host >(); } TEST( SparseMatrixCopyTest, Ellpack_CudaToCuda ) { testMatrixCopy< E_cuda, E_cuda >(); testCopyAssignment< E_cuda, E_cuda >(); } #endif TEST( SparseMatrixCopyTest, SlicedEllpack_HostToHost ) { testMatrixCopy< SE_host, SE_host >(); testCopyAssignment< SE_host, SE_host >(); } #ifdef HAVE_CUDA TEST( SparseMatrixCopyTest, SlicedEllpack_HostToCuda ) { testMatrixCopy< SE_host, SE_cuda >(); testCopyAssignment< SE_host, SE_cuda >(); } TEST( SparseMatrixCopyTest, SlicedEllpack_CudaToHost ) { testMatrixCopy< SE_cuda, SE_host >(); testCopyAssignment< SE_cuda, SE_host >(); } TEST( SparseMatrixCopyTest, SlicedEllpack_CudaToCuda ) { testMatrixCopy< SE_cuda, SE_cuda >(); testCopyAssignment< SE_cuda, SE_cuda >(); } #endif // test conversion between formats TEST( SparseMatrixCopyTest, CSR_to_Ellpack_host ) { testConversion< CSR_host, E_host >(); } TEST( SparseMatrixCopyTest, Ellpack_to_CSR_host ) { testConversion< E_host, CSR_host >(); } TEST( SparseMatrixCopyTest, CSR_to_SlicedEllpack_host ) { testConversion< CSR_host, SE_host >(); } TEST( SparseMatrixCopyTest, SlicedEllpack_to_CSR_host ) { testConversion< SE_host, CSR_host >(); } TEST( SparseMatrixCopyTest, Ellpack_to_SlicedEllpack_host ) { testConversion< E_host, SE_host >(); } TEST( SparseMatrixCopyTest, SlicedEllpack_to_Ellpack_host ) { testConversion< SE_host, E_host >(); } #ifdef HAVE_CUDA TEST( SparseMatrixCopyTest, CSR_to_Ellpack_cuda ) { testConversion< CSR_cuda, E_cuda >(); } TEST( SparseMatrixCopyTest, Ellpack_to_CSR_cuda ) { testConversion< E_cuda, CSR_cuda >(); } TEST( SparseMatrixCopyTest, CSR_to_SlicedEllpack_cuda ) { testConversion< CSR_cuda, SE_cuda >(); } TEST( SparseMatrixCopyTest, SlicedEllpack_to_CSR_cuda ) { testConversion< SE_cuda, CSR_cuda >(); } TEST( SparseMatrixCopyTest, Ellpack_to_SlicedEllpack_cuda ) { testConversion< E_cuda, SE_cuda >(); } TEST( SparseMatrixCopyTest, SlicedEllpack_to_Ellpack_cuda ) { testConversion< SE_cuda, E_cuda >(); } #endif Loading
tests/benchmarks/tnl-benchmark-spmv.h +233 −274 Original line number Diff line number Diff line Loading @@ -326,7 +326,6 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) return false; } const int rows = csrMatrix.getRows(); const int columns = csrMatrix.getColumns(); const long int nonzeroElements = csrMatrix.getNumberOfMatrixElements(); Containers::Vector< int, Devices::Host, int > rowLengthsHost; rowLengthsHost.setSize( rows ); Loading Loading @@ -363,13 +362,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) typedef CSR< Real, Devices::Cuda, int > CSRCudaType; CSRCudaType cudaCSR; //cout << "Copying matrix to GPU... "; if( ! cudaCSR.copyFrom( csrMatrix, rowLengthsCuda ) ) { std::cerr << "I am not able to transfer the matrix on GPU." << std::endl; writeTestFailed( logFile, 21 ); } else { cudaCSR = csrMatrix; ::tnlCusparseCSR< Real > cusparseCSR; cusparseCSR.init( cudaCSR, &cusparseHandle ); benchmarkMatrix( cusparseCSR, Loading Loading @@ -516,7 +509,6 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) baseline, verbose, logFile );*/ } cudaCSR.reset(); #endif Loading @@ -524,10 +516,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) double padding; typedef Ellpack< Real, Devices::Host, int > EllpackType; EllpackType ellpackMatrix; if( ! ellpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) ) writeTestFailed( logFile, 7 ); else { Matrices::copySparseMatrix( ellpackMatrix, csrMatrix ); allocatedElements = ellpackMatrix.getNumberOfMatrixElements(); padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; logFile << " " << padding << std::endl; Loading @@ -544,13 +533,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) typedef Ellpack< Real, Devices::Cuda, int > EllpackCudaType; EllpackCudaType cudaEllpack; std::cout << "Copying matrix to GPU... "; if( ! cudaEllpack.copyFrom( ellpackMatrix, rowLengthsCuda ) ) { std::cerr << "I am not able to transfer the matrix on GPU." << std::endl; writeTestFailed( logFile, 3 ); } else { cudaEllpack = ellpackMatrix; std::cout << " done. \r"; benchmarkMatrix( cudaEllpack, cudaX, Loading @@ -561,18 +544,13 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) baseline, verbose, logFile ); } cudaEllpack.reset(); #endif ellpackMatrix.reset(); } typedef SlicedEllpack< Real, Devices::Host, int > SlicedEllpackType; SlicedEllpackType slicedEllpack; if( ! slicedEllpack.copyFrom( csrMatrix, rowLengthsHost ) ) writeTestFailed( logFile, 7 ); else { Matrices::copySparseMatrix( slicedEllpack, csrMatrix ); allocatedElements = slicedEllpack.getNumberOfMatrixElements(); padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; logFile << " " << padding << std::endl; Loading @@ -589,13 +567,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) typedef SlicedEllpack< Real, Devices::Cuda, int > SlicedEllpackCudaType; SlicedEllpackCudaType cudaSlicedEllpack; std::cout << "Copying matrix to GPU... "; if( ! cudaSlicedEllpack.copyFrom( slicedEllpack, rowLengthsCuda ) ) { std::cerr << "I am not able to transfer the matrix on GPU." << std::endl; writeTestFailed( logFile, 3 ); } else { cudaSlicedEllpack = slicedEllpack; std::cout << " done. \r"; benchmarkMatrix( cudaSlicedEllpack, cudaX, Loading @@ -606,18 +578,13 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) baseline, verbose, logFile ); } cudaSlicedEllpack.reset(); #endif slicedEllpack.reset(); } typedef ChunkedEllpack< Real, Devices::Host, int > ChunkedEllpackType; ChunkedEllpackType chunkedEllpack; if( ! chunkedEllpack.copyFrom( csrMatrix, rowLengthsHost ) ) writeTestFailed( logFile, 7 ); else { Matrices::copySparseMatrix( chunkedEllpack, csrMatrix ); allocatedElements = chunkedEllpack.getNumberOfMatrixElements(); padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; logFile << " " << padding << std::endl; Loading @@ -634,13 +601,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) typedef ChunkedEllpack< Real, Devices::Cuda, int > ChunkedEllpackCudaType; ChunkedEllpackCudaType cudaChunkedEllpack; std::cout << "Copying matrix to GPU... "; if( ! cudaChunkedEllpack.copyFrom( chunkedEllpack, rowLengthsCuda ) ) { std::cerr << "I am not able to transfer the matrix on GPU." << std::endl; writeTestFailed( logFile, 3 ); } else { cudaChunkedEllpack = chunkedEllpack; std::cout << " done. \r"; benchmarkMatrix( cudaChunkedEllpack, cudaX, Loading @@ -651,12 +612,10 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) baseline, verbose, logFile ); } cudaChunkedEllpack.reset(); #endif chunkedEllpack.reset(); } } return true; } Loading