Implemented copySparseMatrix function for conversion between sparse matrix formats (2894096d) · Commits · TNL / tnl-dev

src/TNL/Matrices/MatrixReader_impl.h

+2 −5

Original line number	Diff line number	Diff line
		@@ -15,6 +15,7 @@
		#include <TNL/String.h>
		#include <TNL/Containers/Vector.h>
		#include <TNL/Timer.h>
		#include <TNL/Matrices/MatrixReader.h>

		namespace TNL {
		namespace Matrices {
		@@ -405,11 +406,7 @@ class MatrixReaderDeviceDependentCode< Devices::Cuda >
		if( ! MatrixReader< HostMatrixType >::readMtxFileHostMatrix( file, hostMatrix, rowLengthsVector, verbose ) )
		return false;

		typename Matrix::CompressedRowLengthsVector cudaCompressedRowLengthsVector;
		cudaCompressedRowLengthsVector.setLike( rowLengthsVector );
		cudaCompressedRowLengthsVector = rowLengthsVector;
		if( ! matrix.copyFrom( hostMatrix, cudaCompressedRowLengthsVector ) )
		return false;
		matrix = hostMatrix;
		return true;
		}
		};

src/TNL/Matrices/Sparse.h

+7 −0

Original line number	Diff line number	Diff line
		@@ -65,6 +65,13 @@ class Sparse : public Matrix< Real, Device, Index >
		Index maxRowLength;
		};


		// This cannot be a method of the Sparse class, because the implementation uses
		// methods (marked with __cuda_callable__) which are defined only on the
		// subclasses, but are not virtual methods of Sparse.
		template< typename Matrix1, typename Matrix2 >
		void copySparseMatrix( Matrix1& A, const Matrix2& B );

		} // namespace Matrices
		} // namespace TNL

src/TNL/Matrices/Sparse_impl.h

+143 −1

Original line number	Diff line number	Diff line
		@@ -11,6 +11,7 @@
		#pragma once

		#include "Sparse.h"
		#include <TNL/DevicePointer.h>

		namespace TNL {
		namespace Matrices {
		@@ -129,6 +130,147 @@ template< typename Real,
		typename Index >
		void Sparse< Real, Device, Index >::printStructure( std::ostream& str ) const
		{
		TNL_ASSERT_TRUE( false, "Not implemented yet." );
		}


		#ifdef HAVE_CUDA
		template< typename Vector, typename Matrix >
		__global__ void
		SparseMatrixSetRowLengthsVectorKernel( Vector* rowLengths,
		const Matrix* matrix,
		typename Matrix::IndexType rows,
		typename Matrix::IndexType cols )
		{
		using IndexType = typename Matrix::IndexType;

		IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
		const IndexType gridSize = blockDim.x * gridDim.x;

		while( rowIdx < rows ) {
		const auto max_length = matrix->getRowLengthFast( rowIdx );
		const auto row = matrix->getRow( rowIdx );
		IndexType length = 0;
		for( IndexType c_j = 0; c_j < max_length; c_j++ )
		if( row.getElementColumn( c_j ) < cols )
		length++;
		else
		break;
		rowLengths[ rowIdx ] = length;
		rowIdx += gridSize;
		}
		}

		template< typename Matrix1, typename Matrix2 >
		__global__ void
		SparseMatrixCopyKernel( Matrix1* A,
		const Matrix2* B,
		const typename Matrix2::IndexType* rowLengths,
		typename Matrix2::IndexType rows )
		{
		using IndexType = typename Matrix2::IndexType;

		IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
		const IndexType gridSize = blockDim.x * gridDim.x;

		while( rowIdx < rows ) {
		const auto length = rowLengths[ rowIdx ];
		const auto rowB = B->getRow( rowIdx );
		auto rowA = A->getRow( rowIdx );
		for( IndexType c = 0; c < length; c++ )
		rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
		rowIdx += gridSize;
		}
		}
		#endif

		template< typename Matrix1, typename Matrix2 >
		void
		copySparseMatrix( Matrix1& A, const Matrix2& B )
		{
		static_assert( std::is_same< typename Matrix1::RealType, typename Matrix2::RealType >::value,
		"The matrices must have the same RealType." );
		static_assert( std::is_same< typename Matrix1::DeviceType, typename Matrix2::DeviceType >::value,
		"The matrices must be allocated on the same device." );
		static_assert( std::is_same< typename Matrix1::IndexType, typename Matrix2::IndexType >::value,
		"The matrices must have the same IndexType." );

		using RealType = typename Matrix1::RealType;
		using DeviceType = typename Matrix1::DeviceType;
		using IndexType = typename Matrix1::IndexType;

		const IndexType rows = B.getRows();
		const IndexType cols = B.getColumns();

		A.setDimensions( rows, cols );

		if( std::is_same< DeviceType, Devices::Host >::value ) {
		// set row lengths
		typename Matrix1::CompressedRowLengthsVector rowLengths;
		rowLengths.setSize( rows );
		#ifdef HAVE_OPENMP
		#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
		#endif
		for( IndexType i = 0; i < rows; i++ ) {
		const auto max_length = B.getRowLength( i );
		const auto row = B.getRow( i );
		IndexType length = 0;
		for( IndexType c_j = 0; c_j < max_length; c_j++ )
		if( row.getElementColumn( c_j ) < cols )
		length++;
		else
		break;
		rowLengths[ i ] = length;
		}
		A.setCompressedRowLengths( rowLengths );

		#ifdef HAVE_OPENMP
		#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
		#endif
		for( IndexType i = 0; i < rows; i++ ) {
		const auto length = rowLengths[ i ];
		const auto rowB = B.getRow( i );
		auto rowA = A.getRow( i );
		for( IndexType c = 0; c < length; c++ )
		rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
		}
		}

		if( std::is_same< DeviceType, Devices::Cuda >::value ) {
		#ifdef HAVE_CUDA
		dim3 blockSize( 256 );
		dim3 gridSize;
		const IndexType desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
		gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( rows, blockSize.x ) );

		typename Matrix1::CompressedRowLengthsVector rowLengths;
		rowLengths.setSize( rows );

		DevicePointer< Matrix1 > Apointer( A );
		const DevicePointer< const Matrix2 > Bpointer( B );

		// set row lengths
		Devices::Cuda::synchronizeDevice();
		SparseMatrixSetRowLengthsVectorKernel<<< gridSize, blockSize >>>(
		rowLengths.getData(),
		&Bpointer.template getData< TNL::Devices::Cuda >(),
		rows,
		cols );
		TNL_CHECK_CUDA_DEVICE;
		Apointer->setCompressedRowLengths( rowLengths );

		// copy rows
		Devices::Cuda::synchronizeDevice();
		SparseMatrixCopyKernel<<< gridSize, blockSize >>>(
		&Apointer.template modifyData< TNL::Devices::Cuda >(),
		&Bpointer.template getData< TNL::Devices::Cuda >(),
		rowLengths.getData(),
		rows );
		TNL_CHECK_CUDA_DEVICE;
		#else
		throw Exceptions::CudaSupportMissing();
		#endif
		}
		}

		} // namespace Matrices

src/UnitTests/Matrices/SparseMatrixCopyTest.h

+89 −13

Original line number	Diff line number	Diff line
		@@ -111,7 +111,7 @@ void checkMatrix( Matrix& m )
		}

		template< typename Matrix1, typename Matrix2 >
		void testMatrixCopy()
		void testCopyAssignment()
		{
		Matrix1 m1;
		setupMatrix( m1 );
		@@ -122,72 +122,148 @@ void testMatrixCopy()
		checkMatrix( m2 );
		}

		template< typename Matrix1, typename Matrix2 >
		void testConversion()
		{
		Matrix1 m1;
		setupMatrix( m1 );
		checkMatrix( m1 );

		Matrix2 m2;
		TNL::Matrices::copySparseMatrix( m2, m1 );
		checkMatrix( m2 );
		}


		TEST( SparseMatrixCopyTest, CSR_HostToHost )
		{
		testMatrixCopy< CSR_host, CSR_host >();
		testCopyAssignment< CSR_host, CSR_host >();
		}

		#ifdef HAVE_CUDA
		TEST( SparseMatrixCopyTest, CSR_HostToCuda )
		{
		testMatrixCopy< CSR_host, CSR_cuda >();
		testCopyAssignment< CSR_host, CSR_cuda >();
		}

		TEST( SparseMatrixCopyTest, CSR_CudaToHost )
		{
		testMatrixCopy< CSR_cuda, CSR_host >();
		testCopyAssignment< CSR_cuda, CSR_host >();
		}

		TEST( SparseMatrixCopyTest, CSR_CudaToCuda )
		{
		testMatrixCopy< CSR_cuda, CSR_cuda >();
		testCopyAssignment< CSR_cuda, CSR_cuda >();
		}
		#endif


		TEST( SparseMatrixCopyTest, Ellpack_HostToHost )
		{
		testMatrixCopy< E_host, E_host >();
		testCopyAssignment< E_host, E_host >();
		}

		#ifdef HAVE_CUDA
		TEST( SparseMatrixCopyTest, Ellpack_HostToCuda )
		{
		testMatrixCopy< E_host, E_cuda >();
		testCopyAssignment< E_host, E_cuda >();
		}

		TEST( SparseMatrixCopyTest, Ellpack_CudaToHost )
		{
		testMatrixCopy< E_cuda, E_host >();
		testCopyAssignment< E_cuda, E_host >();
		}

		TEST( SparseMatrixCopyTest, Ellpack_CudaToCuda )
		{
		testMatrixCopy< E_cuda, E_cuda >();
		testCopyAssignment< E_cuda, E_cuda >();
		}
		#endif


		TEST( SparseMatrixCopyTest, SlicedEllpack_HostToHost )
		{
		testMatrixCopy< SE_host, SE_host >();
		testCopyAssignment< SE_host, SE_host >();
		}

		#ifdef HAVE_CUDA
		TEST( SparseMatrixCopyTest, SlicedEllpack_HostToCuda )
		{
		testMatrixCopy< SE_host, SE_cuda >();
		testCopyAssignment< SE_host, SE_cuda >();
		}

		TEST( SparseMatrixCopyTest, SlicedEllpack_CudaToHost )
		{
		testMatrixCopy< SE_cuda, SE_host >();
		testCopyAssignment< SE_cuda, SE_host >();
		}

		TEST( SparseMatrixCopyTest, SlicedEllpack_CudaToCuda )
		{
		testMatrixCopy< SE_cuda, SE_cuda >();
		testCopyAssignment< SE_cuda, SE_cuda >();
		}
		#endif


		// test conversion between formats
		TEST( SparseMatrixCopyTest, CSR_to_Ellpack_host )
		{
		testConversion< CSR_host, E_host >();
		}

		TEST( SparseMatrixCopyTest, Ellpack_to_CSR_host )
		{
		testConversion< E_host, CSR_host >();
		}

		TEST( SparseMatrixCopyTest, CSR_to_SlicedEllpack_host )
		{
		testConversion< CSR_host, SE_host >();
		}

		TEST( SparseMatrixCopyTest, SlicedEllpack_to_CSR_host )
		{
		testConversion< SE_host, CSR_host >();
		}

		TEST( SparseMatrixCopyTest, Ellpack_to_SlicedEllpack_host )
		{
		testConversion< E_host, SE_host >();
		}

		TEST( SparseMatrixCopyTest, SlicedEllpack_to_Ellpack_host )
		{
		testConversion< SE_host, E_host >();
		}

		#ifdef HAVE_CUDA
		TEST( SparseMatrixCopyTest, CSR_to_Ellpack_cuda )
		{
		testConversion< CSR_cuda, E_cuda >();
		}

		TEST( SparseMatrixCopyTest, Ellpack_to_CSR_cuda )
		{
		testConversion< E_cuda, CSR_cuda >();
		}

		TEST( SparseMatrixCopyTest, CSR_to_SlicedEllpack_cuda )
		{
		testConversion< CSR_cuda, SE_cuda >();
		}

		TEST( SparseMatrixCopyTest, SlicedEllpack_to_CSR_cuda )
		{
		testConversion< SE_cuda, CSR_cuda >();
		}

		TEST( SparseMatrixCopyTest, Ellpack_to_SlicedEllpack_cuda )
		{
		testConversion< E_cuda, SE_cuda >();
		}

		TEST( SparseMatrixCopyTest, SlicedEllpack_to_Ellpack_cuda )
		{
		testConversion< SE_cuda, E_cuda >();
		}
		#endif

tests/benchmarks/tnl-benchmark-spmv.h

+233 −274

Original line number	Diff line number	Diff line
		@@ -326,7 +326,6 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
		return false;
		}
		const int rows = csrMatrix.getRows();
		const int columns = csrMatrix.getColumns();
		const long int nonzeroElements = csrMatrix.getNumberOfMatrixElements();
		Containers::Vector< int, Devices::Host, int > rowLengthsHost;
		rowLengthsHost.setSize( rows );
		@@ -363,13 +362,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
		typedef CSR< Real, Devices::Cuda, int > CSRCudaType;
		CSRCudaType cudaCSR;
		//cout << "Copying matrix to GPU... ";
		if( ! cudaCSR.copyFrom( csrMatrix, rowLengthsCuda ) )
		{
		std::cerr << "I am not able to transfer the matrix on GPU." << std::endl;
		writeTestFailed( logFile, 21 );
		}
		else
		{
		cudaCSR = csrMatrix;
		::tnlCusparseCSR< Real > cusparseCSR;
		cusparseCSR.init( cudaCSR, &cusparseHandle );
		benchmarkMatrix( cusparseCSR,
		@@ -516,7 +509,6 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
		baseline,
		verbose,
		logFile );*/
		}
		cudaCSR.reset();
		#endif

		@@ -524,10 +516,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
		double padding;
		typedef Ellpack< Real, Devices::Host, int > EllpackType;
		EllpackType ellpackMatrix;
		if( ! ellpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) )
		writeTestFailed( logFile, 7 );
		else
		{
		Matrices::copySparseMatrix( ellpackMatrix, csrMatrix );
		allocatedElements = ellpackMatrix.getNumberOfMatrixElements();
		padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
		logFile << " " << padding << std::endl;
		@@ -544,13 +533,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
		typedef Ellpack< Real, Devices::Cuda, int > EllpackCudaType;
		EllpackCudaType cudaEllpack;
		std::cout << "Copying matrix to GPU... ";
		if( ! cudaEllpack.copyFrom( ellpackMatrix, rowLengthsCuda ) )
		{
		std::cerr << "I am not able to transfer the matrix on GPU." << std::endl;
		writeTestFailed( logFile, 3 );
		}
		else
		{
		cudaEllpack = ellpackMatrix;
		std::cout << " done. \r";
		benchmarkMatrix( cudaEllpack,
		cudaX,
		@@ -561,18 +544,13 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
		baseline,
		verbose,
		logFile );
		}
		cudaEllpack.reset();
		#endif
		ellpackMatrix.reset();
		}

		typedef SlicedEllpack< Real, Devices::Host, int > SlicedEllpackType;
		SlicedEllpackType slicedEllpack;
		if( ! slicedEllpack.copyFrom( csrMatrix, rowLengthsHost ) )
		writeTestFailed( logFile, 7 );
		else
		{
		Matrices::copySparseMatrix( slicedEllpack, csrMatrix );
		allocatedElements = slicedEllpack.getNumberOfMatrixElements();
		padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
		logFile << " " << padding << std::endl;
		@@ -589,13 +567,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
		typedef SlicedEllpack< Real, Devices::Cuda, int > SlicedEllpackCudaType;
		SlicedEllpackCudaType cudaSlicedEllpack;
		std::cout << "Copying matrix to GPU... ";
		if( ! cudaSlicedEllpack.copyFrom( slicedEllpack, rowLengthsCuda ) )
		{
		std::cerr << "I am not able to transfer the matrix on GPU." << std::endl;
		writeTestFailed( logFile, 3 );
		}
		else
		{
		cudaSlicedEllpack = slicedEllpack;
		std::cout << " done. \r";
		benchmarkMatrix( cudaSlicedEllpack,
		cudaX,
		@@ -606,18 +578,13 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
		baseline,
		verbose,
		logFile );
		}
		cudaSlicedEllpack.reset();
		#endif
		slicedEllpack.reset();
		}

		typedef ChunkedEllpack< Real, Devices::Host, int > ChunkedEllpackType;
		ChunkedEllpackType chunkedEllpack;
		if( ! chunkedEllpack.copyFrom( csrMatrix, rowLengthsHost ) )
		writeTestFailed( logFile, 7 );
		else
		{
		Matrices::copySparseMatrix( chunkedEllpack, csrMatrix );
		allocatedElements = chunkedEllpack.getNumberOfMatrixElements();
		padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0;
		logFile << " " << padding << std::endl;
		@@ -634,13 +601,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
		typedef ChunkedEllpack< Real, Devices::Cuda, int > ChunkedEllpackCudaType;
		ChunkedEllpackCudaType cudaChunkedEllpack;
		std::cout << "Copying matrix to GPU... ";
		if( ! cudaChunkedEllpack.copyFrom( chunkedEllpack, rowLengthsCuda ) )
		{
		std::cerr << "I am not able to transfer the matrix on GPU." << std::endl;
		writeTestFailed( logFile, 3 );
		}
		else
		{
		cudaChunkedEllpack = chunkedEllpack;
		std::cout << " done. \r";
		benchmarkMatrix( cudaChunkedEllpack,
		cudaX,
		@@ -651,12 +612,10 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
		baseline,
		verbose,
		logFile );
		}
		cudaChunkedEllpack.reset();
		#endif
		chunkedEllpack.reset();
		}
		}
		return true;
		}