Refactoring: added SparseOperations.h containing the copySparseMatrix function (a813195e) · Commits · TNL / tnl-dev

src/TNL/Matrices/CMakeLists.txt

+2 −0

Original line number	Diff line number	Diff line
		@@ -14,6 +14,8 @@ SET( headers AdEllpack.h
		Multidiagonal_impl.h
		Sparse.h
		Sparse_impl.h
		SparseOperations.h
		SparseOperations_impl.h
		Ellpack.h
		Ellpack_impl.h
		SlicedEllpack.h

src/TNL/Matrices/Sparse.h

+2 −8

Original line number	Diff line number	Diff line
		@@ -62,14 +62,8 @@ class Sparse : public Matrix< Real, Device, Index >
		Index maxRowLength;
		};


		// This cannot be a method of the Sparse class, because the implementation uses
		// methods (marked with __cuda_callable__) which are defined only on the
		// subclasses, but are not virtual methods of Sparse.
		template< typename Matrix1, typename Matrix2 >
		void copySparseMatrix( Matrix1& A, const Matrix2& B );

		} // namespace Matrices
		} // namespace TNL

		#include <TNL/Matrices/Sparse_impl.h>
		#include <TNL/Matrices/SparseOperations.h>

src/TNL/Matrices/SparseOperations.h

0 → 100644

+28 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		SparseOperations.h - description
		-------------------
		begin : Oct 4, 2018
		copyright : (C) 2018 by Tomas Oberhuber et al.
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/* See Copyright Notice in tnl/Copyright */

		// Implemented by: Jakub Klinkovský

		// Note that these functions cannot be methods of the Sparse class, because
		// their implementation uses methods (marked with __cuda_callable__) which are
		// defined only on the subclasses, but are not virtual methods of Sparse.

		#pragma once

		namespace TNL {
		namespace Matrices {

		template< typename Matrix1, typename Matrix2 >
		void copySparseMatrix( Matrix1& A, const Matrix2& B );

		} // namespace Matrices
		} // namespace TNL

		#include "SparseOperations_impl.h"

src/TNL/Matrices/SparseOperations_impl.h

0 → 100644

+161 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		SparseOperations_impl.h - description
		-------------------
		begin : Oct 4, 2018
		copyright : (C) 2018 by Tomas Oberhuber et al.
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/* See Copyright Notice in tnl/Copyright */

		// Implemented by: Jakub Klinkovský

		#pragma once

		#include <TNL/Pointers/DevicePointer.h>

		namespace TNL {
		namespace Matrices {

		#ifdef HAVE_CUDA
		template< typename Vector, typename Matrix >
		__global__ void
		SparseMatrixSetRowLengthsVectorKernel( Vector* rowLengths,
		const Matrix* matrix,
		typename Matrix::IndexType rows,
		typename Matrix::IndexType cols )
		{
		using IndexType = typename Matrix::IndexType;

		IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
		const IndexType gridSize = blockDim.x * gridDim.x;

		while( rowIdx < rows ) {
		const auto max_length = matrix->getRowLengthFast( rowIdx );
		const auto row = matrix->getRow( rowIdx );
		IndexType length = 0;
		for( IndexType c_j = 0; c_j < max_length; c_j++ )
		if( row.getElementColumn( c_j ) < cols )
		length++;
		else
		break;
		rowLengths[ rowIdx ] = length;
		rowIdx += gridSize;
		}
		}

		template< typename Matrix1, typename Matrix2 >
		__global__ void
		SparseMatrixCopyKernel( Matrix1* A,
		const Matrix2* B,
		const typename Matrix2::IndexType* rowLengths,
		typename Matrix2::IndexType rows )
		{
		using IndexType = typename Matrix2::IndexType;

		IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
		const IndexType gridSize = blockDim.x * gridDim.x;

		while( rowIdx < rows ) {
		const auto length = rowLengths[ rowIdx ];
		const auto rowB = B->getRow( rowIdx );
		auto rowA = A->getRow( rowIdx );
		for( IndexType c = 0; c < length; c++ )
		rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
		rowIdx += gridSize;
		}
		}
		#endif


		template< typename Matrix1, typename Matrix2 >
		void
		copySparseMatrix( Matrix1& A, const Matrix2& B )
		{
		static_assert( std::is_same< typename Matrix1::RealType, typename Matrix2::RealType >::value,
		"The matrices must have the same RealType." );
		static_assert( std::is_same< typename Matrix1::DeviceType, typename Matrix2::DeviceType >::value,
		"The matrices must be allocated on the same device." );
		static_assert( std::is_same< typename Matrix1::IndexType, typename Matrix2::IndexType >::value,
		"The matrices must have the same IndexType." );

		using RealType = typename Matrix1::RealType;
		using DeviceType = typename Matrix1::DeviceType;
		using IndexType = typename Matrix1::IndexType;

		const IndexType rows = B.getRows();
		const IndexType cols = B.getColumns();

		A.setDimensions( rows, cols );

		if( std::is_same< DeviceType, Devices::Host >::value ) {
		// set row lengths
		typename Matrix1::CompressedRowLengthsVector rowLengths;
		rowLengths.setSize( rows );
		#ifdef HAVE_OPENMP
		#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
		#endif
		for( IndexType i = 0; i < rows; i++ ) {
		const auto max_length = B.getRowLength( i );
		const auto row = B.getRow( i );
		IndexType length = 0;
		for( IndexType c_j = 0; c_j < max_length; c_j++ )
		if( row.getElementColumn( c_j ) < cols )
		length++;
		else
		break;
		rowLengths[ i ] = length;
		}
		A.setCompressedRowLengths( rowLengths );

		#ifdef HAVE_OPENMP
		#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
		#endif
		for( IndexType i = 0; i < rows; i++ ) {
		const auto length = rowLengths[ i ];
		const auto rowB = B.getRow( i );
		auto rowA = A.getRow( i );
		for( IndexType c = 0; c < length; c++ )
		rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
		}
		}

		if( std::is_same< DeviceType, Devices::Cuda >::value ) {
		#ifdef HAVE_CUDA
		dim3 blockSize( 256 );
		dim3 gridSize;
		const IndexType desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
		gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( rows, blockSize.x ) );

		typename Matrix1::CompressedRowLengthsVector rowLengths;
		rowLengths.setSize( rows );

		Pointers::DevicePointer< Matrix1 > Apointer( A );
		const Pointers::DevicePointer< const Matrix2 > Bpointer( B );

		// set row lengths
		Devices::Cuda::synchronizeDevice();
		SparseMatrixSetRowLengthsVectorKernel<<< gridSize, blockSize >>>(
		rowLengths.getData(),
		&Bpointer.template getData< TNL::Devices::Cuda >(),
		rows,
		cols );
		TNL_CHECK_CUDA_DEVICE;
		Apointer->setCompressedRowLengths( rowLengths );

		// copy rows
		Devices::Cuda::synchronizeDevice();
		SparseMatrixCopyKernel<<< gridSize, blockSize >>>(
		&Apointer.template modifyData< TNL::Devices::Cuda >(),
		&Bpointer.template getData< TNL::Devices::Cuda >(),
		rowLengths.getData(),
		rows );
		TNL_CHECK_CUDA_DEVICE;
		#else
		throw Exceptions::CudaSupportMissing();
		#endif
		}
		}

		} // namespace Matrices
		} // namespace TNL

src/TNL/Matrices/Sparse_impl.h

+0 −141

Original line number	Diff line number	Diff line
		@@ -11,7 +11,6 @@
		#pragma once

		#include "Sparse.h"
		#include <TNL/Pointers/DevicePointer.h>

		namespace TNL {
		namespace Matrices {
		@@ -134,145 +133,5 @@ void Sparse< Real, Device, Index >::printStructure( std::ostream& str ) const
		TNL_ASSERT_TRUE( false, "Not implemented yet." );
		}


		#ifdef HAVE_CUDA
		template< typename Vector, typename Matrix >
		__global__ void
		SparseMatrixSetRowLengthsVectorKernel( Vector* rowLengths,
		const Matrix* matrix,
		typename Matrix::IndexType rows,
		typename Matrix::IndexType cols )
		{
		using IndexType = typename Matrix::IndexType;

		IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
		const IndexType gridSize = blockDim.x * gridDim.x;

		while( rowIdx < rows ) {
		const auto max_length = matrix->getRowLengthFast( rowIdx );
		const auto row = matrix->getRow( rowIdx );
		IndexType length = 0;
		for( IndexType c_j = 0; c_j < max_length; c_j++ )
		if( row.getElementColumn( c_j ) < cols )
		length++;
		else
		break;
		rowLengths[ rowIdx ] = length;
		rowIdx += gridSize;
		}
		}

		template< typename Matrix1, typename Matrix2 >
		__global__ void
		SparseMatrixCopyKernel( Matrix1* A,
		const Matrix2* B,
		const typename Matrix2::IndexType* rowLengths,
		typename Matrix2::IndexType rows )
		{
		using IndexType = typename Matrix2::IndexType;

		IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
		const IndexType gridSize = blockDim.x * gridDim.x;

		while( rowIdx < rows ) {
		const auto length = rowLengths[ rowIdx ];
		const auto rowB = B->getRow( rowIdx );
		auto rowA = A->getRow( rowIdx );
		for( IndexType c = 0; c < length; c++ )
		rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
		rowIdx += gridSize;
		}
		}
		#endif

		template< typename Matrix1, typename Matrix2 >
		void
		copySparseMatrix( Matrix1& A, const Matrix2& B )
		{
		static_assert( std::is_same< typename Matrix1::RealType, typename Matrix2::RealType >::value,
		"The matrices must have the same RealType." );
		static_assert( std::is_same< typename Matrix1::DeviceType, typename Matrix2::DeviceType >::value,
		"The matrices must be allocated on the same device." );
		static_assert( std::is_same< typename Matrix1::IndexType, typename Matrix2::IndexType >::value,
		"The matrices must have the same IndexType." );

		using RealType = typename Matrix1::RealType;
		using DeviceType = typename Matrix1::DeviceType;
		using IndexType = typename Matrix1::IndexType;

		const IndexType rows = B.getRows();
		const IndexType cols = B.getColumns();

		A.setDimensions( rows, cols );

		if( std::is_same< DeviceType, Devices::Host >::value ) {
		// set row lengths
		typename Matrix1::CompressedRowLengthsVector rowLengths;
		rowLengths.setSize( rows );
		#ifdef HAVE_OPENMP
		#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
		#endif
		for( IndexType i = 0; i < rows; i++ ) {
		const auto max_length = B.getRowLength( i );
		const auto row = B.getRow( i );
		IndexType length = 0;
		for( IndexType c_j = 0; c_j < max_length; c_j++ )
		if( row.getElementColumn( c_j ) < cols )
		length++;
		else
		break;
		rowLengths[ i ] = length;
		}
		A.setCompressedRowLengths( rowLengths );

		#ifdef HAVE_OPENMP
		#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
		#endif
		for( IndexType i = 0; i < rows; i++ ) {
		const auto length = rowLengths[ i ];
		const auto rowB = B.getRow( i );
		auto rowA = A.getRow( i );
		for( IndexType c = 0; c < length; c++ )
		rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
		}
		}

		if( std::is_same< DeviceType, Devices::Cuda >::value ) {
		#ifdef HAVE_CUDA
		dim3 blockSize( 256 );
		dim3 gridSize;
		const IndexType desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
		gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( rows, blockSize.x ) );

		typename Matrix1::CompressedRowLengthsVector rowLengths;
		rowLengths.setSize( rows );

		Pointers::DevicePointer< Matrix1 > Apointer( A );
		const Pointers::DevicePointer< const Matrix2 > Bpointer( B );

		// set row lengths
		Devices::Cuda::synchronizeDevice();
		SparseMatrixSetRowLengthsVectorKernel<<< gridSize, blockSize >>>(
		rowLengths.getData(),
		&Bpointer.template getData< TNL::Devices::Cuda >(),
		rows,
		cols );
		TNL_CHECK_CUDA_DEVICE;
		Apointer->setCompressedRowLengths( rowLengths );

		// copy rows
		Devices::Cuda::synchronizeDevice();
		SparseMatrixCopyKernel<<< gridSize, blockSize >>>(
		&Apointer.template modifyData< TNL::Devices::Cuda >(),
		&Bpointer.template getData< TNL::Devices::Cuda >(),
		rowLengths.getData(),
		rows );
		TNL_CHECK_CUDA_DEVICE;
		#else
		throw Exceptions::CudaSupportMissing();
		#endif
		}
		}

		} // namespace Matrices
		} // namespace TNL