diff --git a/src/TNL/Matrices/CMakeLists.txt b/src/TNL/Matrices/CMakeLists.txt index 82c8924bf762b89b8b3378c52acbb51e1ae1772f..f60a43c87d951152cda5491c23e6626a25d5bd2e 100644 --- a/src/TNL/Matrices/CMakeLists.txt +++ b/src/TNL/Matrices/CMakeLists.txt @@ -14,6 +14,8 @@ SET( headers AdEllpack.h Multidiagonal_impl.h Sparse.h Sparse_impl.h + SparseOperations.h + SparseOperations_impl.h Ellpack.h Ellpack_impl.h SlicedEllpack.h diff --git a/src/TNL/Matrices/Sparse.h b/src/TNL/Matrices/Sparse.h index 110dd7a40a57372c9fc65ea94c6bac1adf3c0f54..2ee49219ee2d8fa8be6662c417dd68c3c3a6c690 100644 --- a/src/TNL/Matrices/Sparse.h +++ b/src/TNL/Matrices/Sparse.h @@ -14,7 +14,7 @@ #include <TNL/Matrices/SparseRow.h> namespace TNL { -namespace Matrices { +namespace Matrices { template< typename Real, typename Device, @@ -62,14 +62,8 @@ class Sparse : public Matrix< Real, Device, Index > Index maxRowLength; }; - -// This cannot be a method of the Sparse class, because the implementation uses -// methods (marked with __cuda_callable__) which are defined only on the -// subclasses, but are not virtual methods of Sparse. -template< typename Matrix1, typename Matrix2 > -void copySparseMatrix( Matrix1& A, const Matrix2& B ); - } // namespace Matrices } // namespace TNL #include <TNL/Matrices/Sparse_impl.h> +#include <TNL/Matrices/SparseOperations.h> diff --git a/src/TNL/Matrices/SparseOperations.h b/src/TNL/Matrices/SparseOperations.h new file mode 100644 index 0000000000000000000000000000000000000000..e375d596ad439520289bbf6d5e3a0006dc34d2c6 --- /dev/null +++ b/src/TNL/Matrices/SparseOperations.h @@ -0,0 +1,28 @@ +/*************************************************************************** + SparseOperations.h - description + ------------------- + begin : Oct 4, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub KlinkovskĂ˝ + +// Note that these functions cannot be methods of the Sparse class, because +// their implementation uses methods (marked with __cuda_callable__) which are +// defined only on the subclasses, but are not virtual methods of Sparse. + +#pragma once + +namespace TNL { +namespace Matrices { + +template< typename Matrix1, typename Matrix2 > +void copySparseMatrix( Matrix1& A, const Matrix2& B ); + +} // namespace Matrices +} // namespace TNL + +#include "SparseOperations_impl.h" diff --git a/src/TNL/Matrices/SparseOperations_impl.h b/src/TNL/Matrices/SparseOperations_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..0fbb02005d1fe070087d5eceb180efee0daea166 --- /dev/null +++ b/src/TNL/Matrices/SparseOperations_impl.h @@ -0,0 +1,161 @@ +/*************************************************************************** + SparseOperations_impl.h - description + ------------------- + begin : Oct 4, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub KlinkovskĂ˝ + +#pragma once + +#include <TNL/Pointers/DevicePointer.h> + +namespace TNL { +namespace Matrices { + +#ifdef HAVE_CUDA +template< typename Vector, typename Matrix > +__global__ void +SparseMatrixSetRowLengthsVectorKernel( Vector* rowLengths, + const Matrix* matrix, + typename Matrix::IndexType rows, + typename Matrix::IndexType cols ) +{ + using IndexType = typename Matrix::IndexType; + + IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x; + const IndexType gridSize = blockDim.x * gridDim.x; + + while( rowIdx < rows ) { + const auto max_length = matrix->getRowLengthFast( rowIdx ); + const auto row = matrix->getRow( rowIdx ); + IndexType length = 0; + for( IndexType c_j = 0; c_j < max_length; c_j++ ) + if( row.getElementColumn( c_j ) < cols ) + length++; + else + break; + rowLengths[ rowIdx ] = length; + rowIdx += gridSize; + } +} + +template< typename Matrix1, typename Matrix2 > +__global__ void +SparseMatrixCopyKernel( Matrix1* A, + const Matrix2* B, + const typename Matrix2::IndexType* rowLengths, + typename Matrix2::IndexType rows ) +{ + using IndexType = typename Matrix2::IndexType; + + IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x; + const IndexType gridSize = blockDim.x * gridDim.x; + + while( rowIdx < rows ) { + const auto length = rowLengths[ rowIdx ]; + const auto rowB = B->getRow( rowIdx ); + auto rowA = A->getRow( rowIdx ); + for( IndexType c = 0; c < length; c++ ) + rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) ); + rowIdx += gridSize; + } +} +#endif + + +template< typename Matrix1, typename Matrix2 > +void +copySparseMatrix( Matrix1& A, const Matrix2& B ) +{ + static_assert( std::is_same< typename Matrix1::RealType, typename Matrix2::RealType >::value, + "The matrices must have the same RealType." ); + static_assert( std::is_same< typename Matrix1::DeviceType, typename Matrix2::DeviceType >::value, + "The matrices must be allocated on the same device." ); + static_assert( std::is_same< typename Matrix1::IndexType, typename Matrix2::IndexType >::value, + "The matrices must have the same IndexType." ); + + using RealType = typename Matrix1::RealType; + using DeviceType = typename Matrix1::DeviceType; + using IndexType = typename Matrix1::IndexType; + + const IndexType rows = B.getRows(); + const IndexType cols = B.getColumns(); + + A.setDimensions( rows, cols ); + + if( std::is_same< DeviceType, Devices::Host >::value ) { + // set row lengths + typename Matrix1::CompressedRowLengthsVector rowLengths; + rowLengths.setSize( rows ); +#ifdef HAVE_OPENMP +#pragma omp parallel for if( Devices::Host::isOMPEnabled() ) +#endif + for( IndexType i = 0; i < rows; i++ ) { + const auto max_length = B.getRowLength( i ); + const auto row = B.getRow( i ); + IndexType length = 0; + for( IndexType c_j = 0; c_j < max_length; c_j++ ) + if( row.getElementColumn( c_j ) < cols ) + length++; + else + break; + rowLengths[ i ] = length; + } + A.setCompressedRowLengths( rowLengths ); + +#ifdef HAVE_OPENMP +#pragma omp parallel for if( Devices::Host::isOMPEnabled() ) +#endif + for( IndexType i = 0; i < rows; i++ ) { + const auto length = rowLengths[ i ]; + const auto rowB = B.getRow( i ); + auto rowA = A.getRow( i ); + for( IndexType c = 0; c < length; c++ ) + rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) ); + } + } + + if( std::is_same< DeviceType, Devices::Cuda >::value ) { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ); + dim3 gridSize; + const IndexType desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() ); + gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( rows, blockSize.x ) ); + + typename Matrix1::CompressedRowLengthsVector rowLengths; + rowLengths.setSize( rows ); + + Pointers::DevicePointer< Matrix1 > Apointer( A ); + const Pointers::DevicePointer< const Matrix2 > Bpointer( B ); + + // set row lengths + Devices::Cuda::synchronizeDevice(); + SparseMatrixSetRowLengthsVectorKernel<<< gridSize, blockSize >>>( + rowLengths.getData(), + &Bpointer.template getData< TNL::Devices::Cuda >(), + rows, + cols ); + TNL_CHECK_CUDA_DEVICE; + Apointer->setCompressedRowLengths( rowLengths ); + + // copy rows + Devices::Cuda::synchronizeDevice(); + SparseMatrixCopyKernel<<< gridSize, blockSize >>>( + &Apointer.template modifyData< TNL::Devices::Cuda >(), + &Bpointer.template getData< TNL::Devices::Cuda >(), + rowLengths.getData(), + rows ); + TNL_CHECK_CUDA_DEVICE; +#else + throw Exceptions::CudaSupportMissing(); +#endif + } +} + +} // namespace Matrices +} // namespace TNL diff --git a/src/TNL/Matrices/Sparse_impl.h b/src/TNL/Matrices/Sparse_impl.h index ba4558d3f989374f65c12ace41391323e5a6adba..cd57637cbd773dc2ce25a13ad9b39191a7d2507b 100644 --- a/src/TNL/Matrices/Sparse_impl.h +++ b/src/TNL/Matrices/Sparse_impl.h @@ -11,7 +11,6 @@ #pragma once #include "Sparse.h" -#include <TNL/Pointers/DevicePointer.h> namespace TNL { namespace Matrices { @@ -134,145 +133,5 @@ void Sparse< Real, Device, Index >::printStructure( std::ostream& str ) const TNL_ASSERT_TRUE( false, "Not implemented yet." ); } - -#ifdef HAVE_CUDA -template< typename Vector, typename Matrix > -__global__ void -SparseMatrixSetRowLengthsVectorKernel( Vector* rowLengths, - const Matrix* matrix, - typename Matrix::IndexType rows, - typename Matrix::IndexType cols ) -{ - using IndexType = typename Matrix::IndexType; - - IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x; - const IndexType gridSize = blockDim.x * gridDim.x; - - while( rowIdx < rows ) { - const auto max_length = matrix->getRowLengthFast( rowIdx ); - const auto row = matrix->getRow( rowIdx ); - IndexType length = 0; - for( IndexType c_j = 0; c_j < max_length; c_j++ ) - if( row.getElementColumn( c_j ) < cols ) - length++; - else - break; - rowLengths[ rowIdx ] = length; - rowIdx += gridSize; - } -} - -template< typename Matrix1, typename Matrix2 > -__global__ void -SparseMatrixCopyKernel( Matrix1* A, - const Matrix2* B, - const typename Matrix2::IndexType* rowLengths, - typename Matrix2::IndexType rows ) -{ - using IndexType = typename Matrix2::IndexType; - - IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x; - const IndexType gridSize = blockDim.x * gridDim.x; - - while( rowIdx < rows ) { - const auto length = rowLengths[ rowIdx ]; - const auto rowB = B->getRow( rowIdx ); - auto rowA = A->getRow( rowIdx ); - for( IndexType c = 0; c < length; c++ ) - rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) ); - rowIdx += gridSize; - } -} -#endif - -template< typename Matrix1, typename Matrix2 > -void -copySparseMatrix( Matrix1& A, const Matrix2& B ) -{ - static_assert( std::is_same< typename Matrix1::RealType, typename Matrix2::RealType >::value, - "The matrices must have the same RealType." ); - static_assert( std::is_same< typename Matrix1::DeviceType, typename Matrix2::DeviceType >::value, - "The matrices must be allocated on the same device." ); - static_assert( std::is_same< typename Matrix1::IndexType, typename Matrix2::IndexType >::value, - "The matrices must have the same IndexType." ); - - using RealType = typename Matrix1::RealType; - using DeviceType = typename Matrix1::DeviceType; - using IndexType = typename Matrix1::IndexType; - - const IndexType rows = B.getRows(); - const IndexType cols = B.getColumns(); - - A.setDimensions( rows, cols ); - - if( std::is_same< DeviceType, Devices::Host >::value ) { - // set row lengths - typename Matrix1::CompressedRowLengthsVector rowLengths; - rowLengths.setSize( rows ); -#ifdef HAVE_OPENMP -#pragma omp parallel for if( Devices::Host::isOMPEnabled() ) -#endif - for( IndexType i = 0; i < rows; i++ ) { - const auto max_length = B.getRowLength( i ); - const auto row = B.getRow( i ); - IndexType length = 0; - for( IndexType c_j = 0; c_j < max_length; c_j++ ) - if( row.getElementColumn( c_j ) < cols ) - length++; - else - break; - rowLengths[ i ] = length; - } - A.setCompressedRowLengths( rowLengths ); - -#ifdef HAVE_OPENMP -#pragma omp parallel for if( Devices::Host::isOMPEnabled() ) -#endif - for( IndexType i = 0; i < rows; i++ ) { - const auto length = rowLengths[ i ]; - const auto rowB = B.getRow( i ); - auto rowA = A.getRow( i ); - for( IndexType c = 0; c < length; c++ ) - rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) ); - } - } - - if( std::is_same< DeviceType, Devices::Cuda >::value ) { -#ifdef HAVE_CUDA - dim3 blockSize( 256 ); - dim3 gridSize; - const IndexType desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() ); - gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( rows, blockSize.x ) ); - - typename Matrix1::CompressedRowLengthsVector rowLengths; - rowLengths.setSize( rows ); - - Pointers::DevicePointer< Matrix1 > Apointer( A ); - const Pointers::DevicePointer< const Matrix2 > Bpointer( B ); - - // set row lengths - Devices::Cuda::synchronizeDevice(); - SparseMatrixSetRowLengthsVectorKernel<<< gridSize, blockSize >>>( - rowLengths.getData(), - &Bpointer.template getData< TNL::Devices::Cuda >(), - rows, - cols ); - TNL_CHECK_CUDA_DEVICE; - Apointer->setCompressedRowLengths( rowLengths ); - - // copy rows - Devices::Cuda::synchronizeDevice(); - SparseMatrixCopyKernel<<< gridSize, blockSize >>>( - &Apointer.template modifyData< TNL::Devices::Cuda >(), - &Bpointer.template getData< TNL::Devices::Cuda >(), - rowLengths.getData(), - rows ); - TNL_CHECK_CUDA_DEVICE; -#else - throw Exceptions::CudaSupportMissing(); -#endif - } -} - } // namespace Matrices } // namespace TNL