diff --git a/src/TNL/Containers/Segments/Ellpack.h b/src/TNL/Containers/Segments/Ellpack.h index 772566f5181d2de3c3dfa84b5067b8cfe0c857cc..dc1a717b3a0822e60c9e098795fdcfc2507e5885 100644 --- a/src/TNL/Containers/Segments/Ellpack.h +++ b/src/TNL/Containers/Segments/Ellpack.h @@ -28,6 +28,7 @@ class Ellpack using IndexType = Index; using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType >; static constexpr int getAlignment() { return Alignment; } + static constexpr bool getRowMajorOrder() { return RowMajorOrder; } Ellpack(); diff --git a/src/TNL/Containers/Segments/Ellpack.hpp b/src/TNL/Containers/Segments/Ellpack.hpp index 42d7eb8c1c5a83f69e49fd7655e6904ad953a340..8a23693ec8f3d38effac6a3de3d0fb24a8838531 100644 --- a/src/TNL/Containers/Segments/Ellpack.hpp +++ b/src/TNL/Containers/Segments/Ellpack.hpp @@ -63,7 +63,7 @@ setSizes( const SizesHolder& sizes ) if( RowMajorOrder ) this->alignedSize = this->size; else - this->alignedSize = roundUpDivision( size / this->getAlignment() ) * this->getAlignment(); + this->alignedSize = roundUpDivision( size, this->getAlignment() ) * this->getAlignment(); } template< typename Device, @@ -186,17 +186,35 @@ void Ellpack< Device, Index, RowMajorOrder, Alignment >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { - using RealType = decltype( fetch( IndexType(), IndexType() ) ); - const auto offsetsView = this->offsets.getConstView(); - auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable { - const IndexType begin = offsetsView[ i ]; - const IndexType end = offsetsView[ i + 1 ]; - RealType aux( zero ); - for( IndexType j = begin; j < end; j++ ) - reduction( aux, fetch( i, j, args... ) ); - keeper( i, aux ); - }; - Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); + if( RowMajorOrder ) + { + using RealType = decltype( fetch( IndexType(), IndexType() ) ); + const IndexType segmentSize = this->segmentSize; + auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable { + const IndexType begin = i * segmentSize; + const IndexType end = begin + segmentSize; + RealType aux( zero ); + for( IndexType j = begin; j < end; j++ ) + reduction( aux, fetch( i, j, args... ) ); + keeper( i, aux ); + }; + Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); + } + else + { + using RealType = decltype( fetch( IndexType(), IndexType() ) ); + const IndexType storageSize = this->getStorageSize(); + const IndexType alignedSize = this->alignedSize; + auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable { + const IndexType begin = i; + const IndexType end = storageSize; + RealType aux( zero ); + for( IndexType j = begin; j < end; j += alignedSize ) + reduction( aux, fetch( i, j, args... ) ); + keeper( i, aux ); + }; + Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); + } } template< typename Device, @@ -219,7 +237,9 @@ void Ellpack< Device, Index, RowMajorOrder, Alignment >:: save( File& file ) const { - file << this->offsets; + file.save( &segmentSize ); + file.save( &size ); + file.save( &alignedSize ); } template< typename Device, @@ -230,7 +250,9 @@ void Ellpack< Device, Index, RowMajorOrder, Alignment >:: load( File& file ) { - file >> this->offsets; + file.load( &segmentSize ); + file.load( &size ); + file.load( &alignedSize ); } } // namespace Segments diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp index a43ddba8297eb1624db2b8113ce4349bfda42c8c..9bc8d7fb7355130847edbf3441b6856f98ed402e 100644 --- a/src/TNL/Matrices/SparseMatrix.hpp +++ b/src/TNL/Matrices/SparseMatrix.hpp @@ -489,12 +489,16 @@ vectorProduct( const InVector& inVector, const RealType& matrixMultiplicator, const RealType& inVectorAddition ) const { - auto inVectorView = inVector.getConstView(); + const auto inVectorView = inVector.getConstView(); auto outVectorView = outVector.getView(); - auto valuesView = this->values.getConstView(); - auto columnIndexesView = this->columnIndexes.getConstView(); + const auto valuesView = this->values.getConstView(); + const auto columnIndexesView = this->columnIndexes.getConstView(); + const IndexType paddingIndex = this->getPaddingIndex(); auto fetch = [=] __cuda_callable__ ( IndexType row, IndexType offset ) -> RealType { - return valuesView[ offset ] * inVectorView[ columnIndexesView[ offset ] ]; + const IndexType column = columnIndexesView[ offset ]; + if( column == paddingIndex ) + return 0.0; + return valuesView[ offset ] * inVectorView[ column ]; }; auto reduction = [] __cuda_callable__ ( RealType& sum, const RealType& value ) { sum += value; diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt index 996dd0430c4dbeec7b4131c9e748836c26d3495e..ef1f043710a792b30d74cad335e6659643d3554b 100644 --- a/src/UnitTests/Matrices/CMakeLists.txt +++ b/src/UnitTests/Matrices/CMakeLists.txt @@ -31,6 +31,9 @@ IF( BUILD_CUDA ) CUDA_ADD_EXECUTABLE( SparseMatrixTest_CSR_segments SparseMatrixTest_CSR_segments.cu OPTIONS ${CXX_TESTS_FLAGS} ) TARGET_LINK_LIBRARIES( SparseMatrixTest_CSR_segments ${GTEST_BOTH_LIBRARIES} ) + CUDA_ADD_EXECUTABLE( SparseMatrixTest_Ellpack_segments SparseMatrixTest_Ellpack_segments.cu OPTIONS ${CXX_TESTS_FLAGS} ) + TARGET_LINK_LIBRARIES( SparseMatrixTest_Ellpack_segments ${GTEST_BOTH_LIBRARIES} ) + ELSE( BUILD_CUDA ) ADD_EXECUTABLE( SparseMatrixCopyTest SparseMatrixCopyTest.cpp ) TARGET_COMPILE_OPTIONS( SparseMatrixCopyTest PRIVATE ${CXX_TESTS_FLAGS} ) diff --git a/src/UnitTests/Matrices/SparseMatrixTest_Ellpack_segments.h b/src/UnitTests/Matrices/SparseMatrixTest_Ellpack_segments.h index 79cdf06cff8e65723b7c0224ff2310b1dd0621a2..c54aab9486fdecc8836faa5f4e2e5e11a7ce36f3 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_Ellpack_segments.h +++ b/src/UnitTests/Matrices/SparseMatrixTest_Ellpack_segments.h @@ -26,34 +26,45 @@ protected: using EllpackMatrixType = Matrix; }; +//// +// Row-major format is used for the host system +template< typename Device, typename Index > +using RowMajorEllpack = TNL::Containers::Segments::Ellpack< Device, Index, true, 32 >; + + +//// +// Column-major format is used for GPUs +template< typename Device, typename Index > +using ColumnMajorEllpack = TNL::Containers::Segments::Ellpack< Device, Index, false, 32 >; + // types for which MatrixTest is instantiated using EllpackMatrixTypes = ::testing::Types < - TNL::Matrices::SparseMatrix< int, TNL::Containers::Segments::Ellpack, TNL::Devices::Host, short >, - TNL::Matrices::SparseMatrix< long, TNL::Containers::Segments::Ellpack, TNL::Devices::Host, short >, - TNL::Matrices::SparseMatrix< float, TNL::Containers::Segments::Ellpack, TNL::Devices::Host, short >, - TNL::Matrices::SparseMatrix< double, TNL::Containers::Segments::Ellpack, TNL::Devices::Host, short >, - TNL::Matrices::SparseMatrix< int, TNL::Containers::Segments::Ellpack, TNL::Devices::Host, int >, - TNL::Matrices::SparseMatrix< long, TNL::Containers::Segments::Ellpack, TNL::Devices::Host, int >, - TNL::Matrices::SparseMatrix< float, TNL::Containers::Segments::Ellpack, TNL::Devices::Host, int >, - TNL::Matrices::SparseMatrix< double, TNL::Containers::Segments::Ellpack, TNL::Devices::Host, int >, - TNL::Matrices::SparseMatrix< int, TNL::Containers::Segments::Ellpack, TNL::Devices::Host, long >, - TNL::Matrices::SparseMatrix< long, TNL::Containers::Segments::Ellpack, TNL::Devices::Host, long >, - TNL::Matrices::SparseMatrix< float, TNL::Containers::Segments::Ellpack, TNL::Devices::Host, long >, - TNL::Matrices::SparseMatrix< double, TNL::Containers::Segments::Ellpack, TNL::Devices::Host, long > + TNL::Matrices::SparseMatrix< int, RowMajorEllpack, TNL::Devices::Host, short >, + TNL::Matrices::SparseMatrix< long, RowMajorEllpack, TNL::Devices::Host, short >, + TNL::Matrices::SparseMatrix< float, RowMajorEllpack, TNL::Devices::Host, short >, + TNL::Matrices::SparseMatrix< double, RowMajorEllpack, TNL::Devices::Host, short >, + TNL::Matrices::SparseMatrix< int, RowMajorEllpack, TNL::Devices::Host, int >, + TNL::Matrices::SparseMatrix< long, RowMajorEllpack, TNL::Devices::Host, int >, + TNL::Matrices::SparseMatrix< float, RowMajorEllpack, TNL::Devices::Host, int >, + TNL::Matrices::SparseMatrix< double, RowMajorEllpack, TNL::Devices::Host, int >, + TNL::Matrices::SparseMatrix< int, RowMajorEllpack, TNL::Devices::Host, long >, + TNL::Matrices::SparseMatrix< long, RowMajorEllpack, TNL::Devices::Host, long >, + TNL::Matrices::SparseMatrix< float, RowMajorEllpack, TNL::Devices::Host, long >, + TNL::Matrices::SparseMatrix< double, RowMajorEllpack, TNL::Devices::Host, long > #ifdef HAVE_CUDA - ,TNL::Matrices::SparseMatrix< int, TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, short >, - TNL::Matrices::SparseMatrix< long, TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, short >, - TNL::Matrices::SparseMatrix< float, TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, short >, - TNL::Matrices::SparseMatrix< double, TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, short >, - TNL::Matrices::SparseMatrix< int, TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, int >, - TNL::Matrices::SparseMatrix< long, TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, int >, - TNL::Matrices::SparseMatrix< float, TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, int >, - TNL::Matrices::SparseMatrix< double, TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, int >, - TNL::Matrices::SparseMatrix< int, TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, long >, - TNL::Matrices::SparseMatrix< long, TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, long >, - TNL::Matrices::SparseMatrix< float, TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, long >, - TNL::Matrices::SparseMatrix< double, TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, long > + ,TNL::Matrices::SparseMatrix< int, ColumnMajorEllpack, TNL::Devices::Cuda, short >, + TNL::Matrices::SparseMatrix< long, ColumnMajorEllpack, TNL::Devices::Cuda, short >, + TNL::Matrices::SparseMatrix< float, ColumnMajorEllpack, TNL::Devices::Cuda, short >, + TNL::Matrices::SparseMatrix< double, ColumnMajorEllpack, TNL::Devices::Cuda, short >, + TNL::Matrices::SparseMatrix< int, ColumnMajorEllpack, TNL::Devices::Cuda, int >, + TNL::Matrices::SparseMatrix< long, ColumnMajorEllpack, TNL::Devices::Cuda, int >, + TNL::Matrices::SparseMatrix< float, ColumnMajorEllpack, TNL::Devices::Cuda, int >, + TNL::Matrices::SparseMatrix< double, ColumnMajorEllpack, TNL::Devices::Cuda, int >, + TNL::Matrices::SparseMatrix< int, ColumnMajorEllpack, TNL::Devices::Cuda, long >, + TNL::Matrices::SparseMatrix< long, ColumnMajorEllpack, TNL::Devices::Cuda, long >, + TNL::Matrices::SparseMatrix< float, ColumnMajorEllpack, TNL::Devices::Cuda, long >, + TNL::Matrices::SparseMatrix< double, ColumnMajorEllpack, TNL::Devices::Cuda, long > #endif >;