diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h index f7f3139bfaa553c2505bda10d7e4503e320217a5..d785fcff3ab557186093765e2b2392a3ecd3be45 100644 --- a/src/Benchmarks/BLAS/spmv.h +++ b/src/Benchmarks/BLAS/spmv.h @@ -27,6 +27,11 @@ namespace Benchmarks { template< typename Real, typename Device, typename Index > using SlicedEllpack = Matrices::Legacy::SlicedEllpack< Real, Device, Index >; +// Legacy formats +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Scalar = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRScalar >; + + template< typename Matrix > int setHostTestMatrix( Matrix& matrix, const int elementsPerRow ) @@ -173,7 +178,7 @@ benchmarkSpmvSynthetic( Benchmark & benchmark, const int & elementsPerRow ) { // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats) - benchmarkSpMV< Real, Matrices::Legacy::CSR >( benchmark, size, elementsPerRow ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow ); benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, size, elementsPerRow ); benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow ); benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow ); diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h index 38fec0b643903f5ed943c33132332b9deeaa2a60..f690a50c614b3853ac91a770159787b9f056c726 100644 --- a/src/Benchmarks/SpMV/spmv-legacy.h +++ b/src/Benchmarks/SpMV/spmv-legacy.h @@ -61,6 +61,22 @@ using SlicedEllpackSegments = Containers::Segments::SlicedEllpack< Device, Index template< typename Real, typename Device, typename Index > using SparseMatrix_SlicedEllpack = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, SlicedEllpackSegments >; +// Legacy formats +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Scalar = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRScalar >; + +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Vector = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRVector >; + +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Light = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight >; + +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Adaptive = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRAdaptive >; + +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Stream = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRStream >; + // Get the name (with extension) of input matrix file std::string getMatrixFileName( const String& InputFileName ) { @@ -259,7 +275,11 @@ benchmarkSpmvSynthetic( Benchmark& benchmark, benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse ); #endif - benchmarkSpMV< Real, Matrices::Legacy::CSR >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Stream >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_CSR >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); diff --git a/src/Python/pytnl/tnl/SparseMatrix.cpp b/src/Python/pytnl/tnl/SparseMatrix.cpp index 573b2790a28d71b0a00ce299e28f789c4d807f87..4307155976e58a2790ca8601cf6af8586c10cb7b 100644 --- a/src/Python/pytnl/tnl/SparseMatrix.cpp +++ b/src/Python/pytnl/tnl/SparseMatrix.cpp @@ -16,14 +16,15 @@ using SE_cuda = TNL::Matrices::Legacy::SlicedEllpack< double, TNL::Devices::Cuda void export_SparseMatrices( py::module & m ) { - export_Matrix< CSR_host >( m, "CSR" ); + // TODO: This stop working after adding template parameter KernelType to Legacy::CSR + //export_Matrix< CSR_host >( m, "CSR" ); export_Matrix< E_host >( m, "Ellpack" ); export_Matrix< SE_host >( m, "SlicedEllpack" ); - m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< CSR_host, E_host >); - m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< E_host, CSR_host >); - m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< CSR_host, SE_host >); - m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< SE_host, CSR_host >); + //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< CSR_host, E_host >); + //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< E_host, CSR_host >); + //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< CSR_host, SE_host >); + //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< SE_host, CSR_host >); m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< E_host, SE_host >); m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< SE_host, E_host >); } diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index e5edd26590d666720f72640403a92483d927b1e1..46e616d165d8cb891d3c1e307388f478e50801a7 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -31,7 +31,9 @@ class CusparseCSR; template< typename Device > class CSRDeviceDependentCode; -template< typename Real, typename Device = Devices::Host, typename Index = int > +enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, CSRAdaptive, CSRStream }; + +template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRScalar > class CSR : public Sparse< Real, Device, Index > { private: @@ -40,7 +42,7 @@ private: using Enabler = std::enable_if< ! std::is_same< Device2, Device >::value >; // friend class will be needed for templated assignment operators - template< typename Real2, typename Device2, typename Index2 > + template< typename Real2, typename Device2, typename Index2, CSRKernel KernelType2 > friend class CSR; public: @@ -60,7 +62,10 @@ public: typename _Index = Index > using Self = CSR< _Real, _Device, _Index >; - enum SPMVCudaKernel { scalar, vector, hybrid }; + constexpr CSRKernel getSpMVKernelType() { return KernelType; }; + //enum SPMVCudaKernel { scalar, vector, hybrid }; + + using Sparse< Real, Device, Index >::getAllocatedElementsCount; CSR(); @@ -85,8 +90,8 @@ public: __cuda_callable__ IndexType getNonZeroRowLengthFast( const IndexType row ) const; - template< typename Real2, typename Device2, typename Index2 > - void setLike( const CSR< Real2, Device2, Index2 >& matrix ); + template< typename Real2, typename Device2, typename Index2, CSRKernel KernelType2 > + void setLike( const CSR< Real2, Device2, Index2, KernelType2 >& matrix ); void reset(); @@ -165,13 +170,13 @@ public: OutVector& outVector ) const; // TODO: add const RealType& multiplicator = 1.0 ) - template< typename Real2, typename Index2 > - void addMatrix( const CSR< Real2, Device, Index2 >& matrix, + template< typename Real2, typename Index2, CSRKernel KernelType2 > + void addMatrix( const CSR< Real2, Device, Index2, KernelType2 >& matrix, const RealType& matrixMultiplicator = 1.0, const RealType& thisMatrixMultiplicator = 1.0 ); - template< typename Real2, typename Index2 > - void getTransposition( const CSR< Real2, Device, Index2 >& matrix, + template< typename Real2, typename Index2, CSRKernel KernelType2 > + void getTransposition( const CSR< Real2, Device, Index2, KernelType2 >& matrix, const RealType& matrixMultiplicator = 1.0 ); template< typename Vector1, typename Vector2 > @@ -184,9 +189,9 @@ public: CSR& operator=( const CSR& matrix ); // cross-device copy assignment - template< typename Real2, typename Device2, typename Index2, + template< typename Real2, typename Device2, typename Index2, CSRKernel KernelType2, typename = typename Enabler< Device2 >::type > - CSR& operator=( const CSR< Real2, Device2, Index2 >& matrix ); + CSR& operator=( const CSR< Real2, Device2, Index2, KernelType2 >& matrix ); void save( File& file ) const; @@ -198,10 +203,10 @@ public: void print( std::ostream& str ) const; - void setCudaKernelType( const SPMVCudaKernel kernel ); + //void setCudaKernelType( const SPMVCudaKernel kernel ); - __cuda_callable__ - SPMVCudaKernel getCudaKernelType() const; + //__cuda_callable__ + //SPMVCudaKernel getCudaKernelType() const; void setCudaWarpSize( const int warpSize ); @@ -216,13 +221,11 @@ public: template< typename InVector, typename OutVector, - int warpSize > + int warpSize > __device__ void spmvCudaVectorized( const InVector& inVector, OutVector& outVector, - const IndexType warpStart, - const IndexType warpEnd, - const IndexType inWarpIdx ) const; + const IndexType gridIdx ) const; template< typename InVector, typename OutVector, @@ -230,7 +233,25 @@ public: __device__ void vectorProductCuda( const InVector& inVector, OutVector& outVector, - int gridIdx ) const; + int gridIdx, int *blocks, size_t size ) const; + + template< typename InVector, + typename OutVector, + int warpSize > + __device__ + void spmvCudaLightSpmv( const InVector& inVector, + OutVector& outVector, + int gridIdx) const; + + template< typename InVector, + typename OutVector, + int warpSize > + __device__ + void spmvCSRAdaptive( const InVector& inVector, + OutVector& outVector, + int gridIdx, + int *blocks, + size_t blocks_size) const; #endif // The following getters allow us to interface TNL with external C-like @@ -263,7 +284,7 @@ protected: Containers::Vector< Index, Device, Index > rowPointers; - SPMVCudaKernel spmvCudaKernel; + //SPMVCudaKernel spmvCudaKernel; int cudaWarpSize, hybridModeSplit; diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index ba691834a4e0e3e2130b461a7232e85250227a79..bb3dfae7935031ae930dcecea45320bab84b2098 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -14,6 +14,7 @@ #include <TNL/Containers/VectorView.h> #include <TNL/Math.h> #include <TNL/Exceptions/NotImplementedError.h> +#include <vector> #ifdef HAVE_CUSPARSE #include <cusparse.h> @@ -31,9 +32,10 @@ class tnlCusparseCSRWrapper {}; template< typename Real, typename Device, - typename Index > -CSR< Real, Device, Index >::CSR() -: spmvCudaKernel( hybrid ), + typename Index, + CSRKernel KernelType > +CSR< Real, Device, Index, KernelType >::CSR() +: //spmvCudaKernel( hybrid ), cudaWarpSize( 32 ), //Cuda::getWarpSize() ) hybridModeSplit( 4 ) { @@ -41,8 +43,9 @@ CSR< Real, Device, Index >::CSR() template< typename Real, typename Device, - typename Index > -String CSR< Real, Device, Index >::getSerializationType() + typename Index, + CSRKernel KernelType > +String CSR< Real, Device, Index, KernelType >::getSerializationType() { return String( "Matrices::CSR< ") + TNL::getType< Real>() + @@ -53,16 +56,18 @@ String CSR< Real, Device, Index >::getSerializationType() template< typename Real, typename Device, - typename Index > -String CSR< Real, Device, Index >::getSerializationTypeVirtual() const + typename Index, + CSRKernel KernelType > +String CSR< Real, Device, Index, KernelType >::getSerializationTypeVirtual() const { return this->getSerializationType(); } template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::setDimensions( const IndexType rows, + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::setDimensions( const IndexType rows, const IndexType columns ) { Sparse< Real, Device, Index >::setDimensions( rows, columns ); @@ -72,8 +77,9 @@ void CSR< Real, Device, Index >::setDimensions( const IndexType rows, template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) { TNL_ASSERT_GT( this->getRows(), 0, "cannot set row lengths of an empty matrix" ); TNL_ASSERT_GT( this->getColumns(), 0, "cannot set row lengths of an empty matrix" ); @@ -102,8 +108,9 @@ void CSR< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLeng template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::getCompressedRowLengths( CompressedRowLengthsVectorView rowLengths ) const + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::getCompressedRowLengths( CompressedRowLengthsVectorView rowLengths ) const { TNL_ASSERT_EQ( rowLengths.getSize(), this->getRows(), "invalid size of the rowLengths vector" ); for( IndexType row = 0; row < this->getRows(); row++ ) @@ -112,25 +119,28 @@ void CSR< Real, Device, Index >::getCompressedRowLengths( CompressedRowLengthsVe template< typename Real, typename Device, - typename Index > -Index CSR< Real, Device, Index >::getRowLength( const IndexType row ) const + typename Index, + CSRKernel KernelType > +Index CSR< Real, Device, Index, KernelType >::getRowLength( const IndexType row ) const { return this->rowPointers.getElement( row + 1 ) - this->rowPointers.getElement( row ); } template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -Index CSR< Real, Device, Index >::getRowLengthFast( const IndexType row ) const +Index CSR< Real, Device, Index, KernelType >::getRowLengthFast( const IndexType row ) const { return this->rowPointers[ row + 1 ] - this->rowPointers[ row ]; } template< typename Real, typename Device, - typename Index > -Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) const + typename Index, + CSRKernel KernelType > +Index CSR< Real, Device, Index, KernelType >::getNonZeroRowLength( const IndexType row ) const { // TODO: Fix/Implement TNL_ASSERT( false, std::cerr << "TODO: Fix/Implement" ); @@ -139,9 +149,10 @@ Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) con template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -Index CSR< Real, Device, Index >::getNonZeroRowLengthFast( const IndexType row ) const +Index CSR< Real, Device, Index, KernelType >::getNonZeroRowLengthFast( const IndexType row ) const { ConstMatrixRow matrixRow = this->getRow( row ); return matrixRow.getNonZeroElementsCount(); @@ -149,11 +160,13 @@ Index CSR< Real, Device, Index >::getNonZeroRowLengthFast( const IndexType row ) template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > template< typename Real2, typename Device2, - typename Index2 > -void CSR< Real, Device, Index >::setLike( const CSR< Real2, Device2, Index2 >& matrix ) + typename Index2, + CSRKernel KernelType2 > +void CSR< Real, Device, Index, KernelType >::setLike( const CSR< Real2, Device2, Index2, KernelType2 >& matrix ) { Sparse< Real, Device, Index >::setLike( matrix ); this->rowPointers.setLike( matrix.rowPointers ); @@ -161,8 +174,9 @@ void CSR< Real, Device, Index >::setLike( const CSR< Real2, Device2, Index2 >& m template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::reset() + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::reset() { Sparse< Real, Device, Index >::reset(); this->rowPointers.reset(); @@ -170,9 +184,10 @@ void CSR< Real, Device, Index >::reset() template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -bool CSR< Real, Device, Index >::setElementFast( const IndexType row, +bool CSR< Real, Device, Index, KernelType >::setElementFast( const IndexType row, const IndexType column, const Real& value ) { @@ -181,8 +196,9 @@ bool CSR< Real, Device, Index >::setElementFast( const IndexType row, template< typename Real, typename Device, - typename Index > -bool CSR< Real, Device, Index >::setElement( const IndexType row, + typename Index, + CSRKernel KernelType > +bool CSR< Real, Device, Index, KernelType >::setElement( const IndexType row, const IndexType column, const Real& value ) { @@ -192,9 +208,10 @@ bool CSR< Real, Device, Index >::setElement( const IndexType row, template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -bool CSR< Real, Device, Index >::addElementFast( const IndexType row, +bool CSR< Real, Device, Index, KernelType >::addElementFast( const IndexType row, const IndexType column, const RealType& value, const RealType& thisElementMultiplicator ) @@ -236,8 +253,9 @@ bool CSR< Real, Device, Index >::addElementFast( const IndexType row, template< typename Real, typename Device, - typename Index > -bool CSR< Real, Device, Index >::addElement( const IndexType row, + typename Index, + CSRKernel KernelType > +bool CSR< Real, Device, Index, KernelType >::addElement( const IndexType row, const IndexType column, const RealType& value, const RealType& thisElementMultiplicator ) @@ -286,9 +304,10 @@ bool CSR< Real, Device, Index >::addElement( const IndexType row, template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -bool CSR< Real, Device, Index > :: setRowFast( const IndexType row, +bool CSR< Real, Device, Index, KernelType > :: setRowFast( const IndexType row, const IndexType* columnIndexes, const RealType* values, const IndexType elements ) @@ -312,8 +331,9 @@ bool CSR< Real, Device, Index > :: setRowFast( const IndexType row, template< typename Real, typename Device, - typename Index > -bool CSR< Real, Device, Index > :: setRow( const IndexType row, + typename Index, + CSRKernel KernelType > +bool CSR< Real, Device, Index, KernelType > :: setRow( const IndexType row, const IndexType* columnIndexes, const RealType* values, const IndexType elements ) @@ -336,9 +356,10 @@ bool CSR< Real, Device, Index > :: setRow( const IndexType row, template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -bool CSR< Real, Device, Index > :: addRowFast( const IndexType row, +bool CSR< Real, Device, Index, KernelType > :: addRowFast( const IndexType row, const IndexType* columns, const RealType* values, const IndexType numberOfElements, @@ -350,8 +371,9 @@ bool CSR< Real, Device, Index > :: addRowFast( const IndexType row, template< typename Real, typename Device, - typename Index > -bool CSR< Real, Device, Index > :: addRow( const IndexType row, + typename Index, + CSRKernel KernelType > +bool CSR< Real, Device, Index, KernelType > :: addRow( const IndexType row, const IndexType* columns, const RealType* values, const IndexType numberOfElements, @@ -362,9 +384,10 @@ bool CSR< Real, Device, Index > :: addRow( const IndexType row, template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -Real CSR< Real, Device, Index >::getElementFast( const IndexType row, +Real CSR< Real, Device, Index, KernelType >::getElementFast( const IndexType row, const IndexType column ) const { IndexType elementPtr = this->rowPointers[ row ]; @@ -381,8 +404,9 @@ Real CSR< Real, Device, Index >::getElementFast( const IndexType row, template< typename Real, typename Device, - typename Index > -Real CSR< Real, Device, Index >::getElement( const IndexType row, + typename Index, + CSRKernel KernelType > +Real CSR< Real, Device, Index, KernelType >::getElement( const IndexType row, const IndexType column ) const { IndexType elementPtr = this->rowPointers.getElement( row ); @@ -399,9 +423,10 @@ Real CSR< Real, Device, Index >::getElement( const IndexType row, template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -void CSR< Real, Device, Index >::getRowFast( const IndexType row, +void CSR< Real, Device, Index, KernelType >::getRowFast( const IndexType row, IndexType* columns, RealType* values ) const { @@ -417,10 +442,11 @@ void CSR< Real, Device, Index >::getRowFast( const IndexType row, template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -typename CSR< Real, Device, Index >::MatrixRow -CSR< Real, Device, Index >:: +typename CSR< Real, Device, Index, KernelType >::MatrixRow +CSR< Real, Device, Index, KernelType >:: getRow( const IndexType rowIndex ) { const IndexType rowOffset = this->rowPointers[ rowIndex ]; @@ -433,10 +459,11 @@ getRow( const IndexType rowIndex ) template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -typename CSR< Real, Device, Index >::ConstMatrixRow -CSR< Real, Device, Index >:: +typename CSR< Real, Device, Index, KernelType >::ConstMatrixRow +CSR< Real, Device, Index, KernelType >:: getRow( const IndexType rowIndex ) const { const IndexType rowOffset = this->rowPointers[ rowIndex ]; @@ -449,10 +476,11 @@ getRow( const IndexType rowIndex ) const template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > template< typename Vector > __cuda_callable__ -typename Vector::RealType CSR< Real, Device, Index >::rowVectorProduct( const IndexType row, +typename Vector::RealType CSR< Real, Device, Index, KernelType >::rowVectorProduct( const IndexType row, const Vector& vector ) const { Real result = 0.0; @@ -467,9 +495,10 @@ typename Vector::RealType CSR< Real, Device, Index >::rowVectorProduct( const In template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > template< typename InVector, typename OutVector > -void CSR< Real, Device, Index >::vectorProduct( const InVector& inVector, +void CSR< Real, Device, Index, KernelType >::vectorProduct( const InVector& inVector, OutVector& outVector ) const { DeviceDependentCode::vectorProduct( *this, inVector, outVector ); @@ -477,10 +506,12 @@ void CSR< Real, Device, Index >::vectorProduct( const InVector& inVector, template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > template< typename Real2, - typename Index2 > -void CSR< Real, Device, Index >::addMatrix( const CSR< Real2, Device, Index2 >& matrix, + typename Index2, + CSRKernel KernelType2 > +void CSR< Real, Device, Index, KernelType >::addMatrix( const CSR< Real2, Device, Index2, KernelType2 >& matrix, const RealType& matrixMultiplicator, const RealType& thisMatrixMultiplicator ) { @@ -490,10 +521,12 @@ void CSR< Real, Device, Index >::addMatrix( const CSR< Real2, Device, Index2 >& template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > template< typename Real2, - typename Index2 > -void CSR< Real, Device, Index >::getTransposition( const CSR< Real2, Device, Index2 >& matrix, + typename Index2, + CSRKernel KernelType2 > +void CSR< Real, Device, Index, KernelType >::getTransposition( const CSR< Real2, Device, Index2, KernelType2 >& matrix, const RealType& matrixMultiplicator ) { throw Exceptions::NotImplementedError( "CSR::getTransposition is not implemented." ); @@ -502,9 +535,10 @@ void CSR< Real, Device, Index >::getTransposition( const CSR< Real2, Device, Ind template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > template< typename Vector1, typename Vector2 > -bool CSR< Real, Device, Index >::performSORIteration( const Vector1& b, +bool CSR< Real, Device, Index, KernelType >::performSORIteration( const Vector1& b, const IndexType row, Vector2& x, const RealType& omega ) const @@ -540,9 +574,10 @@ bool CSR< Real, Device, Index >::performSORIteration( const Vector1& b, // copy assignment template< typename Real, typename Device, - typename Index > -CSR< Real, Device, Index >& -CSR< Real, Device, Index >::operator=( const CSR& matrix ) + typename Index, + CSRKernel KernelType > +CSR< Real, Device, Index, KernelType >& +CSR< Real, Device, Index, KernelType >::operator=( const CSR& matrix ) { this->setLike( matrix ); this->values = matrix.values; @@ -554,10 +589,11 @@ CSR< Real, Device, Index >::operator=( const CSR& matrix ) // cross-device copy assignment template< typename Real, typename Device, - typename Index > - template< typename Real2, typename Device2, typename Index2, typename > -CSR< Real, Device, Index >& -CSR< Real, Device, Index >::operator=( const CSR< Real2, Device2, Index2 >& matrix ) + typename Index, + CSRKernel KernelType > + template< typename Real2, typename Device2, typename Index2, CSRKernel KernelType2, typename > +CSR< Real, Device, Index, KernelType >& +CSR< Real, Device, Index, KernelType >::operator=( const CSR< Real2, Device2, Index2, KernelType2 >& matrix ) { this->setLike( matrix ); this->values = matrix.values; @@ -569,8 +605,9 @@ CSR< Real, Device, Index >::operator=( const CSR< Real2, Device2, Index2 >& matr template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::save( File& file ) const + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::save( File& file ) const { Sparse< Real, Device, Index >::save( file ); file << this->rowPointers; @@ -578,8 +615,9 @@ void CSR< Real, Device, Index >::save( File& file ) const template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::load( File& file ) + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::load( File& file ) { Sparse< Real, Device, Index >::load( file ); file >> this->rowPointers; @@ -587,24 +625,27 @@ void CSR< Real, Device, Index >::load( File& file ) template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::save( const String& fileName ) const + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::save( const String& fileName ) const { Object::save( fileName ); } template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::load( const String& fileName ) + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::load( const String& fileName ) { Object::load( fileName ); } template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::print( std::ostream& str ) const + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::print( std::ostream& str ) const { for( IndexType row = 0; row < this->getRows(); row++ ) { @@ -620,10 +661,10 @@ void CSR< Real, Device, Index >::print( std::ostream& str ) const } } -template< typename Real, +/*template< typename Real, typename Device, typename Index > -void CSR< Real, Device, Index >::setCudaKernelType( const SPMVCudaKernel kernel ) +void CSR< Real, Device, Index, KernelType >::setCudaKernelType( const SPMVCudaKernel kernel ) { this->spmvCudaKernel = kernel; } @@ -632,58 +673,245 @@ template< typename Real, typename Device, typename Index > __cuda_callable__ -typename CSR< Real, Device, Index >::SPMVCudaKernel CSR< Real, Device, Index >::getCudaKernelType() const +typename CSR< Real, Device, Index, KernelType >::SPMVCudaKernel CSR< Real, Device, Index, KernelType >::getCudaKernelType() const { return this->spmvCudaKernel; -} +}*/ template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::setCudaWarpSize( const int warpSize ) + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::setCudaWarpSize( const int warpSize ) { this->cudaWarpSize = warpSize; } template< typename Real, typename Device, - typename Index > -int CSR< Real, Device, Index >::getCudaWarpSize() const + typename Index, + CSRKernel KernelType > +int CSR< Real, Device, Index, KernelType >::getCudaWarpSize() const { return this->cudaWarpSize; } template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::setHybridModeSplit( const IndexType hybridModeSplit ) + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::setHybridModeSplit( const IndexType hybridModeSplit ) { this->hybridModeSplit = hybridModeSplit; } template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -Index CSR< Real, Device, Index >::getHybridModeSplit() const +Index CSR< Real, Device, Index, KernelType >::getHybridModeSplit() const { return this->hybridModeSplit; } #ifdef HAVE_CUDA + template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > + template< typename InVector, + typename OutVector, + int warpSize > +__device__ +void CSR< Real, Device, Index, KernelType >::spmvCudaLightSpmv( const InVector& inVector, + OutVector& outVector, + int gridIdx) const +{ + const IndexType index = blockIdx.x * blockDim.x + threadIdx.x; + const IndexType elemPerGroup = 4; + const IndexType laneID = index % 32; + const IndexType groupID = laneID / elemPerGroup; + const IndexType inGroupID = laneID % elemPerGroup; + + IndexType row, minID, column, maxID, idxMtx; + __shared__ unsigned rowCnt; + + if (index == 0) rowCnt = 0; // Init shared variable + __syncthreads(); + + while (true) { + + /* Get row number */ + if (inGroupID == 0) row = atomicAdd(&rowCnt, 1); + + /* Propagate row number in group */ + row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * elemPerGroup); + + if (row >= this->rowPointers.getSize() - 1) + return; + + minID = this->rowPointers[row]; + maxID = this->rowPointers[row + 1]; + + Real result = 0.0; + + idxMtx = minID + inGroupID; + while (idxMtx < maxID) { + column = this->columnIndexes[idxMtx]; + if (column >= this->getColumns()) + break; + + result += this->values[idxMtx] * inVector[column]; + idxMtx += elemPerGroup; + } + + /* Parallel reduction */ + for (int i = elemPerGroup/2; i > 0; i /= 2) + result += __shfl_down_sync((unsigned)(warpSize - 1), result, i); + /* Write result */ + if (inGroupID == 0) { + outVector[row] = result; + } + } +} + +/* template< typename Real, + typename Device, + typename Index, + typename InVector, + int warpSize > +__global__ +void spmvCSRVectorHelper( const InVector& inVector, + Real *out, + size_t from, + size_t to, + size_t perWarp) +{ + const size_t index = blockIdx.x * blockDim.x + threadIdx.x; + const size_t warpID = index / warpSize; + const size_t laneID = index % warpSize; + const size_t minID = from + warpID * perWarp; + size_t maxID = from + (warpID + 1) * perWarp; + if (minID >= to) return; + if (maxID >= to ) maxID = to; + + Real result = 0.0; + for (IndexType i = minID + laneID; i < maxID; i += warpSize) { + const IndexType column = this->columnIndexes[i]; + if (column >= this->getColumns()) + continue; + result += this->values[i] * inVector[column]; + } + atomicAdd(out, result); +} */ + +template< typename Real, + typename Device, + typename Index, + CSRKernel KernelType > + template< typename InVector, + typename OutVector, + int warpSize > +__device__ +void CSR< Real, Device, Index, KernelType >::spmvCSRAdaptive( const InVector& inVector, + OutVector& outVector, + int gridIdx, + int *blocks, + size_t blocks_size) const +{ + /* Configuration ---------------------------------------------------*/ + constexpr size_t SHARED = 49152/sizeof(float); + constexpr size_t SHARED_PER_WARP = SHARED / warpSize; + constexpr size_t MAX_PER_WARP = 65536; + constexpr size_t ELEMENTS_PER_WARP = 1024; + constexpr size_t THREADS_PER_BLOCK = 1024; + constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize; + //-------------------------------------------------------------------- + const IndexType index = blockIdx.x * blockDim.x + threadIdx.x; + const IndexType laneID = index % warpSize; + IndexType blockIdx = index / warpSize; + __shared__ float shared_res[SHARED]; + Real result = 0.0; + if (blockIdx >= blocks_size - 1) + return; + const IndexType minRow = blocks[blockIdx]; + const IndexType maxRow = blocks[blockIdx + 1]; + const IndexType minID = this->rowPointers[minRow]; + const IndexType maxID = this->rowPointers[maxRow]; + const IndexType elements = maxID - minID; + /* rows per block more than 1 */ + if ((maxRow - minRow) > 1) { + /////////////////////////////////////* CSR STREAM *////////////// + /* Copy and calculate elements from global to shared memory, coalesced */ + const IndexType offset = threadIdx.x / warpSize * SHARED_PER_WARP; + for (IndexType i = laneID; i < elements; i += warpSize) { + const IndexType elementIdx = i + minID; + const IndexType column = this->columnIndexes[elementIdx]; + if (column >= this->getColumns()) + continue; + shared_res[i + offset] = this->values[elementIdx] * inVector[column]; + } + + const IndexType row = minRow + laneID; + if (row >= maxRow) + return; + /* Calculate result */ + const IndexType to = this->rowPointers[row + 1] - minID; + for (IndexType i = this->rowPointers[row] - minID; i < to; ++i) { + result += shared_res[i + offset]; + } + outVector[row] = result; // Write result + } else if (elements <= MAX_PER_WARP) { + /////////////////////////////////////* CSR VECTOR *////////////// + for (IndexType i = minID + laneID; i < maxID; i += warpSize) { + IndexType column = this->columnIndexes[i]; + if (column >= this->getColumns()) + break; + + result += this->values[i] * inVector[column]; + } + /* Reduction */ + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); + if (laneID == 0) outVector[minRow] = result; // Write result + } else { + /////////////////////////////////////* CSR VECTOR LONG *////////////// + const size_t warps = (elements - ELEMENTS_PER_WARP) / ELEMENTS_PER_WARP + 1; + const size_t blocks = warps <= WARPS_PER_BLOCK ? 1 : warps / WARPS_PER_BLOCK + 1; + const size_t threads_per_block = blocks == 1 ? warps * warpSize : WARPS_PER_BLOCK * warpSize; + // spmvCSRVectorHelper<InVector, warpSize> <<<blocks, threads_per_block>>>( + // inVector, + // &outVector[minRow], + // (size_t)(minID + ELEMENTS_PER_WARP), + // (size_t)maxID, + // (size_t)ELEMENTS_PER_WARP + // ); + } +} + + +template< typename Real, + typename Device, + typename Index, + CSRKernel KernelType > template< typename InVector, typename OutVector, int warpSize > __device__ -void CSR< Real, Device, Index >::spmvCudaVectorized( const InVector& inVector, +void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& inVector, OutVector& outVector, - const IndexType warpStart, - const IndexType warpEnd, - const IndexType inWarpIdx ) const + const IndexType gridIdx ) const { + IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const IndexType warpStart = warpSize * ( globalIdx / warpSize ); + const IndexType warpEnd = min( warpStart + warpSize, this->getRows() ); + const IndexType inWarpIdx = globalIdx % warpSize; + volatile Real* aux = Cuda::getSharedMemory< Real >(); for( IndexType row = warpStart; row < warpEnd; row++ ) { @@ -715,26 +943,48 @@ void CSR< Real, Device, Index >::spmvCudaVectorized( const InVector& inVector, template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > template< typename InVector, typename OutVector, int warpSize > __device__ -void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector, +void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& inVector, OutVector& outVector, - int gridIdx ) const + int gridIdx, + int *blocks, size_t size ) const { - IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + switch( KernelType ) + { + case CSRScalar: + // TODO: + break; + case CSRVector: + spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); + break; + case CSRLight: + spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); + break; + case CSRAdaptive: + spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size ); + break; + case CSRStream: + // TODO: + break; + } + + + /*IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; const IndexType warpStart = warpSize * ( globalIdx / warpSize ); const IndexType warpEnd = min( warpStart + warpSize, this->getRows() ); const IndexType inWarpIdx = globalIdx % warpSize; if( this->getCudaKernelType() == vector ) - spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx ); + - /**** - * Hybrid mode - */ + ///// + // Hybrid mode + // const Index firstRow = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x; const IndexType lastRow = min( this->getRows(), firstRow + blockDim. x ); const IndexType nonzerosPerRow = ( this->rowPointers[ lastRow ] - this->rowPointers[ firstRow ] ) / @@ -742,19 +992,19 @@ void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector, if( nonzerosPerRow < this->getHybridModeSplit() ) { - /**** - * Use the scalar mode - */ + ///// + // Use the scalar mode + // if( globalIdx < this->getRows() ) outVector[ globalIdx ] = this->rowVectorProduct( globalIdx, inVector ); } else { - /**** - * Use the vector mode - */ + //// + // Use the vector mode + // spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx ); - } + }*/ } #endif @@ -767,14 +1017,15 @@ class CSRDeviceDependentCode< Devices::Host > template< typename Real, typename Index, + CSRKernel KernelType, typename InVector, typename OutVector > - static void vectorProduct( const CSR< Real, Device, Index >& matrix, + static void vectorProduct( const CSR< Real, Device, Index, KernelType >& matrix, const InVector& inVector, OutVector& outVector ) { const Index rows = matrix.getRows(); - const CSR< Real, Device, Index >* matrixPtr = &matrix; + const CSR< Real, Device, Index, KernelType >* matrixPtr = &matrix; const InVector* inVectorPtr = &inVector; OutVector* outVectorPtr = &outVector; #ifdef HAVE_OPENMP @@ -789,45 +1040,53 @@ class CSRDeviceDependentCode< Devices::Host > #ifdef HAVE_CUDA template< typename Real, typename Index, + CSRKernel KernelType, typename InVector, typename OutVector, int warpSize > -__global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index >* matrix, +__global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index, KernelType >* matrix, const InVector* inVector, OutVector* outVector, - int gridIdx ) + int gridIdx, + int *blocks, size_t size) { typedef CSR< Real, Devices::Cuda, Index > Matrix; static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" ); const typename Matrix::IndexType rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( matrix->getCudaKernelType() == Matrix::scalar ) + if( KernelType == CSRScalar ) { if( rowIdx < matrix->getRows() ) ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector ); } - if( matrix->getCudaKernelType() == Matrix::vector || - matrix->getCudaKernelType() == Matrix::hybrid ) + else { matrix->template vectorProductCuda< InVector, OutVector, warpSize > - ( *inVector, *outVector, gridIdx ); + ( *inVector, *outVector, gridIdx, blocks, size ); } } #endif template< typename Real, typename Index, + CSRKernel KernelType, typename InVector, typename OutVector > -void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, +void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >& matrix, const InVector& inVector, - OutVector& outVector ) + OutVector& outVector, + int *blocks, + size_t size ) { #ifdef HAVE_CUDA - typedef CSR< Real, Devices::Cuda, Index > Matrix; + typedef CSR< Real, Devices::Cuda, Index, KernelType > Matrix; typedef typename Matrix::IndexType IndexType; Matrix* kernel_this = Cuda::passToDevice( matrix ); InVector* kernel_inVector = Cuda::passToDevice( inVector ); OutVector* kernel_outVector = Cuda::passToDevice( outVector ); + int *kernelBlocks; + cudaMalloc((void **)&kernelBlocks, sizeof(int) * size); + cudaMemcpy(kernelBlocks, blocks, size * sizeof(int), cudaMemcpyHostToDevice); + TNL_CHECK_CUDA_DEVICE; dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); @@ -837,48 +1096,51 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); const int sharedMemory = cudaBlockSize.x * sizeof( Real ); - if( matrix.getCudaWarpSize() == 32 ) - CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 32 > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + // const int threads = cudaBlockSize.x; + if( matrix.getCudaWarpSize() == 32 ) { + // printf("BL %d BLSIZE %d\n", (int)cudaBlocks, (int)threads); + CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 32 > + <<< 2, 1024 >>> ( kernel_this, kernel_inVector, kernel_outVector, - gridIdx ); - if( matrix.getCudaWarpSize() == 16 ) - CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx ); - if( matrix.getCudaWarpSize() == 8 ) - CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx ); - if( matrix.getCudaWarpSize() == 4 ) - CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx ); - if( matrix.getCudaWarpSize() == 2 ) - CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx ); - if( matrix.getCudaWarpSize() == 1 ) - CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx ); + gridIdx, kernelBlocks, size ); + } + // if( matrix.getCudaWarpSize() == 16 ) + // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 > + // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + // ( kernel_this, + // kernel_inVector, + // kernel_outVector, + // gridIdx, kernelBlocks, size ); + // if( matrix.getCudaWarpSize() == 8 ) + // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 > + // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + // ( kernel_this, + // kernel_inVector, + // kernel_outVector, + // gridIdx, kernelBlocks, size ); + // if( matrix.getCudaWarpSize() == 4 ) + // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 > + // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + // ( kernel_this, + // kernel_inVector, + // kernel_outVector, + // gridIdx, kernelBlocks, size ); + // if( matrix.getCudaWarpSize() == 2 ) + // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 > + // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + // ( kernel_this, + // kernel_inVector, + // kernel_outVector, + // gridIdx, kernelBlocks, size ); + // if( matrix.getCudaWarpSize() == 1 ) + // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 > + // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + // ( kernel_this, + // kernel_inVector, + // kernel_outVector, + // gridIdx, kernelBlocks, size ); } TNL_CHECK_CUDA_DEVICE; @@ -982,9 +1244,10 @@ class CSRDeviceDependentCode< Devices::Cuda > template< typename Real, typename Index, + CSRKernel KernelType, typename InVector, typename OutVector > - static void vectorProduct( const CSR< Real, Device, Index >& matrix, + static void vectorProduct( const CSR< Real, Device, Index, KernelType >& matrix, const InVector& inVector, OutVector& outVector ) { @@ -998,7 +1261,36 @@ class CSRDeviceDependentCode< Devices::Cuda > inVector.getData(), outVector.getData() ); #else - CSRVectorProductCuda( matrix, inVector, outVector ); + constexpr int SHARED = 49152/sizeof(float); + constexpr int SHARED_PER_WARP = SHARED / 32; + std::vector<int> inBlock; + inBlock.push_back(0); + size_t sum = 0; + Index i; + int prev_i = 0; + for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) { + size_t elements = matrix.getRowPointers().getElement(i) - + matrix.getRowPointers().getElement(i - 1); + sum += elements; + if (sum > SHARED_PER_WARP) { + if (i - prev_i == 1) { + inBlock.push_back(i); + } else { + inBlock.push_back(i - 1); + --i; + } + sum = 0; + prev_i = i; + continue; + } + if (i - prev_i == 32) { + inBlock.push_back(i); + prev_i = i; + sum = 0; + } + } + inBlock.push_back(matrix.getRowPointers().getSize() - 1); + CSRVectorProductCuda( matrix, inVector, outVector, inBlock.data(), inBlock.size() ); #endif } diff --git a/src/TNL/Matrices/MatrixInfo.h b/src/TNL/Matrices/MatrixInfo.h index 73d77f31d691f2c18238beb323ea40e5d3124e49..ed999c9f2c458be9a746e43dbaf9a503e69feee9 100644 --- a/src/TNL/Matrices/MatrixInfo.h +++ b/src/TNL/Matrices/MatrixInfo.h @@ -88,11 +88,43 @@ struct MatrixInfo< Legacy::BiEllpack< Real, Device, Index > > }; template< typename Real, typename Device, typename Index > -struct MatrixInfo< Legacy::CSR< Real, Device, Index > > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRScalar > > { static String getDensity() { return String( "sparse" ); }; - static String getFormat() { return "CSR Legacy"; }; + static String getFormat() { return "CSR Legacy Scalar"; }; +}; + +template< typename Real, typename Device, typename Index > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRVector> > +{ + static String getDensity() { return String( "sparse" ); }; + + static String getFormat() { return "CSR Legacy Vector"; }; +}; + +template< typename Real, typename Device, typename Index > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight > > +{ + static String getDensity() { return String( "sparse" ); }; + + static String getFormat() { return "CSR Legacy Light"; }; +}; + +template< typename Real, typename Device, typename Index > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRAdaptive > > +{ + static String getDensity() { return String( "sparse" ); }; + + static String getFormat() { return "CSR Legacy Adaptive"; }; +}; + +template< typename Real, typename Device, typename Index > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRStream > > +{ + static String getDensity() { return String( "sparse" ); }; + + static String getFormat() { return "CSR Legacy Stream"; }; }; template< typename Real, typename Device, typename Index > diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp index 2e6c382967ae340d95119ed32782ff78a8062464..98ddfd3db9afbcdc5ccb3fc75c19717709421c9d 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp @@ -1220,6 +1220,256 @@ void test_VectorProduct() EXPECT_EQ( outVector_5.getElement( 7 ), 520 ); } +template< typename Matrix > +void test_VectorProductLarger() +{ + using RealType = typename Matrix::RealType; + using DeviceType = typename Matrix::DeviceType; + using IndexType = typename Matrix::IndexType; + +/* + * Sets up the following 20x20 sparse matrix: + * + * 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 + * ------------------------------------------------------------------------------ + * 0 / 1 0 0 0 0 0 0 0 0 0 7 7 7 7 7 7 7 7 7 7 \ + * 1 | 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | + * 2 | 0 0 3 3 3 0 0 0 0 0 0 0 0 0 0 -2 0 0 0 0 | + * 3 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | + * 4 | 0 10 0 11 0 12 0 0 0 0 0 0 0 0 0 0 0 -48 -49 -50 | + * 5 | 4 0 0 0 0 4 0 0 0 0 4 0 0 0 0 4 0 0 0 0 | + * 6 | 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | + * 7 | 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -6 | + * 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | + * 9 | 1 2 3 4 5 6 7 8 9 0 0 -1 -2 -3 -4 -5 -6 -7 -8 -9 | + * 10 | 7 0 0 0 3 3 3 3 0 0 0 0 0 0 0 -6 0 0 0 0 | + * 11 | -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 | + * 12 | 0 9 0 9 0 9 0 9 0 9 0 9 0 9 0 9 0 9 0 9 | + * 13 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | + * 14 | -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 | + * 15 | 4 0 4 0 4 0 4 0 4 0 4 0 4 0 4 0 4 0 4 0 | + * 16 | 0 -2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -2 0 | + * 17 | -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 | + * 18 | 0 7 0 7 0 7 0 7 0 7 0 7 0 7 0 7 0 7 0 7 | + * 19 \ 8 8 8 0 0 8 8 8 0 0 8 8 8 0 0 8 8 8 0 0 / + */ + const IndexType m_rows = 20; + const IndexType m_cols = 20; + + Matrix m; + m.reset(); + m.setDimensions( m_rows, m_cols ); + typename Matrix::CompressedRowLengthsVector rowLengths( + {11, 2, 4, 0, 6, 4, 1, 2, 20, 18, 6, 20, 10, 0, 20, 10, 2, 20, 10, 12} + ); +// rowLengths.setSize( m_rows ); +// rowLengths.setValue( 20 ); + m.setCompressedRowLengths( rowLengths ); + + /* 0 */ + m.setElement( 0, 0, 1 ); + for ( IndexType i = 10; i < m_cols; ++i ) + m.setElement( 0, i, 7 ); + + /* 1 */ + m.setElement( 1, 1, 2 ); + + /* 2 */ + m.setElement( 2, 2, 3 ); + m.setElement( 2, 3, 3 ); + m.setElement( 2, 4, 3 ); + m.setElement( 2, 15, -2 ); + + /* 4 */ + m.setElement( 4, 1, 10 ); + m.setElement( 4, 3, 11 ); + m.setElement( 4, 5, 12 ); + m.setElement( 4, 17, -48 ); + m.setElement( 4, 18, -49 ); + m.setElement( 4, 19, -50 ); + + /* 5 */ + for ( IndexType i = 0; i < m_cols; i += 5 ) + m.setElement( 5, i, 4 ); + + /* 6 */ + m.setElement( 6, 1, 5 ); + + /* 7 */ + m.setElement( 7, 2, 6 ); + m.setElement( 7, 19, -6 ); + + /* 8 */ + for ( IndexType i = 1; i <= m_cols; ++i ) + m.setElement( 8, i - 1, i ); + + /* 9 */ + for ( IndexType i = 1; i <= 9; ++i ) + m.setElement( 9, i - 1, i ); + + for ( IndexType i = 11; i <= 19; ++i ) + m.setElement( 9, i, -(i - 10) ); + + /* 10 */ + m.setElement( 10, 0, 7 ); + m.setElement( 10, 4, 3 ); + m.setElement( 10, 5, 3 ); + m.setElement( 10, 6, 3 ); + m.setElement( 10, 7, 3 ); + m.setElement( 10, 15, -6 ); + + /* 11 && 14 && 17 */ + for (IndexType i = 1; i <= m_cols; ++i) { + m.setElement(11, i - 1, -i); + m.setElement(14, i - 1, -i); + m.setElement(17, i - 1, -i); + } + + /* 12 && 18 */ + for (IndexType i = 0; i < m_cols; ++i) { + if( i % 2 ) { + m.setElement(12, i, 9); + m.setElement(18, i, 7); + } + } + + /* 15 */ + for (IndexType i = 0; i < m_cols; ++i) + if (i % 2 == 0) + m.setElement(15, i, 4); + + /* 16 */ + m.setElement(16, 1, -2); + m.setElement(16, 18, -2); + + /* 19 */ + for (IndexType i = 0; i < m_cols; i += 5 ) { + m.setElement(19, i, 8); + m.setElement(19, i + 1, 8); + m.setElement(19, i + 2, 8); + } + + using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >; + + VectorType inVector; + inVector.setSize( 20 ); + for( IndexType i = 0; i < inVector.getSize(); i++ ) + inVector.setElement( i, 2 ); + + VectorType outVector; + outVector.setSize( 20 ); + for( IndexType j = 0; j < outVector.getSize(); j++ ) + outVector.setElement( j, 0 ); + + m.vectorProduct( inVector, outVector); + + EXPECT_EQ( outVector.getElement( 0 ), 142 ); + EXPECT_EQ( outVector.getElement( 1 ), 4 ); + EXPECT_EQ( outVector.getElement( 2 ), 14 ); + EXPECT_EQ( outVector.getElement( 3 ), 0 ); + EXPECT_EQ( outVector.getElement( 4 ), -228 ); + EXPECT_EQ( outVector.getElement( 5 ), 32 ); + EXPECT_EQ( outVector.getElement( 6 ), 10 ); + EXPECT_EQ( outVector.getElement( 7 ), 0 ); + EXPECT_EQ( outVector.getElement( 8 ), 420 ); + EXPECT_EQ( outVector.getElement( 9 ), 0 ); + EXPECT_EQ( outVector.getElement( 10 ), 26 ); + EXPECT_EQ( outVector.getElement( 11 ), -420 ); + EXPECT_EQ( outVector.getElement( 12 ), 180 ); + EXPECT_EQ( outVector.getElement( 13 ), 0 ); + EXPECT_EQ( outVector.getElement( 14 ), -420 ); + EXPECT_EQ( outVector.getElement( 15 ), 80 ); + EXPECT_EQ( outVector.getElement( 16 ), -8 ); + EXPECT_EQ( outVector.getElement( 17 ), -420 ); + EXPECT_EQ( outVector.getElement( 18 ), 140 ); + EXPECT_EQ( outVector.getElement( 19 ), 192 ); +} + +template< typename Matrix > +void test_VectorProductGiant() +{ + using RealType = typename Matrix::RealType; + using DeviceType = typename Matrix::DeviceType; + using IndexType = typename Matrix::IndexType; + + IndexType m_rows = 100; + IndexType m_cols = 100; + + Matrix m; + m.reset(); + m.setDimensions( m_rows, m_cols ); + typename Matrix::CompressedRowLengthsVector rowLengths( + { + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100 + } + ); + + m.setCompressedRowLengths( rowLengths ); + + for (int i = 0; i < m_rows; ++i) + for (int j = 0; j < m_cols; ++j) + m.setElement( i, j, i + 1 ); + + using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >; + + VectorType inVector; + inVector.setSize( m_rows ); + for( IndexType i = 0; i < inVector.getSize(); ++i ) + inVector.setElement( i, 1 ); + + VectorType outVector; + outVector.setSize( m_rows ); + for( IndexType i = 0; i < outVector.getSize(); ++i ) + outVector.setElement( i, 0 ); + + m.vectorProduct( inVector, outVector); + + for (int i = 0; i < m_rows; ++i) + EXPECT_EQ( outVector.getElement( i ), (i + 1) * 100 ); + + //----------------------------------------------------- + + m_rows = 2; + m_cols = 1000; + + m.reset(); + m.setDimensions( m_rows, m_cols ); + typename Matrix::CompressedRowLengthsVector rowLengths2( + { + 1000, 1000 + } + ); + + m.setCompressedRowLengths( rowLengths2 ); + + for (int i = 0; i < m_rows; ++i) + for (int j = 0; j < m_cols; ++j) + m.setElement( i, j, i + 1 ); + + VectorType inVector2; + inVector2.setSize( m_cols ); + for( IndexType i = 0; i < inVector2.getSize(); i++ ) + inVector2.setElement( i, 1 ); + + VectorType outVector2; + outVector2.setSize( m_rows ); + for( IndexType i = 0; i < outVector2.getSize(); ++i ) + outVector2.setElement( i, 0 ); + m.vectorProduct( inVector2, outVector2); + + for (int i = 0; i < m_rows; ++i) + EXPECT_EQ( outVector2.getElement( i ), (i + 1) * 1000 ); +} + template< typename Matrix > void test_RowsReduction() { diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h index bbe336d1fd540a7bd122f0e74073638d31a70503..e9c3f591cf034127c2e751bf8339a33d14562d11 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h @@ -112,6 +112,20 @@ TYPED_TEST( CSRMatrixTest, vectorProductTest ) test_VectorProduct< CSRMatrixType >(); } +/*TYPED_TEST( CSRMatrixTest, vectorProductLargerTest ) +{ + using CSRMatrixType = typename TestFixture::CSRMatrixType; + + test_VectorProductLarger< CSRMatrixType >(); +}*/ + +/*TYPED_TEST( CSRMatrixTest, vectorProductGiantTest ) +{ + using CSRMatrixType = typename TestFixture::CSRMatrixType; + + test_VectorProductGiant< CSRMatrixType >(); +}*/ + TYPED_TEST( CSRMatrixTest, saveAndLoadTest ) { using CSRMatrixType = typename TestFixture::CSRMatrixType;