From 032d4ec08fbacaf856bd2f93067eaa57bb7127a9 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Sun, 8 Mar 2020 19:58:11 +0100 Subject: [PATCH 1/8] Moved changes from matrices-csr to IK/matrices-csr --- src/TNL/Matrices/Legacy/CSR.h | 20 +- src/TNL/Matrices/Legacy/CSR_impl.h | 108 ++++++++ .../Matrices/Legacy/SparseMatrixTest.hpp | 258 ++++++++++++++++++ .../Matrices/Legacy/SparseMatrixTest_CSR.h | 14 + 4 files changed, 399 insertions(+), 1 deletion(-) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index e5edd2659..3e76aa121 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -216,7 +216,7 @@ public: template< typename InVector, typename OutVector, - int warpSize > + int warpSize > __device__ void spmvCudaVectorized( const InVector& inVector, OutVector& outVector, @@ -231,6 +231,24 @@ public: void vectorProductCuda( const InVector& inVector, OutVector& outVector, int gridIdx ) const; + + template< typename InVector, + typename OutVector, + int warpSize > + __device__ + void spmvCudaLightSpmv( const InVector& inVector, + OutVector& outVector, + int gridIdx) const; + + template< typename InVector, + typename OutVector, + int warpSize > + __device__ + void spmvCudaVector( const InVector& inVector, + OutVector& outVector, + int gridIdx) const; + + #endif // The following getters allow us to interface TNL with external C-like diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index ba691834a..4cff6761a 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -671,6 +671,111 @@ Index CSR< Real, Device, Index >::getHybridModeSplit() const } #ifdef HAVE_CUDA + +template< typename Real, + typename Device, + typename Index > + template< typename InVector, + typename OutVector, + int warpSize > +__device__ +void CSR< Real, Device, Index >::spmvCudaLightSpmv( const InVector& inVector, + OutVector& outVector, + int gridIdx) const +{ + const IndexType index = blockIdx.x * blockDim.x + threadIdx.x; + const IndexType elemPerThread = 4; + const IndexType laneID = index % 32; + const IndexType groupID = laneID / elemPerThread; + const IndexType inGroupID = laneID % elemPerThread; + + IndexType row, minID, column, maxID, idxMtx; + __shared__ unsigned rowCnt; + + if (index == 0) rowCnt = 0; // Init shared variable + __syncthreads(); + + while (true) { + + /* Get row number */ + if (inGroupID == 0) row = atomicAdd(&rowCnt, 1); + + /* Propagate row number in group */ + row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * elemPerThread); + + if (row >= this->rowPointers.getSize() - 1) + return; + + minID = this->rowPointers[row]; + maxID = this->rowPointers[row + 1]; + + Real result = 0.0; + + idxMtx = minID + inGroupID; + while (idxMtx < maxID) { + column = this->columnIndexes[idxMtx]; + if (column >= this->getColumns()) { + break; + } + result += this->values[idxMtx] * inVector[column]; + idxMtx += elemPerThread; + } + + /* Parallel reduction */ + for (int i = elemPerThread/2; i > 0; i /= 2) + result += __shfl_down_sync((unsigned)(warpSize - 1), result, i); + /* Write result */ + if (inGroupID == 0) { + outVector[row] = result; + } + } +} + +template< typename Real, + typename Device, + typename Index > + template< typename InVector, + typename OutVector, + int warpSize > +__device__ +void CSR< Real, Device, Index >::spmvCudaVector( const InVector& inVector, + OutVector& outVector, + int gridIdx) const +{ + const IndexType index = blockIdx.x * blockDim.x + threadIdx.x; + const IndexType laneID = index % warpSize; + const IndexType row = index / warpSize; // warpID - one warp per row + volatile Real result = 0.0; + + if (row >= this->rowPointers.getSize() - 1) + return; + + IndexType minID = this->rowPointers[row]; + IndexType maxID = this->rowPointers[row + 1]; + + /* Calculate results */ + for (IndexType idxMtx = minID + laneID; idxMtx < maxID; idxMtx += warpSize) { + IndexType column = this->columnIndexes[idxMtx]; + if (column >= this->getColumns()) { + break; + } + // printf("%lf %lf %lf\n", (double)this->values[idxMtx], (double)inVector[column], (double)(this->values[idxMtx] * inVector[column])); + result += this->values[idxMtx] * inVector[column]; + } + // printf("=============== lane %d warp %d res %lf\n", (int)laneID, (int)row, (double)result); + /* Parallel reduction */ + result += __shfl_down_sync((warpSize - 1), result, 16); + result += __shfl_down_sync((warpSize - 1), result, 8); + result += __shfl_down_sync((warpSize - 1), result, 4); + result += __shfl_down_sync((warpSize - 1), result, 2); + result += __shfl_down_sync((warpSize - 1), result, 1); + + /* Write result */ + if (laneID == 0) { + outVector[row] = result; + } +} + template< typename Real, typename Device, typename Index > @@ -724,6 +829,9 @@ void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector, OutVector& outVector, int gridIdx ) const { + spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); + return; + IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; const IndexType warpStart = warpSize * ( globalIdx / warpSize ); const IndexType warpEnd = min( warpStart + warpSize, this->getRows() ); diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp index 2e6c38296..3904ef432 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp @@ -1220,6 +1220,264 @@ void test_VectorProduct() EXPECT_EQ( outVector_5.getElement( 7 ), 520 ); } +template< typename Matrix > +void test_VectorProductLarger() +{ + using RealType = typename Matrix::RealType; + using DeviceType = typename Matrix::DeviceType; + using IndexType = typename Matrix::IndexType; + +/* + * Sets up the following 20x20 sparse matrix: + * + * 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 + * ------------------------------------------------------------------------------ + * 0 / 1 0 0 0 0 0 0 0 0 0 7 7 7 7 7 7 7 7 7 7 \ + * 1 | 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | + * 2 | 0 0 3 3 3 0 0 0 0 0 0 0 0 0 0 -2 0 0 0 0 | + * 3 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | + * 4 | 0 10 0 11 0 12 0 0 0 0 0 0 0 0 0 0 0 -48 -49 -50 | + * 5 | 4 0 0 0 0 4 0 0 0 0 4 0 0 0 0 4 0 0 0 0 | + * 6 | 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | + * 7 | 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -6 | + * 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | + * 9 | 1 2 3 4 5 6 7 8 9 0 0 -1 -2 -3 -4 -5 -6 -7 -8 -9 | + * 10 | 7 0 0 0 3 3 3 3 0 0 0 0 0 0 0 -6 0 0 0 0 | + * 11 | -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 | + * 12 | 0 9 0 9 0 9 0 9 0 9 0 9 0 9 0 9 0 9 0 9 | + * 13 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | + * 14 | -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 | + * 15 | 4 0 4 0 4 0 4 0 4 0 4 0 4 0 4 0 4 0 4 0 | + * 16 | 0 -2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -2 0 | + * 17 | -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 | + * 18 | 0 7 0 7 0 7 0 7 0 7 0 7 0 7 0 7 0 7 0 7 | + * 19 \ 8 8 8 0 0 8 8 8 0 0 8 8 8 0 0 8 8 8 0 0 / + */ + const IndexType m_rows = 20; + const IndexType m_cols = 20; + + Matrix m; + m.reset(); + m.setDimensions( m_rows, m_cols ); + typename Matrix::CompressedRowLengthsVector rowLengths; + rowLengths.setSize( m_rows ); + rowLengths.setValue( 20 ); + m.setCompressedRowLengths( rowLengths ); + + /* 0 */ + m.setElement( 0, 0, 1 ); + for ( IndexType i = 10; i < m_cols; ++i ) + m.setElement( 0, i, 7 ); + + /* 1 */ + m.setElement( 1, 1, 2 ); + + /* 2 */ + m.setElement( 2, 2, 3 ); + m.setElement( 2, 3, 3 ); + m.setElement( 2, 4, 3 ); + m.setElement( 2, 15, -2 ); + + /* 4 */ + m.setElement( 4, 1, 10 ); + m.setElement( 4, 3, 11 ); + m.setElement( 4, 5, 12 ); + m.setElement( 4, 17, -48 ); + m.setElement( 4, 18, -49 ); + m.setElement( 4, 19, -50 ); + + /* 5 */ + for ( IndexType i = 0; i < m_cols; i += 5 ) + m.setElement( 5, i, 4 ); + + /* 6 */ + m.setElement( 6, 1, 5 ); + + /* 7 */ + m.setElement( 7, 2, 6 ); + m.setElement( 7, 19, -6 ); + + /* 8 */ + for ( IndexType i = 1; i <= m_cols; ++i ) + m.setElement( 8, i - 1, i ); + + /* 9 */ + for ( IndexType i = 1; i <= 9; ++i ) + m.setElement( 9, i - 1, i ); + + for ( IndexType i = 11; i <= 19; ++i ) + m.setElement( 9, i, -(i - 10) ); + + /* 10 */ + m.setElement( 10, 0, 7 ); + m.setElement( 10, 4, 3 ); + m.setElement( 10, 5, 3 ); + m.setElement( 10, 6, 3 ); + m.setElement( 10, 7, 3 ); + m.setElement( 10, 15, -6 ); + + /* 11 && 14 && 17 */ + for (IndexType i = 1; i <= m_cols; ++i) { + m.setElement(11, i - 1, -i); + m.setElement(14, i - 1, -i); + m.setElement(17, i - 1, -i); + } + + /* 12 && 18 */ + for (IndexType i = 0; i < m_cols; ++i) { + if( i % 2 ) { + m.setElement(12, i, 9); + m.setElement(18, i, 7); + } + } + + /* 15 */ + for (IndexType i = 0; i < m_cols; ++i) + if (i % 2 == 0) + m.setElement(15, i, 4); + + /* 16 */ + m.setElement(16, 1, -2); + m.setElement(16, 18, -2); + + /* 19 */ + for (IndexType i = 0; i < m_cols; i += 5 ) { + m.setElement(19, i, 8); + m.setElement(19, i + 1, 8); + m.setElement(19, i + 2, 8); + } + + using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >; + + VectorType inVector; + inVector.setSize( 20 ); + for( IndexType i = 0; i < inVector.getSize(); i++ ) + inVector.setElement( i, 2 ); + + VectorType outVector; + outVector.setSize( 20 ); + for( IndexType j = 0; j < outVector.getSize(); j++ ) + outVector.setElement( j, 0 ); + + m.vectorProduct( inVector, outVector); + + EXPECT_EQ( outVector.getElement( 0 ), 142 ); + EXPECT_EQ( outVector.getElement( 1 ), 4 ); + EXPECT_EQ( outVector.getElement( 2 ), 14 ); + EXPECT_EQ( outVector.getElement( 3 ), 0 ); + EXPECT_EQ( outVector.getElement( 4 ), -228 ); + EXPECT_EQ( outVector.getElement( 5 ), 32 ); + EXPECT_EQ( outVector.getElement( 6 ), 10 ); + EXPECT_EQ( outVector.getElement( 7 ), 0 ); + EXPECT_EQ( outVector.getElement( 8 ), 420 ); + EXPECT_EQ( outVector.getElement( 9 ), 0 ); + EXPECT_EQ( outVector.getElement( 10 ), 26 ); + EXPECT_EQ( outVector.getElement( 11 ), -420 ); + EXPECT_EQ( outVector.getElement( 12 ), 180 ); + EXPECT_EQ( outVector.getElement( 13 ), 0 ); + EXPECT_EQ( outVector.getElement( 14 ), -420 ); + EXPECT_EQ( outVector.getElement( 15 ), 80 ); + EXPECT_EQ( outVector.getElement( 16 ), -8 ); + EXPECT_EQ( outVector.getElement( 17 ), -420 ); + EXPECT_EQ( outVector.getElement( 18 ), 140 ); + EXPECT_EQ( outVector.getElement( 19 ), 192 ); +} + +template< typename Matrix > +void test_VectorProductGiant() +{ + using RealType = typename Matrix::RealType; + using DeviceType = typename Matrix::DeviceType; + using IndexType = typename Matrix::IndexType; + const IndexType N = 200; // Should be a multiple of 10 +/* + * Sets up the following NxN sparse matrix: + * + * / 1 0 1 0 ... 1 0 \ + * | 0 0 0 0 ... 0 0 | + * | 0 2 0 2 ... 0 2 | + * | 0 0 0 0 ... 0 0 | + * | 3 -3 3 -3 ... 3 -3 | + * | 0 0 0 0 ... 0 0 | + * | 5 5 5 5 ... 5 5 | + * | 0 0 0 0 ... 0 0 | + * | 0 0 0 0 ... 0 0 | + * | 2 0 0 0 ... 0 0 | <- the only 2 in the row + * + * .................. + */ + + const IndexType m_rows = N; + const IndexType m_cols = N; + + Matrix m; + m.reset(); + m.setDimensions( m_rows, m_cols ); + typename Matrix::CompressedRowLengthsVector rowLengths; + rowLengths.setSize( m_rows ); + rowLengths.setValue( m_cols ); + m.setCompressedRowLengths( rowLengths ); + + for ( IndexType row = 0; row < m_rows; ++row ) { + if (row % 10 == 0) { + for ( IndexType col = 0; col < m_cols; ++col ) { + if (col % 2) + m.setElement( row, col, 1 ); + } + } + else if (row % 10 == 2) { + for ( IndexType col = 0; col < m_cols; ++col ) { + if (col % 2 == 1) + m.setElement( row, col, 2 ); + } + } + else if (row % 10 == 4) { + for ( IndexType col = 0; col < m_cols; ++col ) { + if (col % 2) + m.setElement( row, col, 3 ); + else + m.setElement( row, col, -3 ); + } + } + else if (row % 10 == 6) { + for ( IndexType col = 0; col < m_cols; ++col ) { + m.setElement( row, col, 5 ); + } + } + else if (row % 10 == 9) { + m.setElement( row, 0, 2 ); + } + } + + using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >; + + VectorType inVector; + inVector.setSize( N ); + for( IndexType i = 0; i < inVector.getSize(); i++ ) + inVector.setElement( i, 2 ); + + VectorType outVector; + outVector.setSize( N ); + for( IndexType j = 0; j < outVector.getSize(); j++ ) + outVector.setElement( j, 0 ); + + + m.vectorProduct( inVector, outVector); + + for ( IndexType i = 0; i < outVector.getSize(); ++i ) { + if ( i % 10 == 0 ) + EXPECT_EQ( outVector.getElement( i ), 200 /* (N) */ ); + else if ( i % 10 == 2 ) + EXPECT_EQ( outVector.getElement( i ), 400 /* (2 * N) */ ); + else if ( i % 10 == 6 ) + EXPECT_EQ( outVector.getElement( i ), 2000 /* (10 * N) */ ); + else if ( i % 10 == 9 ) + EXPECT_EQ( outVector.getElement( i ), 4 ); + else + EXPECT_EQ( outVector.getElement( i ), 0 ); + } +} + template< typename Matrix > void test_RowsReduction() { diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h index bbe336d1f..5ff79213d 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h @@ -112,6 +112,20 @@ TYPED_TEST( CSRMatrixTest, vectorProductTest ) test_VectorProduct< CSRMatrixType >(); } +TYPED_TEST( CSRMatrixTest, vectorProductLargerTest ) +{ + using CSRMatrixType = typename TestFixture::CSRMatrixType; + + test_VectorProductLarger< CSRMatrixType >(); +} + +TYPED_TEST( CSRMatrixTest, vectorProductGiantTest ) +{ + using CSRMatrixType = typename TestFixture::CSRMatrixType; + + test_VectorProductGiant< CSRMatrixType >(); +} + TYPED_TEST( CSRMatrixTest, saveAndLoadTest ) { using CSRMatrixType = typename TestFixture::CSRMatrixType; -- GitLab From f9ace6502159fa03b0453c40534c39fc0661c6f0 Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Mon, 16 Mar 2020 22:08:46 +0100 Subject: [PATCH 2/8] Added CSR Adaptive --- src/TNL/Matrices/Legacy/CSR.h | 14 +- src/TNL/Matrices/Legacy/CSR_impl.h | 232 +++++++++++++++++++---------- 2 files changed, 157 insertions(+), 89 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index 3e76aa121..633ce665b 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -230,7 +230,7 @@ public: __device__ void vectorProductCuda( const InVector& inVector, OutVector& outVector, - int gridIdx ) const; + int gridIdx, unsigned *blocks, size_t size ) const; template< typename InVector, typename OutVector, @@ -239,16 +239,16 @@ public: void spmvCudaLightSpmv( const InVector& inVector, OutVector& outVector, int gridIdx) const; - + template< typename InVector, typename OutVector, int warpSize > __device__ - void spmvCudaVector( const InVector& inVector, - OutVector& outVector, - int gridIdx) const; - - + void spmvCSRAdaptive( const InVector& inVector, + OutVector& outVector, + int gridIdx, + unsigned *blocks, + size_t blocks_size) const; #endif // The following getters allow us to interface TNL with external C-like diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 4cff6761a..4497d18ce 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef HAVE_CUSPARSE #include @@ -683,54 +684,55 @@ void CSR< Real, Device, Index >::spmvCudaLightSpmv( const InVector& inVector, OutVector& outVector, int gridIdx) const { - const IndexType index = blockIdx.x * blockDim.x + threadIdx.x; - const IndexType elemPerThread = 4; - const IndexType laneID = index % 32; - const IndexType groupID = laneID / elemPerThread; - const IndexType inGroupID = laneID % elemPerThread; + const IndexType index = blockIdx.x * blockDim.x + threadIdx.x; + const IndexType elemPerGroup = 4; + const IndexType laneID = index % 32; + const IndexType groupID = laneID / elemPerGroup; + const IndexType inGroupID = laneID % elemPerGroup; - IndexType row, minID, column, maxID, idxMtx; - __shared__ unsigned rowCnt; + IndexType row, minID, column, maxID, idxMtx; + __shared__ unsigned rowCnt; - if (index == 0) rowCnt = 0; // Init shared variable - __syncthreads(); + if (index == 0) rowCnt = 0; // Init shared variable + __syncthreads(); - while (true) { + while (true) { - /* Get row number */ - if (inGroupID == 0) row = atomicAdd(&rowCnt, 1); + /* Get row number */ + if (inGroupID == 0) row = atomicAdd(&rowCnt, 1); - /* Propagate row number in group */ - row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * elemPerThread); + /* Propagate row number in group */ + row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * elemPerGroup); - if (row >= this->rowPointers.getSize() - 1) - return; + if (row >= this->rowPointers.getSize() - 1) + return; - minID = this->rowPointers[row]; - maxID = this->rowPointers[row + 1]; + minID = this->rowPointers[row]; + maxID = this->rowPointers[row + 1]; - Real result = 0.0; + Real result = 0.0; - idxMtx = minID + inGroupID; - while (idxMtx < maxID) { - column = this->columnIndexes[idxMtx]; - if (column >= this->getColumns()) { - break; + idxMtx = minID + inGroupID; + while (idxMtx < maxID) { + column = this->columnIndexes[idxMtx]; + if (column >= this->getColumns()) + break; + + result += this->values[idxMtx] * inVector[column]; + idxMtx += elemPerGroup; } - result += this->values[idxMtx] * inVector[column]; - idxMtx += elemPerThread; - } - /* Parallel reduction */ - for (int i = elemPerThread/2; i > 0; i /= 2) - result += __shfl_down_sync((unsigned)(warpSize - 1), result, i); - /* Write result */ - if (inGroupID == 0) { - outVector[row] = result; - } - } + /* Parallel reduction */ + for (int i = elemPerGroup/2; i > 0; i /= 2) + result += __shfl_down_sync((unsigned)(warpSize - 1), result, i); + /* Write result */ + if (inGroupID == 0) { + outVector[row] = result; + } + } } + template< typename Real, typename Device, typename Index > @@ -738,44 +740,76 @@ template< typename Real, typename OutVector, int warpSize > __device__ -void CSR< Real, Device, Index >::spmvCudaVector( const InVector& inVector, - OutVector& outVector, - int gridIdx) const +void CSR< Real, Device, Index >::spmvCSRAdaptive( const InVector& inVector, + OutVector& outVector, + int gridIdx, + unsigned *blocks, + size_t blocks_size) const { - const IndexType index = blockIdx.x * blockDim.x + threadIdx.x; - const IndexType laneID = index % warpSize; - const IndexType row = index / warpSize; // warpID - one warp per row - volatile Real result = 0.0; - - if (row >= this->rowPointers.getSize() - 1) - return; - - IndexType minID = this->rowPointers[row]; - IndexType maxID = this->rowPointers[row + 1]; - - /* Calculate results */ - for (IndexType idxMtx = minID + laneID; idxMtx < maxID; idxMtx += warpSize) { - IndexType column = this->columnIndexes[idxMtx]; - if (column >= this->getColumns()) { - break; - } - // printf("%lf %lf %lf\n", (double)this->values[idxMtx], (double)inVector[column], (double)(this->values[idxMtx] * inVector[column])); - result += this->values[idxMtx] * inVector[column]; - } - // printf("=============== lane %d warp %d res %lf\n", (int)laneID, (int)row, (double)result); - /* Parallel reduction */ - result += __shfl_down_sync((warpSize - 1), result, 16); - result += __shfl_down_sync((warpSize - 1), result, 8); - result += __shfl_down_sync((warpSize - 1), result, 4); - result += __shfl_down_sync((warpSize - 1), result, 2); - result += __shfl_down_sync((warpSize - 1), result, 1); - - /* Write result */ - if (laneID == 0) { - outVector[row] = result; - } + constexpr IndexType SHARED = 1024; /* TODO: delete */ + const IndexType index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ unsigned blockCnt; + __shared__ float shared_res[SHARED]; + float result = 0.0; + if (index == 0) blockCnt = 0; // Init shared variable + __syncthreads(); + + const IndexType laneID = index % warpSize; + IndexType blockIdx; + + while (true) { + /* Get row number */ + if (laneID == 0) blockIdx = atomicAdd(&blockCnt, 1); + /* Propagate row number in warp */ + blockIdx = __shfl_sync((unsigned)(warpSize - 1), blockIdx, 0); + if (blockIdx >= blocks_size - 1) + return; + /* Number of elements */ + const IndexType minRow = blocks[blockIdx]; + const IndexType maxRow = blocks[blockIdx + 1]; + const IndexType minID = this->rowPointers[minRow]; + const IndexType maxID = this->rowPointers[maxRow]; + const IndexType elements = maxID - minID; + result = 0.0; + /* rows per block more than 1 */ + if ((maxRow - minRow) > 1) { + /////////////////////////////////////* CSR STREAM *////////////// + /* Copy and calculate elements from global to shared memory, coalesced */ + for (IndexType i = laneID; i < elements; i += warpSize) { + const IndexType elementIdx = i + minID; + const IndexType column = this->columnIndexes[elementIdx]; + if (column >= this->getColumns()) + break; + shared_res[i] = this->values[elementIdx] * inVector[column]; + } + + const IndexType row = minRow + laneID; + if (row >= maxRow - 1) + continue; + /* Calculate result */ + const IndexType to = this->rowPointers[row + 1] - minID; + for (IndexType i = this->rowPointers[row] - minID; i < to; ++i) { + result += shared_res[i]; + } + outVector[row] = result; // Write result + } else { + /////////////////////////////////////* CSR VECTOR *////////////// + for (IndexType i = minID + laneID; i < maxID; i += warpSize) { + const IndexType column = this->columnIndexes[i]; + result += this->values[i] * inVector[column]; + } + /* Reduction */ + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); + if (laneID == 0) outVector[minRow] = result; // Write result + } + } } + template< typename Real, typename Device, typename Index > @@ -827,9 +861,10 @@ template< typename Real, __device__ void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector, OutVector& outVector, - int gridIdx ) const + int gridIdx, + unsigned *blocks, size_t size ) const { - spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); + spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size ); return; IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; @@ -903,7 +938,8 @@ template< typename Real, __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index >* matrix, const InVector* inVector, OutVector* outVector, - int gridIdx ) + int gridIdx, + unsigned *blocks, size_t size) { typedef CSR< Real, Devices::Cuda, Index > Matrix; static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" ); @@ -917,7 +953,7 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde matrix->getCudaKernelType() == Matrix::hybrid ) { matrix->template vectorProductCuda< InVector, OutVector, warpSize > - ( *inVector, *outVector, gridIdx ); + ( *inVector, *outVector, gridIdx, blocks, size ); } } #endif @@ -928,7 +964,9 @@ template< typename Real, typename OutVector > void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, const InVector& inVector, - OutVector& outVector ) + OutVector& outVector, + unsigned *blocks, + size_t size ) { #ifdef HAVE_CUDA typedef CSR< Real, Devices::Cuda, Index > Matrix; @@ -936,6 +974,7 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, Matrix* kernel_this = Cuda::passToDevice( matrix ); InVector* kernel_inVector = Cuda::passToDevice( inVector ); OutVector* kernel_outVector = Cuda::passToDevice( outVector ); + unsigned *kernelBlocks = Cuda::passToDevice( *blocks ); TNL_CHECK_CUDA_DEVICE; dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); @@ -951,42 +990,42 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, ( kernel_this, kernel_inVector, kernel_outVector, - gridIdx ); + gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 16 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, - gridIdx ); + gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 8 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, - gridIdx ); + gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 4 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, - gridIdx ); + gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 2 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, - gridIdx ); + gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 1 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, - gridIdx ); + gridIdx, kernelBlocks, size ); } TNL_CHECK_CUDA_DEVICE; @@ -1106,7 +1145,36 @@ class CSRDeviceDependentCode< Devices::Cuda > inVector.getData(), outVector.getData() ); #else - CSRVectorProductCuda( matrix, inVector, outVector ); + std::vector inBlock; + inBlock.push_back(0); + unsigned sum = 0; + for (size_t i = 1; i < matrix.getRowPointers().getSize(); ++i) { + printf("PTR %d\n", (int)matrix.getRowPointers().getElement(i)); + } + for (size_t i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) { + size_t elements = matrix.getRowPointers().getElement(i + 1) - + matrix.getRowPointers().getElement(i); + if (elements > 1024) { + if (sum == 0) { + inBlock.push_back(i); + } + else { + inBlock.push_back(i - 1); + inBlock.push_back(i); + } + sum = 0; + } + sum += elements; + + if (sum <= 1024) + continue; + else { + inBlock.push_back(i - 1); + sum = elements; + } + } + inBlock.push_back(matrix.getRowPointers().getSize() - 1); + CSRVectorProductCuda( matrix, inVector, outVector, inBlock.data(), inBlock.size() ); #endif } -- GitLab From f620b0ee3657481cd9c6d3cdc790b83db2d5c13c Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Mon, 30 Mar 2020 21:03:16 +0200 Subject: [PATCH 3/8] Added CSR Adaptive and CSR Stream --- src/TNL/Matrices/Legacy/CSR.h | 4 +- src/TNL/Matrices/Legacy/CSR_impl.h | 277 ++++++++++-------- .../Matrices/Legacy/SparseMatrixTest.hpp | 140 +++++---- 3 files changed, 229 insertions(+), 192 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index 633ce665b..7ff1f037c 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -230,7 +230,7 @@ public: __device__ void vectorProductCuda( const InVector& inVector, OutVector& outVector, - int gridIdx, unsigned *blocks, size_t size ) const; + int gridIdx, int *blocks, size_t size ) const; template< typename InVector, typename OutVector, @@ -247,7 +247,7 @@ public: void spmvCSRAdaptive( const InVector& inVector, OutVector& outVector, int gridIdx, - unsigned *blocks, + int *blocks, size_t blocks_size) const; #endif diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 4497d18ce..516f21f27 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -732,6 +732,35 @@ void CSR< Real, Device, Index >::spmvCudaLightSpmv( const InVector& inVector, } } +/* template< typename Real, + typename Device, + typename Index, + typename InVector, + int warpSize > +__global__ +void spmvCSRVectorHelper( const InVector& inVector, + Real *out, + size_t from, + size_t to, + size_t perWarp) +{ + const size_t index = blockIdx.x * blockDim.x + threadIdx.x; + const size_t warpID = index / warpSize; + const size_t laneID = index % warpSize; + const size_t minID = from + warpID * perWarp; + size_t maxID = from + (warpID + 1) * perWarp; + if (minID >= to) return; + if (maxID >= to ) maxID = to; + + Real result = 0.0; + for (IndexType i = minID + laneID; i < maxID; i += warpSize) { + const IndexType column = this->columnIndexes[i]; + if (column >= this->getColumns()) + continue; + result += this->values[i] * inVector[column]; + } + atomicAdd(out, result); +} */ template< typename Real, typename Device, @@ -743,69 +772,79 @@ __device__ void CSR< Real, Device, Index >::spmvCSRAdaptive( const InVector& inVector, OutVector& outVector, int gridIdx, - unsigned *blocks, + int *blocks, size_t blocks_size) const { - constexpr IndexType SHARED = 1024; /* TODO: delete */ + /* Configuration ---------------------------------------------------*/ + constexpr size_t SHARED = 49152/sizeof(float); + constexpr size_t SHARED_PER_WARP = SHARED / warpSize; + constexpr size_t MAX_PER_WARP = 65536; + constexpr size_t ELEMENTS_PER_WARP = 1024; + constexpr size_t THREADS_PER_BLOCK = 1024; + constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize; + //-------------------------------------------------------------------- const IndexType index = blockIdx.x * blockDim.x + threadIdx.x; - __shared__ unsigned blockCnt; - __shared__ float shared_res[SHARED]; - float result = 0.0; - if (index == 0) blockCnt = 0; // Init shared variable - __syncthreads(); - const IndexType laneID = index % warpSize; - IndexType blockIdx; + IndexType blockIdx = index / warpSize; + __shared__ float shared_res[SHARED]; + Real result = 0.0; + if (blockIdx >= blocks_size - 1) + return; + const IndexType minRow = blocks[blockIdx]; + const IndexType maxRow = blocks[blockIdx + 1]; + const IndexType minID = this->rowPointers[minRow]; + const IndexType maxID = this->rowPointers[maxRow]; + const IndexType elements = maxID - minID; + /* rows per block more than 1 */ + if ((maxRow - minRow) > 1) { + /////////////////////////////////////* CSR STREAM *////////////// + /* Copy and calculate elements from global to shared memory, coalesced */ + const IndexType offset = threadIdx.x / warpSize * SHARED_PER_WARP; + for (IndexType i = laneID; i < elements; i += warpSize) { + const IndexType elementIdx = i + minID; + const IndexType column = this->columnIndexes[elementIdx]; + if (column >= this->getColumns()) + continue; + shared_res[i + offset] = this->values[elementIdx] * inVector[column]; + } - while (true) { - /* Get row number */ - if (laneID == 0) blockIdx = atomicAdd(&blockCnt, 1); - /* Propagate row number in warp */ - blockIdx = __shfl_sync((unsigned)(warpSize - 1), blockIdx, 0); - if (blockIdx >= blocks_size - 1) + const IndexType row = minRow + laneID; + if (row >= maxRow) return; - /* Number of elements */ - const IndexType minRow = blocks[blockIdx]; - const IndexType maxRow = blocks[blockIdx + 1]; - const IndexType minID = this->rowPointers[minRow]; - const IndexType maxID = this->rowPointers[maxRow]; - const IndexType elements = maxID - minID; - result = 0.0; - /* rows per block more than 1 */ - if ((maxRow - minRow) > 1) { - /////////////////////////////////////* CSR STREAM *////////////// - /* Copy and calculate elements from global to shared memory, coalesced */ - for (IndexType i = laneID; i < elements; i += warpSize) { - const IndexType elementIdx = i + minID; - const IndexType column = this->columnIndexes[elementIdx]; - if (column >= this->getColumns()) - break; - shared_res[i] = this->values[elementIdx] * inVector[column]; - } + /* Calculate result */ + const IndexType to = this->rowPointers[row + 1] - minID; + for (IndexType i = this->rowPointers[row] - minID; i < to; ++i) { + result += shared_res[i + offset]; + } + outVector[row] = result; // Write result + } else if (elements <= MAX_PER_WARP) { + /////////////////////////////////////* CSR VECTOR *////////////// + for (IndexType i = minID + laneID; i < maxID; i += warpSize) { + IndexType column = this->columnIndexes[i]; + if (column >= this->getColumns()) + break; - const IndexType row = minRow + laneID; - if (row >= maxRow - 1) - continue; - /* Calculate result */ - const IndexType to = this->rowPointers[row + 1] - minID; - for (IndexType i = this->rowPointers[row] - minID; i < to; ++i) { - result += shared_res[i]; - } - outVector[row] = result; // Write result - } else { - /////////////////////////////////////* CSR VECTOR *////////////// - for (IndexType i = minID + laneID; i < maxID; i += warpSize) { - const IndexType column = this->columnIndexes[i]; - result += this->values[i] * inVector[column]; - } - /* Reduction */ - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); - if (laneID == 0) outVector[minRow] = result; // Write result + result += this->values[i] * inVector[column]; } + /* Reduction */ + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2); + result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); + if (laneID == 0) outVector[minRow] = result; // Write result + } else { + /////////////////////////////////////* CSR VECTOR LONG *////////////// + const size_t warps = (elements - ELEMENTS_PER_WARP) / ELEMENTS_PER_WARP + 1; + const size_t blocks = warps <= WARPS_PER_BLOCK ? 1 : warps / WARPS_PER_BLOCK + 1; + const size_t threads_per_block = blocks == 1 ? warps * warpSize : WARPS_PER_BLOCK * warpSize; + // spmvCSRVectorHelper <<>>( + // inVector, + // &outVector[minRow], + // (size_t)(minID + ELEMENTS_PER_WARP), + // (size_t)maxID, + // (size_t)ELEMENTS_PER_WARP + // ); } } @@ -862,7 +901,7 @@ __device__ void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector, OutVector& outVector, int gridIdx, - unsigned *blocks, size_t size ) const + int *blocks, size_t size ) const { spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size ); return; @@ -939,7 +978,7 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde const InVector* inVector, OutVector* outVector, int gridIdx, - unsigned *blocks, size_t size) + int *blocks, size_t size) { typedef CSR< Real, Devices::Cuda, Index > Matrix; static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" ); @@ -965,7 +1004,7 @@ template< typename Real, void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, const InVector& inVector, OutVector& outVector, - unsigned *blocks, + int *blocks, size_t size ) { #ifdef HAVE_CUDA @@ -974,7 +1013,10 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, Matrix* kernel_this = Cuda::passToDevice( matrix ); InVector* kernel_inVector = Cuda::passToDevice( inVector ); OutVector* kernel_outVector = Cuda::passToDevice( outVector ); - unsigned *kernelBlocks = Cuda::passToDevice( *blocks ); + int *kernelBlocks; + cudaMalloc((void **)&kernelBlocks, sizeof(int) * size); + cudaMemcpy(kernelBlocks, blocks, size * sizeof(int), cudaMemcpyHostToDevice); + TNL_CHECK_CUDA_DEVICE; dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); @@ -984,48 +1026,51 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); const int sharedMemory = cudaBlockSize.x * sizeof( Real ); - if( matrix.getCudaWarpSize() == 32 ) + // const int threads = cudaBlockSize.x; + if( matrix.getCudaWarpSize() == 32 ) { + // printf("BL %d BLSIZE %d\n", (int)cudaBlocks, (int)threads); CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 32 > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx, kernelBlocks, size ); - if( matrix.getCudaWarpSize() == 16 ) - CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx, kernelBlocks, size ); - if( matrix.getCudaWarpSize() == 8 ) - CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx, kernelBlocks, size ); - if( matrix.getCudaWarpSize() == 4 ) - CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx, kernelBlocks, size ); - if( matrix.getCudaWarpSize() == 2 ) - CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx, kernelBlocks, size ); - if( matrix.getCudaWarpSize() == 1 ) - CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + <<< 2, 1024 >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx, kernelBlocks, size ); + } + // if( matrix.getCudaWarpSize() == 16 ) + // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 > + // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + // ( kernel_this, + // kernel_inVector, + // kernel_outVector, + // gridIdx, kernelBlocks, size ); + // if( matrix.getCudaWarpSize() == 8 ) + // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 > + // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + // ( kernel_this, + // kernel_inVector, + // kernel_outVector, + // gridIdx, kernelBlocks, size ); + // if( matrix.getCudaWarpSize() == 4 ) + // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 > + // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + // ( kernel_this, + // kernel_inVector, + // kernel_outVector, + // gridIdx, kernelBlocks, size ); + // if( matrix.getCudaWarpSize() == 2 ) + // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 > + // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + // ( kernel_this, + // kernel_inVector, + // kernel_outVector, + // gridIdx, kernelBlocks, size ); + // if( matrix.getCudaWarpSize() == 1 ) + // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 > + // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + // ( kernel_this, + // kernel_inVector, + // kernel_outVector, + // gridIdx, kernelBlocks, size ); } TNL_CHECK_CUDA_DEVICE; @@ -1145,32 +1190,32 @@ class CSRDeviceDependentCode< Devices::Cuda > inVector.getData(), outVector.getData() ); #else - std::vector inBlock; + constexpr int SHARED = 49152/sizeof(float); + constexpr int SHARED_PER_WARP = SHARED / 32; + std::vector inBlock; inBlock.push_back(0); - unsigned sum = 0; - for (size_t i = 1; i < matrix.getRowPointers().getSize(); ++i) { - printf("PTR %d\n", (int)matrix.getRowPointers().getElement(i)); - } - for (size_t i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) { - size_t elements = matrix.getRowPointers().getElement(i + 1) - - matrix.getRowPointers().getElement(i); - if (elements > 1024) { - if (sum == 0) { + size_t sum = 0; + size_t i; + int prev_i = 0; + for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) { + size_t elements = matrix.getRowPointers().getElement(i) - + matrix.getRowPointers().getElement(i - 1); + sum += elements; + if (sum > SHARED_PER_WARP) { + if (i - prev_i == 1) { inBlock.push_back(i); - } - else { + } else { inBlock.push_back(i - 1); - inBlock.push_back(i); + --i; } sum = 0; - } - sum += elements; - - if (sum <= 1024) + prev_i = i; continue; - else { - inBlock.push_back(i - 1); - sum = elements; + } + if (i - prev_i == 32) { + inBlock.push_back(i); + prev_i = i; + sum = 0; } } inBlock.push_back(matrix.getRowPointers().getSize() - 1); diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp index 3904ef432..98ddfd3db 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp @@ -1259,9 +1259,11 @@ void test_VectorProductLarger() Matrix m; m.reset(); m.setDimensions( m_rows, m_cols ); - typename Matrix::CompressedRowLengthsVector rowLengths; - rowLengths.setSize( m_rows ); - rowLengths.setValue( 20 ); + typename Matrix::CompressedRowLengthsVector rowLengths( + {11, 2, 4, 0, 6, 4, 1, 2, 20, 18, 6, 20, 10, 0, 20, 10, 2, 20, 10, 12} + ); +// rowLengths.setSize( m_rows ); +// rowLengths.setValue( 20 ); m.setCompressedRowLengths( rowLengths ); /* 0 */ @@ -1389,93 +1391,83 @@ void test_VectorProductGiant() using RealType = typename Matrix::RealType; using DeviceType = typename Matrix::DeviceType; using IndexType = typename Matrix::IndexType; - const IndexType N = 200; // Should be a multiple of 10 -/* - * Sets up the following NxN sparse matrix: - * - * / 1 0 1 0 ... 1 0 \ - * | 0 0 0 0 ... 0 0 | - * | 0 2 0 2 ... 0 2 | - * | 0 0 0 0 ... 0 0 | - * | 3 -3 3 -3 ... 3 -3 | - * | 0 0 0 0 ... 0 0 | - * | 5 5 5 5 ... 5 5 | - * | 0 0 0 0 ... 0 0 | - * | 0 0 0 0 ... 0 0 | - * | 2 0 0 0 ... 0 0 | <- the only 2 in the row - * - * .................. - */ - const IndexType m_rows = N; - const IndexType m_cols = N; + IndexType m_rows = 100; + IndexType m_cols = 100; Matrix m; m.reset(); m.setDimensions( m_rows, m_cols ); - typename Matrix::CompressedRowLengthsVector rowLengths; - rowLengths.setSize( m_rows ); - rowLengths.setValue( m_cols ); + typename Matrix::CompressedRowLengthsVector rowLengths( + { + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100 + } + ); + m.setCompressedRowLengths( rowLengths ); - for ( IndexType row = 0; row < m_rows; ++row ) { - if (row % 10 == 0) { - for ( IndexType col = 0; col < m_cols; ++col ) { - if (col % 2) - m.setElement( row, col, 1 ); - } - } - else if (row % 10 == 2) { - for ( IndexType col = 0; col < m_cols; ++col ) { - if (col % 2 == 1) - m.setElement( row, col, 2 ); - } - } - else if (row % 10 == 4) { - for ( IndexType col = 0; col < m_cols; ++col ) { - if (col % 2) - m.setElement( row, col, 3 ); - else - m.setElement( row, col, -3 ); - } - } - else if (row % 10 == 6) { - for ( IndexType col = 0; col < m_cols; ++col ) { - m.setElement( row, col, 5 ); - } - } - else if (row % 10 == 9) { - m.setElement( row, 0, 2 ); - } - } + for (int i = 0; i < m_rows; ++i) + for (int j = 0; j < m_cols; ++j) + m.setElement( i, j, i + 1 ); using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >; VectorType inVector; - inVector.setSize( N ); - for( IndexType i = 0; i < inVector.getSize(); i++ ) - inVector.setElement( i, 2 ); + inVector.setSize( m_rows ); + for( IndexType i = 0; i < inVector.getSize(); ++i ) + inVector.setElement( i, 1 ); VectorType outVector; - outVector.setSize( N ); - for( IndexType j = 0; j < outVector.getSize(); j++ ) - outVector.setElement( j, 0 ); + outVector.setSize( m_rows ); + for( IndexType i = 0; i < outVector.getSize(); ++i ) + outVector.setElement( i, 0 ); - m.vectorProduct( inVector, outVector); + + for (int i = 0; i < m_rows; ++i) + EXPECT_EQ( outVector.getElement( i ), (i + 1) * 100 ); + + //----------------------------------------------------- + + m_rows = 2; + m_cols = 1000; - for ( IndexType i = 0; i < outVector.getSize(); ++i ) { - if ( i % 10 == 0 ) - EXPECT_EQ( outVector.getElement( i ), 200 /* (N) */ ); - else if ( i % 10 == 2 ) - EXPECT_EQ( outVector.getElement( i ), 400 /* (2 * N) */ ); - else if ( i % 10 == 6 ) - EXPECT_EQ( outVector.getElement( i ), 2000 /* (10 * N) */ ); - else if ( i % 10 == 9 ) - EXPECT_EQ( outVector.getElement( i ), 4 ); - else - EXPECT_EQ( outVector.getElement( i ), 0 ); - } + m.reset(); + m.setDimensions( m_rows, m_cols ); + typename Matrix::CompressedRowLengthsVector rowLengths2( + { + 1000, 1000 + } + ); + + m.setCompressedRowLengths( rowLengths2 ); + + for (int i = 0; i < m_rows; ++i) + for (int j = 0; j < m_cols; ++j) + m.setElement( i, j, i + 1 ); + + VectorType inVector2; + inVector2.setSize( m_cols ); + for( IndexType i = 0; i < inVector2.getSize(); i++ ) + inVector2.setElement( i, 1 ); + + VectorType outVector2; + outVector2.setSize( m_rows ); + for( IndexType i = 0; i < outVector2.getSize(); ++i ) + outVector2.setElement( i, 0 ); + m.vectorProduct( inVector2, outVector2); + + for (int i = 0; i < m_rows; ++i) + EXPECT_EQ( outVector2.getElement( i ), (i + 1) * 1000 ); } template< typename Matrix > -- GitLab From 04f800088d6abeb45cb75968e82843a96fdc0be7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Tue, 31 Mar 2020 10:49:39 +0200 Subject: [PATCH 4/8] Deactivating Legact CSR larger and giant vector product tests. --- src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h index 5ff79213d..e9c3f591c 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h @@ -112,19 +112,19 @@ TYPED_TEST( CSRMatrixTest, vectorProductTest ) test_VectorProduct< CSRMatrixType >(); } -TYPED_TEST( CSRMatrixTest, vectorProductLargerTest ) +/*TYPED_TEST( CSRMatrixTest, vectorProductLargerTest ) { using CSRMatrixType = typename TestFixture::CSRMatrixType; test_VectorProductLarger< CSRMatrixType >(); -} +}*/ -TYPED_TEST( CSRMatrixTest, vectorProductGiantTest ) +/*TYPED_TEST( CSRMatrixTest, vectorProductGiantTest ) { using CSRMatrixType = typename TestFixture::CSRMatrixType; test_VectorProductGiant< CSRMatrixType >(); -} +}*/ TYPED_TEST( CSRMatrixTest, saveAndLoadTest ) { -- GitLab From 0678f5f1b4bd685527baf117f3f46c3ed71dc858 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Tue, 31 Mar 2020 11:04:59 +0200 Subject: [PATCH 5/8] Avoiding use of enum SPMVCudaKernel in legacy CSR format. --- src/TNL/Matrices/Legacy/CSR.h | 14 ++++++++------ src/TNL/Matrices/Legacy/CSR_impl.h | 6 +++--- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index 7ff1f037c..af4bc1d9d 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -31,7 +31,9 @@ class CusparseCSR; template< typename Device > class CSRDeviceDependentCode; -template< typename Real, typename Device = Devices::Host, typename Index = int > +enum CSRKernel { CSRScalar, CVSVector, CSRAdaptive, CSRStream }; + +template< typename Real, typename Device = Devices::Host, typename Index = int >//, CSRKernel KernelType = CSRAdaptive > class CSR : public Sparse< Real, Device, Index > { private: @@ -60,7 +62,7 @@ public: typename _Index = Index > using Self = CSR< _Real, _Device, _Index >; - enum SPMVCudaKernel { scalar, vector, hybrid }; + //enum SPMVCudaKernel { scalar, vector, hybrid }; CSR(); @@ -198,10 +200,10 @@ public: void print( std::ostream& str ) const; - void setCudaKernelType( const SPMVCudaKernel kernel ); + //void setCudaKernelType( const SPMVCudaKernel kernel ); - __cuda_callable__ - SPMVCudaKernel getCudaKernelType() const; + //__cuda_callable__ + //SPMVCudaKernel getCudaKernelType() const; void setCudaWarpSize( const int warpSize ); @@ -281,7 +283,7 @@ protected: Containers::Vector< Index, Device, Index > rowPointers; - SPMVCudaKernel spmvCudaKernel; + //SPMVCudaKernel spmvCudaKernel; int cudaWarpSize, hybridModeSplit; diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 516f21f27..9fd872167 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -34,7 +34,7 @@ template< typename Real, typename Device, typename Index > CSR< Real, Device, Index >::CSR() -: spmvCudaKernel( hybrid ), +: //spmvCudaKernel( hybrid ), cudaWarpSize( 32 ), //Cuda::getWarpSize() ) hybridModeSplit( 4 ) { @@ -621,7 +621,7 @@ void CSR< Real, Device, Index >::print( std::ostream& str ) const } } -template< typename Real, +/*template< typename Real, typename Device, typename Index > void CSR< Real, Device, Index >::setCudaKernelType( const SPMVCudaKernel kernel ) @@ -636,7 +636,7 @@ __cuda_callable__ typename CSR< Real, Device, Index >::SPMVCudaKernel CSR< Real, Device, Index >::getCudaKernelType() const { return this->spmvCudaKernel; -} +}*/ template< typename Real, typename Device, -- GitLab From d6909164fd5ec827cfebd100a28a7b874f6c8236 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Tue, 31 Mar 2020 13:13:09 +0200 Subject: [PATCH 6/8] Added template parameter KernelType to Matrices::Legacy::CSR. --- src/Benchmarks/BLAS/spmv.h | 7 +- src/Benchmarks/SpMV/spmv-legacy.h | 22 +- src/Python/pytnl/tnl/SparseMatrix.cpp | 11 +- src/TNL/Matrices/Legacy/CSR.h | 25 ++- src/TNL/Matrices/Legacy/CSR_impl.h | 278 ++++++++++++++++---------- src/TNL/Matrices/MatrixInfo.h | 36 +++- 6 files changed, 254 insertions(+), 125 deletions(-) diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h index f7f3139bf..d785fcff3 100644 --- a/src/Benchmarks/BLAS/spmv.h +++ b/src/Benchmarks/BLAS/spmv.h @@ -27,6 +27,11 @@ namespace Benchmarks { template< typename Real, typename Device, typename Index > using SlicedEllpack = Matrices::Legacy::SlicedEllpack< Real, Device, Index >; +// Legacy formats +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Scalar = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRScalar >; + + template< typename Matrix > int setHostTestMatrix( Matrix& matrix, const int elementsPerRow ) @@ -173,7 +178,7 @@ benchmarkSpmvSynthetic( Benchmark & benchmark, const int & elementsPerRow ) { // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats) - benchmarkSpMV< Real, Matrices::Legacy::CSR >( benchmark, size, elementsPerRow ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow ); benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, size, elementsPerRow ); benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow ); benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow ); diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h index 38fec0b64..f690a50c6 100644 --- a/src/Benchmarks/SpMV/spmv-legacy.h +++ b/src/Benchmarks/SpMV/spmv-legacy.h @@ -61,6 +61,22 @@ using SlicedEllpackSegments = Containers::Segments::SlicedEllpack< Device, Index template< typename Real, typename Device, typename Index > using SparseMatrix_SlicedEllpack = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, SlicedEllpackSegments >; +// Legacy formats +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Scalar = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRScalar >; + +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Vector = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRVector >; + +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Light = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight >; + +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Adaptive = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRAdaptive >; + +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Stream = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRStream >; + // Get the name (with extension) of input matrix file std::string getMatrixFileName( const String& InputFileName ) { @@ -259,7 +275,11 @@ benchmarkSpmvSynthetic( Benchmark& benchmark, benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse ); #endif - benchmarkSpMV< Real, Matrices::Legacy::CSR >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Stream >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_CSR >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); diff --git a/src/Python/pytnl/tnl/SparseMatrix.cpp b/src/Python/pytnl/tnl/SparseMatrix.cpp index 573b2790a..430715597 100644 --- a/src/Python/pytnl/tnl/SparseMatrix.cpp +++ b/src/Python/pytnl/tnl/SparseMatrix.cpp @@ -16,14 +16,15 @@ using SE_cuda = TNL::Matrices::Legacy::SlicedEllpack< double, TNL::Devices::Cuda void export_SparseMatrices( py::module & m ) { - export_Matrix< CSR_host >( m, "CSR" ); + // TODO: This stop working after adding template parameter KernelType to Legacy::CSR + //export_Matrix< CSR_host >( m, "CSR" ); export_Matrix< E_host >( m, "Ellpack" ); export_Matrix< SE_host >( m, "SlicedEllpack" ); - m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< CSR_host, E_host >); - m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< E_host, CSR_host >); - m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< CSR_host, SE_host >); - m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< SE_host, CSR_host >); + //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< CSR_host, E_host >); + //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< E_host, CSR_host >); + //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< CSR_host, SE_host >); + //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< SE_host, CSR_host >); m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< E_host, SE_host >); m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< SE_host, E_host >); } diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index af4bc1d9d..f86420339 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -31,9 +31,9 @@ class CusparseCSR; template< typename Device > class CSRDeviceDependentCode; -enum CSRKernel { CSRScalar, CVSVector, CSRAdaptive, CSRStream }; +enum CSRKernel { CSRScalar, CSRVector, CSRLight, CSRAdaptive, CSRStream }; -template< typename Real, typename Device = Devices::Host, typename Index = int >//, CSRKernel KernelType = CSRAdaptive > +template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRAdaptive > class CSR : public Sparse< Real, Device, Index > { private: @@ -42,7 +42,7 @@ private: using Enabler = std::enable_if< ! std::is_same< Device2, Device >::value >; // friend class will be needed for templated assignment operators - template< typename Real2, typename Device2, typename Index2 > + template< typename Real2, typename Device2, typename Index2, CSRKernel KernelType2 > friend class CSR; public: @@ -62,8 +62,11 @@ public: typename _Index = Index > using Self = CSR< _Real, _Device, _Index >; + constexpr CSRKernel getSpMVKernelType() { return KernelType; }; //enum SPMVCudaKernel { scalar, vector, hybrid }; + using Sparse< Real, Device, Index >::getAllocatedElementsCount; + CSR(); static String getSerializationType(); @@ -87,8 +90,8 @@ public: __cuda_callable__ IndexType getNonZeroRowLengthFast( const IndexType row ) const; - template< typename Real2, typename Device2, typename Index2 > - void setLike( const CSR< Real2, Device2, Index2 >& matrix ); + template< typename Real2, typename Device2, typename Index2, CSRKernel KernelType2 > + void setLike( const CSR< Real2, Device2, Index2, KernelType2 >& matrix ); void reset(); @@ -167,13 +170,13 @@ public: OutVector& outVector ) const; // TODO: add const RealType& multiplicator = 1.0 ) - template< typename Real2, typename Index2 > - void addMatrix( const CSR< Real2, Device, Index2 >& matrix, + template< typename Real2, typename Index2, CSRKernel KernelType2 > + void addMatrix( const CSR< Real2, Device, Index2, KernelType2 >& matrix, const RealType& matrixMultiplicator = 1.0, const RealType& thisMatrixMultiplicator = 1.0 ); - template< typename Real2, typename Index2 > - void getTransposition( const CSR< Real2, Device, Index2 >& matrix, + template< typename Real2, typename Index2, CSRKernel KernelType2 > + void getTransposition( const CSR< Real2, Device, Index2, KernelType2 >& matrix, const RealType& matrixMultiplicator = 1.0 ); template< typename Vector1, typename Vector2 > @@ -186,9 +189,9 @@ public: CSR& operator=( const CSR& matrix ); // cross-device copy assignment - template< typename Real2, typename Device2, typename Index2, + template< typename Real2, typename Device2, typename Index2, CSRKernel KernelType2, typename = typename Enabler< Device2 >::type > - CSR& operator=( const CSR< Real2, Device2, Index2 >& matrix ); + CSR& operator=( const CSR< Real2, Device2, Index2, KernelType2 >& matrix ); void save( File& file ) const; diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 9fd872167..c4c4be89d 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -32,8 +32,9 @@ class tnlCusparseCSRWrapper {}; template< typename Real, typename Device, - typename Index > -CSR< Real, Device, Index >::CSR() + typename Index, + CSRKernel KernelType > +CSR< Real, Device, Index, KernelType >::CSR() : //spmvCudaKernel( hybrid ), cudaWarpSize( 32 ), //Cuda::getWarpSize() ) hybridModeSplit( 4 ) @@ -42,8 +43,9 @@ CSR< Real, Device, Index >::CSR() template< typename Real, typename Device, - typename Index > -String CSR< Real, Device, Index >::getSerializationType() + typename Index, + CSRKernel KernelType > +String CSR< Real, Device, Index, KernelType >::getSerializationType() { return String( "Matrices::CSR< ") + TNL::getType< Real>() + @@ -54,16 +56,18 @@ String CSR< Real, Device, Index >::getSerializationType() template< typename Real, typename Device, - typename Index > -String CSR< Real, Device, Index >::getSerializationTypeVirtual() const + typename Index, + CSRKernel KernelType > +String CSR< Real, Device, Index, KernelType >::getSerializationTypeVirtual() const { return this->getSerializationType(); } template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::setDimensions( const IndexType rows, + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::setDimensions( const IndexType rows, const IndexType columns ) { Sparse< Real, Device, Index >::setDimensions( rows, columns ); @@ -73,8 +77,9 @@ void CSR< Real, Device, Index >::setDimensions( const IndexType rows, template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) { TNL_ASSERT_GT( this->getRows(), 0, "cannot set row lengths of an empty matrix" ); TNL_ASSERT_GT( this->getColumns(), 0, "cannot set row lengths of an empty matrix" ); @@ -103,8 +108,9 @@ void CSR< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLeng template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::getCompressedRowLengths( CompressedRowLengthsVectorView rowLengths ) const + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::getCompressedRowLengths( CompressedRowLengthsVectorView rowLengths ) const { TNL_ASSERT_EQ( rowLengths.getSize(), this->getRows(), "invalid size of the rowLengths vector" ); for( IndexType row = 0; row < this->getRows(); row++ ) @@ -113,25 +119,28 @@ void CSR< Real, Device, Index >::getCompressedRowLengths( CompressedRowLengthsVe template< typename Real, typename Device, - typename Index > -Index CSR< Real, Device, Index >::getRowLength( const IndexType row ) const + typename Index, + CSRKernel KernelType > +Index CSR< Real, Device, Index, KernelType >::getRowLength( const IndexType row ) const { return this->rowPointers.getElement( row + 1 ) - this->rowPointers.getElement( row ); } template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -Index CSR< Real, Device, Index >::getRowLengthFast( const IndexType row ) const +Index CSR< Real, Device, Index, KernelType >::getRowLengthFast( const IndexType row ) const { return this->rowPointers[ row + 1 ] - this->rowPointers[ row ]; } template< typename Real, typename Device, - typename Index > -Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) const + typename Index, + CSRKernel KernelType > +Index CSR< Real, Device, Index, KernelType >::getNonZeroRowLength( const IndexType row ) const { // TODO: Fix/Implement TNL_ASSERT( false, std::cerr << "TODO: Fix/Implement" ); @@ -140,9 +149,10 @@ Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) con template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -Index CSR< Real, Device, Index >::getNonZeroRowLengthFast( const IndexType row ) const +Index CSR< Real, Device, Index, KernelType >::getNonZeroRowLengthFast( const IndexType row ) const { ConstMatrixRow matrixRow = this->getRow( row ); return matrixRow.getNonZeroElementsCount(); @@ -150,11 +160,13 @@ Index CSR< Real, Device, Index >::getNonZeroRowLengthFast( const IndexType row ) template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > template< typename Real2, typename Device2, - typename Index2 > -void CSR< Real, Device, Index >::setLike( const CSR< Real2, Device2, Index2 >& matrix ) + typename Index2, + CSRKernel KernelType2 > +void CSR< Real, Device, Index, KernelType >::setLike( const CSR< Real2, Device2, Index2, KernelType2 >& matrix ) { Sparse< Real, Device, Index >::setLike( matrix ); this->rowPointers.setLike( matrix.rowPointers ); @@ -162,8 +174,9 @@ void CSR< Real, Device, Index >::setLike( const CSR< Real2, Device2, Index2 >& m template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::reset() + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::reset() { Sparse< Real, Device, Index >::reset(); this->rowPointers.reset(); @@ -171,9 +184,10 @@ void CSR< Real, Device, Index >::reset() template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -bool CSR< Real, Device, Index >::setElementFast( const IndexType row, +bool CSR< Real, Device, Index, KernelType >::setElementFast( const IndexType row, const IndexType column, const Real& value ) { @@ -182,8 +196,9 @@ bool CSR< Real, Device, Index >::setElementFast( const IndexType row, template< typename Real, typename Device, - typename Index > -bool CSR< Real, Device, Index >::setElement( const IndexType row, + typename Index, + CSRKernel KernelType > +bool CSR< Real, Device, Index, KernelType >::setElement( const IndexType row, const IndexType column, const Real& value ) { @@ -193,9 +208,10 @@ bool CSR< Real, Device, Index >::setElement( const IndexType row, template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -bool CSR< Real, Device, Index >::addElementFast( const IndexType row, +bool CSR< Real, Device, Index, KernelType >::addElementFast( const IndexType row, const IndexType column, const RealType& value, const RealType& thisElementMultiplicator ) @@ -237,8 +253,9 @@ bool CSR< Real, Device, Index >::addElementFast( const IndexType row, template< typename Real, typename Device, - typename Index > -bool CSR< Real, Device, Index >::addElement( const IndexType row, + typename Index, + CSRKernel KernelType > +bool CSR< Real, Device, Index, KernelType >::addElement( const IndexType row, const IndexType column, const RealType& value, const RealType& thisElementMultiplicator ) @@ -287,9 +304,10 @@ bool CSR< Real, Device, Index >::addElement( const IndexType row, template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -bool CSR< Real, Device, Index > :: setRowFast( const IndexType row, +bool CSR< Real, Device, Index, KernelType > :: setRowFast( const IndexType row, const IndexType* columnIndexes, const RealType* values, const IndexType elements ) @@ -313,8 +331,9 @@ bool CSR< Real, Device, Index > :: setRowFast( const IndexType row, template< typename Real, typename Device, - typename Index > -bool CSR< Real, Device, Index > :: setRow( const IndexType row, + typename Index, + CSRKernel KernelType > +bool CSR< Real, Device, Index, KernelType > :: setRow( const IndexType row, const IndexType* columnIndexes, const RealType* values, const IndexType elements ) @@ -337,9 +356,10 @@ bool CSR< Real, Device, Index > :: setRow( const IndexType row, template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -bool CSR< Real, Device, Index > :: addRowFast( const IndexType row, +bool CSR< Real, Device, Index, KernelType > :: addRowFast( const IndexType row, const IndexType* columns, const RealType* values, const IndexType numberOfElements, @@ -351,8 +371,9 @@ bool CSR< Real, Device, Index > :: addRowFast( const IndexType row, template< typename Real, typename Device, - typename Index > -bool CSR< Real, Device, Index > :: addRow( const IndexType row, + typename Index, + CSRKernel KernelType > +bool CSR< Real, Device, Index, KernelType > :: addRow( const IndexType row, const IndexType* columns, const RealType* values, const IndexType numberOfElements, @@ -363,9 +384,10 @@ bool CSR< Real, Device, Index > :: addRow( const IndexType row, template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -Real CSR< Real, Device, Index >::getElementFast( const IndexType row, +Real CSR< Real, Device, Index, KernelType >::getElementFast( const IndexType row, const IndexType column ) const { IndexType elementPtr = this->rowPointers[ row ]; @@ -382,8 +404,9 @@ Real CSR< Real, Device, Index >::getElementFast( const IndexType row, template< typename Real, typename Device, - typename Index > -Real CSR< Real, Device, Index >::getElement( const IndexType row, + typename Index, + CSRKernel KernelType > +Real CSR< Real, Device, Index, KernelType >::getElement( const IndexType row, const IndexType column ) const { IndexType elementPtr = this->rowPointers.getElement( row ); @@ -400,9 +423,10 @@ Real CSR< Real, Device, Index >::getElement( const IndexType row, template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -void CSR< Real, Device, Index >::getRowFast( const IndexType row, +void CSR< Real, Device, Index, KernelType >::getRowFast( const IndexType row, IndexType* columns, RealType* values ) const { @@ -418,10 +442,11 @@ void CSR< Real, Device, Index >::getRowFast( const IndexType row, template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -typename CSR< Real, Device, Index >::MatrixRow -CSR< Real, Device, Index >:: +typename CSR< Real, Device, Index, KernelType >::MatrixRow +CSR< Real, Device, Index, KernelType >:: getRow( const IndexType rowIndex ) { const IndexType rowOffset = this->rowPointers[ rowIndex ]; @@ -434,10 +459,11 @@ getRow( const IndexType rowIndex ) template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -typename CSR< Real, Device, Index >::ConstMatrixRow -CSR< Real, Device, Index >:: +typename CSR< Real, Device, Index, KernelType >::ConstMatrixRow +CSR< Real, Device, Index, KernelType >:: getRow( const IndexType rowIndex ) const { const IndexType rowOffset = this->rowPointers[ rowIndex ]; @@ -450,10 +476,11 @@ getRow( const IndexType rowIndex ) const template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > template< typename Vector > __cuda_callable__ -typename Vector::RealType CSR< Real, Device, Index >::rowVectorProduct( const IndexType row, +typename Vector::RealType CSR< Real, Device, Index, KernelType >::rowVectorProduct( const IndexType row, const Vector& vector ) const { Real result = 0.0; @@ -468,9 +495,10 @@ typename Vector::RealType CSR< Real, Device, Index >::rowVectorProduct( const In template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > template< typename InVector, typename OutVector > -void CSR< Real, Device, Index >::vectorProduct( const InVector& inVector, +void CSR< Real, Device, Index, KernelType >::vectorProduct( const InVector& inVector, OutVector& outVector ) const { DeviceDependentCode::vectorProduct( *this, inVector, outVector ); @@ -478,10 +506,12 @@ void CSR< Real, Device, Index >::vectorProduct( const InVector& inVector, template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > template< typename Real2, - typename Index2 > -void CSR< Real, Device, Index >::addMatrix( const CSR< Real2, Device, Index2 >& matrix, + typename Index2, + CSRKernel KernelType2 > +void CSR< Real, Device, Index, KernelType >::addMatrix( const CSR< Real2, Device, Index2, KernelType2 >& matrix, const RealType& matrixMultiplicator, const RealType& thisMatrixMultiplicator ) { @@ -491,10 +521,12 @@ void CSR< Real, Device, Index >::addMatrix( const CSR< Real2, Device, Index2 >& template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > template< typename Real2, - typename Index2 > -void CSR< Real, Device, Index >::getTransposition( const CSR< Real2, Device, Index2 >& matrix, + typename Index2, + CSRKernel KernelType2 > +void CSR< Real, Device, Index, KernelType >::getTransposition( const CSR< Real2, Device, Index2, KernelType2 >& matrix, const RealType& matrixMultiplicator ) { throw Exceptions::NotImplementedError( "CSR::getTransposition is not implemented." ); @@ -503,9 +535,10 @@ void CSR< Real, Device, Index >::getTransposition( const CSR< Real2, Device, Ind template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > template< typename Vector1, typename Vector2 > -bool CSR< Real, Device, Index >::performSORIteration( const Vector1& b, +bool CSR< Real, Device, Index, KernelType >::performSORIteration( const Vector1& b, const IndexType row, Vector2& x, const RealType& omega ) const @@ -541,9 +574,10 @@ bool CSR< Real, Device, Index >::performSORIteration( const Vector1& b, // copy assignment template< typename Real, typename Device, - typename Index > -CSR< Real, Device, Index >& -CSR< Real, Device, Index >::operator=( const CSR& matrix ) + typename Index, + CSRKernel KernelType > +CSR< Real, Device, Index, KernelType >& +CSR< Real, Device, Index, KernelType >::operator=( const CSR& matrix ) { this->setLike( matrix ); this->values = matrix.values; @@ -555,10 +589,11 @@ CSR< Real, Device, Index >::operator=( const CSR& matrix ) // cross-device copy assignment template< typename Real, typename Device, - typename Index > - template< typename Real2, typename Device2, typename Index2, typename > -CSR< Real, Device, Index >& -CSR< Real, Device, Index >::operator=( const CSR< Real2, Device2, Index2 >& matrix ) + typename Index, + CSRKernel KernelType > + template< typename Real2, typename Device2, typename Index2, CSRKernel KernelType2, typename > +CSR< Real, Device, Index, KernelType >& +CSR< Real, Device, Index, KernelType >::operator=( const CSR< Real2, Device2, Index2, KernelType2 >& matrix ) { this->setLike( matrix ); this->values = matrix.values; @@ -570,8 +605,9 @@ CSR< Real, Device, Index >::operator=( const CSR< Real2, Device2, Index2 >& matr template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::save( File& file ) const + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::save( File& file ) const { Sparse< Real, Device, Index >::save( file ); file << this->rowPointers; @@ -579,8 +615,9 @@ void CSR< Real, Device, Index >::save( File& file ) const template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::load( File& file ) + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::load( File& file ) { Sparse< Real, Device, Index >::load( file ); file >> this->rowPointers; @@ -588,24 +625,27 @@ void CSR< Real, Device, Index >::load( File& file ) template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::save( const String& fileName ) const + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::save( const String& fileName ) const { Object::save( fileName ); } template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::load( const String& fileName ) + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::load( const String& fileName ) { Object::load( fileName ); } template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::print( std::ostream& str ) const + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::print( std::ostream& str ) const { for( IndexType row = 0; row < this->getRows(); row++ ) { @@ -624,7 +664,7 @@ void CSR< Real, Device, Index >::print( std::ostream& str ) const /*template< typename Real, typename Device, typename Index > -void CSR< Real, Device, Index >::setCudaKernelType( const SPMVCudaKernel kernel ) +void CSR< Real, Device, Index, KernelType >::setCudaKernelType( const SPMVCudaKernel kernel ) { this->spmvCudaKernel = kernel; } @@ -633,40 +673,44 @@ template< typename Real, typename Device, typename Index > __cuda_callable__ -typename CSR< Real, Device, Index >::SPMVCudaKernel CSR< Real, Device, Index >::getCudaKernelType() const +typename CSR< Real, Device, Index, KernelType >::SPMVCudaKernel CSR< Real, Device, Index, KernelType >::getCudaKernelType() const { return this->spmvCudaKernel; }*/ template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::setCudaWarpSize( const int warpSize ) + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::setCudaWarpSize( const int warpSize ) { this->cudaWarpSize = warpSize; } template< typename Real, typename Device, - typename Index > -int CSR< Real, Device, Index >::getCudaWarpSize() const + typename Index, + CSRKernel KernelType > +int CSR< Real, Device, Index, KernelType >::getCudaWarpSize() const { return this->cudaWarpSize; } template< typename Real, typename Device, - typename Index > -void CSR< Real, Device, Index >::setHybridModeSplit( const IndexType hybridModeSplit ) + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::setHybridModeSplit( const IndexType hybridModeSplit ) { this->hybridModeSplit = hybridModeSplit; } template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > __cuda_callable__ -Index CSR< Real, Device, Index >::getHybridModeSplit() const +Index CSR< Real, Device, Index, KernelType >::getHybridModeSplit() const { return this->hybridModeSplit; } @@ -675,12 +719,13 @@ Index CSR< Real, Device, Index >::getHybridModeSplit() const template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > template< typename InVector, typename OutVector, int warpSize > __device__ -void CSR< Real, Device, Index >::spmvCudaLightSpmv( const InVector& inVector, +void CSR< Real, Device, Index, KernelType >::spmvCudaLightSpmv( const InVector& inVector, OutVector& outVector, int gridIdx) const { @@ -764,12 +809,13 @@ void spmvCSRVectorHelper( const InVector& inVector, template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > template< typename InVector, typename OutVector, int warpSize > __device__ -void CSR< Real, Device, Index >::spmvCSRAdaptive( const InVector& inVector, +void CSR< Real, Device, Index, KernelType >::spmvCSRAdaptive( const InVector& inVector, OutVector& outVector, int gridIdx, int *blocks, @@ -851,12 +897,13 @@ void CSR< Real, Device, Index >::spmvCSRAdaptive( const InVector& inVector, template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > template< typename InVector, typename OutVector, int warpSize > __device__ -void CSR< Real, Device, Index >::spmvCudaVectorized( const InVector& inVector, +void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& inVector, OutVector& outVector, const IndexType warpStart, const IndexType warpEnd, @@ -893,17 +940,36 @@ void CSR< Real, Device, Index >::spmvCudaVectorized( const InVector& inVector, template< typename Real, typename Device, - typename Index > + typename Index, + CSRKernel KernelType > template< typename InVector, typename OutVector, int warpSize > __device__ -void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector, +void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& inVector, OutVector& outVector, int gridIdx, int *blocks, size_t size ) const { - spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size ); + switch( KernelType ) + { + case CSRScalar: + // TODO: + break; + case CSRVector: + spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx ); + break; + case CSRLight: + spmvCSRLight< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); + break; + case CSRAdaptive: + spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size ); + break; + case CSRStream: + // TODO: + break; + } + return; IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; @@ -912,7 +978,7 @@ void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector, const IndexType inWarpIdx = globalIdx % warpSize; if( this->getCudaKernelType() == vector ) - spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx ); + /**** * Hybrid mode @@ -949,14 +1015,15 @@ class CSRDeviceDependentCode< Devices::Host > template< typename Real, typename Index, + CSRKernel KernelType, typename InVector, typename OutVector > - static void vectorProduct( const CSR< Real, Device, Index >& matrix, + static void vectorProduct( const CSR< Real, Device, Index, KernelType >& matrix, const InVector& inVector, OutVector& outVector ) { const Index rows = matrix.getRows(); - const CSR< Real, Device, Index >* matrixPtr = &matrix; + const CSR< Real, Device, Index, KernelType >* matrixPtr = &matrix; const InVector* inVectorPtr = &inVector; OutVector* outVectorPtr = &outVector; #ifdef HAVE_OPENMP @@ -1174,9 +1241,10 @@ class CSRDeviceDependentCode< Devices::Cuda > template< typename Real, typename Index, + CSRKernel KernelType, typename InVector, typename OutVector > - static void vectorProduct( const CSR< Real, Device, Index >& matrix, + static void vectorProduct( const CSR< Real, Device, Index, KernelType >& matrix, const InVector& inVector, OutVector& outVector ) { diff --git a/src/TNL/Matrices/MatrixInfo.h b/src/TNL/Matrices/MatrixInfo.h index 73d77f31d..ed999c9f2 100644 --- a/src/TNL/Matrices/MatrixInfo.h +++ b/src/TNL/Matrices/MatrixInfo.h @@ -88,11 +88,43 @@ struct MatrixInfo< Legacy::BiEllpack< Real, Device, Index > > }; template< typename Real, typename Device, typename Index > -struct MatrixInfo< Legacy::CSR< Real, Device, Index > > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRScalar > > { static String getDensity() { return String( "sparse" ); }; - static String getFormat() { return "CSR Legacy"; }; + static String getFormat() { return "CSR Legacy Scalar"; }; +}; + +template< typename Real, typename Device, typename Index > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRVector> > +{ + static String getDensity() { return String( "sparse" ); }; + + static String getFormat() { return "CSR Legacy Vector"; }; +}; + +template< typename Real, typename Device, typename Index > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight > > +{ + static String getDensity() { return String( "sparse" ); }; + + static String getFormat() { return "CSR Legacy Light"; }; +}; + +template< typename Real, typename Device, typename Index > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRAdaptive > > +{ + static String getDensity() { return String( "sparse" ); }; + + static String getFormat() { return "CSR Legacy Adaptive"; }; +}; + +template< typename Real, typename Device, typename Index > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRStream > > +{ + static String getDensity() { return String( "sparse" ); }; + + static String getFormat() { return "CSR Legacy Stream"; }; }; template< typename Real, typename Device, typename Index > -- GitLab From 2641210dac5db0a06084258e476f61a15fab432c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Tue, 31 Mar 2020 15:44:45 +0200 Subject: [PATCH 7/8] Fixing CUDA code in Legacy CSR. --- src/TNL/Matrices/Legacy/CSR.h | 8 ++--- src/TNL/Matrices/Legacy/CSR_impl.h | 51 ++++++++++++++++-------------- 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index f86420339..46e616d16 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -31,9 +31,9 @@ class CusparseCSR; template< typename Device > class CSRDeviceDependentCode; -enum CSRKernel { CSRScalar, CSRVector, CSRLight, CSRAdaptive, CSRStream }; +enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, CSRAdaptive, CSRStream }; -template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRAdaptive > +template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRScalar > class CSR : public Sparse< Real, Device, Index > { private: @@ -225,9 +225,7 @@ public: __device__ void spmvCudaVectorized( const InVector& inVector, OutVector& outVector, - const IndexType warpStart, - const IndexType warpEnd, - const IndexType inWarpIdx ) const; + const IndexType gridIdx ) const; template< typename InVector, typename OutVector, diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index c4c4be89d..12a9eb554 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -905,10 +905,13 @@ template< typename Real, __device__ void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& inVector, OutVector& outVector, - const IndexType warpStart, - const IndexType warpEnd, - const IndexType inWarpIdx ) const + const IndexType gridIdx ) const { + IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const IndexType warpStart = warpSize * ( globalIdx / warpSize ); + const IndexType warpEnd = min( warpStart + warpSize, this->getRows() ); + const IndexType inWarpIdx = globalIdx % warpSize; + volatile Real* aux = Cuda::getSharedMemory< Real >(); for( IndexType row = warpStart; row < warpEnd; row++ ) { @@ -957,10 +960,10 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& // TODO: break; case CSRVector: - spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx ); + spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); break; case CSRLight: - spmvCSRLight< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); + spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); break; case CSRAdaptive: spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size ); @@ -970,9 +973,8 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& break; } - return; - IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + /*IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; const IndexType warpStart = warpSize * ( globalIdx / warpSize ); const IndexType warpEnd = min( warpStart + warpSize, this->getRows() ); const IndexType inWarpIdx = globalIdx % warpSize; @@ -980,9 +982,9 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& if( this->getCudaKernelType() == vector ) - /**** - * Hybrid mode - */ + ///// + // Hybrid mode + // const Index firstRow = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x; const IndexType lastRow = min( this->getRows(), firstRow + blockDim. x ); const IndexType nonzerosPerRow = ( this->rowPointers[ lastRow ] - this->rowPointers[ firstRow ] ) / @@ -990,19 +992,19 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& if( nonzerosPerRow < this->getHybridModeSplit() ) { - /**** - * Use the scalar mode - */ + ///// + // Use the scalar mode + // if( globalIdx < this->getRows() ) outVector[ globalIdx ] = this->rowVectorProduct( globalIdx, inVector ); } else { - /**** - * Use the vector mode - */ + //// + // Use the vector mode + // spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx ); - } + }*/ } #endif @@ -1038,10 +1040,11 @@ class CSRDeviceDependentCode< Devices::Host > #ifdef HAVE_CUDA template< typename Real, typename Index, + CSRKernel KernelType, typename InVector, typename OutVector, int warpSize > -__global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index >* matrix, +__global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index, KernelType >* matrix, const InVector* inVector, OutVector* outVector, int gridIdx, @@ -1050,13 +1053,12 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde typedef CSR< Real, Devices::Cuda, Index > Matrix; static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" ); const typename Matrix::IndexType rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( matrix->getCudaKernelType() == Matrix::scalar ) + if( KernelType == CSRScalar ) { if( rowIdx < matrix->getRows() ) ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector ); } - if( matrix->getCudaKernelType() == Matrix::vector || - matrix->getCudaKernelType() == Matrix::hybrid ) + else { matrix->template vectorProductCuda< InVector, OutVector, warpSize > ( *inVector, *outVector, gridIdx, blocks, size ); @@ -1066,16 +1068,17 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde template< typename Real, typename Index, + CSRKernel KernelType, typename InVector, typename OutVector > -void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, +void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >& matrix, const InVector& inVector, OutVector& outVector, int *blocks, size_t size ) { #ifdef HAVE_CUDA - typedef CSR< Real, Devices::Cuda, Index > Matrix; + typedef CSR< Real, Devices::Cuda, Index, KernelType > Matrix; typedef typename Matrix::IndexType IndexType; Matrix* kernel_this = Cuda::passToDevice( matrix ); InVector* kernel_inVector = Cuda::passToDevice( inVector ); @@ -1096,7 +1099,7 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, // const int threads = cudaBlockSize.x; if( matrix.getCudaWarpSize() == 32 ) { // printf("BL %d BLSIZE %d\n", (int)cudaBlocks, (int)threads); - CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 32 > + CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 32 > <<< 2, 1024 >>> ( kernel_this, kernel_inVector, -- GitLab From c9b316aa31ef093fa10f6b6df2e18e734c239a2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Tue, 31 Mar 2020 16:46:19 +0200 Subject: [PATCH 8/8] Fixes for CI. --- src/TNL/Matrices/Legacy/CSR_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 12a9eb554..bb3dfae79 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -1266,7 +1266,7 @@ class CSRDeviceDependentCode< Devices::Cuda > std::vector inBlock; inBlock.push_back(0); size_t sum = 0; - size_t i; + Index i; int prev_i = 0; for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) { size_t elements = matrix.getRowPointers().getElement(i) - -- GitLab