Loading src/TNL/Matrices/Legacy/CSR.h +2 −2 Original line number Diff line number Diff line Loading @@ -230,7 +230,7 @@ public: __device__ void vectorProductCuda( const InVector& inVector, OutVector& outVector, int gridIdx, unsigned *blocks, size_t size ) const; int gridIdx, int *blocks, size_t size ) const; template< typename InVector, typename OutVector, Loading @@ -247,7 +247,7 @@ public: void spmvCSRAdaptive( const InVector& inVector, OutVector& outVector, int gridIdx, unsigned *blocks, int *blocks, size_t blocks_size) const; #endif Loading src/TNL/Matrices/Legacy/CSR_impl.h +161 −116 Original line number Diff line number Diff line Loading @@ -732,6 +732,35 @@ void CSR< Real, Device, Index >::spmvCudaLightSpmv( const InVector& inVector, } } /* template< typename Real, typename Device, typename Index, typename InVector, int warpSize > __global__ void spmvCSRVectorHelper( const InVector& inVector, Real *out, size_t from, size_t to, size_t perWarp) { const size_t index = blockIdx.x * blockDim.x + threadIdx.x; const size_t warpID = index / warpSize; const size_t laneID = index % warpSize; const size_t minID = from + warpID * perWarp; size_t maxID = from + (warpID + 1) * perWarp; if (minID >= to) return; if (maxID >= to ) maxID = to; Real result = 0.0; for (IndexType i = minID + laneID; i < maxID; i += warpSize) { const IndexType column = this->columnIndexes[i]; if (column >= this->getColumns()) continue; result += this->values[i] * inVector[column]; } atomicAdd(out, result); } */ template< typename Real, typename Device, Loading @@ -743,59 +772,58 @@ __device__ void CSR< Real, Device, Index >::spmvCSRAdaptive( const InVector& inVector, OutVector& outVector, int gridIdx, unsigned *blocks, int *blocks, size_t blocks_size) const { constexpr IndexType SHARED = 1024; /* TODO: delete */ /* Configuration ---------------------------------------------------*/ constexpr size_t SHARED = 49152/sizeof(float); constexpr size_t SHARED_PER_WARP = SHARED / warpSize; constexpr size_t MAX_PER_WARP = 65536; constexpr size_t ELEMENTS_PER_WARP = 1024; constexpr size_t THREADS_PER_BLOCK = 1024; constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize; //-------------------------------------------------------------------- const IndexType index = blockIdx.x * blockDim.x + threadIdx.x; __shared__ unsigned blockCnt; __shared__ float shared_res[SHARED]; float result = 0.0; if (index == 0) blockCnt = 0; // Init shared variable __syncthreads(); const IndexType laneID = index % warpSize; IndexType blockIdx; while (true) { /* Get row number */ if (laneID == 0) blockIdx = atomicAdd(&blockCnt, 1); /* Propagate row number in warp */ blockIdx = __shfl_sync((unsigned)(warpSize - 1), blockIdx, 0); IndexType blockIdx = index / warpSize; __shared__ float shared_res[SHARED]; Real result = 0.0; if (blockIdx >= blocks_size - 1) return; /* Number of elements */ const IndexType minRow = blocks[blockIdx]; const IndexType maxRow = blocks[blockIdx + 1]; const IndexType minID = this->rowPointers[minRow]; const IndexType maxID = this->rowPointers[maxRow]; const IndexType elements = maxID - minID; result = 0.0; /* rows per block more than 1 */ if ((maxRow - minRow) > 1) { /////////////////////////////////////* CSR STREAM *////////////// /* Copy and calculate elements from global to shared memory, coalesced */ const IndexType offset = threadIdx.x / warpSize * SHARED_PER_WARP; for (IndexType i = laneID; i < elements; i += warpSize) { const IndexType elementIdx = i + minID; const IndexType column = this->columnIndexes[elementIdx]; if (column >= this->getColumns()) break; shared_res[i] = this->values[elementIdx] * inVector[column]; continue; shared_res[i + offset] = this->values[elementIdx] * inVector[column]; } const IndexType row = minRow + laneID; if (row >= maxRow - 1) continue; if (row >= maxRow) return; /* Calculate result */ const IndexType to = this->rowPointers[row + 1] - minID; for (IndexType i = this->rowPointers[row] - minID; i < to; ++i) { result += shared_res[i]; result += shared_res[i + offset]; } outVector[row] = result; // Write result } else { } else if (elements <= MAX_PER_WARP) { /////////////////////////////////////* CSR VECTOR *////////////// for (IndexType i = minID + laneID; i < maxID; i += warpSize) { const IndexType column = this->columnIndexes[i]; IndexType column = this->columnIndexes[i]; if (column >= this->getColumns()) break; result += this->values[i] * inVector[column]; } /* Reduction */ Loading @@ -805,7 +833,18 @@ void CSR< Real, Device, Index >::spmvCSRAdaptive( const InVector& inVector, result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2); result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); if (laneID == 0) outVector[minRow] = result; // Write result } } else { /////////////////////////////////////* CSR VECTOR LONG *////////////// const size_t warps = (elements - ELEMENTS_PER_WARP) / ELEMENTS_PER_WARP + 1; const size_t blocks = warps <= WARPS_PER_BLOCK ? 1 : warps / WARPS_PER_BLOCK + 1; const size_t threads_per_block = blocks == 1 ? warps * warpSize : WARPS_PER_BLOCK * warpSize; // spmvCSRVectorHelper<InVector, warpSize> <<<blocks, threads_per_block>>>( // inVector, // &outVector[minRow], // (size_t)(minID + ELEMENTS_PER_WARP), // (size_t)maxID, // (size_t)ELEMENTS_PER_WARP // ); } } Loading Loading @@ -862,7 +901,7 @@ __device__ void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector, OutVector& outVector, int gridIdx, unsigned *blocks, size_t size ) const int *blocks, size_t size ) const { spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size ); return; Loading Loading @@ -939,7 +978,7 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde const InVector* inVector, OutVector* outVector, int gridIdx, unsigned *blocks, size_t size) int *blocks, size_t size) { typedef CSR< Real, Devices::Cuda, Index > Matrix; static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" ); Loading @@ -965,7 +1004,7 @@ template< typename Real, void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, const InVector& inVector, OutVector& outVector, unsigned *blocks, int *blocks, size_t size ) { #ifdef HAVE_CUDA Loading @@ -974,7 +1013,10 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, Matrix* kernel_this = Cuda::passToDevice( matrix ); InVector* kernel_inVector = Cuda::passToDevice( inVector ); OutVector* kernel_outVector = Cuda::passToDevice( outVector ); unsigned *kernelBlocks = Cuda::passToDevice( *blocks ); int *kernelBlocks; cudaMalloc((void **)&kernelBlocks, sizeof(int) * size); cudaMemcpy(kernelBlocks, blocks, size * sizeof(int), cudaMemcpyHostToDevice); TNL_CHECK_CUDA_DEVICE; dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); Loading @@ -984,48 +1026,51 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); const int sharedMemory = cudaBlockSize.x * sizeof( Real ); if( matrix.getCudaWarpSize() == 32 ) // const int threads = cudaBlockSize.x; if( matrix.getCudaWarpSize() == 32 ) { // printf("BL %d BLSIZE %d\n", (int)cudaBlocks, (int)threads); CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 32 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 16 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 8 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 4 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 2 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 1 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> <<< 2, 1024 >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx, kernelBlocks, size ); } // if( matrix.getCudaWarpSize() == 16 ) // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 > // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> // ( kernel_this, // kernel_inVector, // kernel_outVector, // gridIdx, kernelBlocks, size ); // if( matrix.getCudaWarpSize() == 8 ) // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 > // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> // ( kernel_this, // kernel_inVector, // kernel_outVector, // gridIdx, kernelBlocks, size ); // if( matrix.getCudaWarpSize() == 4 ) // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 > // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> // ( kernel_this, // kernel_inVector, // kernel_outVector, // gridIdx, kernelBlocks, size ); // if( matrix.getCudaWarpSize() == 2 ) // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 > // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> // ( kernel_this, // kernel_inVector, // kernel_outVector, // gridIdx, kernelBlocks, size ); // if( matrix.getCudaWarpSize() == 1 ) // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 > // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> // ( kernel_this, // kernel_inVector, // kernel_outVector, // gridIdx, kernelBlocks, size ); } TNL_CHECK_CUDA_DEVICE; Loading Loading @@ -1145,32 +1190,32 @@ class CSRDeviceDependentCode< Devices::Cuda > inVector.getData(), outVector.getData() ); #else std::vector<unsigned> inBlock; constexpr int SHARED = 49152/sizeof(float); constexpr int SHARED_PER_WARP = SHARED / 32; std::vector<int> inBlock; inBlock.push_back(0); unsigned sum = 0; for (size_t i = 1; i < matrix.getRowPointers().getSize(); ++i) { printf("PTR %d\n", (int)matrix.getRowPointers().getElement(i)); } for (size_t i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) { size_t elements = matrix.getRowPointers().getElement(i + 1) - matrix.getRowPointers().getElement(i); if (elements > 1024) { if (sum == 0) { size_t sum = 0; size_t i; int prev_i = 0; for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) { size_t elements = matrix.getRowPointers().getElement(i) - matrix.getRowPointers().getElement(i - 1); sum += elements; if (sum > SHARED_PER_WARP) { if (i - prev_i == 1) { inBlock.push_back(i); } else { } else { inBlock.push_back(i - 1); inBlock.push_back(i); --i; } sum = 0; } sum += elements; if (sum <= 1024) prev_i = i; continue; else { inBlock.push_back(i - 1); sum = elements; } if (i - prev_i == 32) { inBlock.push_back(i); prev_i = i; sum = 0; } } inBlock.push_back(matrix.getRowPointers().getSize() - 1); Loading src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp +66 −74 Original line number Diff line number Diff line Loading @@ -1259,9 +1259,11 @@ void test_VectorProductLarger() Matrix m; m.reset(); m.setDimensions( m_rows, m_cols ); typename Matrix::CompressedRowLengthsVector rowLengths; rowLengths.setSize( m_rows ); rowLengths.setValue( 20 ); typename Matrix::CompressedRowLengthsVector rowLengths( {11, 2, 4, 0, 6, 4, 1, 2, 20, 18, 6, 20, 10, 0, 20, 10, 2, 20, 10, 12} ); // rowLengths.setSize( m_rows ); // rowLengths.setValue( 20 ); m.setCompressedRowLengths( rowLengths ); /* 0 */ Loading Loading @@ -1389,93 +1391,83 @@ void test_VectorProductGiant() using RealType = typename Matrix::RealType; using DeviceType = typename Matrix::DeviceType; using IndexType = typename Matrix::IndexType; const IndexType N = 200; // Should be a multiple of 10 /* * Sets up the following NxN sparse matrix: * * / 1 0 1 0 ... 1 0 \ * | 0 0 0 0 ... 0 0 | * | 0 2 0 2 ... 0 2 | * | 0 0 0 0 ... 0 0 | * | 3 -3 3 -3 ... 3 -3 | * | 0 0 0 0 ... 0 0 | * | 5 5 5 5 ... 5 5 | * | 0 0 0 0 ... 0 0 | * | 0 0 0 0 ... 0 0 | * | 2 0 0 0 ... 0 0 | <- the only 2 in the row * * .................. */ const IndexType m_rows = N; const IndexType m_cols = N; IndexType m_rows = 100; IndexType m_cols = 100; Matrix m; m.reset(); m.setDimensions( m_rows, m_cols ); typename Matrix::CompressedRowLengthsVector rowLengths; rowLengths.setSize( m_rows ); rowLengths.setValue( m_cols ); typename Matrix::CompressedRowLengthsVector rowLengths( { 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100 } ); m.setCompressedRowLengths( rowLengths ); for ( IndexType row = 0; row < m_rows; ++row ) { if (row % 10 == 0) { for ( IndexType col = 0; col < m_cols; ++col ) { if (col % 2) m.setElement( row, col, 1 ); } } else if (row % 10 == 2) { for ( IndexType col = 0; col < m_cols; ++col ) { if (col % 2 == 1) m.setElement( row, col, 2 ); } } else if (row % 10 == 4) { for ( IndexType col = 0; col < m_cols; ++col ) { if (col % 2) m.setElement( row, col, 3 ); else m.setElement( row, col, -3 ); } } else if (row % 10 == 6) { for ( IndexType col = 0; col < m_cols; ++col ) { m.setElement( row, col, 5 ); } } else if (row % 10 == 9) { m.setElement( row, 0, 2 ); } } for (int i = 0; i < m_rows; ++i) for (int j = 0; j < m_cols; ++j) m.setElement( i, j, i + 1 ); using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >; VectorType inVector; inVector.setSize( N ); for( IndexType i = 0; i < inVector.getSize(); i++ ) inVector.setElement( i, 2 ); inVector.setSize( m_rows ); for( IndexType i = 0; i < inVector.getSize(); ++i ) inVector.setElement( i, 1 ); VectorType outVector; outVector.setSize( N ); for( IndexType j = 0; j < outVector.getSize(); j++ ) outVector.setElement( j, 0 ); outVector.setSize( m_rows ); for( IndexType i = 0; i < outVector.getSize(); ++i ) outVector.setElement( i, 0 ); m.vectorProduct( inVector, outVector); for ( IndexType i = 0; i < outVector.getSize(); ++i ) { if ( i % 10 == 0 ) EXPECT_EQ( outVector.getElement( i ), 200 /* (N) */ ); else if ( i % 10 == 2 ) EXPECT_EQ( outVector.getElement( i ), 400 /* (2 * N) */ ); else if ( i % 10 == 6 ) EXPECT_EQ( outVector.getElement( i ), 2000 /* (10 * N) */ ); else if ( i % 10 == 9 ) EXPECT_EQ( outVector.getElement( i ), 4 ); else EXPECT_EQ( outVector.getElement( i ), 0 ); for (int i = 0; i < m_rows; ++i) EXPECT_EQ( outVector.getElement( i ), (i + 1) * 100 ); //----------------------------------------------------- m_rows = 2; m_cols = 1000; m.reset(); m.setDimensions( m_rows, m_cols ); typename Matrix::CompressedRowLengthsVector rowLengths2( { 1000, 1000 } ); m.setCompressedRowLengths( rowLengths2 ); for (int i = 0; i < m_rows; ++i) for (int j = 0; j < m_cols; ++j) m.setElement( i, j, i + 1 ); VectorType inVector2; inVector2.setSize( m_cols ); for( IndexType i = 0; i < inVector2.getSize(); i++ ) inVector2.setElement( i, 1 ); VectorType outVector2; outVector2.setSize( m_rows ); for( IndexType i = 0; i < outVector2.getSize(); ++i ) outVector2.setElement( i, 0 ); m.vectorProduct( inVector2, outVector2); for (int i = 0; i < m_rows; ++i) EXPECT_EQ( outVector2.getElement( i ), (i + 1) * 1000 ); } template< typename Matrix > Loading Loading
src/TNL/Matrices/Legacy/CSR.h +2 −2 Original line number Diff line number Diff line Loading @@ -230,7 +230,7 @@ public: __device__ void vectorProductCuda( const InVector& inVector, OutVector& outVector, int gridIdx, unsigned *blocks, size_t size ) const; int gridIdx, int *blocks, size_t size ) const; template< typename InVector, typename OutVector, Loading @@ -247,7 +247,7 @@ public: void spmvCSRAdaptive( const InVector& inVector, OutVector& outVector, int gridIdx, unsigned *blocks, int *blocks, size_t blocks_size) const; #endif Loading
src/TNL/Matrices/Legacy/CSR_impl.h +161 −116 Original line number Diff line number Diff line Loading @@ -732,6 +732,35 @@ void CSR< Real, Device, Index >::spmvCudaLightSpmv( const InVector& inVector, } } /* template< typename Real, typename Device, typename Index, typename InVector, int warpSize > __global__ void spmvCSRVectorHelper( const InVector& inVector, Real *out, size_t from, size_t to, size_t perWarp) { const size_t index = blockIdx.x * blockDim.x + threadIdx.x; const size_t warpID = index / warpSize; const size_t laneID = index % warpSize; const size_t minID = from + warpID * perWarp; size_t maxID = from + (warpID + 1) * perWarp; if (minID >= to) return; if (maxID >= to ) maxID = to; Real result = 0.0; for (IndexType i = minID + laneID; i < maxID; i += warpSize) { const IndexType column = this->columnIndexes[i]; if (column >= this->getColumns()) continue; result += this->values[i] * inVector[column]; } atomicAdd(out, result); } */ template< typename Real, typename Device, Loading @@ -743,59 +772,58 @@ __device__ void CSR< Real, Device, Index >::spmvCSRAdaptive( const InVector& inVector, OutVector& outVector, int gridIdx, unsigned *blocks, int *blocks, size_t blocks_size) const { constexpr IndexType SHARED = 1024; /* TODO: delete */ /* Configuration ---------------------------------------------------*/ constexpr size_t SHARED = 49152/sizeof(float); constexpr size_t SHARED_PER_WARP = SHARED / warpSize; constexpr size_t MAX_PER_WARP = 65536; constexpr size_t ELEMENTS_PER_WARP = 1024; constexpr size_t THREADS_PER_BLOCK = 1024; constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize; //-------------------------------------------------------------------- const IndexType index = blockIdx.x * blockDim.x + threadIdx.x; __shared__ unsigned blockCnt; __shared__ float shared_res[SHARED]; float result = 0.0; if (index == 0) blockCnt = 0; // Init shared variable __syncthreads(); const IndexType laneID = index % warpSize; IndexType blockIdx; while (true) { /* Get row number */ if (laneID == 0) blockIdx = atomicAdd(&blockCnt, 1); /* Propagate row number in warp */ blockIdx = __shfl_sync((unsigned)(warpSize - 1), blockIdx, 0); IndexType blockIdx = index / warpSize; __shared__ float shared_res[SHARED]; Real result = 0.0; if (blockIdx >= blocks_size - 1) return; /* Number of elements */ const IndexType minRow = blocks[blockIdx]; const IndexType maxRow = blocks[blockIdx + 1]; const IndexType minID = this->rowPointers[minRow]; const IndexType maxID = this->rowPointers[maxRow]; const IndexType elements = maxID - minID; result = 0.0; /* rows per block more than 1 */ if ((maxRow - minRow) > 1) { /////////////////////////////////////* CSR STREAM *////////////// /* Copy and calculate elements from global to shared memory, coalesced */ const IndexType offset = threadIdx.x / warpSize * SHARED_PER_WARP; for (IndexType i = laneID; i < elements; i += warpSize) { const IndexType elementIdx = i + minID; const IndexType column = this->columnIndexes[elementIdx]; if (column >= this->getColumns()) break; shared_res[i] = this->values[elementIdx] * inVector[column]; continue; shared_res[i + offset] = this->values[elementIdx] * inVector[column]; } const IndexType row = minRow + laneID; if (row >= maxRow - 1) continue; if (row >= maxRow) return; /* Calculate result */ const IndexType to = this->rowPointers[row + 1] - minID; for (IndexType i = this->rowPointers[row] - minID; i < to; ++i) { result += shared_res[i]; result += shared_res[i + offset]; } outVector[row] = result; // Write result } else { } else if (elements <= MAX_PER_WARP) { /////////////////////////////////////* CSR VECTOR *////////////// for (IndexType i = minID + laneID; i < maxID; i += warpSize) { const IndexType column = this->columnIndexes[i]; IndexType column = this->columnIndexes[i]; if (column >= this->getColumns()) break; result += this->values[i] * inVector[column]; } /* Reduction */ Loading @@ -805,7 +833,18 @@ void CSR< Real, Device, Index >::spmvCSRAdaptive( const InVector& inVector, result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2); result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); if (laneID == 0) outVector[minRow] = result; // Write result } } else { /////////////////////////////////////* CSR VECTOR LONG *////////////// const size_t warps = (elements - ELEMENTS_PER_WARP) / ELEMENTS_PER_WARP + 1; const size_t blocks = warps <= WARPS_PER_BLOCK ? 1 : warps / WARPS_PER_BLOCK + 1; const size_t threads_per_block = blocks == 1 ? warps * warpSize : WARPS_PER_BLOCK * warpSize; // spmvCSRVectorHelper<InVector, warpSize> <<<blocks, threads_per_block>>>( // inVector, // &outVector[minRow], // (size_t)(minID + ELEMENTS_PER_WARP), // (size_t)maxID, // (size_t)ELEMENTS_PER_WARP // ); } } Loading Loading @@ -862,7 +901,7 @@ __device__ void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector, OutVector& outVector, int gridIdx, unsigned *blocks, size_t size ) const int *blocks, size_t size ) const { spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size ); return; Loading Loading @@ -939,7 +978,7 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde const InVector* inVector, OutVector* outVector, int gridIdx, unsigned *blocks, size_t size) int *blocks, size_t size) { typedef CSR< Real, Devices::Cuda, Index > Matrix; static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" ); Loading @@ -965,7 +1004,7 @@ template< typename Real, void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, const InVector& inVector, OutVector& outVector, unsigned *blocks, int *blocks, size_t size ) { #ifdef HAVE_CUDA Loading @@ -974,7 +1013,10 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, Matrix* kernel_this = Cuda::passToDevice( matrix ); InVector* kernel_inVector = Cuda::passToDevice( inVector ); OutVector* kernel_outVector = Cuda::passToDevice( outVector ); unsigned *kernelBlocks = Cuda::passToDevice( *blocks ); int *kernelBlocks; cudaMalloc((void **)&kernelBlocks, sizeof(int) * size); cudaMemcpy(kernelBlocks, blocks, size * sizeof(int), cudaMemcpyHostToDevice); TNL_CHECK_CUDA_DEVICE; dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); Loading @@ -984,48 +1026,51 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); const int sharedMemory = cudaBlockSize.x * sizeof( Real ); if( matrix.getCudaWarpSize() == 32 ) // const int threads = cudaBlockSize.x; if( matrix.getCudaWarpSize() == 32 ) { // printf("BL %d BLSIZE %d\n", (int)cudaBlocks, (int)threads); CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 32 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 16 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 8 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 4 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 2 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 1 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> <<< 2, 1024 >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx, kernelBlocks, size ); } // if( matrix.getCudaWarpSize() == 16 ) // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 > // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> // ( kernel_this, // kernel_inVector, // kernel_outVector, // gridIdx, kernelBlocks, size ); // if( matrix.getCudaWarpSize() == 8 ) // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 > // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> // ( kernel_this, // kernel_inVector, // kernel_outVector, // gridIdx, kernelBlocks, size ); // if( matrix.getCudaWarpSize() == 4 ) // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 > // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> // ( kernel_this, // kernel_inVector, // kernel_outVector, // gridIdx, kernelBlocks, size ); // if( matrix.getCudaWarpSize() == 2 ) // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 > // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> // ( kernel_this, // kernel_inVector, // kernel_outVector, // gridIdx, kernelBlocks, size ); // if( matrix.getCudaWarpSize() == 1 ) // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 > // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> // ( kernel_this, // kernel_inVector, // kernel_outVector, // gridIdx, kernelBlocks, size ); } TNL_CHECK_CUDA_DEVICE; Loading Loading @@ -1145,32 +1190,32 @@ class CSRDeviceDependentCode< Devices::Cuda > inVector.getData(), outVector.getData() ); #else std::vector<unsigned> inBlock; constexpr int SHARED = 49152/sizeof(float); constexpr int SHARED_PER_WARP = SHARED / 32; std::vector<int> inBlock; inBlock.push_back(0); unsigned sum = 0; for (size_t i = 1; i < matrix.getRowPointers().getSize(); ++i) { printf("PTR %d\n", (int)matrix.getRowPointers().getElement(i)); } for (size_t i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) { size_t elements = matrix.getRowPointers().getElement(i + 1) - matrix.getRowPointers().getElement(i); if (elements > 1024) { if (sum == 0) { size_t sum = 0; size_t i; int prev_i = 0; for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) { size_t elements = matrix.getRowPointers().getElement(i) - matrix.getRowPointers().getElement(i - 1); sum += elements; if (sum > SHARED_PER_WARP) { if (i - prev_i == 1) { inBlock.push_back(i); } else { } else { inBlock.push_back(i - 1); inBlock.push_back(i); --i; } sum = 0; } sum += elements; if (sum <= 1024) prev_i = i; continue; else { inBlock.push_back(i - 1); sum = elements; } if (i - prev_i == 32) { inBlock.push_back(i); prev_i = i; sum = 0; } } inBlock.push_back(matrix.getRowPointers().getSize() - 1); Loading
src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp +66 −74 Original line number Diff line number Diff line Loading @@ -1259,9 +1259,11 @@ void test_VectorProductLarger() Matrix m; m.reset(); m.setDimensions( m_rows, m_cols ); typename Matrix::CompressedRowLengthsVector rowLengths; rowLengths.setSize( m_rows ); rowLengths.setValue( 20 ); typename Matrix::CompressedRowLengthsVector rowLengths( {11, 2, 4, 0, 6, 4, 1, 2, 20, 18, 6, 20, 10, 0, 20, 10, 2, 20, 10, 12} ); // rowLengths.setSize( m_rows ); // rowLengths.setValue( 20 ); m.setCompressedRowLengths( rowLengths ); /* 0 */ Loading Loading @@ -1389,93 +1391,83 @@ void test_VectorProductGiant() using RealType = typename Matrix::RealType; using DeviceType = typename Matrix::DeviceType; using IndexType = typename Matrix::IndexType; const IndexType N = 200; // Should be a multiple of 10 /* * Sets up the following NxN sparse matrix: * * / 1 0 1 0 ... 1 0 \ * | 0 0 0 0 ... 0 0 | * | 0 2 0 2 ... 0 2 | * | 0 0 0 0 ... 0 0 | * | 3 -3 3 -3 ... 3 -3 | * | 0 0 0 0 ... 0 0 | * | 5 5 5 5 ... 5 5 | * | 0 0 0 0 ... 0 0 | * | 0 0 0 0 ... 0 0 | * | 2 0 0 0 ... 0 0 | <- the only 2 in the row * * .................. */ const IndexType m_rows = N; const IndexType m_cols = N; IndexType m_rows = 100; IndexType m_cols = 100; Matrix m; m.reset(); m.setDimensions( m_rows, m_cols ); typename Matrix::CompressedRowLengthsVector rowLengths; rowLengths.setSize( m_rows ); rowLengths.setValue( m_cols ); typename Matrix::CompressedRowLengthsVector rowLengths( { 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100 } ); m.setCompressedRowLengths( rowLengths ); for ( IndexType row = 0; row < m_rows; ++row ) { if (row % 10 == 0) { for ( IndexType col = 0; col < m_cols; ++col ) { if (col % 2) m.setElement( row, col, 1 ); } } else if (row % 10 == 2) { for ( IndexType col = 0; col < m_cols; ++col ) { if (col % 2 == 1) m.setElement( row, col, 2 ); } } else if (row % 10 == 4) { for ( IndexType col = 0; col < m_cols; ++col ) { if (col % 2) m.setElement( row, col, 3 ); else m.setElement( row, col, -3 ); } } else if (row % 10 == 6) { for ( IndexType col = 0; col < m_cols; ++col ) { m.setElement( row, col, 5 ); } } else if (row % 10 == 9) { m.setElement( row, 0, 2 ); } } for (int i = 0; i < m_rows; ++i) for (int j = 0; j < m_cols; ++j) m.setElement( i, j, i + 1 ); using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >; VectorType inVector; inVector.setSize( N ); for( IndexType i = 0; i < inVector.getSize(); i++ ) inVector.setElement( i, 2 ); inVector.setSize( m_rows ); for( IndexType i = 0; i < inVector.getSize(); ++i ) inVector.setElement( i, 1 ); VectorType outVector; outVector.setSize( N ); for( IndexType j = 0; j < outVector.getSize(); j++ ) outVector.setElement( j, 0 ); outVector.setSize( m_rows ); for( IndexType i = 0; i < outVector.getSize(); ++i ) outVector.setElement( i, 0 ); m.vectorProduct( inVector, outVector); for ( IndexType i = 0; i < outVector.getSize(); ++i ) { if ( i % 10 == 0 ) EXPECT_EQ( outVector.getElement( i ), 200 /* (N) */ ); else if ( i % 10 == 2 ) EXPECT_EQ( outVector.getElement( i ), 400 /* (2 * N) */ ); else if ( i % 10 == 6 ) EXPECT_EQ( outVector.getElement( i ), 2000 /* (10 * N) */ ); else if ( i % 10 == 9 ) EXPECT_EQ( outVector.getElement( i ), 4 ); else EXPECT_EQ( outVector.getElement( i ), 0 ); for (int i = 0; i < m_rows; ++i) EXPECT_EQ( outVector.getElement( i ), (i + 1) * 100 ); //----------------------------------------------------- m_rows = 2; m_cols = 1000; m.reset(); m.setDimensions( m_rows, m_cols ); typename Matrix::CompressedRowLengthsVector rowLengths2( { 1000, 1000 } ); m.setCompressedRowLengths( rowLengths2 ); for (int i = 0; i < m_rows; ++i) for (int j = 0; j < m_cols; ++j) m.setElement( i, j, i + 1 ); VectorType inVector2; inVector2.setSize( m_cols ); for( IndexType i = 0; i < inVector2.getSize(); i++ ) inVector2.setElement( i, 1 ); VectorType outVector2; outVector2.setSize( m_rows ); for( IndexType i = 0; i < outVector2.getSize(); ++i ) outVector2.setElement( i, 0 ); m.vectorProduct( inVector2, outVector2); for (int i = 0; i < m_rows; ++i) EXPECT_EQ( outVector2.getElement( i ), (i + 1) * 1000 ); } template< typename Matrix > Loading