Loading src/TNL/Matrices/Legacy/CSR.h +7 −7 Original line number Diff line number Diff line Loading @@ -230,7 +230,7 @@ public: __device__ void vectorProductCuda( const InVector& inVector, OutVector& outVector, int gridIdx ) const; int gridIdx, unsigned *blocks, size_t size ) const; template< typename InVector, typename OutVector, Loading @@ -244,11 +244,11 @@ public: typename OutVector, int warpSize > __device__ void spmvCudaVector( const InVector& inVector, void spmvCSRAdaptive( const InVector& inVector, OutVector& outVector, int gridIdx) const; int gridIdx, unsigned *blocks, size_t blocks_size) const; #endif // The following getters allow us to interface TNL with external C-like Loading src/TNL/Matrices/Legacy/CSR_impl.h +150 −82 Original line number Diff line number Diff line Loading @@ -14,6 +14,7 @@ #include <TNL/Containers/VectorView.h> #include <TNL/Math.h> #include <TNL/Exceptions/NotImplementedError.h> #include <vector> #ifdef HAVE_CUSPARSE #include <cusparse.h> Loading Loading @@ -684,10 +685,10 @@ void CSR< Real, Device, Index >::spmvCudaLightSpmv( const InVector& inVector, int gridIdx) const { const IndexType index = blockIdx.x * blockDim.x + threadIdx.x; const IndexType elemPerThread = 4; const IndexType elemPerGroup = 4; const IndexType laneID = index % 32; const IndexType groupID = laneID / elemPerThread; const IndexType inGroupID = laneID % elemPerThread; const IndexType groupID = laneID / elemPerGroup; const IndexType inGroupID = laneID % elemPerGroup; IndexType row, minID, column, maxID, idxMtx; __shared__ unsigned rowCnt; Loading @@ -701,7 +702,7 @@ void CSR< Real, Device, Index >::spmvCudaLightSpmv( const InVector& inVector, if (inGroupID == 0) row = atomicAdd(&rowCnt, 1); /* Propagate row number in group */ row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * elemPerThread); row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * elemPerGroup); if (row >= this->rowPointers.getSize() - 1) return; Loading @@ -714,15 +715,15 @@ void CSR< Real, Device, Index >::spmvCudaLightSpmv( const InVector& inVector, idxMtx = minID + inGroupID; while (idxMtx < maxID) { column = this->columnIndexes[idxMtx]; if (column >= this->getColumns()) { if (column >= this->getColumns()) break; } result += this->values[idxMtx] * inVector[column]; idxMtx += elemPerThread; idxMtx += elemPerGroup; } /* Parallel reduction */ for (int i = elemPerThread/2; i > 0; i /= 2) for (int i = elemPerGroup/2; i > 0; i /= 2) result += __shfl_down_sync((unsigned)(warpSize - 1), result, i); /* Write result */ if (inGroupID == 0) { Loading @@ -731,6 +732,7 @@ void CSR< Real, Device, Index >::spmvCudaLightSpmv( const InVector& inVector, } } template< typename Real, typename Device, typename Index > Loading @@ -738,44 +740,76 @@ template< typename Real, typename OutVector, int warpSize > __device__ void CSR< Real, Device, Index >::spmvCudaVector( const InVector& inVector, void CSR< Real, Device, Index >::spmvCSRAdaptive( const InVector& inVector, OutVector& outVector, int gridIdx) const int gridIdx, unsigned *blocks, size_t blocks_size) const { constexpr IndexType SHARED = 1024; /* TODO: delete */ const IndexType index = blockIdx.x * blockDim.x + threadIdx.x; __shared__ unsigned blockCnt; __shared__ float shared_res[SHARED]; float result = 0.0; if (index == 0) blockCnt = 0; // Init shared variable __syncthreads(); const IndexType laneID = index % warpSize; const IndexType row = index / warpSize; // warpID - one warp per row volatile Real result = 0.0; IndexType blockIdx; if (row >= this->rowPointers.getSize() - 1) while (true) { /* Get row number */ if (laneID == 0) blockIdx = atomicAdd(&blockCnt, 1); /* Propagate row number in warp */ blockIdx = __shfl_sync((unsigned)(warpSize - 1), blockIdx, 0); if (blockIdx >= blocks_size - 1) return; IndexType minID = this->rowPointers[row]; IndexType maxID = this->rowPointers[row + 1]; /* Calculate results */ for (IndexType idxMtx = minID + laneID; idxMtx < maxID; idxMtx += warpSize) { IndexType column = this->columnIndexes[idxMtx]; if (column >= this->getColumns()) { /* Number of elements */ const IndexType minRow = blocks[blockIdx]; const IndexType maxRow = blocks[blockIdx + 1]; const IndexType minID = this->rowPointers[minRow]; const IndexType maxID = this->rowPointers[maxRow]; const IndexType elements = maxID - minID; result = 0.0; /* rows per block more than 1 */ if ((maxRow - minRow) > 1) { /////////////////////////////////////* CSR STREAM *////////////// /* Copy and calculate elements from global to shared memory, coalesced */ for (IndexType i = laneID; i < elements; i += warpSize) { const IndexType elementIdx = i + minID; const IndexType column = this->columnIndexes[elementIdx]; if (column >= this->getColumns()) break; shared_res[i] = this->values[elementIdx] * inVector[column]; } // printf("%lf %lf %lf\n", (double)this->values[idxMtx], (double)inVector[column], (double)(this->values[idxMtx] * inVector[column])); result += this->values[idxMtx] * inVector[column]; } // printf("=============== lane %d warp %d res %lf\n", (int)laneID, (int)row, (double)result); /* Parallel reduction */ result += __shfl_down_sync((warpSize - 1), result, 16); result += __shfl_down_sync((warpSize - 1), result, 8); result += __shfl_down_sync((warpSize - 1), result, 4); result += __shfl_down_sync((warpSize - 1), result, 2); result += __shfl_down_sync((warpSize - 1), result, 1); /* Write result */ if (laneID == 0) { outVector[row] = result; const IndexType row = minRow + laneID; if (row >= maxRow - 1) continue; /* Calculate result */ const IndexType to = this->rowPointers[row + 1] - minID; for (IndexType i = this->rowPointers[row] - minID; i < to; ++i) { result += shared_res[i]; } outVector[row] = result; // Write result } else { /////////////////////////////////////* CSR VECTOR *////////////// for (IndexType i = minID + laneID; i < maxID; i += warpSize) { const IndexType column = this->columnIndexes[i]; result += this->values[i] * inVector[column]; } /* Reduction */ result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16); result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8); result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4); result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2); result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); if (laneID == 0) outVector[minRow] = result; // Write result } } } template< typename Real, typename Device, typename Index > Loading Loading @@ -827,9 +861,10 @@ template< typename Real, __device__ void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector, OutVector& outVector, int gridIdx ) const int gridIdx, unsigned *blocks, size_t size ) const { spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size ); return; IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; Loading Loading @@ -903,7 +938,8 @@ template< typename Real, __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index >* matrix, const InVector* inVector, OutVector* outVector, int gridIdx ) int gridIdx, unsigned *blocks, size_t size) { typedef CSR< Real, Devices::Cuda, Index > Matrix; static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" ); Loading @@ -917,7 +953,7 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde matrix->getCudaKernelType() == Matrix::hybrid ) { matrix->template vectorProductCuda< InVector, OutVector, warpSize > ( *inVector, *outVector, gridIdx ); ( *inVector, *outVector, gridIdx, blocks, size ); } } #endif Loading @@ -928,7 +964,9 @@ template< typename Real, typename OutVector > void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, const InVector& inVector, OutVector& outVector ) OutVector& outVector, unsigned *blocks, size_t size ) { #ifdef HAVE_CUDA typedef CSR< Real, Devices::Cuda, Index > Matrix; Loading @@ -936,6 +974,7 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, Matrix* kernel_this = Cuda::passToDevice( matrix ); InVector* kernel_inVector = Cuda::passToDevice( inVector ); OutVector* kernel_outVector = Cuda::passToDevice( outVector ); unsigned *kernelBlocks = Cuda::passToDevice( *blocks ); TNL_CHECK_CUDA_DEVICE; dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); Loading @@ -951,42 +990,42 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 16 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 8 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 4 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 2 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 1 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); gridIdx, kernelBlocks, size ); } TNL_CHECK_CUDA_DEVICE; Loading Loading @@ -1106,7 +1145,36 @@ class CSRDeviceDependentCode< Devices::Cuda > inVector.getData(), outVector.getData() ); #else CSRVectorProductCuda( matrix, inVector, outVector ); std::vector<unsigned> inBlock; inBlock.push_back(0); unsigned sum = 0; for (size_t i = 1; i < matrix.getRowPointers().getSize(); ++i) { printf("PTR %d\n", (int)matrix.getRowPointers().getElement(i)); } for (size_t i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) { size_t elements = matrix.getRowPointers().getElement(i + 1) - matrix.getRowPointers().getElement(i); if (elements > 1024) { if (sum == 0) { inBlock.push_back(i); } else { inBlock.push_back(i - 1); inBlock.push_back(i); } sum = 0; } sum += elements; if (sum <= 1024) continue; else { inBlock.push_back(i - 1); sum = elements; } } inBlock.push_back(matrix.getRowPointers().getSize() - 1); CSRVectorProductCuda( matrix, inVector, outVector, inBlock.data(), inBlock.size() ); #endif } Loading Loading
src/TNL/Matrices/Legacy/CSR.h +7 −7 Original line number Diff line number Diff line Loading @@ -230,7 +230,7 @@ public: __device__ void vectorProductCuda( const InVector& inVector, OutVector& outVector, int gridIdx ) const; int gridIdx, unsigned *blocks, size_t size ) const; template< typename InVector, typename OutVector, Loading @@ -244,11 +244,11 @@ public: typename OutVector, int warpSize > __device__ void spmvCudaVector( const InVector& inVector, void spmvCSRAdaptive( const InVector& inVector, OutVector& outVector, int gridIdx) const; int gridIdx, unsigned *blocks, size_t blocks_size) const; #endif // The following getters allow us to interface TNL with external C-like Loading
src/TNL/Matrices/Legacy/CSR_impl.h +150 −82 Original line number Diff line number Diff line Loading @@ -14,6 +14,7 @@ #include <TNL/Containers/VectorView.h> #include <TNL/Math.h> #include <TNL/Exceptions/NotImplementedError.h> #include <vector> #ifdef HAVE_CUSPARSE #include <cusparse.h> Loading Loading @@ -684,10 +685,10 @@ void CSR< Real, Device, Index >::spmvCudaLightSpmv( const InVector& inVector, int gridIdx) const { const IndexType index = blockIdx.x * blockDim.x + threadIdx.x; const IndexType elemPerThread = 4; const IndexType elemPerGroup = 4; const IndexType laneID = index % 32; const IndexType groupID = laneID / elemPerThread; const IndexType inGroupID = laneID % elemPerThread; const IndexType groupID = laneID / elemPerGroup; const IndexType inGroupID = laneID % elemPerGroup; IndexType row, minID, column, maxID, idxMtx; __shared__ unsigned rowCnt; Loading @@ -701,7 +702,7 @@ void CSR< Real, Device, Index >::spmvCudaLightSpmv( const InVector& inVector, if (inGroupID == 0) row = atomicAdd(&rowCnt, 1); /* Propagate row number in group */ row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * elemPerThread); row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * elemPerGroup); if (row >= this->rowPointers.getSize() - 1) return; Loading @@ -714,15 +715,15 @@ void CSR< Real, Device, Index >::spmvCudaLightSpmv( const InVector& inVector, idxMtx = minID + inGroupID; while (idxMtx < maxID) { column = this->columnIndexes[idxMtx]; if (column >= this->getColumns()) { if (column >= this->getColumns()) break; } result += this->values[idxMtx] * inVector[column]; idxMtx += elemPerThread; idxMtx += elemPerGroup; } /* Parallel reduction */ for (int i = elemPerThread/2; i > 0; i /= 2) for (int i = elemPerGroup/2; i > 0; i /= 2) result += __shfl_down_sync((unsigned)(warpSize - 1), result, i); /* Write result */ if (inGroupID == 0) { Loading @@ -731,6 +732,7 @@ void CSR< Real, Device, Index >::spmvCudaLightSpmv( const InVector& inVector, } } template< typename Real, typename Device, typename Index > Loading @@ -738,44 +740,76 @@ template< typename Real, typename OutVector, int warpSize > __device__ void CSR< Real, Device, Index >::spmvCudaVector( const InVector& inVector, void CSR< Real, Device, Index >::spmvCSRAdaptive( const InVector& inVector, OutVector& outVector, int gridIdx) const int gridIdx, unsigned *blocks, size_t blocks_size) const { constexpr IndexType SHARED = 1024; /* TODO: delete */ const IndexType index = blockIdx.x * blockDim.x + threadIdx.x; __shared__ unsigned blockCnt; __shared__ float shared_res[SHARED]; float result = 0.0; if (index == 0) blockCnt = 0; // Init shared variable __syncthreads(); const IndexType laneID = index % warpSize; const IndexType row = index / warpSize; // warpID - one warp per row volatile Real result = 0.0; IndexType blockIdx; if (row >= this->rowPointers.getSize() - 1) while (true) { /* Get row number */ if (laneID == 0) blockIdx = atomicAdd(&blockCnt, 1); /* Propagate row number in warp */ blockIdx = __shfl_sync((unsigned)(warpSize - 1), blockIdx, 0); if (blockIdx >= blocks_size - 1) return; IndexType minID = this->rowPointers[row]; IndexType maxID = this->rowPointers[row + 1]; /* Calculate results */ for (IndexType idxMtx = minID + laneID; idxMtx < maxID; idxMtx += warpSize) { IndexType column = this->columnIndexes[idxMtx]; if (column >= this->getColumns()) { /* Number of elements */ const IndexType minRow = blocks[blockIdx]; const IndexType maxRow = blocks[blockIdx + 1]; const IndexType minID = this->rowPointers[minRow]; const IndexType maxID = this->rowPointers[maxRow]; const IndexType elements = maxID - minID; result = 0.0; /* rows per block more than 1 */ if ((maxRow - minRow) > 1) { /////////////////////////////////////* CSR STREAM *////////////// /* Copy and calculate elements from global to shared memory, coalesced */ for (IndexType i = laneID; i < elements; i += warpSize) { const IndexType elementIdx = i + minID; const IndexType column = this->columnIndexes[elementIdx]; if (column >= this->getColumns()) break; shared_res[i] = this->values[elementIdx] * inVector[column]; } // printf("%lf %lf %lf\n", (double)this->values[idxMtx], (double)inVector[column], (double)(this->values[idxMtx] * inVector[column])); result += this->values[idxMtx] * inVector[column]; } // printf("=============== lane %d warp %d res %lf\n", (int)laneID, (int)row, (double)result); /* Parallel reduction */ result += __shfl_down_sync((warpSize - 1), result, 16); result += __shfl_down_sync((warpSize - 1), result, 8); result += __shfl_down_sync((warpSize - 1), result, 4); result += __shfl_down_sync((warpSize - 1), result, 2); result += __shfl_down_sync((warpSize - 1), result, 1); /* Write result */ if (laneID == 0) { outVector[row] = result; const IndexType row = minRow + laneID; if (row >= maxRow - 1) continue; /* Calculate result */ const IndexType to = this->rowPointers[row + 1] - minID; for (IndexType i = this->rowPointers[row] - minID; i < to; ++i) { result += shared_res[i]; } outVector[row] = result; // Write result } else { /////////////////////////////////////* CSR VECTOR *////////////// for (IndexType i = minID + laneID; i < maxID; i += warpSize) { const IndexType column = this->columnIndexes[i]; result += this->values[i] * inVector[column]; } /* Reduction */ result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16); result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8); result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4); result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2); result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); if (laneID == 0) outVector[minRow] = result; // Write result } } } template< typename Real, typename Device, typename Index > Loading Loading @@ -827,9 +861,10 @@ template< typename Real, __device__ void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector, OutVector& outVector, int gridIdx ) const int gridIdx, unsigned *blocks, size_t size ) const { spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size ); return; IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; Loading Loading @@ -903,7 +938,8 @@ template< typename Real, __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index >* matrix, const InVector* inVector, OutVector* outVector, int gridIdx ) int gridIdx, unsigned *blocks, size_t size) { typedef CSR< Real, Devices::Cuda, Index > Matrix; static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" ); Loading @@ -917,7 +953,7 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde matrix->getCudaKernelType() == Matrix::hybrid ) { matrix->template vectorProductCuda< InVector, OutVector, warpSize > ( *inVector, *outVector, gridIdx ); ( *inVector, *outVector, gridIdx, blocks, size ); } } #endif Loading @@ -928,7 +964,9 @@ template< typename Real, typename OutVector > void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, const InVector& inVector, OutVector& outVector ) OutVector& outVector, unsigned *blocks, size_t size ) { #ifdef HAVE_CUDA typedef CSR< Real, Devices::Cuda, Index > Matrix; Loading @@ -936,6 +974,7 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, Matrix* kernel_this = Cuda::passToDevice( matrix ); InVector* kernel_inVector = Cuda::passToDevice( inVector ); OutVector* kernel_outVector = Cuda::passToDevice( outVector ); unsigned *kernelBlocks = Cuda::passToDevice( *blocks ); TNL_CHECK_CUDA_DEVICE; dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); Loading @@ -951,42 +990,42 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 16 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 8 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 4 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 2 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); gridIdx, kernelBlocks, size ); if( matrix.getCudaWarpSize() == 1 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); gridIdx, kernelBlocks, size ); } TNL_CHECK_CUDA_DEVICE; Loading Loading @@ -1106,7 +1145,36 @@ class CSRDeviceDependentCode< Devices::Cuda > inVector.getData(), outVector.getData() ); #else CSRVectorProductCuda( matrix, inVector, outVector ); std::vector<unsigned> inBlock; inBlock.push_back(0); unsigned sum = 0; for (size_t i = 1; i < matrix.getRowPointers().getSize(); ++i) { printf("PTR %d\n", (int)matrix.getRowPointers().getElement(i)); } for (size_t i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) { size_t elements = matrix.getRowPointers().getElement(i + 1) - matrix.getRowPointers().getElement(i); if (elements > 1024) { if (sum == 0) { inBlock.push_back(i); } else { inBlock.push_back(i - 1); inBlock.push_back(i); } sum = 0; } sum += elements; if (sum <= 1024) continue; else { inBlock.push_back(i - 1); sum = elements; } } inBlock.push_back(matrix.getRowPointers().getSize() - 1); CSRVectorProductCuda( matrix, inVector, outVector, inBlock.data(), inBlock.size() ); #endif } Loading