From ecdac7b0a68526606ed07b9afaf493b18687cfa2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com> Date: Fri, 12 Feb 2021 20:35:48 +0100 Subject: [PATCH] Refactoring legacy adaptive CSR sparse matrix. --- .../SpMV/ReferenceFormats/Legacy/CSR_impl.h | 216 ++++++++++-------- 1 file changed, 123 insertions(+), 93 deletions(-) diff --git a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h index caded91b9a..827e2c3116 100644 --- a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h @@ -136,7 +136,7 @@ Index findLimit(const Index start, Type &type, Index &sum) { sum = 0; - for (Index current = start; current < size - 1; ++current) { + for( Index current = start; current < size - 1; ++current) { Index elements = matrix.getRowPointers().getElement(current + 1) - matrix.getRowPointers().getElement(current); sum += elements; @@ -171,7 +171,7 @@ void CSR< Real, Device, Index, KernelType >::setBlocks() std::vector<Block<Index>> inBlock; inBlock.reserve(rows); // reserve space to avoid reallocation - while (nextStart != rows - 1) + while( nextStart != rows - 1 ) { Type type; nextStart = findLimit<Real, Index, Device, KernelType>( @@ -193,7 +193,6 @@ void CSR< Real, Device, Index, KernelType >::setBlocks() this->rowPointers.getElement(start) ); } - start = nextStart; } inBlock.emplace_back(nextStart); @@ -836,88 +835,6 @@ Index CSR< Real, Device, Index, KernelType >::getHybridModeSplit() const #ifdef HAVE_CUDA -template< typename Real, - typename Index, - int warpSize, - int WARPS, - int SHARED_PER_WARP, - int MAX_ELEM_PER_WARP > -__global__ -void SpMVCSRAdaptive( const Real *inVector, - Real *outVector, - const Index* rowPointers, - const Index* columnIndexes, - const Real* values, - const Block<Index> *blocks, - Index blocksSize, - Index gridID) { - __shared__ Real shared[WARPS][SHARED_PER_WARP]; - const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; - const Index blockIdx = index / warpSize; - if (blockIdx >= blocksSize) - return; - - Real result = 0.0; - const Index laneID = threadIdx.x & 31; // & is cheaper than % - Block<Index> block = blocks[blockIdx]; - const Index minID = rowPointers[block.index[0]/* minRow */]; - Index i, to, maxID; - if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) { - /////////////////////////////////////* CSR STREAM *////////////// - const Index warpID = threadIdx.x / 32; - maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; - - /* Stream data to shared memory */ - for (i = laneID + minID; i < maxID; i += warpSize) - shared[warpID][i - minID] = values[i] * inVector[columnIndexes[i]]; - - const Index maxRow = block.index[0]/* minRow */ + - /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF); - /* Calculate result */ - for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) { - to = rowPointers[i + 1] - minID; // end of preprocessed data - result = 0; - /* Scalar reduction */ - for (Index sharedID = rowPointers[i] - minID; sharedID < to; ++sharedID) - result += shared[warpID][sharedID]; - - outVector[i] = result; // Write result - } - } else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) { - /////////////////////////////////////* CSR VECTOR *////////////// - maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; - - for (i = minID + laneID; i < maxID; i += warpSize) - result += values[i] * inVector[columnIndexes[i]]; - - /* Parallel reduction */ - result += __shfl_down_sync(0xFFFFFFFF, result, 16); - result += __shfl_down_sync(0xFFFFFFFF, result, 8); - result += __shfl_down_sync(0xFFFFFFFF, result, 4); - result += __shfl_down_sync(0xFFFFFFFF, result, 2); - result += __shfl_down_sync(0xFFFFFFFF, result, 1); - if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result - } else { - /////////////////////////////////////* CSR VECTOR L *///////////// - /* Number of elements processed by previous warps */ - const Index offset = block.index[1]/* warpInRow */ * MAX_ELEM_PER_WARP; - to = minID + (block.index[1]/* warpInRow */ + 1) * MAX_ELEM_PER_WARP; - maxID = rowPointers[block.index[0]/* minRow */ + 1]; - if (to > maxID) to = maxID; - for (i = minID + offset + laneID; i < to; i += warpSize) - { - result += values[i] * inVector[columnIndexes[i]]; - } - - /* Parallel reduction */ - result += __shfl_down_sync(0xFFFFFFFF, result, 16); - result += __shfl_down_sync(0xFFFFFFFF, result, 8); - result += __shfl_down_sync(0xFFFFFFFF, result, 4); - result += __shfl_down_sync(0xFFFFFFFF, result, 2); - result += __shfl_down_sync(0xFFFFFFFF, result, 1); - if (laneID == 0) atomicAdd(&outVector[block.index[0]/* minRow */], result); - } -} template< typename Real, typename Index> @@ -1767,6 +1684,110 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector, } } +template< typename Real, typename Index > +__device__ Real CSRFetch( const Index* columnIndexes, const Real* values, const Real* vector, const Index i ) +{ + return values[ i ] * vector[ columnIndexes[ i ] ]; +} + +template< typename Real, + typename Index, + int warpSize, + int WARPS, + int SHARED_PER_WARP, + int MAX_ELEM_PER_WARP, + typename Fetch > +__global__ +void SpMVCSRAdaptive( const Real *inVector, + Real *outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Block<Index> *blocks, + Index blocksSize, + Index gridID, + const Fetch fetch ) +{ + __shared__ Real shared[WARPS][SHARED_PER_WARP]; + const Index index = ( ( gridID * MAX_X_DIM + blockIdx.x ) * blockDim.x ) + threadIdx.x; + const Index blockIdx = index / warpSize; + if( blockIdx >= blocksSize ) + return; + + Real result = 0.0; + bool compute( true ); + const Index laneID = threadIdx.x & 31; // & is cheaper than % + Block<Index> block = blocks[blockIdx]; + const Index minID = rowPointers[block.index[0]/* minRow */]; + Index i, to, maxID; + + if( block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) + { + /////////////////////////////////////* CSR STREAM *////////////// + const Index warpID = threadIdx.x / 32; + maxID = minID + block.twobytes[sizeof(Index) == 4 ? 2 : 4]; + // ^-> maxID - minID + + // Stream data to shared memory + for (i = laneID + minID; i < maxID; i += warpSize) + //shared[warpID][i - minID] = fetch( i, compute ); //CSRFetch( columnIndexes, values, inVector, i ); + shared[warpID][i - minID] = values[i] * inVector[columnIndexes[i]]; + + const Index maxRow = block.index[0] + // minRow + (block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF); // maxRow - minRow + // Calculate result + for (i = block.index[0]+ laneID; i < maxRow; i += warpSize) // block.index[0] -> minRow + { + to = rowPointers[i + 1] - minID; // end of preprocessed data + result = 0; + // Scalar reduction + for (Index sharedID = rowPointers[i] - minID; sharedID < to; ++sharedID) + result += shared[warpID][sharedID]; + + outVector[i] = result; // Write result + } + } else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) { + //////////////////////////////////// CSR VECTOR ///////////// + maxID = minID + // maxID - minID + block.twobytes[sizeof(Index) == 4 ? 2 : 4]; + + for (i = minID + laneID; i < maxID; i += warpSize) + result += values[i] * inVector[columnIndexes[i]]; + + // Parallel reduction + result += __shfl_down_sync(0xFFFFFFFF, result, 16); + result += __shfl_down_sync(0xFFFFFFFF, result, 8); + result += __shfl_down_sync(0xFFFFFFFF, result, 4); + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); + // Write result + if (laneID == 0) outVector[block.index[0]] = result; // block.index[0] -> minRow + } else { + //////////////////////////////////// CSR VECTOR L //////////// + // Number of elements processed by previous warps + const Index offset = block.index[1] * MAX_ELEM_PER_WARP; + // ^ warpInRow + to = minID + (block.index[1] + 1) * MAX_ELEM_PER_WARP; + // ^ warpInRow + maxID = rowPointers[block.index[0] + 1]; + // ^ minRow + if (to > maxID) to = maxID; + for (i = minID + offset + laneID; i < to; i += warpSize) + { + result += values[i] * inVector[columnIndexes[i]]; + } + + // Parallel reduction + result += __shfl_down_sync(0xFFFFFFFF, result, 16); + result += __shfl_down_sync(0xFFFFFFFF, result, 8); + result += __shfl_down_sync(0xFFFFFFFF, result, 4); + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); + if (laneID == 0) atomicAdd(&outVector[block.index[0]], result); + // ^ minRow + } +} + template< typename Real, typename Index, typename Device, @@ -1778,18 +1799,27 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, Index blocks; const Index threads = matrix.THREADS_ADAPTIVE; - /* Fill blocks */ + const Index* columnIndexesData = matrix.getColumnIndexes().getData(); + const Real* valuesData = matrix.getValues().getData(); + auto fetch = [=] __cuda_callable__ ( Index globalIdx, bool& compute ) -> Real { + return valuesData[ globalIdx ] * inVector[ columnIndexesData[ globalIdx ] ]; + }; + + // Fill blocks size_t neededThreads = matrix.blocks.getSize() * warpSize; // one warp per block - /* Execute kernels on device */ - for (Index grid = 0; neededThreads != 0; ++grid) { - if (MAX_X_DIM * threads >= neededThreads) { + // Execute kernels on device + for( Index grid = 0; neededThreads != 0; ++grid ) + { + if( MAX_X_DIM * threads >= neededThreads ) + { blocks = roundUpDivision(neededThreads, threads); neededThreads = 0; - } else { + } + else + { blocks = MAX_X_DIM; neededThreads -= MAX_X_DIM * threads; } - SpMVCSRAdaptive< Real, Index, warpSize, matrix.WARPS, matrix.SHARED_PER_WARP, @@ -1802,8 +1832,8 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, matrix.getValues().getData(), matrix.blocks.getData(), matrix.blocks.getSize() - 1, // last block shouldn't be used - grid - ); + grid, + fetch ); } } -- GitLab