Loading src/TNL/Algorithms/detail/CudaScanKernel.h +223 −110 Original line number Diff line number Diff line Loading @@ -21,33 +21,159 @@ namespace Algorithms { namespace detail { #ifdef HAVE_CUDA /* Template for cooperative scan across the CUDA block of threads. * It is a *cooperative* operation - all threads must call the operation, * otherwise it will deadlock! * * The default implementation is generic and the reduction is done using * shared memory. Specializations can be made based on `Reduction` and * `ValueType`, e.g. using the `__shfl_sync` intrinsics for supported * value types. */ template< ScanType scanType, int blockSize, typename Reduction, typename ValueType > struct CudaBlockScan { // storage to be allocated in shared memory struct Storage { ValueType chunkResults[ blockSize + blockSize / Cuda::getNumberOfSharedMemoryBanks() ]; // accessed via Cuda::getInterleaving() ValueType warpResults[ Cuda::getWarpSize() ]; }; /* Cooperative scan across the CUDA block - each thread will get the * result of the scan according to its ID. * * \param reduction The binary reduction functor. * \param zero Neutral element for given reduction operation, i.e. * value such that `reduction(zero, x) == x` for any `x`. * \param threadValue Value of the calling thread to be reduced. * \param tid Index of the calling thread (usually `threadIdx.x`, * unless you know what you are doing). * \param storage Auxiliary storage (must be allocated as a __shared__ * variable). */ __device__ static ValueType scan( const Reduction& reduction, ValueType zero, ValueType threadValue, int tid, Storage& storage ) { // verify the configuration TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaBlockScan::scan" ); static_assert( blockSize / Cuda::getWarpSize() <= Cuda::getWarpSize(), "blockSize is too large, it would not be possible to scan warpResults using one warp" ); // Store the threadValue in the shared memory. const int chunkResultIdx = Cuda::getInterleaving( tid ); storage.chunkResults[ chunkResultIdx ] = threadValue; __syncthreads(); // Perform the parallel scan on chunkResults inside warps. const int threadInWarpIdx = tid % Cuda::getWarpSize(); const int warpIdx = tid / Cuda::getWarpSize(); #pragma unroll for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride ) { storage.chunkResults[ chunkResultIdx ] = reduction( storage.chunkResults[ chunkResultIdx ], storage.chunkResults[ Cuda::getInterleaving( tid - stride ) ] ); } __syncwarp(); } threadValue = storage.chunkResults[ chunkResultIdx ]; // The last thread in warp stores the intermediate result in warpResults. if( threadInWarpIdx == Cuda::getWarpSize() - 1 ) storage.warpResults[ warpIdx ] = threadValue; __syncthreads(); // Perform the scan of warpResults using one warp. if( warpIdx == 0 ) #pragma unroll for( int stride = 1; stride < blockSize / Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride ) storage.warpResults[ tid ] = reduction( storage.warpResults[ tid ], storage.warpResults[ tid - stride ] ); __syncwarp(); } __syncthreads(); // Shift threadValue by the warpResults. if( warpIdx > 0 ) threadValue = reduction( threadValue, storage.warpResults[ warpIdx - 1 ] ); // Shift the result for exclusive scan. if( scanType == ScanType::Exclusive ) { storage.chunkResults[ chunkResultIdx ] = threadValue; __syncthreads(); threadValue = (tid == 0) ? zero : storage.chunkResults[ Cuda::getInterleaving( tid - 1 ) ]; } __syncthreads(); return threadValue; } }; /* Template for cooperative scan of a data tile in the global memory. * It is a *cooperative* operation - all threads must call the operation, * otherwise it will deadlock! */ template< ScanType scanType, int blockSize, int valuesPerThread, typename InputView, typename OutputView, typename Reduction > __global__ void CudaScanKernelFirstPhase( const InputView input, typename Reduction, typename ValueType > struct CudaTileScan { using BlockScan = CudaBlockScan< ScanType::Exclusive, blockSize, Reduction, ValueType >; // storage to be allocated in shared memory struct Storage { ValueType data[ blockSize * valuesPerThread ]; typename BlockScan::Storage blockScanStorage; }; /* Cooperative scan of a data tile in the global memory - each thread will * get the result of its chunk (i.e. the last value of the (inclusive) scan * in the chunk) according to the thread ID. * * \param input The input array to be scanned. * \param output The array where the result will be stored. * \param begin The first element in the array to be scanned. * \param end the last element in the array to be scanned. * \param outputBegin The first element in the output array to be written. There * must be at least `end - begin` elements in the output * array starting at the position given by `outputBegin`. * \param reduction The binary reduction functor. * \param zero Neutral element for given reduction operation, i.e. * value such that `reduction(zero, x) == x` for any `x`. * \param shift A global shift to be applied to all elements in the * chunk processed by this thread. * \param storage Auxiliary storage (must be allocated as a __shared__ * variable). */ template< typename InputView, typename OutputView > __device__ static ValueType scan( const InputView input, OutputView output, typename InputView::IndexType begin, typename InputView::IndexType end, typename OutputView::IndexType outputBegin, Reduction reduction, typename OutputView::ValueType zero, typename OutputView::ValueType* blockResults ) const Reduction& reduction, ValueType zero, ValueType shift, Storage& storage ) { using ValueType = typename OutputView::ValueType; using IndexType = typename InputView::IndexType; // verify the configuration TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaScanKernelFirstPhase" ); static_assert( blockSize / Cuda::getWarpSize() <= Cuda::getWarpSize(), "blockSize is too large, it would not be possible to scan warpResults using one warp" ); TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaTileScan::scan" ); static_assert( valuesPerThread % 2, "valuesPerThread must be odd, otherwise there would be shared memory bank conflicts " "when threads access their chunks in sharedData sequentially" ); "when threads access their chunks in shared memory sequentially" ); // calculate indices constexpr int maxElementsInBlock = blockSize * valuesPerThread; Loading @@ -59,17 +185,12 @@ CudaScanKernelFirstPhase( const InputView input, begin += threadOffset; outputBegin += threadOffset; // allocate shared memory __shared__ ValueType sharedData[ maxElementsInBlock ]; __shared__ ValueType chunkResults[ blockSize + blockSize / Cuda::getNumberOfSharedMemoryBanks() ]; // accessed via Cuda::getInterleaving() __shared__ ValueType warpResults[ Cuda::getWarpSize() ]; // Load data into the shared memory. { int idx = threadIdx.x; while( idx < elementsInBlock ) { sharedData[ idx ] = input[ begin ]; storage.data[ idx ] = input[ begin ]; begin += blockDim.x; idx += blockDim.x; } Loading @@ -77,77 +198,35 @@ CudaScanKernelFirstPhase( const InputView input, // (this helps to avoid divergent branches in the blocks below) while( idx < maxElementsInBlock ) { sharedData[ idx ] = zero; storage.data[ idx ] = zero; idx += blockDim.x; } } __syncthreads(); // Perform sequential reduction of the chunk in shared memory. // Perform sequential reduction of the thread's chunk in shared memory. const int chunkOffset = threadIdx.x * valuesPerThread; const int chunkResultIdx = Cuda::getInterleaving( threadIdx.x ); { ValueType chunkResult = sharedData[ chunkOffset ]; ValueType value = storage.data[ chunkOffset ]; #pragma unroll for( int i = 1; i < valuesPerThread; i++ ) chunkResult = reduction( chunkResult, sharedData[ chunkOffset + i ] ); value = reduction( value, storage.data[ chunkOffset + i ] ); // store the result of the sequential reduction of the chunk in chunkResults chunkResults[ chunkResultIdx ] = chunkResult; } __syncthreads(); // Scan the spine to obtain the initial value ("offset") for the downsweep. value = BlockScan::scan( reduction, zero, value, threadIdx.x, storage.blockScanStorage ); // Perform the parallel scan on chunkResults inside warps. const int threadInWarpIdx = threadIdx.x % Cuda::getWarpSize(); const int warpIdx = threadIdx.x / Cuda::getWarpSize(); #pragma unroll for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride ) { chunkResults[ chunkResultIdx ] = reduction( chunkResults[ chunkResultIdx ], chunkResults[ Cuda::getInterleaving( threadIdx.x - stride ) ] ); } __syncwarp(); } // The last thread in warp stores the intermediate result in warpResults. if( threadInWarpIdx == Cuda::getWarpSize() - 1 ) warpResults[ warpIdx ] = chunkResults[ chunkResultIdx ]; __syncthreads(); // Perform the scan of warpResults using one warp. if( warpIdx == 0 ) #pragma unroll for( int stride = 1; stride < blockSize / Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride ) warpResults[ threadIdx.x ] = reduction( warpResults[ threadIdx.x ], warpResults[ threadIdx.x - stride ] ); __syncwarp(); } __syncthreads(); // Shift chunkResults by the warpResults. if( warpIdx > 0 ) chunkResults[ chunkResultIdx ] = reduction( chunkResults[ chunkResultIdx ], warpResults[ warpIdx - 1 ] ); __syncthreads(); // Downsweep step: scan the chunks and use the chunk result as the initial value. { ValueType value = zero; if( threadIdx.x > 0 ) value = chunkResults[ Cuda::getInterleaving( threadIdx.x - 1 ) ]; // Apply the global shift. value = reduction( value, shift ); // Downsweep step: scan the chunks and use the result of spine scan as the initial value. #pragma unroll for( int i = 0; i < valuesPerThread; i++ ) { const ValueType inputValue = sharedData[ chunkOffset + i ]; const ValueType inputValue = storage.data[ chunkOffset + i ]; if( scanType == ScanType::Exclusive ) sharedData[ chunkOffset + i ] = value; storage.data[ chunkOffset + i ] = value; value = reduction( value, inputValue ); if( scanType == ScanType::Inclusive ) sharedData[ chunkOffset + i ] = value; } // The last thread of the block stores the block result in the global memory. if( blockResults && threadIdx.x == blockDim.x - 1 ) blockResults[ blockIdx.x ] = value; storage.data[ chunkOffset + i ] = value; } __syncthreads(); Loading @@ -156,11 +235,45 @@ CudaScanKernelFirstPhase( const InputView input, int idx = threadIdx.x; while( idx < elementsInBlock ) { output[ outputBegin ] = sharedData[ idx ]; output[ outputBegin ] = storage.data[ idx ]; outputBegin += blockDim.x; idx += blockDim.x; } } // Return the last (inclusive) scan value of the chunk processed by this thread. return value; } }; template< ScanType scanType, int blockSize, int valuesPerThread, typename InputView, typename OutputView, typename Reduction > __global__ void CudaScanKernelFirstPhase( const InputView input, OutputView output, typename InputView::IndexType begin, typename InputView::IndexType end, typename OutputView::IndexType outputBegin, Reduction reduction, typename OutputView::ValueType zero, typename OutputView::ValueType* blockResults ) { using ValueType = typename OutputView::ValueType; using TileScan = CudaTileScan< scanType, blockSize, valuesPerThread, Reduction, ValueType >; // allocate shared memory __shared__ typename TileScan::Storage storage; // scan from input into output const ValueType value = TileScan::scan( input, output, begin, end, outputBegin, reduction, zero, zero, storage ); // The last thread of the block stores the block result in the global memory. if( blockResults && threadIdx.x == blockDim.x - 1 ) blockResults[ blockIdx.x ] = value; } template< int blockSize, Loading Loading
src/TNL/Algorithms/detail/CudaScanKernel.h +223 −110 Original line number Diff line number Diff line Loading @@ -21,33 +21,159 @@ namespace Algorithms { namespace detail { #ifdef HAVE_CUDA /* Template for cooperative scan across the CUDA block of threads. * It is a *cooperative* operation - all threads must call the operation, * otherwise it will deadlock! * * The default implementation is generic and the reduction is done using * shared memory. Specializations can be made based on `Reduction` and * `ValueType`, e.g. using the `__shfl_sync` intrinsics for supported * value types. */ template< ScanType scanType, int blockSize, typename Reduction, typename ValueType > struct CudaBlockScan { // storage to be allocated in shared memory struct Storage { ValueType chunkResults[ blockSize + blockSize / Cuda::getNumberOfSharedMemoryBanks() ]; // accessed via Cuda::getInterleaving() ValueType warpResults[ Cuda::getWarpSize() ]; }; /* Cooperative scan across the CUDA block - each thread will get the * result of the scan according to its ID. * * \param reduction The binary reduction functor. * \param zero Neutral element for given reduction operation, i.e. * value such that `reduction(zero, x) == x` for any `x`. * \param threadValue Value of the calling thread to be reduced. * \param tid Index of the calling thread (usually `threadIdx.x`, * unless you know what you are doing). * \param storage Auxiliary storage (must be allocated as a __shared__ * variable). */ __device__ static ValueType scan( const Reduction& reduction, ValueType zero, ValueType threadValue, int tid, Storage& storage ) { // verify the configuration TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaBlockScan::scan" ); static_assert( blockSize / Cuda::getWarpSize() <= Cuda::getWarpSize(), "blockSize is too large, it would not be possible to scan warpResults using one warp" ); // Store the threadValue in the shared memory. const int chunkResultIdx = Cuda::getInterleaving( tid ); storage.chunkResults[ chunkResultIdx ] = threadValue; __syncthreads(); // Perform the parallel scan on chunkResults inside warps. const int threadInWarpIdx = tid % Cuda::getWarpSize(); const int warpIdx = tid / Cuda::getWarpSize(); #pragma unroll for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride ) { storage.chunkResults[ chunkResultIdx ] = reduction( storage.chunkResults[ chunkResultIdx ], storage.chunkResults[ Cuda::getInterleaving( tid - stride ) ] ); } __syncwarp(); } threadValue = storage.chunkResults[ chunkResultIdx ]; // The last thread in warp stores the intermediate result in warpResults. if( threadInWarpIdx == Cuda::getWarpSize() - 1 ) storage.warpResults[ warpIdx ] = threadValue; __syncthreads(); // Perform the scan of warpResults using one warp. if( warpIdx == 0 ) #pragma unroll for( int stride = 1; stride < blockSize / Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride ) storage.warpResults[ tid ] = reduction( storage.warpResults[ tid ], storage.warpResults[ tid - stride ] ); __syncwarp(); } __syncthreads(); // Shift threadValue by the warpResults. if( warpIdx > 0 ) threadValue = reduction( threadValue, storage.warpResults[ warpIdx - 1 ] ); // Shift the result for exclusive scan. if( scanType == ScanType::Exclusive ) { storage.chunkResults[ chunkResultIdx ] = threadValue; __syncthreads(); threadValue = (tid == 0) ? zero : storage.chunkResults[ Cuda::getInterleaving( tid - 1 ) ]; } __syncthreads(); return threadValue; } }; /* Template for cooperative scan of a data tile in the global memory. * It is a *cooperative* operation - all threads must call the operation, * otherwise it will deadlock! */ template< ScanType scanType, int blockSize, int valuesPerThread, typename InputView, typename OutputView, typename Reduction > __global__ void CudaScanKernelFirstPhase( const InputView input, typename Reduction, typename ValueType > struct CudaTileScan { using BlockScan = CudaBlockScan< ScanType::Exclusive, blockSize, Reduction, ValueType >; // storage to be allocated in shared memory struct Storage { ValueType data[ blockSize * valuesPerThread ]; typename BlockScan::Storage blockScanStorage; }; /* Cooperative scan of a data tile in the global memory - each thread will * get the result of its chunk (i.e. the last value of the (inclusive) scan * in the chunk) according to the thread ID. * * \param input The input array to be scanned. * \param output The array where the result will be stored. * \param begin The first element in the array to be scanned. * \param end the last element in the array to be scanned. * \param outputBegin The first element in the output array to be written. There * must be at least `end - begin` elements in the output * array starting at the position given by `outputBegin`. * \param reduction The binary reduction functor. * \param zero Neutral element for given reduction operation, i.e. * value such that `reduction(zero, x) == x` for any `x`. * \param shift A global shift to be applied to all elements in the * chunk processed by this thread. * \param storage Auxiliary storage (must be allocated as a __shared__ * variable). */ template< typename InputView, typename OutputView > __device__ static ValueType scan( const InputView input, OutputView output, typename InputView::IndexType begin, typename InputView::IndexType end, typename OutputView::IndexType outputBegin, Reduction reduction, typename OutputView::ValueType zero, typename OutputView::ValueType* blockResults ) const Reduction& reduction, ValueType zero, ValueType shift, Storage& storage ) { using ValueType = typename OutputView::ValueType; using IndexType = typename InputView::IndexType; // verify the configuration TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaScanKernelFirstPhase" ); static_assert( blockSize / Cuda::getWarpSize() <= Cuda::getWarpSize(), "blockSize is too large, it would not be possible to scan warpResults using one warp" ); TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaTileScan::scan" ); static_assert( valuesPerThread % 2, "valuesPerThread must be odd, otherwise there would be shared memory bank conflicts " "when threads access their chunks in sharedData sequentially" ); "when threads access their chunks in shared memory sequentially" ); // calculate indices constexpr int maxElementsInBlock = blockSize * valuesPerThread; Loading @@ -59,17 +185,12 @@ CudaScanKernelFirstPhase( const InputView input, begin += threadOffset; outputBegin += threadOffset; // allocate shared memory __shared__ ValueType sharedData[ maxElementsInBlock ]; __shared__ ValueType chunkResults[ blockSize + blockSize / Cuda::getNumberOfSharedMemoryBanks() ]; // accessed via Cuda::getInterleaving() __shared__ ValueType warpResults[ Cuda::getWarpSize() ]; // Load data into the shared memory. { int idx = threadIdx.x; while( idx < elementsInBlock ) { sharedData[ idx ] = input[ begin ]; storage.data[ idx ] = input[ begin ]; begin += blockDim.x; idx += blockDim.x; } Loading @@ -77,77 +198,35 @@ CudaScanKernelFirstPhase( const InputView input, // (this helps to avoid divergent branches in the blocks below) while( idx < maxElementsInBlock ) { sharedData[ idx ] = zero; storage.data[ idx ] = zero; idx += blockDim.x; } } __syncthreads(); // Perform sequential reduction of the chunk in shared memory. // Perform sequential reduction of the thread's chunk in shared memory. const int chunkOffset = threadIdx.x * valuesPerThread; const int chunkResultIdx = Cuda::getInterleaving( threadIdx.x ); { ValueType chunkResult = sharedData[ chunkOffset ]; ValueType value = storage.data[ chunkOffset ]; #pragma unroll for( int i = 1; i < valuesPerThread; i++ ) chunkResult = reduction( chunkResult, sharedData[ chunkOffset + i ] ); value = reduction( value, storage.data[ chunkOffset + i ] ); // store the result of the sequential reduction of the chunk in chunkResults chunkResults[ chunkResultIdx ] = chunkResult; } __syncthreads(); // Scan the spine to obtain the initial value ("offset") for the downsweep. value = BlockScan::scan( reduction, zero, value, threadIdx.x, storage.blockScanStorage ); // Perform the parallel scan on chunkResults inside warps. const int threadInWarpIdx = threadIdx.x % Cuda::getWarpSize(); const int warpIdx = threadIdx.x / Cuda::getWarpSize(); #pragma unroll for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride ) { chunkResults[ chunkResultIdx ] = reduction( chunkResults[ chunkResultIdx ], chunkResults[ Cuda::getInterleaving( threadIdx.x - stride ) ] ); } __syncwarp(); } // The last thread in warp stores the intermediate result in warpResults. if( threadInWarpIdx == Cuda::getWarpSize() - 1 ) warpResults[ warpIdx ] = chunkResults[ chunkResultIdx ]; __syncthreads(); // Perform the scan of warpResults using one warp. if( warpIdx == 0 ) #pragma unroll for( int stride = 1; stride < blockSize / Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride ) warpResults[ threadIdx.x ] = reduction( warpResults[ threadIdx.x ], warpResults[ threadIdx.x - stride ] ); __syncwarp(); } __syncthreads(); // Shift chunkResults by the warpResults. if( warpIdx > 0 ) chunkResults[ chunkResultIdx ] = reduction( chunkResults[ chunkResultIdx ], warpResults[ warpIdx - 1 ] ); __syncthreads(); // Downsweep step: scan the chunks and use the chunk result as the initial value. { ValueType value = zero; if( threadIdx.x > 0 ) value = chunkResults[ Cuda::getInterleaving( threadIdx.x - 1 ) ]; // Apply the global shift. value = reduction( value, shift ); // Downsweep step: scan the chunks and use the result of spine scan as the initial value. #pragma unroll for( int i = 0; i < valuesPerThread; i++ ) { const ValueType inputValue = sharedData[ chunkOffset + i ]; const ValueType inputValue = storage.data[ chunkOffset + i ]; if( scanType == ScanType::Exclusive ) sharedData[ chunkOffset + i ] = value; storage.data[ chunkOffset + i ] = value; value = reduction( value, inputValue ); if( scanType == ScanType::Inclusive ) sharedData[ chunkOffset + i ] = value; } // The last thread of the block stores the block result in the global memory. if( blockResults && threadIdx.x == blockDim.x - 1 ) blockResults[ blockIdx.x ] = value; storage.data[ chunkOffset + i ] = value; } __syncthreads(); Loading @@ -156,11 +235,45 @@ CudaScanKernelFirstPhase( const InputView input, int idx = threadIdx.x; while( idx < elementsInBlock ) { output[ outputBegin ] = sharedData[ idx ]; output[ outputBegin ] = storage.data[ idx ]; outputBegin += blockDim.x; idx += blockDim.x; } } // Return the last (inclusive) scan value of the chunk processed by this thread. return value; } }; template< ScanType scanType, int blockSize, int valuesPerThread, typename InputView, typename OutputView, typename Reduction > __global__ void CudaScanKernelFirstPhase( const InputView input, OutputView output, typename InputView::IndexType begin, typename InputView::IndexType end, typename OutputView::IndexType outputBegin, Reduction reduction, typename OutputView::ValueType zero, typename OutputView::ValueType* blockResults ) { using ValueType = typename OutputView::ValueType; using TileScan = CudaTileScan< scanType, blockSize, valuesPerThread, Reduction, ValueType >; // allocate shared memory __shared__ typename TileScan::Storage storage; // scan from input into output const ValueType value = TileScan::scan( input, output, begin, end, outputBegin, reduction, zero, zero, storage ); // The last thread of the block stores the block result in the global memory. if( blockResults && threadIdx.x == blockDim.x - 1 ) blockResults[ blockIdx.x ] = value; } template< int blockSize, Loading