Loading src/TNL/Algorithms/Sorting/detail/reduction.hdeleted 100644 → 0 +0 −138 Original line number Diff line number Diff line /*************************************************************************** reduction.h - description ------------------- begin : Jul 13, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ // Implemented by: Xuan Thang Nguyen #pragma once namespace TNL { namespace Algorithms { namespace Sorting { #ifdef HAVE_CUDA /** * https://developer.nvidia.com/blog/faster-parallel-reductions-kepler/ * */ __device__ int warpReduceSum(int initVal) { const unsigned int maskConstant = 0xffffffff; //not used for (unsigned int mask = warpSize / 2; mask > 0; mask >>= 1) initVal += __shfl_xor_sync(maskConstant, initVal, mask); return initVal; } __device__ int blockReduceSum(int val) { static __shared__ int shared[32]; int lane = threadIdx.x & (warpSize - 1); int wid = threadIdx.x / warpSize; val = warpReduceSum(val); if (lane == 0) shared[wid] = val; __syncthreads(); val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; if (wid == 0) val = warpReduceSum(val); if(threadIdx.x == 0) shared[0] = val; __syncthreads(); return shared[0]; } //------------------------------------------------------------------------------- __device__ int warpInclusivePrefixSum(int value) { int laneId = threadIdx.x & (32-1); #pragma unroll for (int i = 1; i*2 <= 32; i *= 2)//32 here is warp size { int n = __shfl_up_sync(0xffffffff, value, i); if ((laneId & (warpSize - 1)) >= i) value += n; } return value; } __device__ int blockInclusivePrefixSum(int value) { static __shared__ int shared[32]; int lane = threadIdx.x & (warpSize - 1); int wid = threadIdx.x / warpSize; int tmp = warpInclusivePrefixSum(value); if (lane == warpSize-1) shared[wid] = tmp; __syncthreads(); int tmp2 = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; if (wid == 0) shared[lane] = warpInclusivePrefixSum(tmp2) - tmp2; __syncthreads(); tmp += shared[wid]; return tmp; } //-------------------------------------------------------------------- template<typename Operator> __device__ int warpCmpReduce(int initVal, const Operator & Cmp) { const unsigned int maskConstant = 0xffffffff; //not used for (unsigned int mask = warpSize / 2; mask > 0; mask >>= 1) initVal = Cmp(initVal, __shfl_xor_sync(maskConstant, initVal, mask)); return initVal; } template<typename Operator> __device__ int blockCmpReduce(int val, const Operator & Cmp) { static __shared__ int shared[32]; int lane = threadIdx.x & (warpSize - 1); int wid = threadIdx.x / warpSize; val = warpCmpReduce(val, Cmp); if (lane == 0) shared[wid] = val; __syncthreads(); val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : shared[0]; if (wid == 0) val = warpCmpReduce(val, Cmp); if(threadIdx.x == 0) shared[0] = val; __syncthreads(); return shared[0]; } #endif } // namespace Sorting } // namespace Algorithms } // namespace TNL No newline at end of file src/TNL/Algorithms/detail/CudaScanKernel.h +173 −15 Original line number Diff line number Diff line Loading @@ -68,43 +68,43 @@ struct CudaBlockScan static_assert( blockSize / Cuda::getWarpSize() <= Cuda::getWarpSize(), "blockSize is too large, it would not be possible to scan warpResults using one warp" ); // Store the threadValue in the shared memory. // store the threadValue in the shared memory const int chunkResultIdx = Cuda::getInterleaving( tid ); storage.chunkResults[ chunkResultIdx ] = threadValue; __syncthreads(); // Perform the parallel scan on chunkResults inside warps. const int threadInWarpIdx = tid % Cuda::getWarpSize(); const int warpIdx = tid / Cuda::getWarpSize(); // perform the parallel scan on chunkResults inside warps const int lane_id = tid % Cuda::getWarpSize(); const int warp_id = tid / Cuda::getWarpSize(); #pragma unroll for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride ) { if( lane_id >= stride ) { storage.chunkResults[ chunkResultIdx ] = reduction( storage.chunkResults[ chunkResultIdx ], storage.chunkResults[ Cuda::getInterleaving( tid - stride ) ] ); } __syncwarp(); } threadValue = storage.chunkResults[ chunkResultIdx ]; // The last thread in warp stores the intermediate result in warpResults. if( threadInWarpIdx == Cuda::getWarpSize() - 1 ) storage.warpResults[ warpIdx ] = threadValue; // the last thread in warp stores the intermediate result in warpResults if( lane_id == Cuda::getWarpSize() - 1 ) storage.warpResults[ warp_id ] = threadValue; __syncthreads(); // Perform the scan of warpResults using one warp. if( warpIdx == 0 ) // perform the scan of warpResults using one warp if( warp_id == 0 ) #pragma unroll for( int stride = 1; stride < blockSize / Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride ) if( lane_id >= stride ) storage.warpResults[ tid ] = reduction( storage.warpResults[ tid ], storage.warpResults[ tid - stride ] ); __syncwarp(); } __syncthreads(); // Shift threadValue by the warpResults. if( warpIdx > 0 ) threadValue = reduction( threadValue, storage.warpResults[ warpIdx - 1 ] ); // shift threadValue by the warpResults if( warp_id > 0 ) threadValue = reduction( threadValue, storage.warpResults[ warp_id - 1 ] ); // Shift the result for exclusive scan. // shift the result for exclusive scan if( scanType == ScanType::Exclusive ) { storage.chunkResults[ chunkResultIdx ] = threadValue; __syncthreads(); Loading @@ -116,6 +116,164 @@ struct CudaBlockScan } }; template< ScanType scanType, int __unused, // the __shfl implementation does not depend on the blockSize typename Reduction, typename ValueType > struct CudaBlockScanShfl { // storage to be allocated in shared memory struct Storage { ValueType warpResults[ Cuda::getWarpSize() ]; }; /* Cooperative scan across the CUDA block - each thread will get the * result of the scan according to its ID. * * \param reduction The binary reduction functor. * \param identity Neutral element for given reduction operation, i.e. * value such that `reduction(identity, x) == x` for any `x`. * \param threadValue Value of the calling thread to be reduced. * \param tid Index of the calling thread (usually `threadIdx.x`, * unless you know what you are doing). * \param storage Auxiliary storage (must be allocated as a __shared__ * variable). */ __device__ static ValueType scan( const Reduction& reduction, ValueType identity, ValueType threadValue, int tid, Storage& storage ) { const int lane_id = tid % Cuda::getWarpSize(); const int warp_id = tid / Cuda::getWarpSize(); // perform the parallel scan across warps ValueType total; threadValue = warpScan< scanType >( reduction, identity, threadValue, lane_id, total ); // the last thread in warp stores the result of inclusive scan in warpResults if( lane_id == Cuda::getWarpSize() - 1 ) storage.warpResults[ warp_id ] = total; __syncthreads(); // the first warp performs the scan of warpResults if( warp_id == 0 ) { // read from shared memory only if that warp existed if( tid < blockDim.x / Cuda::getWarpSize() ) total = storage.warpResults[ lane_id ]; else total = identity; storage.warpResults[ lane_id ] = warpScan< ScanType::Inclusive >( reduction, identity, total, lane_id, total ); } __syncthreads(); // shift threadValue by the warpResults if( warp_id > 0 ) threadValue = reduction( threadValue, storage.warpResults[ warp_id - 1 ] ); __syncthreads(); return threadValue; } /* Helper function. * Cooperative scan across the warp - each thread will get the result of the * scan according to its ID. * return value = thread's result of the *warpScanType* scan * total = thread's result of the *inclusive* scan */ template< ScanType warpScanType > __device__ static ValueType warpScan( const Reduction& reduction, ValueType identity, ValueType threadValue, int lane_id, ValueType& total ) { constexpr unsigned mask = 0xffffffff; // perform an inclusive scan #pragma unroll for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) { const ValueType otherValue = __shfl_up_sync( mask, threadValue, stride ); if( lane_id >= stride ) threadValue = reduction( threadValue, otherValue ); } // set the result of the inclusive scan total = threadValue; // shift the result for exclusive scan if( warpScanType == ScanType::Exclusive ) { threadValue = __shfl_up_sync( mask, threadValue, 1 ); if( lane_id == 0 ) threadValue = identity; } return threadValue; } }; template< ScanType scanType, int blockSize, typename Reduction > struct CudaBlockScan< scanType, blockSize, Reduction, int > : public CudaBlockScanShfl< scanType, blockSize, Reduction, int > {}; template< ScanType scanType, int blockSize, typename Reduction > struct CudaBlockScan< scanType, blockSize, Reduction, unsigned int > : public CudaBlockScanShfl< scanType, blockSize, Reduction, unsigned int > {}; template< ScanType scanType, int blockSize, typename Reduction > struct CudaBlockScan< scanType, blockSize, Reduction, long > : public CudaBlockScanShfl< scanType, blockSize, Reduction, long > {}; template< ScanType scanType, int blockSize, typename Reduction > struct CudaBlockScan< scanType, blockSize, Reduction, unsigned long > : public CudaBlockScanShfl< scanType, blockSize, Reduction, unsigned long > {}; template< ScanType scanType, int blockSize, typename Reduction > struct CudaBlockScan< scanType, blockSize, Reduction, long long > : public CudaBlockScanShfl< scanType, blockSize, Reduction, long long > {}; template< ScanType scanType, int blockSize, typename Reduction > struct CudaBlockScan< scanType, blockSize, Reduction, unsigned long long > : public CudaBlockScanShfl< scanType, blockSize, Reduction, unsigned long long > {}; template< ScanType scanType, int blockSize, typename Reduction > struct CudaBlockScan< scanType, blockSize, Reduction, float > : public CudaBlockScanShfl< scanType, blockSize, Reduction, float > {}; template< ScanType scanType, int blockSize, typename Reduction > struct CudaBlockScan< scanType, blockSize, Reduction, double > : public CudaBlockScanShfl< scanType, blockSize, Reduction, double > {}; /* Template for cooperative scan of a data tile in the global memory. * It is a *cooperative* operation - all threads must call the operation, * otherwise it will deadlock! Loading Loading
src/TNL/Algorithms/Sorting/detail/reduction.hdeleted 100644 → 0 +0 −138 Original line number Diff line number Diff line /*************************************************************************** reduction.h - description ------------------- begin : Jul 13, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ // Implemented by: Xuan Thang Nguyen #pragma once namespace TNL { namespace Algorithms { namespace Sorting { #ifdef HAVE_CUDA /** * https://developer.nvidia.com/blog/faster-parallel-reductions-kepler/ * */ __device__ int warpReduceSum(int initVal) { const unsigned int maskConstant = 0xffffffff; //not used for (unsigned int mask = warpSize / 2; mask > 0; mask >>= 1) initVal += __shfl_xor_sync(maskConstant, initVal, mask); return initVal; } __device__ int blockReduceSum(int val) { static __shared__ int shared[32]; int lane = threadIdx.x & (warpSize - 1); int wid = threadIdx.x / warpSize; val = warpReduceSum(val); if (lane == 0) shared[wid] = val; __syncthreads(); val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; if (wid == 0) val = warpReduceSum(val); if(threadIdx.x == 0) shared[0] = val; __syncthreads(); return shared[0]; } //------------------------------------------------------------------------------- __device__ int warpInclusivePrefixSum(int value) { int laneId = threadIdx.x & (32-1); #pragma unroll for (int i = 1; i*2 <= 32; i *= 2)//32 here is warp size { int n = __shfl_up_sync(0xffffffff, value, i); if ((laneId & (warpSize - 1)) >= i) value += n; } return value; } __device__ int blockInclusivePrefixSum(int value) { static __shared__ int shared[32]; int lane = threadIdx.x & (warpSize - 1); int wid = threadIdx.x / warpSize; int tmp = warpInclusivePrefixSum(value); if (lane == warpSize-1) shared[wid] = tmp; __syncthreads(); int tmp2 = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; if (wid == 0) shared[lane] = warpInclusivePrefixSum(tmp2) - tmp2; __syncthreads(); tmp += shared[wid]; return tmp; } //-------------------------------------------------------------------- template<typename Operator> __device__ int warpCmpReduce(int initVal, const Operator & Cmp) { const unsigned int maskConstant = 0xffffffff; //not used for (unsigned int mask = warpSize / 2; mask > 0; mask >>= 1) initVal = Cmp(initVal, __shfl_xor_sync(maskConstant, initVal, mask)); return initVal; } template<typename Operator> __device__ int blockCmpReduce(int val, const Operator & Cmp) { static __shared__ int shared[32]; int lane = threadIdx.x & (warpSize - 1); int wid = threadIdx.x / warpSize; val = warpCmpReduce(val, Cmp); if (lane == 0) shared[wid] = val; __syncthreads(); val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : shared[0]; if (wid == 0) val = warpCmpReduce(val, Cmp); if(threadIdx.x == 0) shared[0] = val; __syncthreads(); return shared[0]; } #endif } // namespace Sorting } // namespace Algorithms } // namespace TNL No newline at end of file
src/TNL/Algorithms/detail/CudaScanKernel.h +173 −15 Original line number Diff line number Diff line Loading @@ -68,43 +68,43 @@ struct CudaBlockScan static_assert( blockSize / Cuda::getWarpSize() <= Cuda::getWarpSize(), "blockSize is too large, it would not be possible to scan warpResults using one warp" ); // Store the threadValue in the shared memory. // store the threadValue in the shared memory const int chunkResultIdx = Cuda::getInterleaving( tid ); storage.chunkResults[ chunkResultIdx ] = threadValue; __syncthreads(); // Perform the parallel scan on chunkResults inside warps. const int threadInWarpIdx = tid % Cuda::getWarpSize(); const int warpIdx = tid / Cuda::getWarpSize(); // perform the parallel scan on chunkResults inside warps const int lane_id = tid % Cuda::getWarpSize(); const int warp_id = tid / Cuda::getWarpSize(); #pragma unroll for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride ) { if( lane_id >= stride ) { storage.chunkResults[ chunkResultIdx ] = reduction( storage.chunkResults[ chunkResultIdx ], storage.chunkResults[ Cuda::getInterleaving( tid - stride ) ] ); } __syncwarp(); } threadValue = storage.chunkResults[ chunkResultIdx ]; // The last thread in warp stores the intermediate result in warpResults. if( threadInWarpIdx == Cuda::getWarpSize() - 1 ) storage.warpResults[ warpIdx ] = threadValue; // the last thread in warp stores the intermediate result in warpResults if( lane_id == Cuda::getWarpSize() - 1 ) storage.warpResults[ warp_id ] = threadValue; __syncthreads(); // Perform the scan of warpResults using one warp. if( warpIdx == 0 ) // perform the scan of warpResults using one warp if( warp_id == 0 ) #pragma unroll for( int stride = 1; stride < blockSize / Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride ) if( lane_id >= stride ) storage.warpResults[ tid ] = reduction( storage.warpResults[ tid ], storage.warpResults[ tid - stride ] ); __syncwarp(); } __syncthreads(); // Shift threadValue by the warpResults. if( warpIdx > 0 ) threadValue = reduction( threadValue, storage.warpResults[ warpIdx - 1 ] ); // shift threadValue by the warpResults if( warp_id > 0 ) threadValue = reduction( threadValue, storage.warpResults[ warp_id - 1 ] ); // Shift the result for exclusive scan. // shift the result for exclusive scan if( scanType == ScanType::Exclusive ) { storage.chunkResults[ chunkResultIdx ] = threadValue; __syncthreads(); Loading @@ -116,6 +116,164 @@ struct CudaBlockScan } }; template< ScanType scanType, int __unused, // the __shfl implementation does not depend on the blockSize typename Reduction, typename ValueType > struct CudaBlockScanShfl { // storage to be allocated in shared memory struct Storage { ValueType warpResults[ Cuda::getWarpSize() ]; }; /* Cooperative scan across the CUDA block - each thread will get the * result of the scan according to its ID. * * \param reduction The binary reduction functor. * \param identity Neutral element for given reduction operation, i.e. * value such that `reduction(identity, x) == x` for any `x`. * \param threadValue Value of the calling thread to be reduced. * \param tid Index of the calling thread (usually `threadIdx.x`, * unless you know what you are doing). * \param storage Auxiliary storage (must be allocated as a __shared__ * variable). */ __device__ static ValueType scan( const Reduction& reduction, ValueType identity, ValueType threadValue, int tid, Storage& storage ) { const int lane_id = tid % Cuda::getWarpSize(); const int warp_id = tid / Cuda::getWarpSize(); // perform the parallel scan across warps ValueType total; threadValue = warpScan< scanType >( reduction, identity, threadValue, lane_id, total ); // the last thread in warp stores the result of inclusive scan in warpResults if( lane_id == Cuda::getWarpSize() - 1 ) storage.warpResults[ warp_id ] = total; __syncthreads(); // the first warp performs the scan of warpResults if( warp_id == 0 ) { // read from shared memory only if that warp existed if( tid < blockDim.x / Cuda::getWarpSize() ) total = storage.warpResults[ lane_id ]; else total = identity; storage.warpResults[ lane_id ] = warpScan< ScanType::Inclusive >( reduction, identity, total, lane_id, total ); } __syncthreads(); // shift threadValue by the warpResults if( warp_id > 0 ) threadValue = reduction( threadValue, storage.warpResults[ warp_id - 1 ] ); __syncthreads(); return threadValue; } /* Helper function. * Cooperative scan across the warp - each thread will get the result of the * scan according to its ID. * return value = thread's result of the *warpScanType* scan * total = thread's result of the *inclusive* scan */ template< ScanType warpScanType > __device__ static ValueType warpScan( const Reduction& reduction, ValueType identity, ValueType threadValue, int lane_id, ValueType& total ) { constexpr unsigned mask = 0xffffffff; // perform an inclusive scan #pragma unroll for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) { const ValueType otherValue = __shfl_up_sync( mask, threadValue, stride ); if( lane_id >= stride ) threadValue = reduction( threadValue, otherValue ); } // set the result of the inclusive scan total = threadValue; // shift the result for exclusive scan if( warpScanType == ScanType::Exclusive ) { threadValue = __shfl_up_sync( mask, threadValue, 1 ); if( lane_id == 0 ) threadValue = identity; } return threadValue; } }; template< ScanType scanType, int blockSize, typename Reduction > struct CudaBlockScan< scanType, blockSize, Reduction, int > : public CudaBlockScanShfl< scanType, blockSize, Reduction, int > {}; template< ScanType scanType, int blockSize, typename Reduction > struct CudaBlockScan< scanType, blockSize, Reduction, unsigned int > : public CudaBlockScanShfl< scanType, blockSize, Reduction, unsigned int > {}; template< ScanType scanType, int blockSize, typename Reduction > struct CudaBlockScan< scanType, blockSize, Reduction, long > : public CudaBlockScanShfl< scanType, blockSize, Reduction, long > {}; template< ScanType scanType, int blockSize, typename Reduction > struct CudaBlockScan< scanType, blockSize, Reduction, unsigned long > : public CudaBlockScanShfl< scanType, blockSize, Reduction, unsigned long > {}; template< ScanType scanType, int blockSize, typename Reduction > struct CudaBlockScan< scanType, blockSize, Reduction, long long > : public CudaBlockScanShfl< scanType, blockSize, Reduction, long long > {}; template< ScanType scanType, int blockSize, typename Reduction > struct CudaBlockScan< scanType, blockSize, Reduction, unsigned long long > : public CudaBlockScanShfl< scanType, blockSize, Reduction, unsigned long long > {}; template< ScanType scanType, int blockSize, typename Reduction > struct CudaBlockScan< scanType, blockSize, Reduction, float > : public CudaBlockScanShfl< scanType, blockSize, Reduction, float > {}; template< ScanType scanType, int blockSize, typename Reduction > struct CudaBlockScan< scanType, blockSize, Reduction, double > : public CudaBlockScanShfl< scanType, blockSize, Reduction, double > {}; /* Template for cooperative scan of a data tile in the global memory. * It is a *cooperative* operation - all threads must call the operation, * otherwise it will deadlock! Loading