Added specialization of CudaBlockScan using __shfl instructions (1a64a618) · Commits · TNL / tnl-dev

src/TNL/Algorithms/Sorting/detail/reduction.h

deleted100644 → 0

+0 −138

Original line number	Diff line number	Diff line
		/***************************************************************************
		reduction.h - description
		-------------------
		begin : Jul 13, 2021
		copyright : (C) 2021 by Tomas Oberhuber et al.
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/* See Copyright Notice in tnl/Copyright */

		// Implemented by: Xuan Thang Nguyen

		#pragma once

		namespace TNL {
		namespace Algorithms {
		namespace Sorting {

		#ifdef HAVE_CUDA

		/**
		* https://developer.nvidia.com/blog/faster-parallel-reductions-kepler/
		* */


		__device__ int warpReduceSum(int initVal)
		{
		const unsigned int maskConstant = 0xffffffff; //not used
		for (unsigned int mask = warpSize / 2; mask > 0; mask >>= 1)
		initVal += __shfl_xor_sync(maskConstant, initVal, mask);

		return initVal;
		}

		__device__ int blockReduceSum(int val)
		{
		static __shared__ int shared[32];
		int lane = threadIdx.x & (warpSize - 1);
		int wid = threadIdx.x / warpSize;

		val = warpReduceSum(val);

		if (lane == 0)
		shared[wid] = val;
		__syncthreads();

		val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;

		if (wid == 0)
		val = warpReduceSum(val);

		if(threadIdx.x == 0)
		shared[0] = val;
		__syncthreads();

		return shared[0];
		}

		//-------------------------------------------------------------------------------

		__device__ int warpInclusivePrefixSum(int value)
		{
		int laneId = threadIdx.x & (32-1);

		#pragma unroll
		for (int i = 1; i2 <= 32; i = 2)//32 here is warp size
		{
		int n = __shfl_up_sync(0xffffffff, value, i);
		if ((laneId & (warpSize - 1)) >= i)
		value += n;
		}

		return value;
		}

		__device__ int blockInclusivePrefixSum(int value)
		{
		static __shared__ int shared[32];
		int lane = threadIdx.x & (warpSize - 1);
		int wid = threadIdx.x / warpSize;

		int tmp = warpInclusivePrefixSum(value);

		if (lane == warpSize-1)
		shared[wid] = tmp;
		__syncthreads();

		int tmp2 = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
		if (wid == 0)
		shared[lane] = warpInclusivePrefixSum(tmp2) - tmp2;
		__syncthreads();

		tmp += shared[wid];
		return tmp;
		}

		//--------------------------------------------------------------------

		template<typename Operator>
		__device__ int warpCmpReduce(int initVal, const Operator & Cmp)
		{
		const unsigned int maskConstant = 0xffffffff; //not used
		for (unsigned int mask = warpSize / 2; mask > 0; mask >>= 1)
		initVal = Cmp(initVal, __shfl_xor_sync(maskConstant, initVal, mask));

		return initVal;
		}

		template<typename Operator>
		__device__ int blockCmpReduce(int val, const Operator & Cmp)
		{
		static __shared__ int shared[32];
		int lane = threadIdx.x & (warpSize - 1);
		int wid = threadIdx.x / warpSize;

		val = warpCmpReduce(val, Cmp);

		if (lane == 0)
		shared[wid] = val;
		__syncthreads();

		val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : shared[0];

		if (wid == 0)
		val = warpCmpReduce(val, Cmp);

		if(threadIdx.x == 0)
		shared[0] = val;
		__syncthreads();

		return shared[0];
		}

		#endif

		} // namespace Sorting
		} // namespace Algorithms
		} // namespace TNL
		No newline at end of file

src/TNL/Algorithms/detail/CudaScanKernel.h

+173 −15

Original line number	Diff line number	Diff line
		@@ -68,43 +68,43 @@ struct CudaBlockScan
		static_assert( blockSize / Cuda::getWarpSize() <= Cuda::getWarpSize(),
		"blockSize is too large, it would not be possible to scan warpResults using one warp" );

		// Store the threadValue in the shared memory.
		// store the threadValue in the shared memory
		const int chunkResultIdx = Cuda::getInterleaving( tid );
		storage.chunkResults[ chunkResultIdx ] = threadValue;
		__syncthreads();

		// Perform the parallel scan on chunkResults inside warps.
		const int threadInWarpIdx = tid % Cuda::getWarpSize();
		const int warpIdx = tid / Cuda::getWarpSize();
		// perform the parallel scan on chunkResults inside warps
		const int lane_id = tid % Cuda::getWarpSize();
		const int warp_id = tid / Cuda::getWarpSize();
		#pragma unroll
		for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
		if( threadInWarpIdx >= stride ) {
		if( lane_id >= stride ) {
		storage.chunkResults[ chunkResultIdx ] = reduction( storage.chunkResults[ chunkResultIdx ], storage.chunkResults[ Cuda::getInterleaving( tid - stride ) ] );
		}
		__syncwarp();
		}
		threadValue = storage.chunkResults[ chunkResultIdx ];

		// The last thread in warp stores the intermediate result in warpResults.
		if( threadInWarpIdx == Cuda::getWarpSize() - 1 )
		storage.warpResults[ warpIdx ] = threadValue;
		// the last thread in warp stores the intermediate result in warpResults
		if( lane_id == Cuda::getWarpSize() - 1 )
		storage.warpResults[ warp_id ] = threadValue;
		__syncthreads();

		// Perform the scan of warpResults using one warp.
		if( warpIdx == 0 )
		// perform the scan of warpResults using one warp
		if( warp_id == 0 )
		#pragma unroll
		for( int stride = 1; stride < blockSize / Cuda::getWarpSize(); stride *= 2 ) {
		if( threadInWarpIdx >= stride )
		if( lane_id >= stride )
		storage.warpResults[ tid ] = reduction( storage.warpResults[ tid ], storage.warpResults[ tid - stride ] );
		__syncwarp();
		}
		__syncthreads();

		// Shift threadValue by the warpResults.
		if( warpIdx > 0 )
		threadValue = reduction( threadValue, storage.warpResults[ warpIdx - 1 ] );
		// shift threadValue by the warpResults
		if( warp_id > 0 )
		threadValue = reduction( threadValue, storage.warpResults[ warp_id - 1 ] );

		// Shift the result for exclusive scan.
		// shift the result for exclusive scan
		if( scanType == ScanType::Exclusive ) {
		storage.chunkResults[ chunkResultIdx ] = threadValue;
		__syncthreads();
		@@ -116,6 +116,164 @@ struct CudaBlockScan
		}
		};

		template< ScanType scanType,
		int __unused, // the __shfl implementation does not depend on the blockSize
		typename Reduction,
		typename ValueType >
		struct CudaBlockScanShfl
		{
		// storage to be allocated in shared memory
		struct Storage
		{
		ValueType warpResults[ Cuda::getWarpSize() ];
		};

		/* Cooperative scan across the CUDA block - each thread will get the
		* result of the scan according to its ID.
		*
		* \param reduction The binary reduction functor.
		* \param identity Neutral element for given reduction operation, i.e.
		* value such that `reduction(identity, x) == x` for any `x`.
		* \param threadValue Value of the calling thread to be reduced.
		* \param tid Index of the calling thread (usually `threadIdx.x`,
		* unless you know what you are doing).
		* \param storage Auxiliary storage (must be allocated as a __shared__
		* variable).
		*/
		__device__ static
		ValueType
		scan( const Reduction& reduction,
		ValueType identity,
		ValueType threadValue,
		int tid,
		Storage& storage )
		{
		const int lane_id = tid % Cuda::getWarpSize();
		const int warp_id = tid / Cuda::getWarpSize();

		// perform the parallel scan across warps
		ValueType total;
		threadValue = warpScan< scanType >( reduction, identity, threadValue, lane_id, total );

		// the last thread in warp stores the result of inclusive scan in warpResults
		if( lane_id == Cuda::getWarpSize() - 1 )
		storage.warpResults[ warp_id ] = total;
		__syncthreads();

		// the first warp performs the scan of warpResults
		if( warp_id == 0 ) {
		// read from shared memory only if that warp existed
		if( tid < blockDim.x / Cuda::getWarpSize() )
		total = storage.warpResults[ lane_id ];
		else
		total = identity;
		storage.warpResults[ lane_id ] = warpScan< ScanType::Inclusive >( reduction, identity, total, lane_id, total );
		}
		__syncthreads();

		// shift threadValue by the warpResults
		if( warp_id > 0 )
		threadValue = reduction( threadValue, storage.warpResults[ warp_id - 1 ] );

		__syncthreads();
		return threadValue;
		}

		/* Helper function.
		* Cooperative scan across the warp - each thread will get the result of the
		* scan according to its ID.
		* return value = thread's result of the warpScanType scan
		* total = thread's result of the inclusive scan
		*/
		template< ScanType warpScanType >
		__device__ static
		ValueType
		warpScan( const Reduction& reduction,
		ValueType identity,
		ValueType threadValue,
		int lane_id,
		ValueType& total )
		{
		constexpr unsigned mask = 0xffffffff;

		// perform an inclusive scan
		#pragma unroll
		for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
		const ValueType otherValue = __shfl_up_sync( mask, threadValue, stride );
		if( lane_id >= stride )
		threadValue = reduction( threadValue, otherValue );
		}

		// set the result of the inclusive scan
		total = threadValue;

		// shift the result for exclusive scan
		if( warpScanType == ScanType::Exclusive ) {
		threadValue = __shfl_up_sync( mask, threadValue, 1 );
		if( lane_id == 0 )
		threadValue = identity;
		}

		return threadValue;
		}
		};

		template< ScanType scanType,
		int blockSize,
		typename Reduction >
		struct CudaBlockScan< scanType, blockSize, Reduction, int >
		: public CudaBlockScanShfl< scanType, blockSize, Reduction, int >
		{};

		template< ScanType scanType,
		int blockSize,
		typename Reduction >
		struct CudaBlockScan< scanType, blockSize, Reduction, unsigned int >
		: public CudaBlockScanShfl< scanType, blockSize, Reduction, unsigned int >
		{};

		template< ScanType scanType,
		int blockSize,
		typename Reduction >
		struct CudaBlockScan< scanType, blockSize, Reduction, long >
		: public CudaBlockScanShfl< scanType, blockSize, Reduction, long >
		{};

		template< ScanType scanType,
		int blockSize,
		typename Reduction >
		struct CudaBlockScan< scanType, blockSize, Reduction, unsigned long >
		: public CudaBlockScanShfl< scanType, blockSize, Reduction, unsigned long >
		{};

		template< ScanType scanType,
		int blockSize,
		typename Reduction >
		struct CudaBlockScan< scanType, blockSize, Reduction, long long >
		: public CudaBlockScanShfl< scanType, blockSize, Reduction, long long >
		{};

		template< ScanType scanType,
		int blockSize,
		typename Reduction >
		struct CudaBlockScan< scanType, blockSize, Reduction, unsigned long long >
		: public CudaBlockScanShfl< scanType, blockSize, Reduction, unsigned long long >
		{};

		template< ScanType scanType,
		int blockSize,
		typename Reduction >
		struct CudaBlockScan< scanType, blockSize, Reduction, float >
		: public CudaBlockScanShfl< scanType, blockSize, Reduction, float >
		{};

		template< ScanType scanType,
		int blockSize,
		typename Reduction >
		struct CudaBlockScan< scanType, blockSize, Reduction, double >
		: public CudaBlockScanShfl< scanType, blockSize, Reduction, double >
		{};

		/* Template for cooperative scan of a data tile in the global memory.
		* It is a cooperative operation - all threads must call the operation,
		* otherwise it will deadlock!