Refactored CudaScanKernelFirstPhase - split out CudaBlockScan and CudaTileScan (2f61104b) · Commits · TNL / tnl-dev

src/TNL/Algorithms/detail/CudaScanKernel.h

+223 −110

Original line number	Diff line number	Diff line
		@@ -21,33 +21,159 @@ namespace Algorithms {
		namespace detail {

		#ifdef HAVE_CUDA
		/* Template for cooperative scan across the CUDA block of threads.
		* It is a cooperative operation - all threads must call the operation,
		* otherwise it will deadlock!
		*
		* The default implementation is generic and the reduction is done using
		* shared memory. Specializations can be made based on `Reduction` and
		* `ValueType`, e.g. using the `__shfl_sync` intrinsics for supported
		* value types.
		*/
		template< ScanType scanType,
		int blockSize,
		typename Reduction,
		typename ValueType >
		struct CudaBlockScan
		{
		// storage to be allocated in shared memory
		struct Storage
		{
		ValueType chunkResults[ blockSize + blockSize / Cuda::getNumberOfSharedMemoryBanks() ]; // accessed via Cuda::getInterleaving()
		ValueType warpResults[ Cuda::getWarpSize() ];
		};

		/* Cooperative scan across the CUDA block - each thread will get the
		* result of the scan according to its ID.
		*
		* \param reduction The binary reduction functor.
		* \param zero Neutral element for given reduction operation, i.e.
		* value such that `reduction(zero, x) == x` for any `x`.
		* \param threadValue Value of the calling thread to be reduced.
		* \param tid Index of the calling thread (usually `threadIdx.x`,
		* unless you know what you are doing).
		* \param storage Auxiliary storage (must be allocated as a __shared__
		* variable).
		*/
		__device__ static
		ValueType
		scan( const Reduction& reduction,
		ValueType zero,
		ValueType threadValue,
		int tid,
		Storage& storage )
		{
		// verify the configuration
		TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaBlockScan::scan" );
		static_assert( blockSize / Cuda::getWarpSize() <= Cuda::getWarpSize(),
		"blockSize is too large, it would not be possible to scan warpResults using one warp" );

		// Store the threadValue in the shared memory.
		const int chunkResultIdx = Cuda::getInterleaving( tid );
		storage.chunkResults[ chunkResultIdx ] = threadValue;
		__syncthreads();

		// Perform the parallel scan on chunkResults inside warps.
		const int threadInWarpIdx = tid % Cuda::getWarpSize();
		const int warpIdx = tid / Cuda::getWarpSize();
		#pragma unroll
		for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
		if( threadInWarpIdx >= stride ) {
		storage.chunkResults[ chunkResultIdx ] = reduction( storage.chunkResults[ chunkResultIdx ], storage.chunkResults[ Cuda::getInterleaving( tid - stride ) ] );
		}
		__syncwarp();
		}
		threadValue = storage.chunkResults[ chunkResultIdx ];

		// The last thread in warp stores the intermediate result in warpResults.
		if( threadInWarpIdx == Cuda::getWarpSize() - 1 )
		storage.warpResults[ warpIdx ] = threadValue;
		__syncthreads();

		// Perform the scan of warpResults using one warp.
		if( warpIdx == 0 )
		#pragma unroll
		for( int stride = 1; stride < blockSize / Cuda::getWarpSize(); stride *= 2 ) {
		if( threadInWarpIdx >= stride )
		storage.warpResults[ tid ] = reduction( storage.warpResults[ tid ], storage.warpResults[ tid - stride ] );
		__syncwarp();
		}
		__syncthreads();

		// Shift threadValue by the warpResults.
		if( warpIdx > 0 )
		threadValue = reduction( threadValue, storage.warpResults[ warpIdx - 1 ] );

		// Shift the result for exclusive scan.
		if( scanType == ScanType::Exclusive ) {
		storage.chunkResults[ chunkResultIdx ] = threadValue;
		__syncthreads();
		threadValue = (tid == 0) ? zero : storage.chunkResults[ Cuda::getInterleaving( tid - 1 ) ];
		}

		__syncthreads();
		return threadValue;
		}
		};

		/* Template for cooperative scan of a data tile in the global memory.
		* It is a cooperative operation - all threads must call the operation,
		* otherwise it will deadlock!
		*/
		template< ScanType scanType,
		int blockSize,
		int valuesPerThread,
		typename InputView,
		typename OutputView,
		typename Reduction >
		__global__ void
		CudaScanKernelFirstPhase( const InputView input,
		typename Reduction,
		typename ValueType >
		struct CudaTileScan
		{
		using BlockScan = CudaBlockScan< ScanType::Exclusive, blockSize, Reduction, ValueType >;

		// storage to be allocated in shared memory
		struct Storage
		{
		ValueType data[ blockSize * valuesPerThread ];
		typename BlockScan::Storage blockScanStorage;
		};

		/* Cooperative scan of a data tile in the global memory - each thread will
		* get the result of its chunk (i.e. the last value of the (inclusive) scan
		* in the chunk) according to the thread ID.
		*
		* \param input The input array to be scanned.
		* \param output The array where the result will be stored.
		* \param begin The first element in the array to be scanned.
		* \param end the last element in the array to be scanned.
		* \param outputBegin The first element in the output array to be written. There
		* must be at least `end - begin` elements in the output
		* array starting at the position given by `outputBegin`.
		* \param reduction The binary reduction functor.
		* \param zero Neutral element for given reduction operation, i.e.
		* value such that `reduction(zero, x) == x` for any `x`.
		* \param shift A global shift to be applied to all elements in the
		* chunk processed by this thread.
		* \param storage Auxiliary storage (must be allocated as a __shared__
		* variable).
		*/
		template< typename InputView,
		typename OutputView >
		__device__ static
		ValueType
		scan( const InputView input,
		OutputView output,
		typename InputView::IndexType begin,
		typename InputView::IndexType end,
		typename OutputView::IndexType outputBegin,
		Reduction reduction,
		typename OutputView::ValueType zero,
		typename OutputView::ValueType* blockResults )
		const Reduction& reduction,
		ValueType zero,
		ValueType shift,
		Storage& storage )
		{
		using ValueType = typename OutputView::ValueType;
		using IndexType = typename InputView::IndexType;

		// verify the configuration
		TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaScanKernelFirstPhase" );
		static_assert( blockSize / Cuda::getWarpSize() <= Cuda::getWarpSize(),
		"blockSize is too large, it would not be possible to scan warpResults using one warp" );
		TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaTileScan::scan" );
		static_assert( valuesPerThread % 2,
		"valuesPerThread must be odd, otherwise there would be shared memory bank conflicts "
		"when threads access their chunks in sharedData sequentially" );
		"when threads access their chunks in shared memory sequentially" );

		// calculate indices
		constexpr int maxElementsInBlock = blockSize * valuesPerThread;
		@@ -59,17 +185,12 @@ CudaScanKernelFirstPhase( const InputView input,
		begin += threadOffset;
		outputBegin += threadOffset;

		// allocate shared memory
		__shared__ ValueType sharedData[ maxElementsInBlock ];
		__shared__ ValueType chunkResults[ blockSize + blockSize / Cuda::getNumberOfSharedMemoryBanks() ]; // accessed via Cuda::getInterleaving()
		__shared__ ValueType warpResults[ Cuda::getWarpSize() ];

		// Load data into the shared memory.
		{
		int idx = threadIdx.x;
		while( idx < elementsInBlock )
		{
		sharedData[ idx ] = input[ begin ];
		storage.data[ idx ] = input[ begin ];
		begin += blockDim.x;
		idx += blockDim.x;
		}
		@@ -77,77 +198,35 @@ CudaScanKernelFirstPhase( const InputView input,
		// (this helps to avoid divergent branches in the blocks below)
		while( idx < maxElementsInBlock )
		{
		sharedData[ idx ] = zero;
		storage.data[ idx ] = zero;
		idx += blockDim.x;
		}
		}
		__syncthreads();

		// Perform sequential reduction of the chunk in shared memory.
		// Perform sequential reduction of the thread's chunk in shared memory.
		const int chunkOffset = threadIdx.x * valuesPerThread;
		const int chunkResultIdx = Cuda::getInterleaving( threadIdx.x );
		{
		ValueType chunkResult = sharedData[ chunkOffset ];
		ValueType value = storage.data[ chunkOffset ];
		#pragma unroll
		for( int i = 1; i < valuesPerThread; i++ )
		chunkResult = reduction( chunkResult, sharedData[ chunkOffset + i ] );
		value = reduction( value, storage.data[ chunkOffset + i ] );

		// store the result of the sequential reduction of the chunk in chunkResults
		chunkResults[ chunkResultIdx ] = chunkResult;
		}
		__syncthreads();
		// Scan the spine to obtain the initial value ("offset") for the downsweep.
		value = BlockScan::scan( reduction, zero, value, threadIdx.x, storage.blockScanStorage );

		// Perform the parallel scan on chunkResults inside warps.
		const int threadInWarpIdx = threadIdx.x % Cuda::getWarpSize();
		const int warpIdx = threadIdx.x / Cuda::getWarpSize();
		#pragma unroll
		for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
		if( threadInWarpIdx >= stride ) {
		chunkResults[ chunkResultIdx ] = reduction( chunkResults[ chunkResultIdx ], chunkResults[ Cuda::getInterleaving( threadIdx.x - stride ) ] );
		}
		__syncwarp();
		}

		// The last thread in warp stores the intermediate result in warpResults.
		if( threadInWarpIdx == Cuda::getWarpSize() - 1 )
		warpResults[ warpIdx ] = chunkResults[ chunkResultIdx ];
		__syncthreads();

		// Perform the scan of warpResults using one warp.
		if( warpIdx == 0 )
		#pragma unroll
		for( int stride = 1; stride < blockSize / Cuda::getWarpSize(); stride *= 2 ) {
		if( threadInWarpIdx >= stride )
		warpResults[ threadIdx.x ] = reduction( warpResults[ threadIdx.x ], warpResults[ threadIdx.x - stride ] );
		__syncwarp();
		}
		__syncthreads();

		// Shift chunkResults by the warpResults.
		if( warpIdx > 0 )
		chunkResults[ chunkResultIdx ] = reduction( chunkResults[ chunkResultIdx ], warpResults[ warpIdx - 1 ] );
		__syncthreads();

		// Downsweep step: scan the chunks and use the chunk result as the initial value.
		{
		ValueType value = zero;
		if( threadIdx.x > 0 )
		value = chunkResults[ Cuda::getInterleaving( threadIdx.x - 1 ) ];
		// Apply the global shift.
		value = reduction( value, shift );

		// Downsweep step: scan the chunks and use the result of spine scan as the initial value.
		#pragma unroll
		for( int i = 0; i < valuesPerThread; i++ )
		{
		const ValueType inputValue = sharedData[ chunkOffset + i ];
		const ValueType inputValue = storage.data[ chunkOffset + i ];
		if( scanType == ScanType::Exclusive )
		sharedData[ chunkOffset + i ] = value;
		storage.data[ chunkOffset + i ] = value;
		value = reduction( value, inputValue );
		if( scanType == ScanType::Inclusive )
		sharedData[ chunkOffset + i ] = value;
		}

		// The last thread of the block stores the block result in the global memory.
		if( blockResults && threadIdx.x == blockDim.x - 1 )
		blockResults[ blockIdx.x ] = value;
		storage.data[ chunkOffset + i ] = value;
		}
		__syncthreads();

		@@ -156,11 +235,45 @@ CudaScanKernelFirstPhase( const InputView input,
		int idx = threadIdx.x;
		while( idx < elementsInBlock )
		{
		output[ outputBegin ] = sharedData[ idx ];
		output[ outputBegin ] = storage.data[ idx ];
		outputBegin += blockDim.x;
		idx += blockDim.x;
		}
		}

		// Return the last (inclusive) scan value of the chunk processed by this thread.
		return value;
		}
		};

		template< ScanType scanType,
		int blockSize,
		int valuesPerThread,
		typename InputView,
		typename OutputView,
		typename Reduction >
		__global__ void
		CudaScanKernelFirstPhase( const InputView input,
		OutputView output,
		typename InputView::IndexType begin,
		typename InputView::IndexType end,
		typename OutputView::IndexType outputBegin,
		Reduction reduction,
		typename OutputView::ValueType zero,
		typename OutputView::ValueType* blockResults )
		{
		using ValueType = typename OutputView::ValueType;
		using TileScan = CudaTileScan< scanType, blockSize, valuesPerThread, Reduction, ValueType >;

		// allocate shared memory
		__shared__ typename TileScan::Storage storage;

		// scan from input into output
		const ValueType value = TileScan::scan( input, output, begin, end, outputBegin, reduction, zero, zero, storage );

		// The last thread of the block stores the block result in the global memory.
		if( blockResults && threadIdx.x == blockDim.x - 1 )
		blockResults[ blockIdx.x ] = value;
		}

		template< int blockSize,