Style changes in the code for reduction (e470040a) · Commits · TNL / tnl-dev

src/TNL/Containers/Algorithms/CudaReductionKernel.h

+227 −240

Original line number	Diff line number	Diff line
		@@ -12,20 +12,17 @@

		#include <utility> // std::pair

		#ifdef HAVE_CUDA
		#include <cuda.h>
		#endif

		#include <TNL/Assert.h>
		#include <TNL/Math.h>
		#include <TNL/Devices/CudaDeviceInfo.h>
		#include <TNL/Containers/Algorithms/CudaReductionBuffer.h>
		#include <TNL/Containers/Algorithms/ArrayOperations.h>
		#include <TNL/Exceptions/CudaSupportMissing.h>

		namespace TNL {
		namespace Containers {
		namespace Algorithms {

		#ifdef HAVE_CUDA
		/****
		* The performance of this kernel is very sensitive to register usage.
		* Compile with --ptxas-options=-v and configure these constants for given
		@@ -34,6 +31,7 @@ namespace Algorithms {
		static constexpr int Reduction_maxThreadsPerBlock = 256; // must be a power of 2
		static constexpr int Reduction_registersPerThread = 32; // empirically determined optimal value

		#ifdef HAVE_CUDA
		// __CUDA_ARCH__ is defined only in device code!
		#if (__CUDA_ARCH__ >= 300 )
		static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
		@@ -56,19 +54,16 @@ CudaReductionKernel( const Result zero,
		const Index size,
		Result* output )
		{
		using IndexType = Index;
		using ResultType = Result;

		ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
		Result* sdata = Devices::Cuda::getSharedMemory< Result >();

		// Get the thread id (tid), global thread id (gid) and gridSize.
		const IndexType tid = threadIdx.x;
		IndexType gid = blockIdx.x * blockDim. x + threadIdx.x;
		const IndexType gridSize = blockDim.x * gridDim.x;
		const Index tid = threadIdx.x;
		Index gid = blockIdx.x * blockDim. x + threadIdx.x;
		const Index gridSize = blockDim.x * gridDim.x;

		sdata[ tid ] = zero;

		// Read data into the shared memory. We start with the sequential reduction.
		// Start with the sequential reduction and push the result into the shared memory.
		while( gid + 4 * gridSize < size ) {
		reduction( sdata[ tid ], dataFetcher( gid ) );
		reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
		@@ -111,29 +106,24 @@ CudaReductionKernel( const Result zero,

		// This runs in one warp so it is synchronized implicitly.
		if( tid < 32 ) {
		volatile ResultType* vsdata = sdata;
		if( blockSize >= 64 ) {
		volatile Result* vsdata = sdata;
		if( blockSize >= 64 )
		volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
		}
		// Note that here we do not have to check if tid < 16 etc, because we have
		// twice as much shared memory, so we do not access out of bounds. The
		// results for the upper half will be undefined, but unused anyway.
		if( blockSize >= 32 ) {
		// 2 * blockSize.x elements of shared memory per block, so we do not
		// access out of bounds. The results for the upper half will be undefined,
		// but unused anyway.
		if( blockSize >= 32 )
		volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
		}
		if( blockSize >= 16 ) {
		if( blockSize >= 16 )
		volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
		}
		if( blockSize >= 8 ) {
		if( blockSize >= 8 )
		volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
		}
		if( blockSize >= 4 ) {
		if( blockSize >= 4 )
		volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
		}
		if( blockSize >= 2 ) {
		if( blockSize >= 2 )
		volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
		}
		}

		// Store the result back in the global memory.
		if( tid == 0 )
		@@ -157,18 +147,15 @@ CudaReductionWithArgumentKernel( const Result zero,
		Index* idxOutput,
		const Index* idxInput = nullptr )
		{
		using IndexType = Index;
		using ResultType = Result;

		ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
		IndexType* sidx = reinterpret_cast< IndexType* >( &sdata[ blockDim.x ] );
		Result* sdata = Devices::Cuda::getSharedMemory< Result >();
		Index* sidx = reinterpret_cast< Index* >( &sdata[ blockDim.x ] );

		// Get the thread id (tid), global thread id (gid) and gridSize.
		const IndexType tid = threadIdx.x;
		IndexType gid = blockIdx.x * blockDim. x + threadIdx.x;
		const IndexType gridSize = blockDim.x * gridDim.x;
		const Index tid = threadIdx.x;
		Index gid = blockIdx.x * blockDim. x + threadIdx.x;
		const Index gridSize = blockDim.x * gridDim.x;

		// Read data into the shared memory. We start with the sequential reduction.
		// Start with the sequential reduction and push the result into the shared memory.
		if( idxInput ) {
		if( gid < size ) {
		sdata[ tid ] = dataFetcher( gid );
		@@ -245,30 +232,25 @@ CudaReductionWithArgumentKernel( const Result zero,

		// This runs in one warp so it is synchronized implicitly.
		if( tid < 32 ) {
		volatile ResultType* vsdata = sdata;
		volatile IndexType* vsidx = sidx;
		if( blockSize >= 64 ) {
		volatile Result* vsdata = sdata;
		volatile Index* vsidx = sidx;
		if( blockSize >= 64 )
		volatileReduction( vsidx[ tid ], vsidx[ tid + 32 ], vsdata[ tid ], vsdata[ tid + 32 ] );
		}
		// Note that here we do not have to check if tid < 16 etc, because we have
		// twice as much shared memory, so we do not access out of bounds. The
		// results for the upper half will be undefined, but unused anyway.
		if( blockSize >= 32 ) {
		// 2 * blockSize.x elements of shared memory per block, so we do not
		// access out of bounds. The results for the upper half will be undefined,
		// but unused anyway.
		if( blockSize >= 32 )
		volatileReduction( vsidx[ tid ], vsidx[ tid + 16 ], vsdata[ tid ], vsdata[ tid + 16 ] );
		}
		if( blockSize >= 16 ) {
		if( blockSize >= 16 )
		volatileReduction( vsidx[ tid ], vsidx[ tid + 8 ], vsdata[ tid ], vsdata[ tid + 8 ] );
		}
		if( blockSize >= 8 ) {
		if( blockSize >= 8 )
		volatileReduction( vsidx[ tid ], vsidx[ tid + 4 ], vsdata[ tid ], vsdata[ tid + 4 ] );
		}
		if( blockSize >= 4 ) {
		if( blockSize >= 4 )
		volatileReduction( vsidx[ tid ], vsidx[ tid + 2 ], vsdata[ tid ], vsdata[ tid + 2 ] );
		}
		if( blockSize >= 2 ) {
		if( blockSize >= 2 )
		volatileReduction( vsidx[ tid ], vsidx[ tid + 1 ], vsdata[ tid ], vsdata[ tid + 1 ] );
		}
		}

		// Store the result back in the global memory.
		if( tid == 0 ) {
		@@ -276,15 +258,13 @@ CudaReductionWithArgumentKernel( const Result zero,
		idxOutput[ blockIdx.x ] = sidx[ 0 ];
		}
		}
		#endif


		template< typename Index,
		typename Result >
		struct CudaReductionKernelLauncher
		{
		using IndexType = Index;
		using ResultType = Result;

		// The number of blocks should be a multiple of the number of multiprocessors
		// to ensure optimum balancing of the load. This is very important, because
		// we run the kernel with a fixed number of blocks, so the amount of work per
		@@ -315,13 +295,13 @@ struct CudaReductionKernelLauncher
		const VolatileReduction& volatileReduction,
		const DataFetcher& dataFetcher,
		const Result& zero,
		ResultType*& output )
		Result*& output )
		{
		// create reference to the reduction buffer singleton and set size
		const std::size_t buf_size = 2 * desGridSize * sizeof( ResultType );
		const std::size_t buf_size = 2 * desGridSize * sizeof( Result );
		CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
		cudaReductionBuffer.setSize( buf_size );
		output = cudaReductionBuffer.template getData< ResultType >();
		output = cudaReductionBuffer.template getData< Result >();

		this->reducedSize = this->launch( originalSize, reduction, volatileReduction, dataFetcher, zero, output );
		return this->reducedSize;
		@@ -334,15 +314,15 @@ struct CudaReductionKernelLauncher
		const VolatileReduction& volatileReduction,
		const DataFetcher& dataFetcher,
		const Result& zero,
		ResultType*& output,
		IndexType*& idxOutput )
		Result*& output,
		Index*& idxOutput )
		{
		// create reference to the reduction buffer singleton and set size
		const std::size_t buf_size = 2 * desGridSize * ( sizeof( ResultType ) + sizeof( IndexType ) );
		const std::size_t buf_size = 2 * desGridSize * ( sizeof( Result ) + sizeof( Index ) );
		CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
		cudaReductionBuffer.setSize( buf_size );
		output = cudaReductionBuffer.template getData< ResultType >();
		idxOutput = reinterpret_cast< IndexType* >( &output[ 2 * desGridSize ] );
		output = cudaReductionBuffer.template getData< Result >();
		idxOutput = reinterpret_cast< Index* >( &output[ 2 * desGridSize ] );

		this->reducedSize = this->launchWithArgument( originalSize, reduction, volatileReduction, dataFetcher, zero, output, idxOutput, nullptr );
		return this->reducedSize;
		@@ -357,13 +337,13 @@ struct CudaReductionKernelLauncher
		{
		// Input is the first half of the buffer, output is the second half
		CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
		ResultType* input = cudaReductionBuffer.template getData< ResultType >();
		ResultType* output = &input[ desGridSize ];
		Result* input = cudaReductionBuffer.template getData< Result >();
		Result* output = &input[ desGridSize ];

		while( this->reducedSize > 1 )
		{
		// this lambda has to be defined inside the loop, because the captured variable changes
		auto copyFetch = [input] __cuda_callable__ ( IndexType i ) { return input[ i ]; };
		auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; };
		this->reducedSize = this->launch( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output );
		std::swap( input, output );
		}
		@@ -373,7 +353,7 @@ struct CudaReductionKernelLauncher
		std::swap( input, output );

		// Copy result on CPU
		ResultType result;
		Result result;
		ArrayOperations< Devices::Host, Devices::Cuda >::copy( &result, output, 1 );
		return result;
		}
		@@ -387,15 +367,15 @@ struct CudaReductionKernelLauncher
		{
		// Input is the first half of the buffer, output is the second half
		CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
		ResultType* input = cudaReductionBuffer.template getData< ResultType >();
		ResultType* output = &input[ desGridSize ];
		IndexType* idxInput = reinterpret_cast< IndexType* >( &output[ desGridSize ] );
		IndexType* idxOutput = &idxInput[ desGridSize ];
		Result* input = cudaReductionBuffer.template getData< Result >();
		Result* output = &input[ desGridSize ];
		Index* idxInput = reinterpret_cast< Index* >( &output[ desGridSize ] );
		Index* idxOutput = &idxInput[ desGridSize ];

		while( this->reducedSize > 1 )
		{
		// this lambda has to be defined inside the loop, because the captured variable changes
		auto copyFetch = [input] __cuda_callable__ ( IndexType i ) { return input[ i ]; };
		auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; };
		this->reducedSize = this->launchWithArgument( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output, idxOutput, idxInput );
		std::swap( input, output );
		std::swap( idxInput, idxOutput );
		@@ -426,15 +406,16 @@ struct CudaReductionKernelLauncher
		const Result& zero,
		Result* output )
		{
		#ifdef HAVE_CUDA
		dim3 blockSize, gridSize;
		blockSize.x = Reduction_maxThreadsPerBlock;
		gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );

		// when there is only one warp per blockSize.x, we need to allocate two warps
		// worth of shared memory so that we don't index shared memory out of bounds
		const IndexType shmem = (blockSize.x <= 32)
		? 2 * blockSize.x * sizeof( ResultType )
		: blockSize.x * sizeof( ResultType );
		const Index shmem = (blockSize.x <= 32)
		? 2 * blockSize.x * sizeof( Result )
		: blockSize.x * sizeof( Result );

		// This is "general", but this method always sets blockSize.x to a specific value,
		// so runtime switch is not necessary - it only prolongs the compilation time.
		@@ -515,6 +496,9 @@ struct CudaReductionKernelLauncher

		// Return the size of the output array on the CUDA device
		return gridSize.x;
		#else
		throw Exceptions::CudaSupportMissing();
		#endif
		}

		template< typename DataFetcher,
		@@ -529,15 +513,16 @@ struct CudaReductionKernelLauncher
		Index* idxOutput,
		const Index* idxInput )
		{
		#ifdef HAVE_CUDA
		dim3 blockSize, gridSize;
		blockSize.x = Reduction_maxThreadsPerBlock;
		gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );

		// when there is only one warp per blockSize.x, we need to allocate two warps
		// worth of shared memory so that we don't index shared memory out of bounds
		const IndexType shmem = (blockSize.x <= 32)
		? 2 * blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) )
		: blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) );
		const Index shmem = (blockSize.x <= 32)
		? 2 * blockSize.x * ( sizeof( Result ) + sizeof( Index ) )
		: blockSize.x * ( sizeof( Result ) + sizeof( Index ) );

		// This is "general", but this method always sets blockSize.x to a specific value,
		// so runtime switch is not necessary - it only prolongs the compilation time.
		@@ -618,16 +603,18 @@ struct CudaReductionKernelLauncher

		// return the size of the output array on the CUDA device
		return gridSize.x;
		#else
		throw Exceptions::CudaSupportMissing();
		#endif
		}


		const int activeDevice;
		const int blocksdPerMultiprocessor;
		const int desGridSize;
		const IndexType originalSize;
		IndexType reducedSize;
		const Index originalSize;
		Index reducedSize;
		};
		#endif

		} // namespace Algorithms
		} // namespace Containers

src/TNL/Containers/Algorithms/Reduction.h

+47 −49

Original line number	Diff line number	Diff line
		@@ -22,12 +22,11 @@ namespace Containers {
		namespace Algorithms {

		template< typename Device >
		class Reduction;
		struct Reduction;

		template<>
		class Reduction< Devices::Host >
		struct Reduction< Devices::Host >
		{
		public:
		template< typename Index,
		typename Result,
		typename ReductionOperation,
		@@ -54,9 +53,8 @@ class Reduction< Devices::Host >
		};

		template<>
		class Reduction< Devices::Cuda >
		struct Reduction< Devices::Cuda >
		{
		public:
		template< typename Index,
		typename Result,
		typename ReductionOperation,

src/TNL/Containers/Algorithms/Reduction.hpp

+53 −97

File changed.

Preview size limit exceeded, changes collapsed.