Optimizing CUDA parallel reduction. (e36a896a) · Commits · TNL / tnl-dev

src/core/cuda/cuda-prefix-sum_impl.h

+15 −10

Original line number	Diff line number	Diff line
		@@ -86,7 +86,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		while( chunkPointer < chunkSize &&
		chunkOffset + chunkPointer < lastElementInBlock )
		{
		operation.performInPlace( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ],
		operation.commonReductionOnDevice( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ],
		sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
		auxData[ threadIdx. x ] =
		sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ];
		@@ -100,7 +100,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		const int warpIdx = threadIdx. x / tnlCuda::getWarpSize();
		for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 )
		if( threadInWarpIdx >= stride && threadIdx. x < numberOfChunks )
		operation.performInPlace( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] );
		operation.commonReductionOnDevice( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] );

		if( threadInWarpIdx == tnlCuda::getWarpSize() - 1 )
		warpSums[ warpIdx ] = auxData[ threadIdx. x ];
		@@ -112,14 +112,14 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		if( warpIdx == 0 )
		for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 )
		if( threadInWarpIdx >= stride )
		operation.performInPlace( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] );
		operation.commonReductionOnDevice( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] );
		__syncthreads();

		/****
		* Shift the warp prefix-sums.
		*/
		if( warpIdx > 0 )
		operation.performInPlace( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] );
		operation.commonReductionOnDevice( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] );

		/***
		* Store the result back in global memory.
		@@ -132,7 +132,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		DataType chunkShift( operation.initialValue() );
		if( chunkIdx > 0 )
		chunkShift = auxData[ chunkIdx - 1 ];
		operation.performInPlace( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift );
		operation.commonReductionOnDevice( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift );
		output[ blockOffset + idx ] = sharedData[ tnlCuda::getInterleaving( idx ) ];
		idx += blockDim. x;
		}
		@@ -141,10 +141,15 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		if( threadIdx. x == 0 )
		{
		if( prefixSumType == exclusivePrefixSum )
		auxArray[ blockIdx. x ] =
		operation.commonReductionOnDevice( tnlCuda::getInterleaving( lastElementInBlock - 1 ),
		{
		/*auxArray[ blockIdx. x ] = operation.commonReductionOnDevice( tnlCuda::getInterleaving( lastElementInBlock - 1 ),
		tnlCuda::getInterleaving( lastElementInBlock ),
		sharedData );
		sharedData );*/
		DataType aux = operation.initialValue();
		operation.commonReductionOnDevice( aux, sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ] );
		operation.commonReductionOnDevice( aux, sharedData[ tnlCuda::getInterleaving( lastElementInBlock ) ] );
		auxArray[ blockIdx. x ] = aux;
		}
		else
		auxArray[ blockIdx. x ] = sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ];
		}

src/core/cuda/cuda-reduction_impl.h

+83 −146

Original line number	Diff line number	Diff line
		@@ -37,27 +37,25 @@

		using namespace std;


		/****
		* Arrays smaller than the following constant
		* are reduced on CPU. The constant must not be larger
		* than maximal CUDA grid size.
		*/
		const int minGPUReductionDataSize = 256;//65536; //16384;//1024;//256;
		const int minGPUReductionDataSize = 128;//65536; //16384;//1024;//256;

		static tnlCudaReductionBuffer cudaReductionBuffer( 8 * minGPUReductionDataSize );

		#ifdef HAVE_CUDA


		template< typename Operation, int blockSize, bool isSizePow2 >
		template< typename Operation, int blockSize >
		__global__ void tnlCUDAReductionKernel( const Operation operation,
		const typename Operation :: IndexType size,
		const typename Operation :: RealType* input1,
		const typename Operation :: RealType* input2,
		typename Operation :: ResultType* output )
		{
		typedef tnlCUDAReduction< Operation, blockSize, isSizePow2 > Reduction;
		typedef tnlCUDAReduction< Operation, blockSize > Reduction;
		Reduction::reduce( operation, size, input1, input2, output );
		};

		@@ -74,63 +72,52 @@ typename Operation::IndexType reduceOnCudaDevice( const Operation& operation,

		const IndexType desGridSize( minGPUReductionDataSize );
		dim3 blockSize( 256 ), gridSize( 0 );

		gridSize. x = Min( tnlCuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );

		/*#ifdef CUDA_REDUCTION_PROFILING
		tnlTimerRT timer;
		timer.reset();
		timer.start();
		#endif */

		if( ! cudaReductionBuffer.setSize( gridSize.x * sizeof( ResultType ) ) )
		return false;
		output = cudaReductionBuffer.template getData< ResultType >();


		IndexType shmem = blockSize.x * sizeof( ResultType );

		/***
		* Depending on the blockSize we generate appropriate template instance.
		*/

		if( isPow2( size ) )
		{
		switch( blockSize.x )
		{
		case 512:
		tnlCUDAReductionKernel< Operation, 512, true >
		tnlCUDAReductionKernel< Operation, 512 >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		break;
		case 256:
		tnlCUDAReductionKernel< Operation, 256, true >
		tnlCUDAReductionKernel< Operation, 256 >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		break;
		case 128:
		tnlCUDAReductionKernel< Operation, 128, true >
		tnlCUDAReductionKernel< Operation, 128 >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		break;
		case 64:
		tnlCUDAReductionKernel< Operation, 64, true >
		tnlCUDAReductionKernel< Operation, 64 >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		break;
		case 32:
		tnlCUDAReductionKernel< Operation, 32, true >
		tnlCUDAReductionKernel< Operation, 32 >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		break;
		case 16:
		tnlCUDAReductionKernel< Operation, 16, true >
		tnlCUDAReductionKernel< Operation, 16 >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		break;
		case 8:
		tnlCUDAReductionKernel< Operation, 8, true >
		tnlCUDAReductionKernel< Operation, 8 >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		break;
		case 4:
		tnlCUDAReductionKernel< Operation, 4, true >
		tnlCUDAReductionKernel< Operation, 4 >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		break;
		case 2:
		tnlCUDAReductionKernel< Operation, 2, true >
		tnlCUDAReductionKernel< Operation, 2 >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		break;
		case 1:
		@@ -138,59 +125,7 @@ typename Operation::IndexType reduceOnCudaDevice( const Operation& operation,
		default:
		tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
		}
		}
		else
		{
		switch( blockSize.x )
		{
		case 512:
		tnlCUDAReductionKernel< Operation, 512, false >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		break;
		case 256:
		tnlCUDAReductionKernel< Operation, 256, false >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		break;
		case 128:
		tnlCUDAReductionKernel< Operation, 128, false >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		break;
		case 64:
		tnlCUDAReductionKernel< Operation, 64, false >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		break;
		case 32:
		tnlCUDAReductionKernel< Operation, 32, false >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		break;
		case 16:
		tnlCUDAReductionKernel< Operation, 16, false >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		break;
		case 8:
		tnlCUDAReductionKernel< Operation, 8, false >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		break;
		case 4:
		tnlCUDAReductionKernel< Operation, 4, false >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		break;
		case 2:
		tnlCUDAReductionKernel< Operation, 2, false >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		break;
		case 1:
		tnlAssert( false, cerr << "blockSize should not be 1." << endl );
		default:
		tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
		}
		}
		//checkCudaDevice;
		/*#ifdef CUDA_REDUCTION_PROFILING
		//cudaThreadSynchronize();
		timer.stop();
		cout << " Main reduction on GPU took " << timer.getTime() << " sec. " << endl;
		#endif */
		return gridSize. x;
		}
		#endif
		@@ -222,20 +157,21 @@ bool reductionOnCudaDevice( const Operation& operation,
		if( deviceInput2 && !
		tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< RealType, RealType, IndexType >( hostArray2, deviceInput2, size ) )
		return false;
		result = operation. initialValueOnHost( 0, hostArray1, hostArray2 );
		for( IndexType i = 1; i < size; i ++ )
		result = operation.initialValue();
		for( IndexType i = 0; i < size; i ++ )
		result = operation.reduceOnHost( i, result, hostArray1, hostArray2 );
		return true;
		}

		/****
		* Reduce the data on the CUDA device.
		*/
		#ifdef CUDA_REDUCTION_PROFILING
		tnlTimerRT timer;
		timer.reset();
		timer.start();
		#endif

		/****
		* Reduce the data on the CUDA device.
		*/
		ResultType* deviceAux1( 0 );
		IndexType reducedSize = reduceOnCudaDevice( operation,
		size,
		@@ -245,39 +181,40 @@ bool reductionOnCudaDevice( const Operation& operation,
		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();
		cout << " Reduction on GPU to size " << reducedSize << " took " << timer.getTime() << " sec. " << endl;
		timer.reset();
		timer.start();
		#endif

		/***
		* Transfer the reduced data from device to host.
		*/
		#ifdef CUDA_REDUCTION_PROFILING
		timer.reset();
		timer.start();
		#endif
		ResultType resultArray[ minGPUReductionDataSize ];
		if( ! tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< ResultType, ResultType, IndexType >( resultArray, deviceAux1, reducedSize ) )
		return false;

		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();
		cout << " Transferring data to CPU took " << timer.getTime() << " sec. " << endl;
		#endif

		/***
		* Reduce the data on the host system.
		*/
		LaterReductionOperation laterReductionOperation;
		#ifdef CUDA_REDUCTION_PROFILING
		timer.reset();
		timer.start();
		#endif
		result = laterReductionOperation. initialValueOnHost( 0, resultArray, ( ResultType* ) 0 );
		for( IndexType i = 1; i < reducedSize; i ++ )

		/***
		* Reduce the data on the host system.
		*/
		LaterReductionOperation laterReductionOperation;
		result = laterReductionOperation. initialValue();
		for( IndexType i = 0; i < reducedSize; i ++ )
		result = laterReductionOperation.reduceOnHost( i, result, resultArray, ( ResultType*) 0 );

		#ifdef CUDA_REDUCTION_PROFILING
		cudaThreadSynchronize();
		timer.stop();
		cout << " Reduction of small data set on CPU took " << timer.getTime() << " sec. " << endl;
		#endif

		return checkCudaDevice;
		#else
		tnlCudaSupportMissingMessage;;

src/core/cuda/reduction-operations.h

+83 −1264

File changed.

Preview size limit exceeded, changes collapsed.

src/core/cuda/tnlCudaReduction.h

+3 −3

Original line number	Diff line number	Diff line
		@@ -20,7 +20,7 @@

		#ifdef HAVE_CUDA

		template< typename Operation, int blockSize, bool isSizePow2 >
		template< typename Operation, int blockSize >
		class tnlCUDAReduction
		{
		public:
		@@ -37,8 +37,8 @@ class tnlCUDAReduction
		ResultType* output );
		};

		/*template< typename Real, typename Index, int blockSize, bool isSizePow2 >
		class tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize, isSizePow2 >
		/*template< typename Real, typename Index, int blockSize >
		class tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize >
		{
		public:

src/core/cuda/tnlCudaReduction_impl.h

+4 −4

Original line number	Diff line number	Diff line
		@@ -18,10 +18,10 @@
		#ifndef TNLCUDAREDUCTION_IMPL_H
		#define TNLCUDAREDUCTION_IMPL_H

		template< typename Operation, int blockSize, bool isSizePow2 >
		template< typename Operation, int blockSize >
		__device__
		void
		tnlCUDAReduction< Operation, blockSize, isSizePow2 >::
		tnlCUDAReduction< Operation, blockSize >::
		reduce( const Operation operation,
		const IndexType size,
		const RealType* input1,
		@@ -154,10 +154,10 @@ reduce( const Operation operation,

		#ifdef UNDEF

		template< typename Real, typename Index, int blockSize, bool isSizePow2 >
		template< typename Real, typename Index, int blockSize >
		__device__
		void
		tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize, isSizePow2 >::
		tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize >::
		reduce( const Operation operation,
		const IndexType size,
		const RealType* input1,