Implementing the prefix-sum in CUDA. (11898c70) · Commits · TNL / tnl-dev

src/core/cuda/cuda-prefix-sum.h

+4 −3

Original line number	Diff line number	Diff line
		@@ -21,7 +21,7 @@
		enum enumPrefixSumType { exclusivePrefixSum = 0,
		inclusivePrefixSum };

		template< typename DataType >
		/*template< typename DataType >
		class operationSum
		{
		public:
		@@ -58,16 +58,17 @@ class operationSum
		};
		#endif

		};
		};*/


		template< typename DataType,
		template< typename T > class Operation,
		typename Operation,
		typename Index >
		bool cudaPrefixSum( const Index size,
		const Index blockSize,
		const DataType *deviceInput,
		DataType* deviceOutput,
		const Operation& operation,
		const enumPrefixSumType prefixSumType = inclusivePrefixSum );

src/core/cuda/reduction-operations.h

+19 −0

Original line number	Diff line number	Diff line
		@@ -176,6 +176,25 @@ class tnlParallelReductionSum
		{
		return data[ idx1 ] + data[ idx2 ];
		};

		__device__ ResultType commonReductionOnDevice( const ResultType& a,
		const ResultType& b ) const
		{
		return a + b;
		};


		__device__ RealType identity() const
		{
		return 0;
		};

		__device__ void performInPlace( ResultType& a,
		const ResultType& b )
		{
		a += b;
		}

		#endif
		};

src/implementation/core/cuda/cuda-prefix-sum_impl.h

+13 −11

Original line number	Diff line number	Diff line
		@@ -47,7 +47,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		if( prefixSumType == exclusivePrefixSum )
		{
		if( idx == 0 )
		sharedData[ 0 ] = operation. cudaIdentity();
		sharedData[ 0 ] = operation.identity();
		while( idx < elementsInBlock && blockOffset + idx < size )
		{
		sharedData[ tnlCuda::getInterleaving( idx + 1 ) ] = input[ blockOffset + idx ];
		@@ -80,7 +80,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		while( chunkPointer < chunkSize &&
		chunkOffset + chunkPointer < lastElementInBlock )
		{
		operation. cudaPerformInPlace( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ],
		operation.performInPlace( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ],
		sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
		auxData[ threadIdx. x ] =
		sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ];
		@@ -94,7 +94,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		const int warpIdx = threadIdx. x / tnlCuda::getWarpSize();
		for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 )
		if( threadInWarpIdx >= stride && threadIdx. x < numberOfChunks )
		operation. cudaPerformInPlace( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] );
		operation.performInPlace( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] );

		if( threadInWarpIdx == tnlCuda::getWarpSize() - 1 )
		warpSums[ warpIdx ] = auxData[ threadIdx. x ];
		@@ -106,14 +106,14 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		if( warpIdx == 0 )
		for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 )
		if( threadInWarpIdx >= stride )
		operation. cudaPerformInPlace( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] );
		operation.performInPlace( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] );
		__syncthreads();

		/****
		* Shift the warp prefix-sums.
		*/
		if( warpIdx > 0 )
		operation. cudaPerformInPlace( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] );
		operation.performInPlace( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] );

		/***
		* Store the result back in global memory.
		@@ -126,7 +126,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		Index chunkShift( operation. cudaIdentity() );
		if( chunkIdx > 0 )
		chunkShift = auxData[ chunkIdx - 1 ];
		operation. cudaPerformInPlace( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift );
		operation.performInPlace( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift );
		output[ blockOffset + idx ] = sharedData[ tnlCuda::getInterleaving( idx ) ];
		idx += blockDim. x;
		}
		@@ -136,8 +136,9 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		{
		if( prefixSumType == exclusivePrefixSum )
		auxArray[ blockIdx. x ] =
		operation. cudaPerform( sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ],
		sharedData[ tnlCuda::getInterleaving( lastElementInBlock ) ] );
		operation.commonReductionOnDevice( tnlCuda::getInterleaving( lastElementInBlock - 1 ),
		tnlCuda::getInterleaving( lastElementInBlock ),
		sharedData );
		else
		auxArray[ blockIdx. x ] = sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ];
		}
		@@ -156,7 +157,7 @@ __global__ void cudaSecondPhaseBlockPrefixSum( const Index size,
		Operation< DataType > operation;
		if( blockIdx. x > 0 )
		{
		const Index shift = operation. cudaPerform( gridShift, auxArray[ blockIdx. x - 1 ] );
		const Index shift = operation.commonReductionOnDevice( gridShift, auxArray[ blockIdx. x - 1 ] );

		const Index readOffset = blockIdx. x * elementsInBlock;
		Index readIdx = threadIdx. x;
		@@ -288,12 +289,13 @@ bool cudaGridPrefixSum( enumPrefixSumType prefixSumType,
		}

		template< typename DataType,
		template< typename T > class Operation,
		typename Operation,
		typename Index >
		bool cudaPrefixSum( const Index size,
		const Index blockSize,
		const DataType *deviceInput,
		DataType* deviceOutput,
		const Operation& operation,
		const enumPrefixSumType prefixSumType )
		{
		/****

src/implementation/core/vectors/tnlVectorOperationsCuda_impl.h

+17 −6

Original line number	Diff line number	Diff line
		@@ -565,12 +565,17 @@ void tnlVectorOperations< tnlCuda >::computePrefixSum( Vector& v,
		typename Vector::IndexType begin,
		typename Vector::IndexType end )
		{
		typedef tnlParallelReductionSum< typename Vector::IndexType,
		typename Vector::RealType > OperationType;

		OperationType operation;
		cudaPrefixSum< typename Vector::RealType,
		operationSum,
		OperationType,
		typename Vector::IndexType >( end - begin,
		256,
		&v.getData()[ begin ],
		&v.getData()[ begin ],
		operation,
		inclusivePrefixSum );
		}

		@@ -579,12 +584,18 @@ void tnlVectorOperations< tnlCuda >::computeExclusivePrefixSum( Vector& v,
		typename Vector::IndexType begin,
		typename Vector::IndexType end )
		{
		typedef tnlParallelReductionSum< typename Vector::IndexType,
		typename Vector::RealType > OperationType;

		OperationType operation;

		cudaPrefixSum< typename Vector::RealType,
		operationSum,
		OperationType,
		typename Vector::IndexType >( end - begin,
		256,
		&v.getData()[ begin ],
		&v.getData()[ begin ],
		operation,
		exclusivePrefixSum );
		}