Implementing the prefix-sum in CUDA. (b2b2a5a4) · Commits · TNL / tnl-dev

install

+1 −0

Original line number	Diff line number	Diff line
		@@ -11,6 +11,7 @@ TEMPLATE_EXPLICIT_INSTANTIATION=yes
		CMAKE="cmake"
		CPUS=`grep -c processor /proc/cpuinfo`

		CPUS="1"
		echo "Building $TARGET using $CPUS processors."

		if [ ! -d Debug ];

src/core/cuda/cuda-prefix-sum.h

+42 −0

Original line number	Diff line number	Diff line
		@@ -21,6 +21,46 @@
		enum enumPrefixSumType { exclusivePrefixSum = 0,
		inclusivePrefixSum };

		template< typename DataType >
		class operationSum
		{
		public:

		DataType identity() const
		{
		return ( DataType ) 0.0;
		};

		void performInPlace( DataType& a, const DataType& b ) const
		{
		a += b;
		};

		DataType perform( const DataType& a, const DataType& b )
		{
		return a + b;
		};

		#ifdef HAVE_CUDA
		__device__ DataType cudaIdentity() const
		{
		return ( DataType ) 0.0;
		};

		__device__ void cudaPerformInPlace( DataType& a, const DataType& b ) const
		{
		a += b;
		};

		__device__ DataType cudaPerform( const DataType& a, const DataType& b )
		{
		return a + b;
		};
		#endif

		};


		template< typename DataType,
		template< typename T > class Operation,
		typename Index >
		@@ -31,4 +71,6 @@ bool cudaPrefixSum( const Index size,
		const enumPrefixSumType prefixSumType = inclusivePrefixSum );


		#include <implementation/core/cuda/cuda-prefix-sum_impl.h>

		#endif /* CUDA_PREFIX_SUM_H_ */

src/core/tnlCuda.h

+37 −2

Original line number	Diff line number	Diff line
		@@ -42,10 +42,20 @@ class tnlCuda

		static int getGPUTransferBufferSize();

		static bool checkDevice( const char* file_name, int line );

		static size_t getFreeMemory();

		#ifdef HAVE_CUDA
		static inline __host__ __device__ int getNumberOfSharedMemoryBanks();

		static inline __host__ __device__ int getWarpSize();

		template< typename Index >
		static __device__ Index getInterleaving( const Index index );
		#endif


		static bool checkDevice( const char* file_name, int line );

		protected:

		static int maxGridSize, maxBlockSize;
		@@ -56,4 +66,29 @@ class tnlCuda
		#define tnlCudaSupportMissingMessage \
		std::cerr << "The CUDA support is missing in the source file " << __FILE__ << " at line " << __LINE__ << ". Please set WITH_CUDA=yes in the install script. " << std::endl;


		// TODO: This would be nice in tnlCuda but C++ standard does not allow it.
		#ifdef HAVE_CUDA
		template< typename Element >
		struct getSharedMemory
		{
		__device__ operator Element*();
		};

		template<>
		struct getSharedMemory< double >
		{
		inline __device__ operator double*();
		};

		template<>
		struct getSharedMemory< long int >
		{
		inline __device__ operator long int*();
		};

		#endif

		#include <implementation/core/tnlCuda_impl.h>

		#endif /* TNLCUDA_H_ */

src/core/tnlFile.h

+0 −2

Original line number	Diff line number	Diff line
		@@ -154,8 +154,6 @@ bool tnlFile :: read( Type* buffer,
		cerr << "File " << fileName << " was not opened for reading. " << endl;
		return false;
		}

		int bytesRead( 0 );
		this->readElements = 0;
		const Index host_buffer_size = :: Min( ( Index ) ( tnlFileGPUvsCPUTransferBufferSize / sizeof( Type ) ),
		elements );

src/implementation/core/cuda/cuda-prefix-sum_impl.h

+26 −21

Original line number	Diff line number	Diff line
		@@ -18,6 +18,8 @@
		#ifndef CUDA_PREFIX_SUM_IMPL_H_
		#define CUDA_PREFIX_SUM_IMPL_H_

		#ifdef HAVE_CUDA

		template< typename DataType,
		template< typename T > class Operation,
		typename Index >
		@@ -28,8 +30,8 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		DataType* output,
		DataType* auxArray )
		{
		DataType* sharedData = sharedMemory< DataType >();
		DataType* auxData = &sharedData[ elementsInBlock + elementsInBlock / shmBanks + 2 ];
		DataType* sharedData = getSharedMemory< DataType >();
		DataType* auxData = &sharedData[ elementsInBlock + elementsInBlock / tnlCuda::getNumberOfSharedMemoryBanks() + 2 ];
		DataType* warpSums = &auxData[ blockDim. x ];
		Operation< DataType > operation;

		@@ -45,17 +47,17 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		if( prefixSumType == exclusivePrefixSum )
		{
		if( idx == 0 )
		sharedData[ interleave< shmBanks >( 0 ) ] = operation. cudaIdentity();
		sharedData[ 0 ] = operation. cudaIdentity();
		while( idx < elementsInBlock && blockOffset + idx < size )
		{
		sharedData[ interleave< shmBanks >( idx + 1 ) ] = input[ blockOffset + idx ];
		sharedData[ tnlCuda::getInterleaving( idx + 1 ) ] = input[ blockOffset + idx ];
		idx += blockDim. x;
		}
		}
		else
		while( idx < elementsInBlock && blockOffset + idx < size )
		{
		sharedData[ interleave< shmBanks >( idx ) ] = input[ blockOffset + idx ];
		sharedData[ tnlCuda::getInterleaving( idx ) ] = input[ blockOffset + idx ];
		idx += blockDim. x;
		}

		@@ -71,30 +73,30 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		if( chunkOffset < lastElementInBlock )
		{
		auxData[ threadIdx. x ] =
		sharedData[ interleave< shmBanks >( chunkOffset ) ];
		sharedData[ tnlCuda::getInterleaving( chunkOffset ) ];
		}

		Index chunkPointer( 1 );
		while( chunkPointer < chunkSize &&
		chunkOffset + chunkPointer < lastElementInBlock )
		{
		operation. cudaPerformInPlace( sharedData[ interleave< shmBanks >( chunkOffset + chunkPointer ) ],
		sharedData[ interleave< shmBanks >( chunkOffset + chunkPointer - 1 ) ] );
		operation. cudaPerformInPlace( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ],
		sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
		auxData[ threadIdx. x ] =
		sharedData[ interleave< shmBanks >( chunkOffset + chunkPointer ) ];
		sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ];
		chunkPointer ++;
		}

		/***
		* Perform the parallel prefix-sum inside warps.
		*/
		const int threadInWarpIdx = threadIdx. x % warpSize;
		const int warpIdx = threadIdx. x / warpSize;
		for( int stride = 1; stride < warpSize; stride *= 2 )
		const int threadInWarpIdx = threadIdx. x % tnlCuda::getWarpSize();
		const int warpIdx = threadIdx. x / tnlCuda::getWarpSize();
		for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 )
		if( threadInWarpIdx >= stride && threadIdx. x < numberOfChunks )
		operation. cudaPerformInPlace( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] );

		if( threadInWarpIdx == warpSize - 1 )
		if( threadInWarpIdx == tnlCuda::getWarpSize() - 1 )
		warpSums[ warpIdx ] = auxData[ threadIdx. x ];
		__syncthreads();

		@@ -102,7 +104,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		* Compute prefix-sum of warp sums using one warp
		*/
		if( warpIdx == 0 )
		for( int stride = 1; stride < warpSize; stride *= 2 )
		for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 )
		if( threadInWarpIdx >= stride )
		operation. cudaPerformInPlace( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] );
		__syncthreads();
		@@ -124,8 +126,8 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		Index chunkShift( operation. cudaIdentity() );
		if( chunkIdx > 0 )
		chunkShift = auxData[ chunkIdx - 1 ];
		operation. cudaPerformInPlace( sharedData[ interleave< shmBanks >( idx ) ], chunkShift );
		output[ blockOffset + idx ] = sharedData[ interleave< shmBanks >( idx ) ];
		operation. cudaPerformInPlace( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift );
		output[ blockOffset + idx ] = sharedData[ tnlCuda::getInterleaving( idx ) ];
		idx += blockDim. x;
		}
		__syncthreads();
		@@ -134,10 +136,10 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		{
		if( prefixSumType == exclusivePrefixSum )
		auxArray[ blockIdx. x ] =
		operation. cudaPerform( sharedData[ interleave< shmBanks >( lastElementInBlock - 1 ) ],
		sharedData[ interleave< shmBanks >( lastElementInBlock ) ] );
		operation. cudaPerform( sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ],
		sharedData[ tnlCuda::getInterleaving( lastElementInBlock ) ] );
		else
		auxArray[ blockIdx. x ] = sharedData[ interleave< shmBanks >( lastElementInBlock - 1 ) ];
		auxArray[ blockIdx. x ] = sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ];
		}

		}
		@@ -202,8 +204,9 @@ bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType,
		/****
		* Run the kernel.
		*/
		size_t sharedDataSize = elementsInBlock + elementsInBlock / shmBanks + 2;
		size_t sharedMemory = ( sharedDataSize + blockSize + warpSize ) * sizeof( DataType );
		size_t sharedDataSize = elementsInBlock +
		elementsInBlock / tnlCuda::getNumberOfSharedMemoryBanks() + 2;
		size_t sharedMemory = ( sharedDataSize + blockSize + tnlCuda::getWarpSize() ) * sizeof( DataType );
		cudaFirstPhaseBlockPrefixSum< DataType, Operation, Index >
		<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
		( prefixSumType,
		@@ -332,4 +335,6 @@ bool cudaPrefixSum( const Index size,
		return true;
		}

		#endif

		#endif /* CUDA_PREFIX_SUM_IMPL_H_ */