Fixed parallel prefix sum (99e854a7) · Commits · TNL / tnl-dev

src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h

+2 −2

Original line number	Diff line number	Diff line
		@@ -608,7 +608,7 @@ computePrefixSum( Vector& v,
		&v.getData()[ begin ],
		&v.getData()[ begin ],
		operation,
		Algorithms::inclusivePrefixSum );
		Algorithms::PrefixSumType::inclusive );
		#else
		throw Exceptions::CudaSupportMissing();
		#endif
		@@ -633,7 +633,7 @@ computeExclusivePrefixSum( Vector& v,
		&v.getData()[ begin ],
		&v.getData()[ begin ],
		operation,
		Algorithms::exclusivePrefixSum );
		Algorithms::PrefixSumType::exclusive );
		#endif
		}

src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h

+2 −0

Original line number	Diff line number	Diff line
		@@ -627,6 +627,7 @@ computePrefixSum( Vector& v,
		{
		typedef typename Vector::IndexType Index;

		// TODO: parallelize with OpenMP
		for( Index i = begin + 1; i < end; i++ )
		v[ i ] += v[ i - 1 ];
		}
		@@ -641,6 +642,7 @@ computeExclusivePrefixSum( Vector& v,
		typedef typename Vector::IndexType Index;
		typedef typename Vector::RealType Real;

		// TODO: parallelize with OpenMP
		Real aux( v[ begin ] );
		v[ begin ] = 0.0;
		for( Index i = begin + 1; i < end; i++ )

src/TNL/Containers/Algorithms/cuda-prefix-sum.h

+6 −3

Original line number	Diff line number	Diff line
		@@ -14,8 +14,11 @@ namespace TNL {
		namespace Containers {
		namespace Algorithms {

		enum enumPrefixSumType { exclusivePrefixSum = 0,
		inclusivePrefixSum };
		enum class PrefixSumType
		{
		exclusive,
		inclusive
		};

		template< typename DataType,
		typename Operation,
		@@ -25,7 +28,7 @@ bool cudaPrefixSum( const Index size,
		const DataType *deviceInput,
		DataType* deviceOutput,
		const Operation& operation,
		const enumPrefixSumType prefixSumType = inclusivePrefixSum );
		const PrefixSumType prefixSumType = PrefixSumType::inclusive );

		} // namespace Algorithms
		} // namespace Containers

src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h

+146 −171

Original line number	Diff line number	Diff line
		@@ -12,6 +12,7 @@

		#include <iostream>

		#include <TNL/Math.h>
		#include <TNL/Devices/Cuda.h>
		#include <TNL/Exceptions/CudaBadAlloc.h>
		#include <TNL/Containers/Algorithms/reduction-operations.h>
		@@ -25,7 +26,8 @@ namespace Algorithms {
		template< typename DataType,
		typename Operation,
		typename Index >
		__global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumType,
		__global__ void
		cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
		Operation operation,
		const Index size,
		const Index elementsInBlock,
		@@ -46,7 +48,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		*/
		const Index blockOffset = blockIdx.x * elementsInBlock;
		Index idx = threadIdx.x;
		if( prefixSumType == exclusivePrefixSum )
		if( prefixSumType == PrefixSumType::exclusive )
		{
		if( idx == 0 )
		sharedData[ 0 ] = operation.initialValue();
		@@ -69,8 +71,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		__syncthreads();
		const int chunkSize = elementsInBlock / blockDim.x;
		const int chunkOffset = threadIdx.x * chunkSize;
		const int numberOfChunks = lastElementInBlock / chunkSize +
		( lastElementInBlock % chunkSize != 0 );
		const int numberOfChunks = roundUpDivision( lastElementInBlock, chunkSize );

		if( chunkOffset < lastElementInBlock )
		{
		@@ -136,7 +137,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT

		if( threadIdx.x == 0 )
		{
		if( prefixSumType == exclusivePrefixSum )
		if( prefixSumType == PrefixSumType::exclusive )
		{
		/*auxArray[ blockIdx.x ] = operation.commonReductionOnDevice( Devices::Cuda::getInterleaving( lastElementInBlock - 1 ),
		Devices::Cuda::getInterleaving( lastElementInBlock ),
		@@ -149,13 +150,13 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		else
		auxArray[ blockIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ];
		}

		}

		template< typename DataType,
		typename Operation,
		typename Index >
		__global__ void cudaSecondPhaseBlockPrefixSum( Operation operation,
		__global__ void
		cudaSecondPhaseBlockPrefixSum( Operation operation,
		const Index size,
		const Index elementsInBlock,
		const Index gridShift,
		@@ -181,7 +182,8 @@ __global__ void cudaSecondPhaseBlockPrefixSum( Operation operation,
		template< typename DataType,
		typename Operation,
		typename Index >
		bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType,
		void
		cudaRecursivePrefixSum( const PrefixSumType prefixSumType,
		Operation& operation,
		const Index size,
		const Index blockSize,
		@@ -190,75 +192,59 @@ bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType,
		const DataType* input,
		DataType *output )
		{
		const Index numberOfBlocks = ceil( ( double ) size / ( double ) elementsInBlock );
		const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );
		const Index auxArraySize = numberOfBlocks * sizeof( DataType );
		DataType auxArray1, auxArray2;

		if( cudaMalloc( ( void** ) &auxArray1, auxArraySize ) != cudaSuccess \|\|
		cudaMalloc( ( void** ) &auxArray2, auxArraySize ) != cudaSuccess )
		throw Exceptions::CudaBadAlloc();
		Array< DataType, Devices::Cuda > auxArray1, auxArray2;
		auxArray1.setSize( auxArraySize );
		auxArray2.setSize( auxArraySize );

		/****
		* Setup block and grid size.
		*/
		dim3 cudaBlockSize( 0 ), cudaGridSize( 0 );
		cudaBlockSize.x = blockSize;
		cudaGridSize. x = size / elementsInBlock +
		( size % elementsInBlock != 0 );
		cudaGridSize.x = roundUpDivision( size, elementsInBlock );

		/****
		* Run the kernel.
		*/
		size_t sharedDataSize = elementsInBlock +
		const std::size_t sharedDataSize = elementsInBlock +
		elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2;
		size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( DataType );
		cudaFirstPhaseBlockPrefixSum< DataType, Operation, Index >
		<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
		const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( DataType );
		cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
		( prefixSumType,
		operation,
		size,
		elementsInBlock,
		input,
		output,
		auxArray1 );
		if( ! TNL_CHECK_CUDA_DEVICE )
		{
		std::cerr << "The CUDA kernel 'cudaFirstPhaseBlockPrefixSum' ended with error." << std::endl;
		cudaFree( auxArray1 );
		cudaFree( auxArray2 );
		return false;
		}
		auxArray1.getData() );
		TNL_CHECK_CUDA_DEVICE;

		/***
		* In auxArray1 there is now a sum of numbers in each block.
		* We must compute prefix-sum of auxArray1 and then shift
		* each block.
		*/
		if( numberOfBlocks > 1 &&
		! cudaRecursivePrefixSum< DataType, Operation, Index >
		( inclusivePrefixSum,
		if( numberOfBlocks > 1 )
		cudaRecursivePrefixSum( PrefixSumType::inclusive,
		operation,
		numberOfBlocks,
		blockSize,
		elementsInBlock,
		0,
		auxArray1,
		auxArray2 ) )
		return false;
		cudaSecondPhaseBlockPrefixSum< DataType, Operation, Index >
		<<< cudaGridSize, cudaBlockSize >>>
		( operation, size, elementsInBlock, gridShift, auxArray2, output );

		if( ! TNL_CHECK_CUDA_DEVICE )
		{
		std::cerr << "The CUDA kernel 'cudaSecondPhaseBlockPrefixSum' ended with error." << std::endl;
		cudaFree( auxArray1 );
		cudaFree( auxArray2 );
		return false;
		}
		cudaFree( auxArray1 );
		cudaFree( auxArray2 );
		return true;
		(Index) 0,
		auxArray1.getData(),
		auxArray2.getData() );

		cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>>
		( operation,
		size,
		elementsInBlock,
		gridShift,
		auxArray2.getData(),
		output );
		TNL_CHECK_CUDA_DEVICE;
		}


		@@ -266,7 +252,8 @@ bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType,
		template< typename DataType,
		typename Operation,
		typename Index >
		bool cudaGridPrefixSum( enumPrefixSumType prefixSumType,
		void
		cudaGridPrefixSum( PrefixSumType prefixSumType,
		Operation& operation,
		const Index size,
		const Index blockSize,
		@@ -275,76 +262,64 @@ bool cudaGridPrefixSum( enumPrefixSumType prefixSumType,
		DataType *deviceOutput,
		Index& gridShift )
		{

		if( ! cudaRecursivePrefixSum< DataType, Operation, Index >
		( prefixSumType,
		cudaRecursivePrefixSum( prefixSumType,
		operation,
		size,
		blockSize,
		elementsInBlock,
		gridShift,
		deviceInput,
		deviceOutput ) )
		return false;
		if( cudaMemcpy( &gridShift,
		deviceOutput );

		cudaMemcpy( &gridShift,
		&deviceOutput[ size - 1 ],
		sizeof( DataType ),
		cudaMemcpyDeviceToHost ) != cudaSuccess )
		{
		std::cerr << "I am not able to copy data from device to host." << std::endl;
		return false;
		}
		return true;
		cudaMemcpyDeviceToHost );
		TNL_CHECK_CUDA_DEVICE;
		}

		template< typename DataType,
		typename Operation,
		typename Index >
		bool cudaPrefixSum( const Index size,
		void
		cudaPrefixSum( const Index size,
		const Index blockSize,
		const DataType *deviceInput,
		DataType* deviceOutput,
		Operation& operation,
		const enumPrefixSumType prefixSumType )
		const PrefixSumType prefixSumType )
		{
		/****
		* Compute the number of grids
		*/
		const Index elementsInBlock = 8 * blockSize;
		const Index gridSize = size / elementsInBlock + ( size % elementsInBlock != 0 );
		const Index maxGridSize = 65536;
		const Index gridsNumber = gridSize / maxGridSize + ( gridSize % maxGridSize != 0 );
		const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );
		const auto maxGridSize = Devices::Cuda::getMaxGridSize();
		const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize );

		/****
		* Loop over all grids.
		*/
		Index gridShift( 0 );
		for( Index gridIdx = 0; gridIdx < gridsNumber; gridIdx ++ )
		Index gridShift = 0;
		for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ )
		{
		/****
		* Compute current grid size and size of data to be scanned
		*/
		Index gridSize = ( size - gridIdx * maxGridSize * elementsInBlock ) /
		elementsInBlock;
		Index currentSize = size - gridIdx * maxGridSize * elementsInBlock;
		if( gridSize > maxGridSize )
		{
		gridSize = maxGridSize;
		if( currentSize / elementsInBlock > maxGridSize )
		currentSize = maxGridSize * elementsInBlock;
		}
		Index gridOffset = gridIdx * maxGridSize * elementsInBlock;
		if( ! cudaGridPrefixSum< DataType, Operation, Index >
		( prefixSumType,
		const Index gridOffset = gridIdx * maxGridSize * elementsInBlock;

		cudaGridPrefixSum( prefixSumType,
		operation,
		currentSize,
		blockSize,
		elementsInBlock,
		&deviceInput[ gridOffset ],
		&deviceOutput[ gridOffset ],
		gridShift ) )
		return false;
		gridShift );
		}
		return true;
		}

		#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
		@@ -353,7 +328,7 @@ extern template bool cudaPrefixSum( const int size,
		const int *deviceInput,
		int* deviceOutput,
		tnlParallelReductionSum< int, int >& operation,
		const enumPrefixSumType prefixSumType );
		const PrefixSumType prefixSumType );


		extern template bool cudaPrefixSum( const int size,
		@@ -361,14 +336,14 @@ extern template bool cudaPrefixSum( const int size,
		const float *deviceInput,
		float* deviceOutput,
		tnlParallelReductionSum< float, int >& operation,
		const enumPrefixSumType prefixSumType );
		const PrefixSumType prefixSumType );

		extern template bool cudaPrefixSum( const int size,
		const int blockSize,
		const double *deviceInput,
		double* deviceOutput,
		tnlParallelReductionSum< double, int >& operation,
		const enumPrefixSumType prefixSumType );
		const PrefixSumType prefixSumType );

		#ifdef INSTANTIATE_LONG_DOUBLE
		extern template bool cudaPrefixSum( const int size,
		@@ -376,7 +351,7 @@ extern template bool cudaPrefixSum( const int size,
		const long double *deviceInput,
		long double* deviceOutput,
		tnlParallelReductionSum< long double, int >& operation,
		const enumPrefixSumType prefixSumType );
		const PrefixSumType prefixSumType );
		#endif

		#ifdef INSTANTIATE_LONG_INT
		@@ -385,7 +360,7 @@ extern template bool cudaPrefixSum( const long int size,
		const int *deviceInput,
		int* deviceOutput,
		tnlParallelReductionSum< int, long int >& operation,
		const enumPrefixSumType prefixSumType );
		const PrefixSumType prefixSumType );


		extern template bool cudaPrefixSum( const long int size,
		@@ -393,14 +368,14 @@ extern template bool cudaPrefixSum( const long int size,
		const float *deviceInput,
		float* deviceOutput,
		tnlParallelReductionSum< float, long int >& operation,
		const enumPrefixSumType prefixSumType );
		const PrefixSumType prefixSumType );

		extern template bool cudaPrefixSum( const long int size,
		const long int blockSize,
		const double *deviceInput,
		double* deviceOutput,
		tnlParallelReductionSum< double, long int >& operation,
		const enumPrefixSumType prefixSumType );
		const PrefixSumType prefixSumType );

		#ifdef INSTANTIATE_LONG_DOUBLE
		extern template bool cudaPrefixSum( const long int size,
		@@ -408,7 +383,7 @@ extern template bool cudaPrefixSum( const long int size,
		const long double *deviceInput,
		long double* deviceOutput,
		tnlParallelReductionSum< long double, long int >& operation,
		const enumPrefixSumType prefixSumType );
		const PrefixSumType prefixSumType );
		#endif
		#endif