CUDA prefix-sum: separated the implementation of the first and second phase (2c40015f) · Commits · TNL / tnl-dev

src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h

+164 −122

Original line number	Original line	Diff line number	Diff line
	@@ -31,7 +31,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
	Reduction reduction,		Reduction reduction,
	const Real zero,		const Real zero,
	const Index size,		const Index size,
	const Index elementsInBlock,		const int elementsInBlock,
	const Real* input,		const Real* input,
	Real* output,		Real* output,
	Real* auxArray )		Real* auxArray )
	@@ -46,8 +46,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
	/***		/***
	* Load data into the shared memory.		* Load data into the shared memory.
	*/		*/
	const Index blockOffset = blockIdx.x * elementsInBlock;		const int blockOffset = blockIdx.x * elementsInBlock;
	Index idx = threadIdx.x;		int idx = threadIdx.x;
	if( prefixSumType == PrefixSumType::Exclusive )		if( prefixSumType == PrefixSumType::Exclusive )
	{		{
	if( idx == 0 )		if( idx == 0 )
	@@ -81,7 +81,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
	sharedData[ Devices::Cuda::getInterleaving( chunkOffset ) ];		sharedData[ Devices::Cuda::getInterleaving( chunkOffset ) ];
	}		}

	Index chunkPointer( 1 );		int chunkPointer = 1;
	while( chunkPointer < chunkSize &&		while( chunkPointer < chunkSize &&
	chunkOffset + chunkPointer < lastElementInBlock )		chunkOffset + chunkPointer < lastElementInBlock )
	{		{
	@@ -132,7 +132,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
	idx = threadIdx.x;		idx = threadIdx.x;
	while( idx < elementsInBlock && blockOffset + idx < size )		while( idx < elementsInBlock && blockOffset + idx < size )
	{		{
	const Index chunkIdx = idx / chunkSize;		const int chunkIdx = idx / chunkSize;
	Real chunkShift( zero );		Real chunkShift( zero );
	if( chunkIdx > 0 )		if( chunkIdx > 0 )
	chunkShift = auxData[ chunkIdx - 1 ];		chunkShift = auxData[ chunkIdx - 1 ];
	@@ -161,18 +161,20 @@ template< typename Real,
	__global__ void		__global__ void
	cudaSecondPhaseBlockPrefixSum( Reduction reduction,		cudaSecondPhaseBlockPrefixSum( Reduction reduction,
	const Index size,		const Index size,
	const Index elementsInBlock,		const int elementsInBlock,
	Real gridShift,		const Index gridIdx,
			const Index maxGridSize,
	const Real* auxArray,		const Real* auxArray,
	Real* data )		Real* data,
			Real shift )
	{		{
	if( blockIdx.x > 0 )		if( gridIdx > 0 \|\| blockIdx.x > 0 )
	gridShift = reduction( gridShift, auxArray[ blockIdx.x - 1 ] );		shift = reduction( shift, auxArray[ gridIdx * maxGridSize + blockIdx.x - 1 ] );
	const Index readOffset = blockIdx.x * elementsInBlock;		const int readOffset = blockIdx.x * elementsInBlock;
	Index readIdx = threadIdx.x;		int readIdx = threadIdx.x;
	while( readIdx < elementsInBlock && readOffset + readIdx < size )		while( readIdx < elementsInBlock && readOffset + readIdx < size )
	{		{
	data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], gridShift );		data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], shift );
	readIdx += blockDim.x;		readIdx += blockDim.x;
	}		}
	}		}
	@@ -182,143 +184,183 @@ template< PrefixSumType prefixSumType,
	typename Index >		typename Index >
	struct CudaPrefixSumKernelLauncher		struct CudaPrefixSumKernelLauncher
	{		{
			/****
			* \brief Performs both phases of prefix sum.
			*
			* \param size Number of elements to be scanned.
			* \param deviceInput Pointer to input data on GPU.
			* \param deviceOutput Pointer to output array on GPU, can be the same as input.
			* \param reduction Symmetric binary function representing the reduction operation
			* (usually addition, i.e. an instance of \ref std::plus).
			* \param zero Neutral element for given reduction operation, i.e. value such that
			* `reduction(zero, x) == x` for any `x`.
			* \param blockSize The CUDA block size to be used for kernel launch.
			*/
	template< typename Reduction >		template< typename Reduction >
	static void		static void
	cudaRecursivePrefixSum( PrefixSumType prefixSumType_,		perform( const Index size,
			const Real* deviceInput,
			Real* deviceOutput,
	Reduction& reduction,		Reduction& reduction,
	const Real& zero,		const Real zero,
	const Index size,		const int blockSize = 256 )
	const Index blockSize,
	const Index elementsInBlock,
	Real& gridShift,
	const Real* input,
	Real* output )
	{		{
	const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );		const auto blockShifts = performFirstPhase(
	const Index auxArraySize = numberOfBlocks;		size,
			deviceInput,
	Array< Real, Devices::Cuda > auxArray1, auxArray2;		deviceOutput,
	auxArray1.setSize( auxArraySize );		reduction,
	auxArray2.setSize( auxArraySize );		zero,
			blockSize );
			performSecondPhase(
			size,
			deviceOutput,
			blockShifts.getData(),
			reduction,
			zero,
			blockSize );
			}

	/****		/****
	* Setup block and grid size.		* \brief Performs the first phase of prefix sum.
			*
			* \param size Number of elements to be scanned.
			* \param deviceInput Pointer to input data on GPU.
			* \param deviceOutput Pointer to output array on GPU, can be the same as input.
			* \param reduction Symmetric binary function representing the reduction operation
			* (usually addition, i.e. an instance of \ref std::plus).
			* \param zero Neutral value for given reduction operation, i.e. value such that
			* `reduction(zero, x) == x` for any `x`.
			* \param blockSize The CUDA block size to be used for kernel launch.
	*/		*/
	dim3 cudaBlockSize( 0 ), cudaGridSize( 0 );		template< typename Reduction >
			static auto
			performFirstPhase( const Index size,
			const Real* deviceInput,
			Real* deviceOutput,
			Reduction& reduction,
			const Real zero,
			const int blockSize = 256 )
			{
			// compute the number of grids
			const int elementsInBlock = 8 * blockSize;
			const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );
			const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
			//std::cerr << "numberOfgrids = " << numberOfGrids << std::endl;

			// allocate array for the block sums
			Array< Real, Devices::Cuda > blockSums;
			blockSums.setSize( numberOfBlocks );

			// loop over all grids
			for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
			// compute current grid size and size of data to be scanned
			const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock;
			Index currentSize = size - gridOffset;
			if( currentSize / elementsInBlock > maxGridSize() )
			currentSize = maxGridSize() * elementsInBlock;
			//std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl;

			// setup block and grid size
			dim3 cudaBlockSize, cudaGridSize;
	cudaBlockSize.x = blockSize;		cudaBlockSize.x = blockSize;
	cudaGridSize.x = roundUpDivision( size, elementsInBlock );		cudaGridSize.x = roundUpDivision( currentSize, elementsInBlock );

	/****		// run the kernel
	* Run the kernel.
	*/
	const std::size_t sharedDataSize = elementsInBlock +		const std::size_t sharedDataSize = elementsInBlock +
	elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2;		elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2;
	const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real );		const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real );
	cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>>		cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
	( prefixSumType_,		( prefixSumType,
	reduction,		reduction,
	zero,		zero,
	size,		currentSize,
	elementsInBlock,		elementsInBlock,
	input,		&deviceInput[ gridOffset ],
	output,		&deviceOutput[ gridOffset ],
	auxArray1.getData() );		&blockSums[ gridIdx * maxGridSize() ] );
			}

			// synchronize the null-stream after all grids
	cudaStreamSynchronize(0);		cudaStreamSynchronize(0);
	TNL_CHECK_CUDA_DEVICE;		TNL_CHECK_CUDA_DEVICE;

			// blockSums now contains sums of numbers in each block. The first phase
	//std::cerr << " auxArray1 = " << auxArray1 << std::endl;		// ends by computing prefix-sum of this array.
	/***		if( numberOfBlocks > 1 ) {
	* In auxArray1 there is now a sum of numbers in each block.		CudaPrefixSumKernelLauncher< PrefixSumType::Inclusive, Real, Index >::perform(
	* We must compute prefix-sum of auxArray1 and then shift		blockSums.getSize(),
	* each block.		blockSums.getData(),
	*/		blockSums.getData(),
	Real gridShift2 = zero;
	if( numberOfBlocks > 1 )
	cudaRecursivePrefixSum( PrefixSumType::Inclusive,
	reduction,		reduction,
	zero,		zero,
	numberOfBlocks,		blockSize );
	blockSize,		}
	elementsInBlock,
	gridShift2,
	auxArray1.getData(),
	auxArray2.getData() );

	//std::cerr << " auxArray2 = " << auxArray2 << std::endl;		// Store the number of CUDA grids for the purpose of unit testing, i.e.
	cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>>		// to check if we test the algorithm with more than one CUDA grid.
	( reduction,		gridsCount() = numberOfGrids;
	size,
	elementsInBlock,
	gridShift,
	auxArray2.getData(),
	output );
	cudaStreamSynchronize(0);
	TNL_CHECK_CUDA_DEVICE;

	gridShift = auxArray2.getElement( auxArraySize - 1 );		// blockSums now contains shift values for each block - to be used in the second phase
	//std::cerr << "gridShift = " << gridShift << std::endl;		return blockSums;
	}		}

	/****		/****
	* \brief Starts prefix sum in CUDA.		* \brief Performs the seocond phase of prefix sum.
	*		*
	* \tparam Reduction reduction to be performed on particular elements - addition usually		* \param size Number of elements to be scanned.
	* \param size is number of elements to be scanned		* \param deviceOutput Pointer to output array on GPU.
	* \param blockSize is CUDA block size		* \param blockShifts Pointer to a GPU array containing the block shifts. It is the
	* \param deviceInput is pointer to input data on GPU		* result of the first phase.
	* \param deviceOutput is pointer to resulting array, can be the same as input		* \param reduction Symmetric binary function representing the reduction operation
	* \param reduction is instance of Reduction		* (usually addition, i.e. an instance of \ref std::plus).
	* \param zero is neutral element for given Reduction		* \param shift A constant shifting all elements of the array (usually `zero`, i.e.
			* the neutral value).
			* \param blockSize The CUDA block size to be used for kernel launch.
	*/		*/
	template< typename Reduction >		template< typename Reduction >
	static void		static void
	start( const Index size,		performSecondPhase( const Index size,
	const Index blockSize,
	const Real *deviceInput,
	Real* deviceOutput,		Real* deviceOutput,
			const Real* blockShifts,
	Reduction& reduction,		Reduction& reduction,
	const Real& zero )		const Real shift,
			const Index blockSize = 256 )
	{		{
	/****		// compute the number of grids
	* Compute the number of grids		const int elementsInBlock = 8 * blockSize;
	*/
	const Index elementsInBlock = 8 * blockSize;
	const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );		const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );
	const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );		const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
	Real gridShift = zero;
	//std::cerr << "numberOfgrids = " << numberOfGrids << std::endl;

	/****		// loop over all grids
	* Loop over all grids.		for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
	*/		// compute current grid size and size of data to be scanned
	for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ )
	{
	/****
	* Compute current grid size and size of data to be scanned
	*/
	const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock;		const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock;
	Index currentSize = size - gridOffset;		Index currentSize = size - gridOffset;
	if( currentSize / elementsInBlock > maxGridSize() )		if( currentSize / elementsInBlock > maxGridSize() )
	currentSize = maxGridSize() * elementsInBlock;		currentSize = maxGridSize() * elementsInBlock;

	//std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl;		//std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl;
	cudaRecursivePrefixSum( prefixSumType,
	reduction,		// setup block and grid size
	zero,		dim3 cudaBlockSize, cudaGridSize;
	currentSize,		cudaBlockSize.x = blockSize;
	blockSize,		cudaGridSize.x = roundUpDivision( currentSize, elementsInBlock );

			// run the kernel
			cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>>
			( reduction,
			size,
	elementsInBlock,		elementsInBlock,
	gridShift,		gridIdx,
	&deviceInput[ gridOffset ],		(Index) maxGridSize(),
	&deviceOutput[ gridOffset ] );		blockShifts,
			&deviceOutput[ gridOffset ],
			shift );
	}		}

	/***		// synchronize the null-stream after all grids
	* Store the number of CUDA grids for the purpose of unit testing, i.e.		cudaStreamSynchronize(0);
	* to check if we test the algorithm with more than one CUDA grid.		TNL_CHECK_CUDA_DEVICE;
	*/
	gridsCount() = numberOfGrids;
	}		}

	/****		/****

src/TNL/Containers/Algorithms/PrefixSum.hpp

+13 −17

Original line number	Original line	Diff line number	Diff line
	@@ -141,17 +141,18 @@ perform( Vector& v,
	const Reduction& reduction,		const Reduction& reduction,
	const typename Vector::RealType& zero )		const typename Vector::RealType& zero )
	{		{
			#ifdef HAVE_CUDA
	using RealType = typename Vector::RealType;		using RealType = typename Vector::RealType;
	using IndexType = typename Vector::IndexType;		using IndexType = typename Vector::IndexType;
	using IndexType = typename Vector::IndexType;
	#ifdef HAVE_CUDA		CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::perform(
	CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::start(		end - begin,
	( IndexType ) ( end - begin ),		&v[ begin ], // input
	( IndexType ) 256,		&v[ begin ], // output
	&v[ begin ],
	&v[ begin ],
	reduction,		reduction,
	zero );		zero );
			#else
			throw Exceptions::CudaSupportMissing();
	#endif		#endif
	}		}

	@@ -211,18 +212,13 @@ perform( Vector& v,
	const Reduction& reduction,		const Reduction& reduction,
	const typename Vector::RealType& zero )		const typename Vector::RealType& zero )
	{		{
			#ifdef HAVE_CUDA
	using RealType = typename Vector::RealType;		using RealType = typename Vector::RealType;
	using IndexType = typename Vector::IndexType;		using IndexType = typename Vector::IndexType;
	using IndexType = typename Vector::IndexType;
	#ifdef HAVE_CUDA		throw Exceptions::NotImplementedError( "Segmented prefix sum is not implemented for CUDA." );
	throw Exceptions::NotImplementedError( "Segmented prefix sum is not implemented for CUDA." ); // NOT IMPLEMENTED YET		#else
	/*CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::start(		throw Exceptions::CudaSupportMissing();
	( IndexType ) ( end - begin ),
	( IndexType ) 256,
	&v[ begin ],
	&v[ begin ],
	reduction,
	zero );*/
	#endif		#endif
	}		}