Removed volatile reduction from PrefixSum and updated the normal reduction operation (8d0d2638) · Commits · TNL / tnl-dev

src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h

+37 −44

Original line number	Diff line number	Diff line
		@@ -24,13 +24,11 @@ namespace Algorithms {
		#ifdef HAVE_CUDA

		template< typename Real,
		typename Operation,
		typename VolatileOperation,
		typename Reduction,
		typename Index >
		__global__ void
		cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
		Operation operation,
		VolatileOperation volatileOperation,
		Reduction reduction,
		const Real zero,
		const Index size,
		const Index elementsInBlock,
		@@ -40,8 +38,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
		const Real gridShift )
		{
		Real* sharedData = TNL::Devices::Cuda::getSharedMemory< Real >();
		volatile Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ];
		volatile Real* warpSums = &auxData[ blockDim.x ];
		Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ];
		Real* warpSums = &auxData[ blockDim.x ];

		const Index lastElementIdx = size - blockIdx.x * elementsInBlock;
		const Index lastElementInBlock = TNL::min( lastElementIdx, elementsInBlock );
		@@ -70,7 +68,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
		}
		}
		if( blockIdx.x == 0 && threadIdx.x == 0 )
		operation( sharedData[ 0 ], gridShift );
		sharedData[ 0 ] = reduction( sharedData[ 0 ], gridShift );

		/***
		* Perform the sequential prefix-sum.
		@@ -90,7 +88,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
		while( chunkPointer < chunkSize &&
		chunkOffset + chunkPointer < lastElementInBlock )
		{
		operation( sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ],
		sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ] =
		reduction( sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ],
		sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
		auxData[ threadIdx.x ] =
		sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ];
		@@ -102,9 +101,11 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
		*/
		const int threadInWarpIdx = threadIdx.x % Devices::Cuda::getWarpSize();
		const int warpIdx = threadIdx.x / Devices::Cuda::getWarpSize();
		for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 )
		for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) {
		if( threadInWarpIdx >= stride && threadIdx.x < numberOfChunks )
		volatileOperation( auxData[ threadIdx.x ], auxData[ threadIdx.x - stride ] );
		auxData[ threadIdx.x ] = reduction( auxData[ threadIdx.x ], auxData[ threadIdx.x - stride ] );
		__syncwarp();
		}

		if( threadInWarpIdx == Devices::Cuda::getWarpSize() - 1 )
		warpSums[ warpIdx ] = auxData[ threadIdx.x ];
		@@ -114,21 +115,23 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
		* Compute prefix-sum of warp sums using one warp
		*/
		if( warpIdx == 0 )
		for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 )
		for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) {
		if( threadInWarpIdx >= stride )
		volatileOperation( warpSums[ threadIdx.x ], warpSums[ threadIdx.x - stride ] );
		warpSums[ threadIdx.x ] = reduction( warpSums[ threadIdx.x ], warpSums[ threadIdx.x - stride ] );
		__syncwarp();
		}
		__syncthreads();

		/****
		* Shift the warp prefix-sums.
		*/
		if( warpIdx > 0 )
		volatileOperation( auxData[ threadIdx.x ], warpSums[ warpIdx - 1 ] );
		auxData[ threadIdx.x ] = reduction( auxData[ threadIdx.x ], warpSums[ warpIdx - 1 ] );
		__syncthreads();

		/***
		* Store the result back in global memory.
		*/
		__syncthreads();
		idx = threadIdx.x;
		while( idx < elementsInBlock && blockOffset + idx < size )
		{
		@@ -136,7 +139,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
		Real chunkShift( zero );
		if( chunkIdx > 0 )
		chunkShift = auxData[ chunkIdx - 1 ];
		operation( sharedData[ Devices::Cuda::getInterleaving( idx ) ], chunkShift );
		sharedData[ Devices::Cuda::getInterleaving( idx ) ] =
		reduction( sharedData[ Devices::Cuda::getInterleaving( idx ) ], chunkShift );
		output[ blockOffset + idx ] = sharedData[ Devices::Cuda::getInterleaving( idx ) ];
		idx += blockDim.x;
		}
		@@ -147,8 +151,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
		if( prefixSumType == PrefixSumType::Exclusive )
		{
		Real aux = zero;
		operation( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ] );
		operation( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] );
		aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ] );
		aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] );
		auxArray[ blockIdx.x ] = aux;
		}
		else
		@@ -157,10 +161,10 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
		}

		template< typename Real,
		typename Operation,
		typename Reduction,
		typename Index >
		__global__ void
		cudaSecondPhaseBlockPrefixSum( Operation operation,
		cudaSecondPhaseBlockPrefixSum( Reduction reduction,
		const Index size,
		const Index elementsInBlock,
		Real gridShift,
		@@ -174,7 +178,7 @@ cudaSecondPhaseBlockPrefixSum( Operation operation,
		Index readIdx = threadIdx.x;
		while( readIdx < elementsInBlock && readOffset + readIdx < size )
		{
		operation( data[ readIdx + readOffset ], shift );
		data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], shift );
		readIdx += blockDim.x;
		}
		}
		@@ -185,12 +189,10 @@ template< PrefixSumType prefixSumType,
		typename Index >
		struct CudaPrefixSumKernelLauncher
		{
		template< typename Operation,
		typename VolatileOperation >
		template< typename Reduction >
		static void
		cudaRecursivePrefixSum( PrefixSumType prefixSumType_,
		Operation& operation,
		VolatileOperation& volatileOperation,
		Reduction& reduction,
		const Real& zero,
		const Index size,
		const Index blockSize,
		@@ -221,8 +223,7 @@ struct CudaPrefixSumKernelLauncher
		const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real );
		cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
		( prefixSumType_,
		operation,
		volatileOperation,
		reduction,
		zero,
		size,
		elementsInBlock,
		@@ -242,8 +243,7 @@ struct CudaPrefixSumKernelLauncher
		Real gridShift2 = zero;
		if( numberOfBlocks > 1 )
		cudaRecursivePrefixSum( PrefixSumType::Inclusive,
		operation,
		volatileOperation,
		reduction,
		zero,
		numberOfBlocks,
		blockSize,
		@@ -254,7 +254,7 @@ struct CudaPrefixSumKernelLauncher

		//std::cerr << " auxArray2 = " << auxArray2 << std::endl;
		cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>>
		( operation,
		( reduction,
		size,
		elementsInBlock,
		gridShift,
		@@ -273,25 +273,21 @@ struct CudaPrefixSumKernelLauncher
		/****
		* \brief Starts prefix sum in CUDA.
		*
		* \tparam Operation operation to be performed on particular elements - addition usually
		* \tparam VolatileOperation - volatile version of Operation
		* \tparam Reduction reduction to be performed on particular elements - addition usually
		* \param size is number of elements to be scanned
		* \param blockSize is CUDA block size
		* \param deviceInput is pointer to input data on GPU
		* \param deviceOutput is pointer to resulting array, can be the same as input
		* \param operation is instance of Operation
		* \param volatileOperation is instance of VolatileOperation
		* \param zero is neutral element for given Operation
		* \param reduction is instance of Reduction
		* \param zero is neutral element for given Reduction
		*/
		template< typename Operation,
		typename VolatileOperation >
		template< typename Reduction >
		static void
		start( const Index size,
		const Index blockSize,
		const Real *deviceInput,
		Real* deviceOutput,
		Operation& operation,
		VolatileOperation& volatileOperation,
		Reduction& reduction,
		const Real& zero )
		{
		/****
		@@ -319,8 +315,7 @@ struct CudaPrefixSumKernelLauncher

		//std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl;
		cudaRecursivePrefixSum( prefixSumType,
		operation,
		volatileOperation,
		reduction,
		zero,
		currentSize,
		blockSize,
		@@ -371,5 +366,3 @@ int CudaPrefixSumKernelLauncher< prefixSumType, Real, Index >::gridsCount = -1;
		} // namespace Algorithms
		} // namespace Containers
		} // namespace TNL

src/TNL/Containers/Algorithms/PrefixSum.h

+8 −18

Original line number	Diff line number	Diff line
		@@ -38,14 +38,12 @@ class PrefixSum< Devices::Host, Type >
		{
		public:
		template< typename Vector,
		typename PrefixSumOperation,
		typename VolatilePrefixSumOperation >
		typename Reduction >
		static void
		perform( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		PrefixSumOperation& reduction,
		VolatilePrefixSumOperation& volatilePrefixSum,
		const Reduction& reduction,
		const typename Vector::RealType& zero );
		};

		@@ -54,14 +52,12 @@ class PrefixSum< Devices::Cuda, Type >
		{
		public:
		template< typename Vector,
		typename PrefixSumOperation,
		typename VolatilePrefixSumOperation >
		typename Reduction >
		static void
		perform( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		PrefixSumOperation& reduction,
		VolatilePrefixSumOperation& volatilePrefixSum,
		const Reduction& reduction,
		const typename Vector::RealType& zero );
		};

		@@ -70,16 +66,14 @@ class SegmentedPrefixSum< Devices::Host, Type >
		{
		public:
		template< typename Vector,
		typename PrefixSumOperation,
		typename VolatilePrefixSumOperation,
		typename Reduction,
		typename Flags >
		static void
		perform( Vector& v,
		Flags& flags,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		PrefixSumOperation& reduction,
		VolatilePrefixSumOperation& volatilePrefixSum,
		const Reduction& reduction,
		const typename Vector::RealType& zero );
		};

		@@ -88,21 +82,17 @@ class SegmentedPrefixSum< Devices::Cuda, Type >
		{
		public:
		template< typename Vector,
		typename PrefixSumOperation,
		typename VolatilePrefixSumOperation,
		typename Reduction,
		typename Flags >
		static void
		perform( Vector& v,
		Flags& flags,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		PrefixSumOperation& reduction,
		VolatilePrefixSumOperation& volatilePrefixSum,
		const Reduction& reduction,
		const typename Vector::RealType& zero );
		};



		} // namespace Algorithms
		} // namespace Containers
		} // namespace TNL

src/TNL/Containers/Algorithms/PrefixSum.hpp

+16 −26

Original line number	Diff line number	Diff line
		@@ -44,15 +44,13 @@ static constexpr int PrefixSum_minGpuDataSize = 256;//65536; //16384;//1024;//25
		// PrefixSum on host
		template< PrefixSumType Type >
		template< typename Vector,
		typename PrefixSumOperation,
		typename VolatilePrefixSumOperation >
		typename Reduction >
		void
		PrefixSum< Devices::Host, Type >::
		perform( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		PrefixSumOperation& reduction,
		VolatilePrefixSumOperation& volatilePrefixSum,
		const Reduction& reduction,
		const typename Vector::RealType& zero )
		{
		using RealType = typename Vector::RealType;
		@@ -86,7 +84,7 @@ perform( Vector& v,
		if( Type == PrefixSumType::Inclusive ) {
		#pragma omp for schedule(static)
		for( IndexType i = begin; i < end; i++ ) {
		reduction( block_sum, v[ i ] );
		block_sum = reduction( block_sum, v[ i ] );
		v[ i ] = block_sum;
		}
		}
		@@ -95,7 +93,7 @@ perform( Vector& v,
		for( IndexType i = begin; i < end; i++ ) {
		const RealType x = v[ i ];
		v[ i ] = block_sum;
		reduction( block_sum, x );
		block_sum = reduction( block_sum, x );
		}
		}

		@@ -106,17 +104,17 @@ perform( Vector& v,
		// calculate per-block offsets
		RealType offset = 0;
		for( int i = 0; i < thread_idx + 1; i++ )
		reduction( offset, block_sums[ i ] );
		offset = reduction( offset, block_sums[ i ] );

		// shift intermediate results by the offset
		#pragma omp for schedule(static)
		for( IndexType i = begin; i < end; i++ )
		reduction( v[ i ], offset );
		v[ i ] = reduction( v[ i ], offset );
		}
		#else
		if( Type == PrefixSumType::Inclusive ) {
		for( IndexType i = begin + 1; i < end; i++ )
		reduction( v[ i ], v[ i - 1 ] );
		v[ i ] = reduction( v[ i ], v[ i - 1 ] );
		}
		else // Exclusive prefix sum
		{
		@@ -124,7 +122,7 @@ perform( Vector& v,
		for( IndexType i = begin; i < end; i++ ) {
		const RealType x = v[ i ];
		v[ i ] = aux;
		reduction( aux, x );
		aux = reduction( aux, x );
		}
		}
		#endif
		@@ -134,15 +132,13 @@ perform( Vector& v,
		// PrefixSum on CUDA device
		template< PrefixSumType Type >
		template< typename Vector,
		typename PrefixSumOperation,
		typename VolatilePrefixSumOperation >
		typename Reduction >
		void
		PrefixSum< Devices::Cuda, Type >::
		perform( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		PrefixSumOperation& reduction,
		VolatilePrefixSumOperation& volatileReduction,
		const Reduction& reduction,
		const typename Vector::RealType& zero )
		{
		using RealType = typename Vector::RealType;
		@@ -155,7 +151,6 @@ perform( Vector& v,
		&v[ begin ],
		&v[ begin ],
		reduction,
		volatileReduction,
		zero );
		#endif
		}
		@@ -165,8 +160,7 @@ perform( Vector& v,
		// PrefixSum on host
		template< PrefixSumType Type >
		template< typename Vector,
		typename PrefixSumOperation,
		typename VolatilePrefixSumOperation,
		typename Reduction,
		typename Flags >
		void
		SegmentedPrefixSum< Devices::Host, Type >::
		@@ -174,8 +168,7 @@ perform( Vector& v,
		Flags& flags,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		PrefixSumOperation& reduction,
		VolatilePrefixSumOperation& volatilePrefixSum,
		const Reduction& reduction,
		const typename Vector::RealType& zero )
		{
		using RealType = typename Vector::RealType;
		@@ -186,7 +179,7 @@ perform( Vector& v,
		{
		for( IndexType i = begin + 1; i < end; i++ )
		if( ! flags[ i ] )
		reduction( v[ i ], v[ i - 1 ] );
		v[ i ] = reduction( v[ i ], v[ i - 1 ] );
		}
		else // Exclusive prefix sum
		{
		@@ -198,7 +191,7 @@ perform( Vector& v,
		if( flags[ i ] )
		aux = zero;
		v[ i ] = aux;
		reduction( aux, x );
		aux = reduction( aux, x );
		}
		}
		}
		@@ -207,8 +200,7 @@ perform( Vector& v,
		// PrefixSum on CUDA device
		template< PrefixSumType Type >
		template< typename Vector,
		typename PrefixSumOperation,
		typename VolatilePrefixSumOperation,
		typename Reduction,
		typename Flags >
		void
		SegmentedPrefixSum< Devices::Cuda, Type >::
		@@ -216,8 +208,7 @@ perform( Vector& v,
		Flags& flags,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		PrefixSumOperation& reduction,
		VolatilePrefixSumOperation& volatileReduction,
		const Reduction& reduction,
		const typename Vector::RealType& zero )
		{
		using RealType = typename Vector::RealType;
		@@ -231,7 +222,6 @@ perform( Vector& v,
		&v[ begin ],
		&v[ begin ],
		reduction,
		volatileReduction,
		zero );*/
		#endif
		}

src/TNL/Containers/Vector.hpp

+2 −8

Original line number	Diff line number	Diff line
		@@ -175,10 +175,7 @@ prefixSum( IndexType begin, IndexType end )
		{
		if( end == 0 )
		end = this->getSize();

		auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
		Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, reduction, volatileReduction, (RealType) 0.0 );
		Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 );
		}

		template< typename Real,
		@@ -193,10 +190,7 @@ segmentedPrefixSum( FlagsArray& flags, IndexType begin, IndexType end )
		{
		if( end == 0 )
		end = this->getSize();

		auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
		Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, reduction, volatileReduction, (RealType) 0.0 );
		Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, std::plus<>{}, (RealType) 0.0 );
		}

		template< typename Real,

src/TNL/Containers/VectorView.hpp

+2 −8

Original line number	Diff line number	Diff line
		@@ -125,10 +125,7 @@ prefixSum( IndexType begin, IndexType end )
		{
		if( end == 0 )
		end = this->getSize();

		auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
		Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, reduction, volatileReduction, (RealType) 0.0 );
		Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 );
		}

		template< typename Real,
		@@ -142,10 +139,7 @@ segmentedPrefixSum( FlagsArray& flags, IndexType begin, IndexType end )
		{
		if( end == 0 )
		end = this->getSize();

		auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
		Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, reduction, volatileReduction, (RealType) 0.0 );
		Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, std::plus<>{}, (RealType) 0.0 );
		}

		template< typename Real,

src/UnitTests/Containers/VectorPrefixSumTest.h

+6 −6

File changed.

Contains only whitespace changes.