PrefixSum: separate first and second phase for OpenMP implementation and... (174ad5fd) · Commits · TNL / tnl-dev

src/TNL/Containers/Algorithms/PrefixSum.h

+82 −46

Original line number	Original line	Diff line number	Diff line
	@@ -26,17 +26,16 @@ enum class PrefixSumType {

	template< typename Device,		template< typename Device,
	PrefixSumType Type = PrefixSumType::Inclusive >		PrefixSumType Type = PrefixSumType::Inclusive >
	class PrefixSum;		struct PrefixSum;

	template< typename Device,		template< typename Device,
	PrefixSumType Type = PrefixSumType::Inclusive >		PrefixSumType Type = PrefixSumType::Inclusive >
	class SegmentedPrefixSum;		struct SegmentedPrefixSum;


	template< PrefixSumType Type >		template< PrefixSumType Type >
	class PrefixSum< Devices::Host, Type >		struct PrefixSum< Devices::Host, Type >
	{		{
	public:
	template< typename Vector,		template< typename Vector,
	typename Reduction >		typename Reduction >
	static void		static void
	@@ -44,13 +43,32 @@ class PrefixSum< Devices::Host, Type >
	const typename Vector::IndexType begin,		const typename Vector::IndexType begin,
	const typename Vector::IndexType end,		const typename Vector::IndexType end,
	const Reduction& reduction,		const Reduction& reduction,
	const typename Vector::RealType& zero );		const typename Vector::RealType zero );

			template< typename Vector,
			typename Reduction >
			static auto
			performFirstPhase( Vector& v,
			const typename Vector::IndexType begin,
			const typename Vector::IndexType end,
			const Reduction& reduction,
			const typename Vector::RealType zero );

			template< typename Vector,
			typename BlockShifts,
			typename Reduction >
			static void
			performSecondPhase( Vector& v,
			const BlockShifts& blockShifts,
			const typename Vector::IndexType begin,
			const typename Vector::IndexType end,
			const Reduction& reduction,
			const typename Vector::RealType shift );
	};		};

	template< PrefixSumType Type >		template< PrefixSumType Type >
	class PrefixSum< Devices::Cuda, Type >		struct PrefixSum< Devices::Cuda, Type >
	{		{
	public:
	template< typename Vector,		template< typename Vector,
	typename Reduction >		typename Reduction >
	static void		static void
	@@ -58,13 +76,32 @@ class PrefixSum< Devices::Cuda, Type >
	const typename Vector::IndexType begin,		const typename Vector::IndexType begin,
	const typename Vector::IndexType end,		const typename Vector::IndexType end,
	const Reduction& reduction,		const Reduction& reduction,
	const typename Vector::RealType& zero );		const typename Vector::RealType zero );

			template< typename Vector,
			typename Reduction >
			static auto
			performFirstPhase( Vector& v,
			const typename Vector::IndexType begin,
			const typename Vector::IndexType end,
			const Reduction& reduction,
			const typename Vector::RealType zero );

			template< typename Vector,
			typename BlockShifts,
			typename Reduction >
			static void
			performSecondPhase( Vector& v,
			const BlockShifts& blockShifts,
			const typename Vector::IndexType begin,
			const typename Vector::IndexType end,
			const Reduction& reduction,
			const typename Vector::RealType shift );
	};		};

	template< PrefixSumType Type >		template< PrefixSumType Type >
	class SegmentedPrefixSum< Devices::Host, Type >		struct SegmentedPrefixSum< Devices::Host, Type >
	{		{
	public:
	template< typename Vector,		template< typename Vector,
	typename Reduction,		typename Reduction,
	typename Flags >		typename Flags >
	@@ -74,13 +111,12 @@ class SegmentedPrefixSum< Devices::Host, Type >
	const typename Vector::IndexType begin,		const typename Vector::IndexType begin,
	const typename Vector::IndexType end,		const typename Vector::IndexType end,
	const Reduction& reduction,		const Reduction& reduction,
	const typename Vector::RealType& zero );		const typename Vector::RealType zero );
	};		};

	template< PrefixSumType Type >		template< PrefixSumType Type >
	class SegmentedPrefixSum< Devices::Cuda, Type >		struct SegmentedPrefixSum< Devices::Cuda, Type >
	{		{
	public:
	template< typename Vector,		template< typename Vector,
	typename Reduction,		typename Reduction,
	typename Flags >		typename Flags >
	@@ -90,7 +126,7 @@ class SegmentedPrefixSum< Devices::Cuda, Type >
	const typename Vector::IndexType begin,		const typename Vector::IndexType begin,
	const typename Vector::IndexType end,		const typename Vector::IndexType end,
	const Reduction& reduction,		const Reduction& reduction,
	const typename Vector::RealType& zero );		const typename Vector::RealType zero );
	};		};

	} // namespace Algorithms		} // namespace Algorithms

src/TNL/Containers/Algorithms/PrefixSum.hpp

+131 −59

Original line number	Original line	Diff line number	Diff line
	@@ -12,36 +12,18 @@

	#pragma once		#pragma once

	#include <memory> // std::unique_ptr

	#include "PrefixSum.h"		#include "PrefixSum.h"

	//#define CUDA_REDUCTION_PROFILING

	#include <TNL/Assert.h>		#include <TNL/Assert.h>
	#include <TNL/Exceptions/CudaSupportMissing.h>		#include <TNL/Containers/Array.h>
	#include <TNL/Containers/Algorithms/ArrayOperations.h>
	#include <TNL/Containers/Algorithms/CudaPrefixSumKernel.h>		#include <TNL/Containers/Algorithms/CudaPrefixSumKernel.h>
			#include <TNL/Exceptions/CudaSupportMissing.h>
	#include <TNL/Exceptions/NotImplementedError.h>		#include <TNL/Exceptions/NotImplementedError.h>

	#ifdef CUDA_REDUCTION_PROFILING
	#include <iostream>
	#include <TNL/Timer.h>
	#endif

	namespace TNL {		namespace TNL {
	namespace Containers {		namespace Containers {
	namespace Algorithms {		namespace Algorithms {

	/****
	* Arrays smaller than the following constant
	* are reduced on CPU. The constant must not be larger
	* than maximal CUDA grid size.
	*/
	static constexpr int PrefixSum_minGpuDataSize = 256;//65536; //16384;//1024;//256;

	////
	// PrefixSum on host
	template< PrefixSumType Type >		template< PrefixSumType Type >
	template< typename Vector,		template< typename Vector,
	typename Reduction >		typename Reduction >
	@@ -51,30 +33,37 @@ perform( Vector& v,
	const typename Vector::IndexType begin,		const typename Vector::IndexType begin,
	const typename Vector::IndexType end,		const typename Vector::IndexType end,
	const Reduction& reduction,		const Reduction& reduction,
	const typename Vector::RealType& zero )		const typename Vector::RealType zero )
			{
			#ifdef HAVE_OPENMP
			const auto blockShifts = performFirstPhase( v, begin, end, reduction, zero );
			performSecondPhase( v, blockShifts, begin, end, reduction, zero );
			#else
			// sequential prefix-sum does not need a second phase
			performFirstPhase( v, begin, end, reduction, zero );
			#endif
			}

			template< PrefixSumType Type >
			template< typename Vector,
			typename Reduction >
			auto
			PrefixSum< Devices::Host, Type >::
			performFirstPhase( Vector& v,
			const typename Vector::IndexType begin,
			const typename Vector::IndexType end,
			const Reduction& reduction,
			const typename Vector::RealType zero )
	{		{
	using RealType = typename Vector::RealType;		using RealType = typename Vector::RealType;
	using IndexType = typename Vector::IndexType;		using IndexType = typename Vector::IndexType;

	#ifdef HAVE_OPENMP		#ifdef HAVE_OPENMP
	const int threads = Devices::Host::getMaxThreadsCount();		const int threads = Devices::Host::getMaxThreadsCount();
	std::unique_ptr< RealType[] > block_sums{		Array< RealType, Devices::Host > block_sums( threads + 1 );
	// Workaround for nvcc 10.1.168 - it would modifie the simple expression
	// `new RealType[reducedSize]` in the source code to `new (RealType[reducedSize])`
	// which is not correct - see e.g. https://stackoverflow.com/a/39671946
	// Thus, the host compiler would spit out hundreds of warnings...
	// Funnily enough, nvcc's behaviour depends on the context rather than the
	// expression, because exactly the same simple expression in different places
	// does not produce warnings.
	#ifdef __NVCC__
	new RealType[ static_cast<const int&>(threads) + 1 ]
	#else
	new RealType[ threads + 1 ]
	#endif
	};
	block_sums[ 0 ] = zero;		block_sums[ 0 ] = zero;

	#pragma omp parallel		#pragma omp parallel num_threads(threads)
	{		{
	// init		// init
	const int thread_idx = omp_get_thread_num();		const int thread_idx = omp_get_thread_num();
	@@ -99,18 +88,15 @@ perform( Vector& v,

	// write the block sums into the buffer		// write the block sums into the buffer
	block_sums[ thread_idx + 1 ] = block_sum;		block_sums[ thread_idx + 1 ] = block_sum;
	#pragma omp barrier		}

	// calculate per-block offsets		// block_sums now contains sums of numbers in each block. The first phase
	RealType offset = 0;		// ends by computing prefix-sum of this array.
	for( int i = 0; i < thread_idx + 1; i++ )		for( int i = 1; i < threads + 1; i++ )
	offset = reduction( offset, block_sums[ i ] );		block_sums[ i ] = reduction( block_sums[ i ], block_sums[ i - 1 ] );

	// shift intermediate results by the offset		// block_sums now contains shift values for each block - to be used in the second phase
	#pragma omp for schedule(static)		return block_sums;
	for( IndexType i = begin; i < end; i++ )
	v[ i ] = reduction( v[ i ], offset );
	}
	#else		#else
	if( Type == PrefixSumType::Inclusive ) {		if( Type == PrefixSumType::Inclusive ) {
	for( IndexType i = begin + 1; i < end; i++ )		for( IndexType i = begin + 1; i < end; i++ )
	@@ -125,11 +111,47 @@ perform( Vector& v,
	aux = reduction( aux, x );		aux = reduction( aux, x );
	}		}
	}		}

			return 0;
			#endif
			}

			template< PrefixSumType Type >
			template< typename Vector,
			typename BlockShifts,
			typename Reduction >
			void
			PrefixSum< Devices::Host, Type >::
			performSecondPhase( Vector& v,
			const BlockShifts& blockShifts,
			const typename Vector::IndexType begin,
			const typename Vector::IndexType end,
			const Reduction& reduction,
			const typename Vector::RealType shift )
			{
			using RealType = typename Vector::RealType;
			using IndexType = typename Vector::IndexType;

			#ifdef HAVE_OPENMP
			const int threads = blockShifts.getSize() - 1;

			// launch exactly the same number of threads as in the first phase
			#pragma omp parallel num_threads(threads)
			{
			const int thread_idx = omp_get_thread_num();
			const RealType offset = reduction( blockShifts[ thread_idx ], shift );

			// shift intermediate results by the offset
			#pragma omp for schedule(static)
			for( IndexType i = begin; i < end; i++ )
			v[ i ] = reduction( v[ i ], offset );
			}
			#else
			for( IndexType i = begin; i < end; i++ )
			v[ i ] = reduction( v[ i ], shift );
	#endif		#endif
	}		}

	////
	// PrefixSum on CUDA device
	template< PrefixSumType Type >		template< PrefixSumType Type >
	template< typename Vector,		template< typename Vector,
	typename Reduction >		typename Reduction >
	@@ -139,7 +161,7 @@ perform( Vector& v,
	const typename Vector::IndexType begin,		const typename Vector::IndexType begin,
	const typename Vector::IndexType end,		const typename Vector::IndexType end,
	const Reduction& reduction,		const Reduction& reduction,
	const typename Vector::RealType& zero )		const typename Vector::RealType zero )
	{		{
	#ifdef HAVE_CUDA		#ifdef HAVE_CUDA
	using RealType = typename Vector::RealType;		using RealType = typename Vector::RealType;
	@@ -156,9 +178,61 @@ perform( Vector& v,
	#endif		#endif
	}		}

			template< PrefixSumType Type >
			template< typename Vector,
			typename Reduction >
			auto
			PrefixSum< Devices::Cuda, Type >::
			performFirstPhase( Vector& v,
			const typename Vector::IndexType begin,
			const typename Vector::IndexType end,
			const Reduction& reduction,
			const typename Vector::RealType zero )
			{
			#ifdef HAVE_CUDA
			using RealType = typename Vector::RealType;
			using IndexType = typename Vector::IndexType;

			return CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::performFirstPhase(
			end - begin,
			&v[ begin ], // input
			&v[ begin ], // output
			reduction,
			zero );
			#else
			throw Exceptions::CudaSupportMissing();
			#endif
			}

			template< PrefixSumType Type >
			template< typename Vector,
			typename BlockShifts,
			typename Reduction >
			void
			PrefixSum< Devices::Cuda, Type >::
			performSecondPhase( Vector& v,
			const BlockShifts& blockShifts,
			const typename Vector::IndexType begin,
			const typename Vector::IndexType end,
			const Reduction& reduction,
			const typename Vector::RealType shift )
			{
			#ifdef HAVE_CUDA
			using RealType = typename Vector::RealType;
			using IndexType = typename Vector::IndexType;

			CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::performSecondPhase(
			end - begin,
			&v[ begin ], // output
			blockShifts.getData(),
			reduction,
			shift );
			#else
			throw Exceptions::CudaSupportMissing();
			#endif
			}


	////
	// PrefixSum on host
	template< PrefixSumType Type >		template< PrefixSumType Type >
	template< typename Vector,		template< typename Vector,
	typename Reduction,		typename Reduction,
	@@ -170,7 +244,7 @@ perform( Vector& v,
	const typename Vector::IndexType begin,		const typename Vector::IndexType begin,
	const typename Vector::IndexType end,		const typename Vector::IndexType end,
	const Reduction& reduction,		const Reduction& reduction,
	const typename Vector::RealType& zero )		const typename Vector::RealType zero )
	{		{
	using RealType = typename Vector::RealType;		using RealType = typename Vector::RealType;
	using IndexType = typename Vector::IndexType;		using IndexType = typename Vector::IndexType;
	@@ -197,8 +271,6 @@ perform( Vector& v,
	}		}
	}		}

	////
	// PrefixSum on CUDA device
	template< PrefixSumType Type >		template< PrefixSumType Type >
	template< typename Vector,		template< typename Vector,
	typename Reduction,		typename Reduction,
	@@ -210,7 +282,7 @@ perform( Vector& v,
	const typename Vector::IndexType begin,		const typename Vector::IndexType begin,
	const typename Vector::IndexType end,		const typename Vector::IndexType end,
	const Reduction& reduction,		const Reduction& reduction,
	const typename Vector::RealType& zero )		const typename Vector::RealType zero )
	{		{
	#ifdef HAVE_CUDA		#ifdef HAVE_CUDA
	using RealType = typename Vector::RealType;		using RealType = typename Vector::RealType;