Refactoring scan (dfe6b1e8) · Commits · TNL / tnl-dev

src/TNL/Algorithms/DistributedScan.h

+13 −13

Original line number	Diff line number	Diff line
		@@ -29,9 +29,9 @@ struct DistributedScan
		typename DistributedVector::IndexType begin,
		typename DistributedVector::IndexType end,
		const Reduction& reduction,
		const typename DistributedVector::RealType zero )
		const typename DistributedVector::ValueType zero )
		{
		using RealType = typename DistributedVector::RealType;
		using ValueType = typename DistributedVector::ValueType;
		using DeviceType = typename DistributedVector::DeviceType;

		const auto group = v.getCommunicationGroup();
		@@ -43,23 +43,23 @@ struct DistributedScan

		// perform first phase on the local data
		auto localView = v.getLocalView();
		const auto blockShifts = Scan< DeviceType, Type >::performFirstPhase( localView, begin, end, reduction, zero );
		const RealType localSum = blockShifts.getElement( blockShifts.getSize() - 1 );
		const auto block_results = Scan< DeviceType, Type >::performFirstPhase( localView, begin, end, reduction, zero );
		const ValueType local_result = block_results.getElement( block_results.getSize() - 1 );

		// exchange local sums between ranks
		// exchange local results between ranks
		const int nproc = MPI::GetSize( group );
		RealType dataForScatter[ nproc ];
		for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localSum;
		Containers::Vector< RealType, Devices::Host > rankSums( nproc );
		ValueType dataForScatter[ nproc ];
		for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = local_result;
		Containers::Vector< ValueType, Devices::Host > rank_results( nproc );
		// NOTE: exchanging general data types does not work with MPI
		MPI::Alltoall( dataForScatter, 1, rankSums.getData(), 1, group );
		MPI::Alltoall( dataForScatter, 1, rank_results.getData(), 1, group );

		// compute the scan of the per-rank sums
		Scan< Devices::Host, ScanType::Exclusive >::perform( rankSums, 0, nproc, reduction, zero );
		// compute the scan of the per-rank results
		Scan< Devices::Host, ScanType::Exclusive >::perform( rank_results, 0, nproc, reduction, zero );

		// perform second phase: shift by the per-block and per-rank offsets
		// perform the second phase, using the per-block and per-rank results
		const int rank = MPI::GetRank( group );
		Scan< DeviceType, Type >::performSecondPhase( localView, blockShifts, begin, end, reduction, rankSums[ rank ] );
		Scan< DeviceType, Type >::performSecondPhase( localView, block_results, begin, end, reduction, rank_results[ rank ] );
		}
		}
		};

src/TNL/Algorithms/Scan.h

+12 −12

Original line number	Diff line number	Diff line
		@@ -134,7 +134,7 @@ struct Scan< Devices::Sequential, Type >
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero );
		const typename Vector::ValueType zero );

		template< typename Vector,
		typename Reduction >
		@@ -143,7 +143,7 @@ struct Scan< Devices::Sequential, Type >
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero );
		const typename Vector::ValueType zero );

		template< typename Vector,
		typename BlockShifts,
		@@ -154,7 +154,7 @@ struct Scan< Devices::Sequential, Type >
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType shift );
		const typename Vector::ValueType shift );
		};

		template< ScanType Type >
		@@ -194,7 +194,7 @@ struct Scan< Devices::Host, Type >
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero );
		const typename Vector::ValueType zero );

		template< typename Vector,
		typename Reduction >
		@@ -203,7 +203,7 @@ struct Scan< Devices::Host, Type >
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero );
		const typename Vector::ValueType zero );

		template< typename Vector,
		typename BlockShifts,
		@@ -214,7 +214,7 @@ struct Scan< Devices::Host, Type >
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType shift );
		const typename Vector::ValueType shift );
		};

		template< ScanType Type >
		@@ -254,7 +254,7 @@ struct Scan< Devices::Cuda, Type >
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero );
		const typename Vector::ValueType zero );

		template< typename Vector,
		typename Reduction >
		@@ -263,7 +263,7 @@ struct Scan< Devices::Cuda, Type >
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero );
		const typename Vector::ValueType zero );

		template< typename Vector,
		typename BlockShifts,
		@@ -274,7 +274,7 @@ struct Scan< Devices::Cuda, Type >
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType shift );
		const typename Vector::ValueType shift );
		};

		template< ScanType Type >
		@@ -318,7 +318,7 @@ struct SegmentedScan< Devices::Sequential, Type >
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero );
		const typename Vector::ValueType zero );
		};

		template< ScanType Type >
		@@ -362,7 +362,7 @@ struct SegmentedScan< Devices::Host, Type >
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero );
		const typename Vector::ValueType zero );
		};

		template< ScanType Type >
		@@ -408,7 +408,7 @@ struct SegmentedScan< Devices::Cuda, Type >
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero );
		const typename Vector::ValueType zero );
		};

		} // namespace Algorithms

src/TNL/Algorithms/Scan.hpp

+53 −55

Original line number	Diff line number	Diff line
		@@ -33,9 +33,9 @@ perform( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero )
		const typename Vector::ValueType zero )
		{
		// sequential prefix-sum does not need a second phase
		// sequential scan does not need a second phase
		performFirstPhase( v, begin, end, reduction, zero );
		}

		@@ -48,33 +48,31 @@ performFirstPhase( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero )
		const typename Vector::ValueType zero )
		{
		using RealType = typename Vector::RealType;
		using ValueType = typename Vector::ValueType;
		using IndexType = typename Vector::IndexType;

		// FIXME: StaticArray does not have getElement() which is used in DistributedScan
		// return Containers::StaticArray< 1, RealType > block_sums;
		Containers::Array< RealType, Devices::Host > block_sums( 1 );
		block_sums[ 0 ] = zero;

		if( Type == ScanType::Inclusive ) {
		for( IndexType i = begin + 1; i < end; i++ )
		v[ i ] = reduction( v[ i ], v[ i - 1 ] );
		block_sums[ 0 ] = v[ end - 1 ];
		}
		else // Exclusive prefix sum
		else // Exclusive scan
		{
		RealType aux = zero;
		ValueType aux = zero;
		for( IndexType i = begin; i < end; i++ ) {
		const RealType x = v[ i ];
		const ValueType x = v[ i ];
		v[ i ] = aux;
		aux = reduction( aux, x );
		}
		block_sums[ 0 ] = aux;
		}

		return block_sums;
		// sequential scan = one block, so the exclusive scan is trivially [zero]
		// FIXME: StaticArray does not have getElement() which is used in DistributedScan
		// Containers::StaticArray< 1, ValueType > block_results;
		Containers::Array< ValueType, Devices::Host > block_results( 1 );
		block_results[ 0 ] = zero;
		return block_results;
		}

		template< ScanType Type >
		@@ -88,7 +86,7 @@ performSecondPhase( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType shift )
		const typename Vector::ValueType shift )
		{
		using IndexType = typename Vector::IndexType;

		@@ -105,7 +103,7 @@ perform( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero )
		const typename Vector::ValueType zero )
		{
		#ifdef HAVE_OPENMP
		if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() >= 2 ) {
		@@ -128,50 +126,50 @@ performFirstPhase( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero )
		const typename Vector::ValueType zero )
		{
		#ifdef HAVE_OPENMP
		using RealType = typename Vector::RealType;
		using ValueType = typename Vector::ValueType;
		using IndexType = typename Vector::IndexType;

		const int threads = Devices::Host::getMaxThreadsCount();
		Containers::Array< RealType > block_sums( threads + 1 );
		block_sums[ 0 ] = zero;
		Containers::Array< ValueType > block_results( threads + 1 );

		#pragma omp parallel num_threads(threads)
		{
		// init
		const int thread_idx = omp_get_thread_num();
		RealType block_sum = zero;
		ValueType block_result = zero;

		// perform prefix-sum on blocks statically assigned to threads
		// perform scan on blocks statically assigned to threads
		if( Type == ScanType::Inclusive ) {
		#pragma omp for schedule(static)
		for( IndexType i = begin; i < end; i++ ) {
		block_sum = reduction( block_sum, v[ i ] );
		v[ i ] = block_sum;
		block_result = reduction( block_result, v[ i ] );
		v[ i ] = block_result;
		}
		}
		else {
		#pragma omp for schedule(static)
		for( IndexType i = begin; i < end; i++ ) {
		const RealType x = v[ i ];
		v[ i ] = block_sum;
		block_sum = reduction( block_sum, x );
		const ValueType x = v[ i ];
		v[ i ] = block_result;
		block_result = reduction( block_result, x );
		}
		}

		// write the block sums into the buffer
		block_sums[ thread_idx + 1 ] = block_sum;
		// write the block result into the buffer
		block_results[ thread_idx + 1 ] = block_result;
		}

		// block_sums now contains sums of numbers in each block. The first phase
		// ends by computing prefix-sum of this array.
		// block_results now contains scan results for each block. The first phase
		// ends by computing an exclusive scan of this array.
		block_results[ 0 ] = zero;
		for( int i = 1; i < threads + 1; i++ )
		block_sums[ i ] = reduction( block_sums[ i ], block_sums[ i - 1 ] );
		block_results[ i ] = reduction( block_results[ i ], block_results[ i - 1 ] );

		// block_sums now contains shift values for each block - to be used in the second phase
		return block_sums;
		// block_results now contains shift values for each block - to be used in the second phase
		return block_results;
		#else
		return Scan< Devices::Sequential, Type >::performFirstPhase( v, begin, end, reduction, zero );
		#endif
		@@ -188,10 +186,10 @@ performSecondPhase( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType shift )
		const typename Vector::ValueType shift )
		{
		#ifdef HAVE_OPENMP
		using RealType = typename Vector::RealType;
		using ValueType = typename Vector::ValueType;
		using IndexType = typename Vector::IndexType;

		const int threads = blockShifts.getSize() - 1;
		@@ -200,7 +198,7 @@ performSecondPhase( Vector& v,
		#pragma omp parallel num_threads(threads)
		{
		const int thread_idx = omp_get_thread_num();
		const RealType offset = reduction( blockShifts[ thread_idx ], shift );
		const ValueType offset = reduction( blockShifts[ thread_idx ], shift );

		// shift intermediate results by the offset
		#pragma omp for schedule(static)
		@@ -221,13 +219,13 @@ perform( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero )
		const typename Vector::ValueType zero )
		{
		#ifdef HAVE_CUDA
		using RealType = typename Vector::RealType;
		using ValueType = typename Vector::ValueType;
		using IndexType = typename Vector::IndexType;

		detail::CudaScanKernelLauncher< Type, RealType, IndexType >::perform(
		detail::CudaScanKernelLauncher< Type, ValueType, IndexType >::perform(
		end - begin,
		&v.getData()[ begin ], // input
		&v.getData()[ begin ], // output
		@@ -247,13 +245,13 @@ performFirstPhase( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero )
		const typename Vector::ValueType zero )
		{
		#ifdef HAVE_CUDA
		using RealType = typename Vector::RealType;
		using ValueType = typename Vector::ValueType;
		using IndexType = typename Vector::IndexType;

		return detail::CudaScanKernelLauncher< Type, RealType, IndexType >::performFirstPhase(
		return detail::CudaScanKernelLauncher< Type, ValueType, IndexType >::performFirstPhase(
		end - begin,
		&v.getData()[ begin ], // input
		&v.getData()[ begin ], // output
		@@ -275,13 +273,13 @@ performSecondPhase( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType shift )
		const typename Vector::ValueType shift )
		{
		#ifdef HAVE_CUDA
		using RealType = typename Vector::RealType;
		using ValueType = typename Vector::ValueType;
		using IndexType = typename Vector::IndexType;

		detail::CudaScanKernelLauncher< Type, RealType, IndexType >::performSecondPhase(
		detail::CudaScanKernelLauncher< Type, ValueType, IndexType >::performSecondPhase(
		end - begin,
		&v.getData()[ begin ], // output
		blockShifts.getData(),
		@@ -304,9 +302,9 @@ perform( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero )
		const typename Vector::ValueType zero )
		{
		using RealType = typename Vector::RealType;
		using ValueType = typename Vector::ValueType;
		using IndexType = typename Vector::IndexType;

		if( Type == ScanType::Inclusive )
		@@ -315,13 +313,13 @@ perform( Vector& v,
		if( ! flags[ i ] )
		v[ i ] = reduction( v[ i ], v[ i - 1 ] );
		}
		else // Exclusive prefix sum
		else // Exclusive scan
		{
		RealType aux( v[ begin ] );
		ValueType aux( v[ begin ] );
		v[ begin ] = zero;
		for( IndexType i = begin + 1; i < end; i++ )
		{
		RealType x = v[ i ];
		ValueType x = v[ i ];
		if( flags[ i ] )
		aux = zero;
		v[ i ] = aux;
		@@ -341,7 +339,7 @@ perform( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero )
		const typename Vector::ValueType zero )
		{
		#ifdef HAVE_OPENMP
		// TODO: parallelize with OpenMP
		@@ -362,10 +360,10 @@ perform( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero )
		const typename Vector::ValueType zero )
		{
		#ifdef HAVE_CUDA
		using RealType = typename Vector::RealType;
		using ValueType = typename Vector::ValueType;
		using IndexType = typename Vector::IndexType;

		throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) is not implemented for CUDA." );

src/TNL/Algorithms/detail/CudaScanKernel.h

+19 −21

Original line number	Diff line number	Diff line
		@@ -34,7 +34,7 @@ cudaFirstPhaseBlockScan( const ScanType scanType,
		const int elementsInBlock,
		const Real* input,
		Real* output,
		Real* auxArray )
		Real* blockResults )
		{
		Real* sharedData = TNL::Cuda::getSharedMemory< Real >();
		Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Cuda::getNumberOfSharedMemoryBanks() + 2 ];
		@@ -147,11 +147,11 @@ cudaFirstPhaseBlockScan( const ScanType scanType,
		{
		if( scanType == ScanType::Exclusive )
		{
		auxArray[ blockIdx.x ] = reduction( sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ],
		blockResults[ blockIdx.x ] = reduction( sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ],
		sharedData[ Cuda::getInterleaving( lastElementInBlock ) ] );
		}
		else
		auxArray[ blockIdx.x ] = sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ];
		blockResults[ blockIdx.x ] = sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ];
		}
		}

		@@ -164,12 +164,12 @@ cudaSecondPhaseBlockScan( Reduction reduction,
		const int elementsInBlock,
		const Index gridIdx,
		const Index maxGridSize,
		const Real* auxArray,
		const Real* blockResults,
		Real* data,
		Real shift )
		{
		if( gridIdx > 0 \|\| blockIdx.x > 0 )
		shift = reduction( shift, auxArray[ gridIdx * maxGridSize + blockIdx.x - 1 ] );
		shift = reduction( shift, blockResults[ gridIdx * maxGridSize + blockIdx.x - 1 ] );
		const int readOffset = blockIdx.x * elementsInBlock;
		int readIdx = threadIdx.x;
		while( readIdx < elementsInBlock && readOffset + readIdx < size )
		@@ -248,9 +248,9 @@ struct CudaScanKernelLauncher
		const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
		//std::cerr << "numberOfgrids = " << numberOfGrids << std::endl;

		// allocate array for the block sums
		Containers::Array< Real, Devices::Cuda > blockSums;
		blockSums.setSize( numberOfBlocks );
		// allocate array for the block results
		Containers::Array< Real, Devices::Cuda > blockResults;
		blockResults.setSize( numberOfBlocks );

		// loop over all grids
		for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
		@@ -278,20 +278,20 @@ struct CudaScanKernelLauncher
		elementsInBlock,
		&deviceInput[ gridOffset ],
		&deviceOutput[ gridOffset ],
		&blockSums.getData()[ gridIdx * maxGridSize() ] );
		&blockResults.getData()[ gridIdx * maxGridSize() ] );
		}

		// synchronize the null-stream after all grids
		cudaStreamSynchronize(0);
		TNL_CHECK_CUDA_DEVICE;

		// blockSums now contains sums of numbers in each block. The first phase
		// ends by computing prefix-sum of this array.
		// blockResults now contains scan results for each block. The first phase
		// ends by computing an exclusive scan of this array.
		if( numberOfBlocks > 1 ) {
		CudaScanKernelLauncher< ScanType::Inclusive, Real, Index >::perform(
		blockSums.getSize(),
		blockSums.getData(),
		blockSums.getData(),
		blockResults.getSize(),
		blockResults.getData(),
		blockResults.getData(),
		reduction,
		zero,
		blockSize );
		@@ -301,8 +301,8 @@ struct CudaScanKernelLauncher
		// to check if we test the algorithm with more than one CUDA grid.
		gridsCount() = numberOfGrids;

		// blockSums now contains shift values for each block - to be used in the second phase
		return blockSums;
		// blockResults now contains shift values for each block - to be used in the second phase
		return blockResults;
		}

		/****
		@@ -363,10 +363,8 @@ struct CudaScanKernelLauncher
		TNL_CHECK_CUDA_DEVICE;
		}

		/****
		* The following serves for setting smaller maxGridSize so that we can force
		* the prefix sum in CUDA to run with more the one grids in unit tests.
		*/
		// The following serves for setting smaller maxGridSize so that we can force
		// the scan in CUDA to run with more than one grid in unit tests.
		static int& maxGridSize()
		{
		static int maxGridSize = Cuda::getMaxGridSize();