Loading src/TNL/Algorithms/DistributedScan.h +13 −13 Original line number Diff line number Diff line Loading @@ -29,9 +29,9 @@ struct DistributedScan typename DistributedVector::IndexType begin, typename DistributedVector::IndexType end, const Reduction& reduction, const typename DistributedVector::RealType zero ) const typename DistributedVector::ValueType zero ) { using RealType = typename DistributedVector::RealType; using ValueType = typename DistributedVector::ValueType; using DeviceType = typename DistributedVector::DeviceType; const auto group = v.getCommunicationGroup(); Loading @@ -43,23 +43,23 @@ struct DistributedScan // perform first phase on the local data auto localView = v.getLocalView(); const auto blockShifts = Scan< DeviceType, Type >::performFirstPhase( localView, begin, end, reduction, zero ); const RealType localSum = blockShifts.getElement( blockShifts.getSize() - 1 ); const auto block_results = Scan< DeviceType, Type >::performFirstPhase( localView, begin, end, reduction, zero ); const ValueType local_result = block_results.getElement( block_results.getSize() - 1 ); // exchange local sums between ranks // exchange local results between ranks const int nproc = MPI::GetSize( group ); RealType dataForScatter[ nproc ]; for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localSum; Containers::Vector< RealType, Devices::Host > rankSums( nproc ); ValueType dataForScatter[ nproc ]; for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = local_result; Containers::Vector< ValueType, Devices::Host > rank_results( nproc ); // NOTE: exchanging general data types does not work with MPI MPI::Alltoall( dataForScatter, 1, rankSums.getData(), 1, group ); MPI::Alltoall( dataForScatter, 1, rank_results.getData(), 1, group ); // compute the scan of the per-rank sums Scan< Devices::Host, ScanType::Exclusive >::perform( rankSums, 0, nproc, reduction, zero ); // compute the scan of the per-rank results Scan< Devices::Host, ScanType::Exclusive >::perform( rank_results, 0, nproc, reduction, zero ); // perform second phase: shift by the per-block and per-rank offsets // perform the second phase, using the per-block and per-rank results const int rank = MPI::GetRank( group ); Scan< DeviceType, Type >::performSecondPhase( localView, blockShifts, begin, end, reduction, rankSums[ rank ] ); Scan< DeviceType, Type >::performSecondPhase( localView, block_results, begin, end, reduction, rank_results[ rank ] ); } } }; Loading src/TNL/Algorithms/Scan.h +12 −12 Original line number Diff line number Diff line Loading @@ -134,7 +134,7 @@ struct Scan< Devices::Sequential, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); const typename Vector::ValueType zero ); template< typename Vector, typename Reduction > Loading @@ -143,7 +143,7 @@ struct Scan< Devices::Sequential, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); const typename Vector::ValueType zero ); template< typename Vector, typename BlockShifts, Loading @@ -154,7 +154,7 @@ struct Scan< Devices::Sequential, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType shift ); const typename Vector::ValueType shift ); }; template< ScanType Type > Loading Loading @@ -194,7 +194,7 @@ struct Scan< Devices::Host, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); const typename Vector::ValueType zero ); template< typename Vector, typename Reduction > Loading @@ -203,7 +203,7 @@ struct Scan< Devices::Host, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); const typename Vector::ValueType zero ); template< typename Vector, typename BlockShifts, Loading @@ -214,7 +214,7 @@ struct Scan< Devices::Host, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType shift ); const typename Vector::ValueType shift ); }; template< ScanType Type > Loading Loading @@ -254,7 +254,7 @@ struct Scan< Devices::Cuda, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); const typename Vector::ValueType zero ); template< typename Vector, typename Reduction > Loading @@ -263,7 +263,7 @@ struct Scan< Devices::Cuda, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); const typename Vector::ValueType zero ); template< typename Vector, typename BlockShifts, Loading @@ -274,7 +274,7 @@ struct Scan< Devices::Cuda, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType shift ); const typename Vector::ValueType shift ); }; template< ScanType Type > Loading Loading @@ -318,7 +318,7 @@ struct SegmentedScan< Devices::Sequential, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); const typename Vector::ValueType zero ); }; template< ScanType Type > Loading Loading @@ -362,7 +362,7 @@ struct SegmentedScan< Devices::Host, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); const typename Vector::ValueType zero ); }; template< ScanType Type > Loading Loading @@ -408,7 +408,7 @@ struct SegmentedScan< Devices::Cuda, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); const typename Vector::ValueType zero ); }; } // namespace Algorithms Loading src/TNL/Algorithms/Scan.hpp +53 −55 Original line number Diff line number Diff line Loading @@ -33,9 +33,9 @@ perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) const typename Vector::ValueType zero ) { // sequential prefix-sum does not need a second phase // sequential scan does not need a second phase performFirstPhase( v, begin, end, reduction, zero ); } Loading @@ -48,33 +48,31 @@ performFirstPhase( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) const typename Vector::ValueType zero ) { using RealType = typename Vector::RealType; using ValueType = typename Vector::ValueType; using IndexType = typename Vector::IndexType; // FIXME: StaticArray does not have getElement() which is used in DistributedScan // return Containers::StaticArray< 1, RealType > block_sums; Containers::Array< RealType, Devices::Host > block_sums( 1 ); block_sums[ 0 ] = zero; if( Type == ScanType::Inclusive ) { for( IndexType i = begin + 1; i < end; i++ ) v[ i ] = reduction( v[ i ], v[ i - 1 ] ); block_sums[ 0 ] = v[ end - 1 ]; } else // Exclusive prefix sum else // Exclusive scan { RealType aux = zero; ValueType aux = zero; for( IndexType i = begin; i < end; i++ ) { const RealType x = v[ i ]; const ValueType x = v[ i ]; v[ i ] = aux; aux = reduction( aux, x ); } block_sums[ 0 ] = aux; } return block_sums; // sequential scan = one block, so the exclusive scan is trivially [zero] // FIXME: StaticArray does not have getElement() which is used in DistributedScan // Containers::StaticArray< 1, ValueType > block_results; Containers::Array< ValueType, Devices::Host > block_results( 1 ); block_results[ 0 ] = zero; return block_results; } template< ScanType Type > Loading @@ -88,7 +86,7 @@ performSecondPhase( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType shift ) const typename Vector::ValueType shift ) { using IndexType = typename Vector::IndexType; Loading @@ -105,7 +103,7 @@ perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) const typename Vector::ValueType zero ) { #ifdef HAVE_OPENMP if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() >= 2 ) { Loading @@ -128,50 +126,50 @@ performFirstPhase( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) const typename Vector::ValueType zero ) { #ifdef HAVE_OPENMP using RealType = typename Vector::RealType; using ValueType = typename Vector::ValueType; using IndexType = typename Vector::IndexType; const int threads = Devices::Host::getMaxThreadsCount(); Containers::Array< RealType > block_sums( threads + 1 ); block_sums[ 0 ] = zero; Containers::Array< ValueType > block_results( threads + 1 ); #pragma omp parallel num_threads(threads) { // init const int thread_idx = omp_get_thread_num(); RealType block_sum = zero; ValueType block_result = zero; // perform prefix-sum on blocks statically assigned to threads // perform scan on blocks statically assigned to threads if( Type == ScanType::Inclusive ) { #pragma omp for schedule(static) for( IndexType i = begin; i < end; i++ ) { block_sum = reduction( block_sum, v[ i ] ); v[ i ] = block_sum; block_result = reduction( block_result, v[ i ] ); v[ i ] = block_result; } } else { #pragma omp for schedule(static) for( IndexType i = begin; i < end; i++ ) { const RealType x = v[ i ]; v[ i ] = block_sum; block_sum = reduction( block_sum, x ); const ValueType x = v[ i ]; v[ i ] = block_result; block_result = reduction( block_result, x ); } } // write the block sums into the buffer block_sums[ thread_idx + 1 ] = block_sum; // write the block result into the buffer block_results[ thread_idx + 1 ] = block_result; } // block_sums now contains sums of numbers in each block. The first phase // ends by computing prefix-sum of this array. // block_results now contains scan results for each block. The first phase // ends by computing an exclusive scan of this array. block_results[ 0 ] = zero; for( int i = 1; i < threads + 1; i++ ) block_sums[ i ] = reduction( block_sums[ i ], block_sums[ i - 1 ] ); block_results[ i ] = reduction( block_results[ i ], block_results[ i - 1 ] ); // block_sums now contains shift values for each block - to be used in the second phase return block_sums; // block_results now contains shift values for each block - to be used in the second phase return block_results; #else return Scan< Devices::Sequential, Type >::performFirstPhase( v, begin, end, reduction, zero ); #endif Loading @@ -188,10 +186,10 @@ performSecondPhase( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType shift ) const typename Vector::ValueType shift ) { #ifdef HAVE_OPENMP using RealType = typename Vector::RealType; using ValueType = typename Vector::ValueType; using IndexType = typename Vector::IndexType; const int threads = blockShifts.getSize() - 1; Loading @@ -200,7 +198,7 @@ performSecondPhase( Vector& v, #pragma omp parallel num_threads(threads) { const int thread_idx = omp_get_thread_num(); const RealType offset = reduction( blockShifts[ thread_idx ], shift ); const ValueType offset = reduction( blockShifts[ thread_idx ], shift ); // shift intermediate results by the offset #pragma omp for schedule(static) Loading @@ -221,13 +219,13 @@ perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) const typename Vector::ValueType zero ) { #ifdef HAVE_CUDA using RealType = typename Vector::RealType; using ValueType = typename Vector::ValueType; using IndexType = typename Vector::IndexType; detail::CudaScanKernelLauncher< Type, RealType, IndexType >::perform( detail::CudaScanKernelLauncher< Type, ValueType, IndexType >::perform( end - begin, &v.getData()[ begin ], // input &v.getData()[ begin ], // output Loading @@ -247,13 +245,13 @@ performFirstPhase( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) const typename Vector::ValueType zero ) { #ifdef HAVE_CUDA using RealType = typename Vector::RealType; using ValueType = typename Vector::ValueType; using IndexType = typename Vector::IndexType; return detail::CudaScanKernelLauncher< Type, RealType, IndexType >::performFirstPhase( return detail::CudaScanKernelLauncher< Type, ValueType, IndexType >::performFirstPhase( end - begin, &v.getData()[ begin ], // input &v.getData()[ begin ], // output Loading @@ -275,13 +273,13 @@ performSecondPhase( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType shift ) const typename Vector::ValueType shift ) { #ifdef HAVE_CUDA using RealType = typename Vector::RealType; using ValueType = typename Vector::ValueType; using IndexType = typename Vector::IndexType; detail::CudaScanKernelLauncher< Type, RealType, IndexType >::performSecondPhase( detail::CudaScanKernelLauncher< Type, ValueType, IndexType >::performSecondPhase( end - begin, &v.getData()[ begin ], // output blockShifts.getData(), Loading @@ -304,9 +302,9 @@ perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) const typename Vector::ValueType zero ) { using RealType = typename Vector::RealType; using ValueType = typename Vector::ValueType; using IndexType = typename Vector::IndexType; if( Type == ScanType::Inclusive ) Loading @@ -315,13 +313,13 @@ perform( Vector& v, if( ! flags[ i ] ) v[ i ] = reduction( v[ i ], v[ i - 1 ] ); } else // Exclusive prefix sum else // Exclusive scan { RealType aux( v[ begin ] ); ValueType aux( v[ begin ] ); v[ begin ] = zero; for( IndexType i = begin + 1; i < end; i++ ) { RealType x = v[ i ]; ValueType x = v[ i ]; if( flags[ i ] ) aux = zero; v[ i ] = aux; Loading @@ -341,7 +339,7 @@ perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) const typename Vector::ValueType zero ) { #ifdef HAVE_OPENMP // TODO: parallelize with OpenMP Loading @@ -362,10 +360,10 @@ perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) const typename Vector::ValueType zero ) { #ifdef HAVE_CUDA using RealType = typename Vector::RealType; using ValueType = typename Vector::ValueType; using IndexType = typename Vector::IndexType; throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) is not implemented for CUDA." ); Loading src/TNL/Algorithms/detail/CudaScanKernel.h +19 −21 Original line number Diff line number Diff line Loading @@ -34,7 +34,7 @@ cudaFirstPhaseBlockScan( const ScanType scanType, const int elementsInBlock, const Real* input, Real* output, Real* auxArray ) Real* blockResults ) { Real* sharedData = TNL::Cuda::getSharedMemory< Real >(); Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Cuda::getNumberOfSharedMemoryBanks() + 2 ]; Loading Loading @@ -147,11 +147,11 @@ cudaFirstPhaseBlockScan( const ScanType scanType, { if( scanType == ScanType::Exclusive ) { auxArray[ blockIdx.x ] = reduction( sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ], blockResults[ blockIdx.x ] = reduction( sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ], sharedData[ Cuda::getInterleaving( lastElementInBlock ) ] ); } else auxArray[ blockIdx.x ] = sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ]; blockResults[ blockIdx.x ] = sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ]; } } Loading @@ -164,12 +164,12 @@ cudaSecondPhaseBlockScan( Reduction reduction, const int elementsInBlock, const Index gridIdx, const Index maxGridSize, const Real* auxArray, const Real* blockResults, Real* data, Real shift ) { if( gridIdx > 0 || blockIdx.x > 0 ) shift = reduction( shift, auxArray[ gridIdx * maxGridSize + blockIdx.x - 1 ] ); shift = reduction( shift, blockResults[ gridIdx * maxGridSize + blockIdx.x - 1 ] ); const int readOffset = blockIdx.x * elementsInBlock; int readIdx = threadIdx.x; while( readIdx < elementsInBlock && readOffset + readIdx < size ) Loading Loading @@ -248,9 +248,9 @@ struct CudaScanKernelLauncher const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() ); //std::cerr << "numberOfgrids = " << numberOfGrids << std::endl; // allocate array for the block sums Containers::Array< Real, Devices::Cuda > blockSums; blockSums.setSize( numberOfBlocks ); // allocate array for the block results Containers::Array< Real, Devices::Cuda > blockResults; blockResults.setSize( numberOfBlocks ); // loop over all grids for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) { Loading Loading @@ -278,20 +278,20 @@ struct CudaScanKernelLauncher elementsInBlock, &deviceInput[ gridOffset ], &deviceOutput[ gridOffset ], &blockSums.getData()[ gridIdx * maxGridSize() ] ); &blockResults.getData()[ gridIdx * maxGridSize() ] ); } // synchronize the null-stream after all grids cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; // blockSums now contains sums of numbers in each block. The first phase // ends by computing prefix-sum of this array. // blockResults now contains scan results for each block. The first phase // ends by computing an exclusive scan of this array. if( numberOfBlocks > 1 ) { CudaScanKernelLauncher< ScanType::Inclusive, Real, Index >::perform( blockSums.getSize(), blockSums.getData(), blockSums.getData(), blockResults.getSize(), blockResults.getData(), blockResults.getData(), reduction, zero, blockSize ); Loading @@ -301,8 +301,8 @@ struct CudaScanKernelLauncher // to check if we test the algorithm with more than one CUDA grid. gridsCount() = numberOfGrids; // blockSums now contains shift values for each block - to be used in the second phase return blockSums; // blockResults now contains shift values for each block - to be used in the second phase return blockResults; } /**** Loading Loading @@ -363,10 +363,8 @@ struct CudaScanKernelLauncher TNL_CHECK_CUDA_DEVICE; } /**** * The following serves for setting smaller maxGridSize so that we can force * the prefix sum in CUDA to run with more the one grids in unit tests. */ // The following serves for setting smaller maxGridSize so that we can force // the scan in CUDA to run with more than one grid in unit tests. static int& maxGridSize() { static int maxGridSize = Cuda::getMaxGridSize(); Loading Loading
src/TNL/Algorithms/DistributedScan.h +13 −13 Original line number Diff line number Diff line Loading @@ -29,9 +29,9 @@ struct DistributedScan typename DistributedVector::IndexType begin, typename DistributedVector::IndexType end, const Reduction& reduction, const typename DistributedVector::RealType zero ) const typename DistributedVector::ValueType zero ) { using RealType = typename DistributedVector::RealType; using ValueType = typename DistributedVector::ValueType; using DeviceType = typename DistributedVector::DeviceType; const auto group = v.getCommunicationGroup(); Loading @@ -43,23 +43,23 @@ struct DistributedScan // perform first phase on the local data auto localView = v.getLocalView(); const auto blockShifts = Scan< DeviceType, Type >::performFirstPhase( localView, begin, end, reduction, zero ); const RealType localSum = blockShifts.getElement( blockShifts.getSize() - 1 ); const auto block_results = Scan< DeviceType, Type >::performFirstPhase( localView, begin, end, reduction, zero ); const ValueType local_result = block_results.getElement( block_results.getSize() - 1 ); // exchange local sums between ranks // exchange local results between ranks const int nproc = MPI::GetSize( group ); RealType dataForScatter[ nproc ]; for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localSum; Containers::Vector< RealType, Devices::Host > rankSums( nproc ); ValueType dataForScatter[ nproc ]; for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = local_result; Containers::Vector< ValueType, Devices::Host > rank_results( nproc ); // NOTE: exchanging general data types does not work with MPI MPI::Alltoall( dataForScatter, 1, rankSums.getData(), 1, group ); MPI::Alltoall( dataForScatter, 1, rank_results.getData(), 1, group ); // compute the scan of the per-rank sums Scan< Devices::Host, ScanType::Exclusive >::perform( rankSums, 0, nproc, reduction, zero ); // compute the scan of the per-rank results Scan< Devices::Host, ScanType::Exclusive >::perform( rank_results, 0, nproc, reduction, zero ); // perform second phase: shift by the per-block and per-rank offsets // perform the second phase, using the per-block and per-rank results const int rank = MPI::GetRank( group ); Scan< DeviceType, Type >::performSecondPhase( localView, blockShifts, begin, end, reduction, rankSums[ rank ] ); Scan< DeviceType, Type >::performSecondPhase( localView, block_results, begin, end, reduction, rank_results[ rank ] ); } } }; Loading
src/TNL/Algorithms/Scan.h +12 −12 Original line number Diff line number Diff line Loading @@ -134,7 +134,7 @@ struct Scan< Devices::Sequential, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); const typename Vector::ValueType zero ); template< typename Vector, typename Reduction > Loading @@ -143,7 +143,7 @@ struct Scan< Devices::Sequential, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); const typename Vector::ValueType zero ); template< typename Vector, typename BlockShifts, Loading @@ -154,7 +154,7 @@ struct Scan< Devices::Sequential, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType shift ); const typename Vector::ValueType shift ); }; template< ScanType Type > Loading Loading @@ -194,7 +194,7 @@ struct Scan< Devices::Host, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); const typename Vector::ValueType zero ); template< typename Vector, typename Reduction > Loading @@ -203,7 +203,7 @@ struct Scan< Devices::Host, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); const typename Vector::ValueType zero ); template< typename Vector, typename BlockShifts, Loading @@ -214,7 +214,7 @@ struct Scan< Devices::Host, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType shift ); const typename Vector::ValueType shift ); }; template< ScanType Type > Loading Loading @@ -254,7 +254,7 @@ struct Scan< Devices::Cuda, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); const typename Vector::ValueType zero ); template< typename Vector, typename Reduction > Loading @@ -263,7 +263,7 @@ struct Scan< Devices::Cuda, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); const typename Vector::ValueType zero ); template< typename Vector, typename BlockShifts, Loading @@ -274,7 +274,7 @@ struct Scan< Devices::Cuda, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType shift ); const typename Vector::ValueType shift ); }; template< ScanType Type > Loading Loading @@ -318,7 +318,7 @@ struct SegmentedScan< Devices::Sequential, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); const typename Vector::ValueType zero ); }; template< ScanType Type > Loading Loading @@ -362,7 +362,7 @@ struct SegmentedScan< Devices::Host, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); const typename Vector::ValueType zero ); }; template< ScanType Type > Loading Loading @@ -408,7 +408,7 @@ struct SegmentedScan< Devices::Cuda, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); const typename Vector::ValueType zero ); }; } // namespace Algorithms Loading
src/TNL/Algorithms/Scan.hpp +53 −55 Original line number Diff line number Diff line Loading @@ -33,9 +33,9 @@ perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) const typename Vector::ValueType zero ) { // sequential prefix-sum does not need a second phase // sequential scan does not need a second phase performFirstPhase( v, begin, end, reduction, zero ); } Loading @@ -48,33 +48,31 @@ performFirstPhase( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) const typename Vector::ValueType zero ) { using RealType = typename Vector::RealType; using ValueType = typename Vector::ValueType; using IndexType = typename Vector::IndexType; // FIXME: StaticArray does not have getElement() which is used in DistributedScan // return Containers::StaticArray< 1, RealType > block_sums; Containers::Array< RealType, Devices::Host > block_sums( 1 ); block_sums[ 0 ] = zero; if( Type == ScanType::Inclusive ) { for( IndexType i = begin + 1; i < end; i++ ) v[ i ] = reduction( v[ i ], v[ i - 1 ] ); block_sums[ 0 ] = v[ end - 1 ]; } else // Exclusive prefix sum else // Exclusive scan { RealType aux = zero; ValueType aux = zero; for( IndexType i = begin; i < end; i++ ) { const RealType x = v[ i ]; const ValueType x = v[ i ]; v[ i ] = aux; aux = reduction( aux, x ); } block_sums[ 0 ] = aux; } return block_sums; // sequential scan = one block, so the exclusive scan is trivially [zero] // FIXME: StaticArray does not have getElement() which is used in DistributedScan // Containers::StaticArray< 1, ValueType > block_results; Containers::Array< ValueType, Devices::Host > block_results( 1 ); block_results[ 0 ] = zero; return block_results; } template< ScanType Type > Loading @@ -88,7 +86,7 @@ performSecondPhase( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType shift ) const typename Vector::ValueType shift ) { using IndexType = typename Vector::IndexType; Loading @@ -105,7 +103,7 @@ perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) const typename Vector::ValueType zero ) { #ifdef HAVE_OPENMP if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() >= 2 ) { Loading @@ -128,50 +126,50 @@ performFirstPhase( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) const typename Vector::ValueType zero ) { #ifdef HAVE_OPENMP using RealType = typename Vector::RealType; using ValueType = typename Vector::ValueType; using IndexType = typename Vector::IndexType; const int threads = Devices::Host::getMaxThreadsCount(); Containers::Array< RealType > block_sums( threads + 1 ); block_sums[ 0 ] = zero; Containers::Array< ValueType > block_results( threads + 1 ); #pragma omp parallel num_threads(threads) { // init const int thread_idx = omp_get_thread_num(); RealType block_sum = zero; ValueType block_result = zero; // perform prefix-sum on blocks statically assigned to threads // perform scan on blocks statically assigned to threads if( Type == ScanType::Inclusive ) { #pragma omp for schedule(static) for( IndexType i = begin; i < end; i++ ) { block_sum = reduction( block_sum, v[ i ] ); v[ i ] = block_sum; block_result = reduction( block_result, v[ i ] ); v[ i ] = block_result; } } else { #pragma omp for schedule(static) for( IndexType i = begin; i < end; i++ ) { const RealType x = v[ i ]; v[ i ] = block_sum; block_sum = reduction( block_sum, x ); const ValueType x = v[ i ]; v[ i ] = block_result; block_result = reduction( block_result, x ); } } // write the block sums into the buffer block_sums[ thread_idx + 1 ] = block_sum; // write the block result into the buffer block_results[ thread_idx + 1 ] = block_result; } // block_sums now contains sums of numbers in each block. The first phase // ends by computing prefix-sum of this array. // block_results now contains scan results for each block. The first phase // ends by computing an exclusive scan of this array. block_results[ 0 ] = zero; for( int i = 1; i < threads + 1; i++ ) block_sums[ i ] = reduction( block_sums[ i ], block_sums[ i - 1 ] ); block_results[ i ] = reduction( block_results[ i ], block_results[ i - 1 ] ); // block_sums now contains shift values for each block - to be used in the second phase return block_sums; // block_results now contains shift values for each block - to be used in the second phase return block_results; #else return Scan< Devices::Sequential, Type >::performFirstPhase( v, begin, end, reduction, zero ); #endif Loading @@ -188,10 +186,10 @@ performSecondPhase( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType shift ) const typename Vector::ValueType shift ) { #ifdef HAVE_OPENMP using RealType = typename Vector::RealType; using ValueType = typename Vector::ValueType; using IndexType = typename Vector::IndexType; const int threads = blockShifts.getSize() - 1; Loading @@ -200,7 +198,7 @@ performSecondPhase( Vector& v, #pragma omp parallel num_threads(threads) { const int thread_idx = omp_get_thread_num(); const RealType offset = reduction( blockShifts[ thread_idx ], shift ); const ValueType offset = reduction( blockShifts[ thread_idx ], shift ); // shift intermediate results by the offset #pragma omp for schedule(static) Loading @@ -221,13 +219,13 @@ perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) const typename Vector::ValueType zero ) { #ifdef HAVE_CUDA using RealType = typename Vector::RealType; using ValueType = typename Vector::ValueType; using IndexType = typename Vector::IndexType; detail::CudaScanKernelLauncher< Type, RealType, IndexType >::perform( detail::CudaScanKernelLauncher< Type, ValueType, IndexType >::perform( end - begin, &v.getData()[ begin ], // input &v.getData()[ begin ], // output Loading @@ -247,13 +245,13 @@ performFirstPhase( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) const typename Vector::ValueType zero ) { #ifdef HAVE_CUDA using RealType = typename Vector::RealType; using ValueType = typename Vector::ValueType; using IndexType = typename Vector::IndexType; return detail::CudaScanKernelLauncher< Type, RealType, IndexType >::performFirstPhase( return detail::CudaScanKernelLauncher< Type, ValueType, IndexType >::performFirstPhase( end - begin, &v.getData()[ begin ], // input &v.getData()[ begin ], // output Loading @@ -275,13 +273,13 @@ performSecondPhase( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType shift ) const typename Vector::ValueType shift ) { #ifdef HAVE_CUDA using RealType = typename Vector::RealType; using ValueType = typename Vector::ValueType; using IndexType = typename Vector::IndexType; detail::CudaScanKernelLauncher< Type, RealType, IndexType >::performSecondPhase( detail::CudaScanKernelLauncher< Type, ValueType, IndexType >::performSecondPhase( end - begin, &v.getData()[ begin ], // output blockShifts.getData(), Loading @@ -304,9 +302,9 @@ perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) const typename Vector::ValueType zero ) { using RealType = typename Vector::RealType; using ValueType = typename Vector::ValueType; using IndexType = typename Vector::IndexType; if( Type == ScanType::Inclusive ) Loading @@ -315,13 +313,13 @@ perform( Vector& v, if( ! flags[ i ] ) v[ i ] = reduction( v[ i ], v[ i - 1 ] ); } else // Exclusive prefix sum else // Exclusive scan { RealType aux( v[ begin ] ); ValueType aux( v[ begin ] ); v[ begin ] = zero; for( IndexType i = begin + 1; i < end; i++ ) { RealType x = v[ i ]; ValueType x = v[ i ]; if( flags[ i ] ) aux = zero; v[ i ] = aux; Loading @@ -341,7 +339,7 @@ perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) const typename Vector::ValueType zero ) { #ifdef HAVE_OPENMP // TODO: parallelize with OpenMP Loading @@ -362,10 +360,10 @@ perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) const typename Vector::ValueType zero ) { #ifdef HAVE_CUDA using RealType = typename Vector::RealType; using ValueType = typename Vector::ValueType; using IndexType = typename Vector::IndexType; throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) is not implemented for CUDA." ); Loading
src/TNL/Algorithms/detail/CudaScanKernel.h +19 −21 Original line number Diff line number Diff line Loading @@ -34,7 +34,7 @@ cudaFirstPhaseBlockScan( const ScanType scanType, const int elementsInBlock, const Real* input, Real* output, Real* auxArray ) Real* blockResults ) { Real* sharedData = TNL::Cuda::getSharedMemory< Real >(); Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Cuda::getNumberOfSharedMemoryBanks() + 2 ]; Loading Loading @@ -147,11 +147,11 @@ cudaFirstPhaseBlockScan( const ScanType scanType, { if( scanType == ScanType::Exclusive ) { auxArray[ blockIdx.x ] = reduction( sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ], blockResults[ blockIdx.x ] = reduction( sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ], sharedData[ Cuda::getInterleaving( lastElementInBlock ) ] ); } else auxArray[ blockIdx.x ] = sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ]; blockResults[ blockIdx.x ] = sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ]; } } Loading @@ -164,12 +164,12 @@ cudaSecondPhaseBlockScan( Reduction reduction, const int elementsInBlock, const Index gridIdx, const Index maxGridSize, const Real* auxArray, const Real* blockResults, Real* data, Real shift ) { if( gridIdx > 0 || blockIdx.x > 0 ) shift = reduction( shift, auxArray[ gridIdx * maxGridSize + blockIdx.x - 1 ] ); shift = reduction( shift, blockResults[ gridIdx * maxGridSize + blockIdx.x - 1 ] ); const int readOffset = blockIdx.x * elementsInBlock; int readIdx = threadIdx.x; while( readIdx < elementsInBlock && readOffset + readIdx < size ) Loading Loading @@ -248,9 +248,9 @@ struct CudaScanKernelLauncher const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() ); //std::cerr << "numberOfgrids = " << numberOfGrids << std::endl; // allocate array for the block sums Containers::Array< Real, Devices::Cuda > blockSums; blockSums.setSize( numberOfBlocks ); // allocate array for the block results Containers::Array< Real, Devices::Cuda > blockResults; blockResults.setSize( numberOfBlocks ); // loop over all grids for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) { Loading Loading @@ -278,20 +278,20 @@ struct CudaScanKernelLauncher elementsInBlock, &deviceInput[ gridOffset ], &deviceOutput[ gridOffset ], &blockSums.getData()[ gridIdx * maxGridSize() ] ); &blockResults.getData()[ gridIdx * maxGridSize() ] ); } // synchronize the null-stream after all grids cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; // blockSums now contains sums of numbers in each block. The first phase // ends by computing prefix-sum of this array. // blockResults now contains scan results for each block. The first phase // ends by computing an exclusive scan of this array. if( numberOfBlocks > 1 ) { CudaScanKernelLauncher< ScanType::Inclusive, Real, Index >::perform( blockSums.getSize(), blockSums.getData(), blockSums.getData(), blockResults.getSize(), blockResults.getData(), blockResults.getData(), reduction, zero, blockSize ); Loading @@ -301,8 +301,8 @@ struct CudaScanKernelLauncher // to check if we test the algorithm with more than one CUDA grid. gridsCount() = numberOfGrids; // blockSums now contains shift values for each block - to be used in the second phase return blockSums; // blockResults now contains shift values for each block - to be used in the second phase return blockResults; } /**** Loading Loading @@ -363,10 +363,8 @@ struct CudaScanKernelLauncher TNL_CHECK_CUDA_DEVICE; } /**** * The following serves for setting smaller maxGridSize so that we can force * the prefix sum in CUDA to run with more the one grids in unit tests. */ // The following serves for setting smaller maxGridSize so that we can force // the scan in CUDA to run with more than one grid in unit tests. static int& maxGridSize() { static int maxGridSize = Cuda::getMaxGridSize(); Loading