Loading src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h +37 −44 Original line number Diff line number Diff line Loading @@ -24,13 +24,11 @@ namespace Algorithms { #ifdef HAVE_CUDA template< typename Real, typename Operation, typename VolatileOperation, typename Reduction, typename Index > __global__ void cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, Operation operation, VolatileOperation volatileOperation, Reduction reduction, const Real zero, const Index size, const Index elementsInBlock, Loading @@ -40,8 +38,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, const Real gridShift ) { Real* sharedData = TNL::Devices::Cuda::getSharedMemory< Real >(); volatile Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ]; volatile Real* warpSums = &auxData[ blockDim.x ]; Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ]; Real* warpSums = &auxData[ blockDim.x ]; const Index lastElementIdx = size - blockIdx.x * elementsInBlock; const Index lastElementInBlock = TNL::min( lastElementIdx, elementsInBlock ); Loading Loading @@ -70,7 +68,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, } } if( blockIdx.x == 0 && threadIdx.x == 0 ) operation( sharedData[ 0 ], gridShift ); sharedData[ 0 ] = reduction( sharedData[ 0 ], gridShift ); /*** * Perform the sequential prefix-sum. Loading @@ -90,7 +88,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, while( chunkPointer < chunkSize && chunkOffset + chunkPointer < lastElementInBlock ) { operation( sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ], sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ] = reduction( sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ], sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] ); auxData[ threadIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ]; Loading @@ -102,9 +101,11 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, */ const int threadInWarpIdx = threadIdx.x % Devices::Cuda::getWarpSize(); const int warpIdx = threadIdx.x / Devices::Cuda::getWarpSize(); for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride && threadIdx.x < numberOfChunks ) volatileOperation( auxData[ threadIdx.x ], auxData[ threadIdx.x - stride ] ); auxData[ threadIdx.x ] = reduction( auxData[ threadIdx.x ], auxData[ threadIdx.x - stride ] ); __syncwarp(); } if( threadInWarpIdx == Devices::Cuda::getWarpSize() - 1 ) warpSums[ warpIdx ] = auxData[ threadIdx.x ]; Loading @@ -114,21 +115,23 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, * Compute prefix-sum of warp sums using one warp */ if( warpIdx == 0 ) for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride ) volatileOperation( warpSums[ threadIdx.x ], warpSums[ threadIdx.x - stride ] ); warpSums[ threadIdx.x ] = reduction( warpSums[ threadIdx.x ], warpSums[ threadIdx.x - stride ] ); __syncwarp(); } __syncthreads(); /**** * Shift the warp prefix-sums. */ if( warpIdx > 0 ) volatileOperation( auxData[ threadIdx.x ], warpSums[ warpIdx - 1 ] ); auxData[ threadIdx.x ] = reduction( auxData[ threadIdx.x ], warpSums[ warpIdx - 1 ] ); __syncthreads(); /*** * Store the result back in global memory. */ __syncthreads(); idx = threadIdx.x; while( idx < elementsInBlock && blockOffset + idx < size ) { Loading @@ -136,7 +139,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, Real chunkShift( zero ); if( chunkIdx > 0 ) chunkShift = auxData[ chunkIdx - 1 ]; operation( sharedData[ Devices::Cuda::getInterleaving( idx ) ], chunkShift ); sharedData[ Devices::Cuda::getInterleaving( idx ) ] = reduction( sharedData[ Devices::Cuda::getInterleaving( idx ) ], chunkShift ); output[ blockOffset + idx ] = sharedData[ Devices::Cuda::getInterleaving( idx ) ]; idx += blockDim.x; } Loading @@ -147,8 +151,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, if( prefixSumType == PrefixSumType::Exclusive ) { Real aux = zero; operation( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ] ); operation( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ] ); aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); auxArray[ blockIdx.x ] = aux; } else Loading @@ -157,10 +161,10 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, } template< typename Real, typename Operation, typename Reduction, typename Index > __global__ void cudaSecondPhaseBlockPrefixSum( Operation operation, cudaSecondPhaseBlockPrefixSum( Reduction reduction, const Index size, const Index elementsInBlock, Real gridShift, Loading @@ -174,7 +178,7 @@ cudaSecondPhaseBlockPrefixSum( Operation operation, Index readIdx = threadIdx.x; while( readIdx < elementsInBlock && readOffset + readIdx < size ) { operation( data[ readIdx + readOffset ], shift ); data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], shift ); readIdx += blockDim.x; } } Loading @@ -185,12 +189,10 @@ template< PrefixSumType prefixSumType, typename Index > struct CudaPrefixSumKernelLauncher { template< typename Operation, typename VolatileOperation > template< typename Reduction > static void cudaRecursivePrefixSum( PrefixSumType prefixSumType_, Operation& operation, VolatileOperation& volatileOperation, Reduction& reduction, const Real& zero, const Index size, const Index blockSize, Loading Loading @@ -221,8 +223,7 @@ struct CudaPrefixSumKernelLauncher const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real ); cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( prefixSumType_, operation, volatileOperation, reduction, zero, size, elementsInBlock, Loading @@ -242,8 +243,7 @@ struct CudaPrefixSumKernelLauncher Real gridShift2 = zero; if( numberOfBlocks > 1 ) cudaRecursivePrefixSum( PrefixSumType::Inclusive, operation, volatileOperation, reduction, zero, numberOfBlocks, blockSize, Loading @@ -254,7 +254,7 @@ struct CudaPrefixSumKernelLauncher //std::cerr << " auxArray2 = " << auxArray2 << std::endl; cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>> ( operation, ( reduction, size, elementsInBlock, gridShift, Loading @@ -273,25 +273,21 @@ struct CudaPrefixSumKernelLauncher /**** * \brief Starts prefix sum in CUDA. * * \tparam Operation operation to be performed on particular elements - addition usually * \tparam VolatileOperation - volatile version of Operation * \tparam Reduction reduction to be performed on particular elements - addition usually * \param size is number of elements to be scanned * \param blockSize is CUDA block size * \param deviceInput is pointer to input data on GPU * \param deviceOutput is pointer to resulting array, can be the same as input * \param operation is instance of Operation * \param volatileOperation is instance of VolatileOperation * \param zero is neutral element for given Operation * \param reduction is instance of Reduction * \param zero is neutral element for given Reduction */ template< typename Operation, typename VolatileOperation > template< typename Reduction > static void start( const Index size, const Index blockSize, const Real *deviceInput, Real* deviceOutput, Operation& operation, VolatileOperation& volatileOperation, Reduction& reduction, const Real& zero ) { /**** Loading Loading @@ -319,8 +315,7 @@ struct CudaPrefixSumKernelLauncher //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl; cudaRecursivePrefixSum( prefixSumType, operation, volatileOperation, reduction, zero, currentSize, blockSize, Loading Loading @@ -371,5 +366,3 @@ int CudaPrefixSumKernelLauncher< prefixSumType, Real, Index >::gridsCount = -1; } // namespace Algorithms } // namespace Containers } // namespace TNL src/TNL/Containers/Algorithms/PrefixSum.h +8 −18 Original line number Diff line number Diff line Loading @@ -38,14 +38,12 @@ class PrefixSum< Devices::Host, Type > { public: template< typename Vector, typename PrefixSumOperation, typename VolatilePrefixSumOperation > typename Reduction > static void perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, PrefixSumOperation& reduction, VolatilePrefixSumOperation& volatilePrefixSum, const Reduction& reduction, const typename Vector::RealType& zero ); }; Loading @@ -54,14 +52,12 @@ class PrefixSum< Devices::Cuda, Type > { public: template< typename Vector, typename PrefixSumOperation, typename VolatilePrefixSumOperation > typename Reduction > static void perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, PrefixSumOperation& reduction, VolatilePrefixSumOperation& volatilePrefixSum, const Reduction& reduction, const typename Vector::RealType& zero ); }; Loading @@ -70,16 +66,14 @@ class SegmentedPrefixSum< Devices::Host, Type > { public: template< typename Vector, typename PrefixSumOperation, typename VolatilePrefixSumOperation, typename Reduction, typename Flags > static void perform( Vector& v, Flags& flags, const typename Vector::IndexType begin, const typename Vector::IndexType end, PrefixSumOperation& reduction, VolatilePrefixSumOperation& volatilePrefixSum, const Reduction& reduction, const typename Vector::RealType& zero ); }; Loading @@ -88,21 +82,17 @@ class SegmentedPrefixSum< Devices::Cuda, Type > { public: template< typename Vector, typename PrefixSumOperation, typename VolatilePrefixSumOperation, typename Reduction, typename Flags > static void perform( Vector& v, Flags& flags, const typename Vector::IndexType begin, const typename Vector::IndexType end, PrefixSumOperation& reduction, VolatilePrefixSumOperation& volatilePrefixSum, const Reduction& reduction, const typename Vector::RealType& zero ); }; } // namespace Algorithms } // namespace Containers } // namespace TNL Loading src/TNL/Containers/Algorithms/PrefixSum.hpp +16 −26 Original line number Diff line number Diff line Loading @@ -44,15 +44,13 @@ static constexpr int PrefixSum_minGpuDataSize = 256;//65536; //16384;//1024;//25 // PrefixSum on host template< PrefixSumType Type > template< typename Vector, typename PrefixSumOperation, typename VolatilePrefixSumOperation > typename Reduction > void PrefixSum< Devices::Host, Type >:: perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, PrefixSumOperation& reduction, VolatilePrefixSumOperation& volatilePrefixSum, const Reduction& reduction, const typename Vector::RealType& zero ) { using RealType = typename Vector::RealType; Loading Loading @@ -86,7 +84,7 @@ perform( Vector& v, if( Type == PrefixSumType::Inclusive ) { #pragma omp for schedule(static) for( IndexType i = begin; i < end; i++ ) { reduction( block_sum, v[ i ] ); block_sum = reduction( block_sum, v[ i ] ); v[ i ] = block_sum; } } Loading @@ -95,7 +93,7 @@ perform( Vector& v, for( IndexType i = begin; i < end; i++ ) { const RealType x = v[ i ]; v[ i ] = block_sum; reduction( block_sum, x ); block_sum = reduction( block_sum, x ); } } Loading @@ -106,17 +104,17 @@ perform( Vector& v, // calculate per-block offsets RealType offset = 0; for( int i = 0; i < thread_idx + 1; i++ ) reduction( offset, block_sums[ i ] ); offset = reduction( offset, block_sums[ i ] ); // shift intermediate results by the offset #pragma omp for schedule(static) for( IndexType i = begin; i < end; i++ ) reduction( v[ i ], offset ); v[ i ] = reduction( v[ i ], offset ); } #else if( Type == PrefixSumType::Inclusive ) { for( IndexType i = begin + 1; i < end; i++ ) reduction( v[ i ], v[ i - 1 ] ); v[ i ] = reduction( v[ i ], v[ i - 1 ] ); } else // Exclusive prefix sum { Loading @@ -124,7 +122,7 @@ perform( Vector& v, for( IndexType i = begin; i < end; i++ ) { const RealType x = v[ i ]; v[ i ] = aux; reduction( aux, x ); aux = reduction( aux, x ); } } #endif Loading @@ -134,15 +132,13 @@ perform( Vector& v, // PrefixSum on CUDA device template< PrefixSumType Type > template< typename Vector, typename PrefixSumOperation, typename VolatilePrefixSumOperation > typename Reduction > void PrefixSum< Devices::Cuda, Type >:: perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, PrefixSumOperation& reduction, VolatilePrefixSumOperation& volatileReduction, const Reduction& reduction, const typename Vector::RealType& zero ) { using RealType = typename Vector::RealType; Loading @@ -155,7 +151,6 @@ perform( Vector& v, &v[ begin ], &v[ begin ], reduction, volatileReduction, zero ); #endif } Loading @@ -165,8 +160,7 @@ perform( Vector& v, // PrefixSum on host template< PrefixSumType Type > template< typename Vector, typename PrefixSumOperation, typename VolatilePrefixSumOperation, typename Reduction, typename Flags > void SegmentedPrefixSum< Devices::Host, Type >:: Loading @@ -174,8 +168,7 @@ perform( Vector& v, Flags& flags, const typename Vector::IndexType begin, const typename Vector::IndexType end, PrefixSumOperation& reduction, VolatilePrefixSumOperation& volatilePrefixSum, const Reduction& reduction, const typename Vector::RealType& zero ) { using RealType = typename Vector::RealType; Loading @@ -186,7 +179,7 @@ perform( Vector& v, { for( IndexType i = begin + 1; i < end; i++ ) if( ! flags[ i ] ) reduction( v[ i ], v[ i - 1 ] ); v[ i ] = reduction( v[ i ], v[ i - 1 ] ); } else // Exclusive prefix sum { Loading @@ -198,7 +191,7 @@ perform( Vector& v, if( flags[ i ] ) aux = zero; v[ i ] = aux; reduction( aux, x ); aux = reduction( aux, x ); } } } Loading @@ -207,8 +200,7 @@ perform( Vector& v, // PrefixSum on CUDA device template< PrefixSumType Type > template< typename Vector, typename PrefixSumOperation, typename VolatilePrefixSumOperation, typename Reduction, typename Flags > void SegmentedPrefixSum< Devices::Cuda, Type >:: Loading @@ -216,8 +208,7 @@ perform( Vector& v, Flags& flags, const typename Vector::IndexType begin, const typename Vector::IndexType end, PrefixSumOperation& reduction, VolatilePrefixSumOperation& volatileReduction, const Reduction& reduction, const typename Vector::RealType& zero ) { using RealType = typename Vector::RealType; Loading @@ -231,7 +222,6 @@ perform( Vector& v, &v[ begin ], &v[ begin ], reduction, volatileReduction, zero );*/ #endif } Loading src/TNL/Containers/Vector.hpp +2 −8 Original line number Diff line number Diff line Loading @@ -175,10 +175,7 @@ prefixSum( IndexType begin, IndexType end ) { if( end == 0 ) end = this->getSize(); auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, reduction, volatileReduction, (RealType) 0.0 ); Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 ); } template< typename Real, Loading @@ -193,10 +190,7 @@ segmentedPrefixSum( FlagsArray& flags, IndexType begin, IndexType end ) { if( end == 0 ) end = this->getSize(); auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, reduction, volatileReduction, (RealType) 0.0 ); Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, std::plus<>{}, (RealType) 0.0 ); } template< typename Real, Loading src/TNL/Containers/VectorView.hpp +2 −8 Original line number Diff line number Diff line Loading @@ -125,10 +125,7 @@ prefixSum( IndexType begin, IndexType end ) { if( end == 0 ) end = this->getSize(); auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, reduction, volatileReduction, (RealType) 0.0 ); Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 ); } template< typename Real, Loading @@ -142,10 +139,7 @@ segmentedPrefixSum( FlagsArray& flags, IndexType begin, IndexType end ) { if( end == 0 ) end = this->getSize(); auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, reduction, volatileReduction, (RealType) 0.0 ); Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, std::plus<>{}, (RealType) 0.0 ); } template< typename Real, Loading src/UnitTests/Containers/VectorPrefixSumTest.h +6 −6 File changed.Contains only whitespace changes. Show changes Loading
src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h +37 −44 Original line number Diff line number Diff line Loading @@ -24,13 +24,11 @@ namespace Algorithms { #ifdef HAVE_CUDA template< typename Real, typename Operation, typename VolatileOperation, typename Reduction, typename Index > __global__ void cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, Operation operation, VolatileOperation volatileOperation, Reduction reduction, const Real zero, const Index size, const Index elementsInBlock, Loading @@ -40,8 +38,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, const Real gridShift ) { Real* sharedData = TNL::Devices::Cuda::getSharedMemory< Real >(); volatile Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ]; volatile Real* warpSums = &auxData[ blockDim.x ]; Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ]; Real* warpSums = &auxData[ blockDim.x ]; const Index lastElementIdx = size - blockIdx.x * elementsInBlock; const Index lastElementInBlock = TNL::min( lastElementIdx, elementsInBlock ); Loading Loading @@ -70,7 +68,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, } } if( blockIdx.x == 0 && threadIdx.x == 0 ) operation( sharedData[ 0 ], gridShift ); sharedData[ 0 ] = reduction( sharedData[ 0 ], gridShift ); /*** * Perform the sequential prefix-sum. Loading @@ -90,7 +88,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, while( chunkPointer < chunkSize && chunkOffset + chunkPointer < lastElementInBlock ) { operation( sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ], sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ] = reduction( sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ], sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] ); auxData[ threadIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ]; Loading @@ -102,9 +101,11 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, */ const int threadInWarpIdx = threadIdx.x % Devices::Cuda::getWarpSize(); const int warpIdx = threadIdx.x / Devices::Cuda::getWarpSize(); for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride && threadIdx.x < numberOfChunks ) volatileOperation( auxData[ threadIdx.x ], auxData[ threadIdx.x - stride ] ); auxData[ threadIdx.x ] = reduction( auxData[ threadIdx.x ], auxData[ threadIdx.x - stride ] ); __syncwarp(); } if( threadInWarpIdx == Devices::Cuda::getWarpSize() - 1 ) warpSums[ warpIdx ] = auxData[ threadIdx.x ]; Loading @@ -114,21 +115,23 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, * Compute prefix-sum of warp sums using one warp */ if( warpIdx == 0 ) for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride ) volatileOperation( warpSums[ threadIdx.x ], warpSums[ threadIdx.x - stride ] ); warpSums[ threadIdx.x ] = reduction( warpSums[ threadIdx.x ], warpSums[ threadIdx.x - stride ] ); __syncwarp(); } __syncthreads(); /**** * Shift the warp prefix-sums. */ if( warpIdx > 0 ) volatileOperation( auxData[ threadIdx.x ], warpSums[ warpIdx - 1 ] ); auxData[ threadIdx.x ] = reduction( auxData[ threadIdx.x ], warpSums[ warpIdx - 1 ] ); __syncthreads(); /*** * Store the result back in global memory. */ __syncthreads(); idx = threadIdx.x; while( idx < elementsInBlock && blockOffset + idx < size ) { Loading @@ -136,7 +139,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, Real chunkShift( zero ); if( chunkIdx > 0 ) chunkShift = auxData[ chunkIdx - 1 ]; operation( sharedData[ Devices::Cuda::getInterleaving( idx ) ], chunkShift ); sharedData[ Devices::Cuda::getInterleaving( idx ) ] = reduction( sharedData[ Devices::Cuda::getInterleaving( idx ) ], chunkShift ); output[ blockOffset + idx ] = sharedData[ Devices::Cuda::getInterleaving( idx ) ]; idx += blockDim.x; } Loading @@ -147,8 +151,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, if( prefixSumType == PrefixSumType::Exclusive ) { Real aux = zero; operation( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ] ); operation( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ] ); aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); auxArray[ blockIdx.x ] = aux; } else Loading @@ -157,10 +161,10 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, } template< typename Real, typename Operation, typename Reduction, typename Index > __global__ void cudaSecondPhaseBlockPrefixSum( Operation operation, cudaSecondPhaseBlockPrefixSum( Reduction reduction, const Index size, const Index elementsInBlock, Real gridShift, Loading @@ -174,7 +178,7 @@ cudaSecondPhaseBlockPrefixSum( Operation operation, Index readIdx = threadIdx.x; while( readIdx < elementsInBlock && readOffset + readIdx < size ) { operation( data[ readIdx + readOffset ], shift ); data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], shift ); readIdx += blockDim.x; } } Loading @@ -185,12 +189,10 @@ template< PrefixSumType prefixSumType, typename Index > struct CudaPrefixSumKernelLauncher { template< typename Operation, typename VolatileOperation > template< typename Reduction > static void cudaRecursivePrefixSum( PrefixSumType prefixSumType_, Operation& operation, VolatileOperation& volatileOperation, Reduction& reduction, const Real& zero, const Index size, const Index blockSize, Loading Loading @@ -221,8 +223,7 @@ struct CudaPrefixSumKernelLauncher const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real ); cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( prefixSumType_, operation, volatileOperation, reduction, zero, size, elementsInBlock, Loading @@ -242,8 +243,7 @@ struct CudaPrefixSumKernelLauncher Real gridShift2 = zero; if( numberOfBlocks > 1 ) cudaRecursivePrefixSum( PrefixSumType::Inclusive, operation, volatileOperation, reduction, zero, numberOfBlocks, blockSize, Loading @@ -254,7 +254,7 @@ struct CudaPrefixSumKernelLauncher //std::cerr << " auxArray2 = " << auxArray2 << std::endl; cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>> ( operation, ( reduction, size, elementsInBlock, gridShift, Loading @@ -273,25 +273,21 @@ struct CudaPrefixSumKernelLauncher /**** * \brief Starts prefix sum in CUDA. * * \tparam Operation operation to be performed on particular elements - addition usually * \tparam VolatileOperation - volatile version of Operation * \tparam Reduction reduction to be performed on particular elements - addition usually * \param size is number of elements to be scanned * \param blockSize is CUDA block size * \param deviceInput is pointer to input data on GPU * \param deviceOutput is pointer to resulting array, can be the same as input * \param operation is instance of Operation * \param volatileOperation is instance of VolatileOperation * \param zero is neutral element for given Operation * \param reduction is instance of Reduction * \param zero is neutral element for given Reduction */ template< typename Operation, typename VolatileOperation > template< typename Reduction > static void start( const Index size, const Index blockSize, const Real *deviceInput, Real* deviceOutput, Operation& operation, VolatileOperation& volatileOperation, Reduction& reduction, const Real& zero ) { /**** Loading Loading @@ -319,8 +315,7 @@ struct CudaPrefixSumKernelLauncher //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl; cudaRecursivePrefixSum( prefixSumType, operation, volatileOperation, reduction, zero, currentSize, blockSize, Loading Loading @@ -371,5 +366,3 @@ int CudaPrefixSumKernelLauncher< prefixSumType, Real, Index >::gridsCount = -1; } // namespace Algorithms } // namespace Containers } // namespace TNL
src/TNL/Containers/Algorithms/PrefixSum.h +8 −18 Original line number Diff line number Diff line Loading @@ -38,14 +38,12 @@ class PrefixSum< Devices::Host, Type > { public: template< typename Vector, typename PrefixSumOperation, typename VolatilePrefixSumOperation > typename Reduction > static void perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, PrefixSumOperation& reduction, VolatilePrefixSumOperation& volatilePrefixSum, const Reduction& reduction, const typename Vector::RealType& zero ); }; Loading @@ -54,14 +52,12 @@ class PrefixSum< Devices::Cuda, Type > { public: template< typename Vector, typename PrefixSumOperation, typename VolatilePrefixSumOperation > typename Reduction > static void perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, PrefixSumOperation& reduction, VolatilePrefixSumOperation& volatilePrefixSum, const Reduction& reduction, const typename Vector::RealType& zero ); }; Loading @@ -70,16 +66,14 @@ class SegmentedPrefixSum< Devices::Host, Type > { public: template< typename Vector, typename PrefixSumOperation, typename VolatilePrefixSumOperation, typename Reduction, typename Flags > static void perform( Vector& v, Flags& flags, const typename Vector::IndexType begin, const typename Vector::IndexType end, PrefixSumOperation& reduction, VolatilePrefixSumOperation& volatilePrefixSum, const Reduction& reduction, const typename Vector::RealType& zero ); }; Loading @@ -88,21 +82,17 @@ class SegmentedPrefixSum< Devices::Cuda, Type > { public: template< typename Vector, typename PrefixSumOperation, typename VolatilePrefixSumOperation, typename Reduction, typename Flags > static void perform( Vector& v, Flags& flags, const typename Vector::IndexType begin, const typename Vector::IndexType end, PrefixSumOperation& reduction, VolatilePrefixSumOperation& volatilePrefixSum, const Reduction& reduction, const typename Vector::RealType& zero ); }; } // namespace Algorithms } // namespace Containers } // namespace TNL Loading
src/TNL/Containers/Algorithms/PrefixSum.hpp +16 −26 Original line number Diff line number Diff line Loading @@ -44,15 +44,13 @@ static constexpr int PrefixSum_minGpuDataSize = 256;//65536; //16384;//1024;//25 // PrefixSum on host template< PrefixSumType Type > template< typename Vector, typename PrefixSumOperation, typename VolatilePrefixSumOperation > typename Reduction > void PrefixSum< Devices::Host, Type >:: perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, PrefixSumOperation& reduction, VolatilePrefixSumOperation& volatilePrefixSum, const Reduction& reduction, const typename Vector::RealType& zero ) { using RealType = typename Vector::RealType; Loading Loading @@ -86,7 +84,7 @@ perform( Vector& v, if( Type == PrefixSumType::Inclusive ) { #pragma omp for schedule(static) for( IndexType i = begin; i < end; i++ ) { reduction( block_sum, v[ i ] ); block_sum = reduction( block_sum, v[ i ] ); v[ i ] = block_sum; } } Loading @@ -95,7 +93,7 @@ perform( Vector& v, for( IndexType i = begin; i < end; i++ ) { const RealType x = v[ i ]; v[ i ] = block_sum; reduction( block_sum, x ); block_sum = reduction( block_sum, x ); } } Loading @@ -106,17 +104,17 @@ perform( Vector& v, // calculate per-block offsets RealType offset = 0; for( int i = 0; i < thread_idx + 1; i++ ) reduction( offset, block_sums[ i ] ); offset = reduction( offset, block_sums[ i ] ); // shift intermediate results by the offset #pragma omp for schedule(static) for( IndexType i = begin; i < end; i++ ) reduction( v[ i ], offset ); v[ i ] = reduction( v[ i ], offset ); } #else if( Type == PrefixSumType::Inclusive ) { for( IndexType i = begin + 1; i < end; i++ ) reduction( v[ i ], v[ i - 1 ] ); v[ i ] = reduction( v[ i ], v[ i - 1 ] ); } else // Exclusive prefix sum { Loading @@ -124,7 +122,7 @@ perform( Vector& v, for( IndexType i = begin; i < end; i++ ) { const RealType x = v[ i ]; v[ i ] = aux; reduction( aux, x ); aux = reduction( aux, x ); } } #endif Loading @@ -134,15 +132,13 @@ perform( Vector& v, // PrefixSum on CUDA device template< PrefixSumType Type > template< typename Vector, typename PrefixSumOperation, typename VolatilePrefixSumOperation > typename Reduction > void PrefixSum< Devices::Cuda, Type >:: perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, PrefixSumOperation& reduction, VolatilePrefixSumOperation& volatileReduction, const Reduction& reduction, const typename Vector::RealType& zero ) { using RealType = typename Vector::RealType; Loading @@ -155,7 +151,6 @@ perform( Vector& v, &v[ begin ], &v[ begin ], reduction, volatileReduction, zero ); #endif } Loading @@ -165,8 +160,7 @@ perform( Vector& v, // PrefixSum on host template< PrefixSumType Type > template< typename Vector, typename PrefixSumOperation, typename VolatilePrefixSumOperation, typename Reduction, typename Flags > void SegmentedPrefixSum< Devices::Host, Type >:: Loading @@ -174,8 +168,7 @@ perform( Vector& v, Flags& flags, const typename Vector::IndexType begin, const typename Vector::IndexType end, PrefixSumOperation& reduction, VolatilePrefixSumOperation& volatilePrefixSum, const Reduction& reduction, const typename Vector::RealType& zero ) { using RealType = typename Vector::RealType; Loading @@ -186,7 +179,7 @@ perform( Vector& v, { for( IndexType i = begin + 1; i < end; i++ ) if( ! flags[ i ] ) reduction( v[ i ], v[ i - 1 ] ); v[ i ] = reduction( v[ i ], v[ i - 1 ] ); } else // Exclusive prefix sum { Loading @@ -198,7 +191,7 @@ perform( Vector& v, if( flags[ i ] ) aux = zero; v[ i ] = aux; reduction( aux, x ); aux = reduction( aux, x ); } } } Loading @@ -207,8 +200,7 @@ perform( Vector& v, // PrefixSum on CUDA device template< PrefixSumType Type > template< typename Vector, typename PrefixSumOperation, typename VolatilePrefixSumOperation, typename Reduction, typename Flags > void SegmentedPrefixSum< Devices::Cuda, Type >:: Loading @@ -216,8 +208,7 @@ perform( Vector& v, Flags& flags, const typename Vector::IndexType begin, const typename Vector::IndexType end, PrefixSumOperation& reduction, VolatilePrefixSumOperation& volatileReduction, const Reduction& reduction, const typename Vector::RealType& zero ) { using RealType = typename Vector::RealType; Loading @@ -231,7 +222,6 @@ perform( Vector& v, &v[ begin ], &v[ begin ], reduction, volatileReduction, zero );*/ #endif } Loading
src/TNL/Containers/Vector.hpp +2 −8 Original line number Diff line number Diff line Loading @@ -175,10 +175,7 @@ prefixSum( IndexType begin, IndexType end ) { if( end == 0 ) end = this->getSize(); auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, reduction, volatileReduction, (RealType) 0.0 ); Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 ); } template< typename Real, Loading @@ -193,10 +190,7 @@ segmentedPrefixSum( FlagsArray& flags, IndexType begin, IndexType end ) { if( end == 0 ) end = this->getSize(); auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, reduction, volatileReduction, (RealType) 0.0 ); Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, std::plus<>{}, (RealType) 0.0 ); } template< typename Real, Loading
src/TNL/Containers/VectorView.hpp +2 −8 Original line number Diff line number Diff line Loading @@ -125,10 +125,7 @@ prefixSum( IndexType begin, IndexType end ) { if( end == 0 ) end = this->getSize(); auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, reduction, volatileReduction, (RealType) 0.0 ); Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 ); } template< typename Real, Loading @@ -142,10 +139,7 @@ segmentedPrefixSum( FlagsArray& flags, IndexType begin, IndexType end ) { if( end == 0 ) end = this->getSize(); auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, reduction, volatileReduction, (RealType) 0.0 ); Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, std::plus<>{}, (RealType) 0.0 ); } template< typename Real, Loading
src/UnitTests/Containers/VectorPrefixSumTest.h +6 −6 File changed.Contains only whitespace changes. Show changes