Loading src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h +164 −122 Original line number Original line Diff line number Diff line Loading @@ -31,7 +31,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, Reduction reduction, Reduction reduction, const Real zero, const Real zero, const Index size, const Index size, const Index elementsInBlock, const int elementsInBlock, const Real* input, const Real* input, Real* output, Real* output, Real* auxArray ) Real* auxArray ) Loading @@ -46,8 +46,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, /*** /*** * Load data into the shared memory. * Load data into the shared memory. */ */ const Index blockOffset = blockIdx.x * elementsInBlock; const int blockOffset = blockIdx.x * elementsInBlock; Index idx = threadIdx.x; int idx = threadIdx.x; if( prefixSumType == PrefixSumType::Exclusive ) if( prefixSumType == PrefixSumType::Exclusive ) { { if( idx == 0 ) if( idx == 0 ) Loading Loading @@ -81,7 +81,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, sharedData[ Devices::Cuda::getInterleaving( chunkOffset ) ]; sharedData[ Devices::Cuda::getInterleaving( chunkOffset ) ]; } } Index chunkPointer( 1 ); int chunkPointer = 1; while( chunkPointer < chunkSize && while( chunkPointer < chunkSize && chunkOffset + chunkPointer < lastElementInBlock ) chunkOffset + chunkPointer < lastElementInBlock ) { { Loading Loading @@ -132,7 +132,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, idx = threadIdx.x; idx = threadIdx.x; while( idx < elementsInBlock && blockOffset + idx < size ) while( idx < elementsInBlock && blockOffset + idx < size ) { { const Index chunkIdx = idx / chunkSize; const int chunkIdx = idx / chunkSize; Real chunkShift( zero ); Real chunkShift( zero ); if( chunkIdx > 0 ) if( chunkIdx > 0 ) chunkShift = auxData[ chunkIdx - 1 ]; chunkShift = auxData[ chunkIdx - 1 ]; Loading Loading @@ -161,18 +161,20 @@ template< typename Real, __global__ void __global__ void cudaSecondPhaseBlockPrefixSum( Reduction reduction, cudaSecondPhaseBlockPrefixSum( Reduction reduction, const Index size, const Index size, const Index elementsInBlock, const int elementsInBlock, Real gridShift, const Index gridIdx, const Index maxGridSize, const Real* auxArray, const Real* auxArray, Real* data ) Real* data, Real shift ) { { if( blockIdx.x > 0 ) if( gridIdx > 0 || blockIdx.x > 0 ) gridShift = reduction( gridShift, auxArray[ blockIdx.x - 1 ] ); shift = reduction( shift, auxArray[ gridIdx * maxGridSize + blockIdx.x - 1 ] ); const Index readOffset = blockIdx.x * elementsInBlock; const int readOffset = blockIdx.x * elementsInBlock; Index readIdx = threadIdx.x; int readIdx = threadIdx.x; while( readIdx < elementsInBlock && readOffset + readIdx < size ) while( readIdx < elementsInBlock && readOffset + readIdx < size ) { { data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], gridShift ); data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], shift ); readIdx += blockDim.x; readIdx += blockDim.x; } } } } Loading @@ -182,143 +184,183 @@ template< PrefixSumType prefixSumType, typename Index > typename Index > struct CudaPrefixSumKernelLauncher struct CudaPrefixSumKernelLauncher { { /**** * \brief Performs both phases of prefix sum. * * \param size Number of elements to be scanned. * \param deviceInput Pointer to input data on GPU. * \param deviceOutput Pointer to output array on GPU, can be the same as input. * \param reduction Symmetric binary function representing the reduction operation * (usually addition, i.e. an instance of \ref std::plus). * \param zero Neutral element for given reduction operation, i.e. value such that * `reduction(zero, x) == x` for any `x`. * \param blockSize The CUDA block size to be used for kernel launch. */ template< typename Reduction > template< typename Reduction > static void static void cudaRecursivePrefixSum( PrefixSumType prefixSumType_, perform( const Index size, const Real* deviceInput, Real* deviceOutput, Reduction& reduction, Reduction& reduction, const Real& zero, const Real zero, const Index size, const int blockSize = 256 ) const Index blockSize, const Index elementsInBlock, Real& gridShift, const Real* input, Real* output ) { { const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); const auto blockShifts = performFirstPhase( const Index auxArraySize = numberOfBlocks; size, deviceInput, Array< Real, Devices::Cuda > auxArray1, auxArray2; deviceOutput, auxArray1.setSize( auxArraySize ); reduction, auxArray2.setSize( auxArraySize ); zero, blockSize ); performSecondPhase( size, deviceOutput, blockShifts.getData(), reduction, zero, blockSize ); } /**** /**** * Setup block and grid size. * \brief Performs the first phase of prefix sum. * * \param size Number of elements to be scanned. * \param deviceInput Pointer to input data on GPU. * \param deviceOutput Pointer to output array on GPU, can be the same as input. * \param reduction Symmetric binary function representing the reduction operation * (usually addition, i.e. an instance of \ref std::plus). * \param zero Neutral value for given reduction operation, i.e. value such that * `reduction(zero, x) == x` for any `x`. * \param blockSize The CUDA block size to be used for kernel launch. */ */ dim3 cudaBlockSize( 0 ), cudaGridSize( 0 ); template< typename Reduction > static auto performFirstPhase( const Index size, const Real* deviceInput, Real* deviceOutput, Reduction& reduction, const Real zero, const int blockSize = 256 ) { // compute the number of grids const int elementsInBlock = 8 * blockSize; const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() ); //std::cerr << "numberOfgrids = " << numberOfGrids << std::endl; // allocate array for the block sums Array< Real, Devices::Cuda > blockSums; blockSums.setSize( numberOfBlocks ); // loop over all grids for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) { // compute current grid size and size of data to be scanned const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock; Index currentSize = size - gridOffset; if( currentSize / elementsInBlock > maxGridSize() ) currentSize = maxGridSize() * elementsInBlock; //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl; // setup block and grid size dim3 cudaBlockSize, cudaGridSize; cudaBlockSize.x = blockSize; cudaBlockSize.x = blockSize; cudaGridSize.x = roundUpDivision( size, elementsInBlock ); cudaGridSize.x = roundUpDivision( currentSize, elementsInBlock ); /**** // run the kernel * Run the kernel. */ const std::size_t sharedDataSize = elementsInBlock + const std::size_t sharedDataSize = elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2; elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2; const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real ); const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real ); cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>> cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( prefixSumType_, ( prefixSumType, reduction, reduction, zero, zero, size, currentSize, elementsInBlock, elementsInBlock, input, &deviceInput[ gridOffset ], output, &deviceOutput[ gridOffset ], auxArray1.getData() ); &blockSums[ gridIdx * maxGridSize() ] ); } // synchronize the null-stream after all grids cudaStreamSynchronize(0); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; TNL_CHECK_CUDA_DEVICE; // blockSums now contains sums of numbers in each block. The first phase //std::cerr << " auxArray1 = " << auxArray1 << std::endl; // ends by computing prefix-sum of this array. /*** if( numberOfBlocks > 1 ) { * In auxArray1 there is now a sum of numbers in each block. CudaPrefixSumKernelLauncher< PrefixSumType::Inclusive, Real, Index >::perform( * We must compute prefix-sum of auxArray1 and then shift blockSums.getSize(), * each block. blockSums.getData(), */ blockSums.getData(), Real gridShift2 = zero; if( numberOfBlocks > 1 ) cudaRecursivePrefixSum( PrefixSumType::Inclusive, reduction, reduction, zero, zero, numberOfBlocks, blockSize ); blockSize, } elementsInBlock, gridShift2, auxArray1.getData(), auxArray2.getData() ); //std::cerr << " auxArray2 = " << auxArray2 << std::endl; // Store the number of CUDA grids for the purpose of unit testing, i.e. cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>> // to check if we test the algorithm with more than one CUDA grid. ( reduction, gridsCount() = numberOfGrids; size, elementsInBlock, gridShift, auxArray2.getData(), output ); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; gridShift = auxArray2.getElement( auxArraySize - 1 ); // blockSums now contains shift values for each block - to be used in the second phase //std::cerr << "gridShift = " << gridShift << std::endl; return blockSums; } } /**** /**** * \brief Starts prefix sum in CUDA. * \brief Performs the seocond phase of prefix sum. * * * \tparam Reduction reduction to be performed on particular elements - addition usually * \param size Number of elements to be scanned. * \param size is number of elements to be scanned * \param deviceOutput Pointer to output array on GPU. * \param blockSize is CUDA block size * \param blockShifts Pointer to a GPU array containing the block shifts. It is the * \param deviceInput is pointer to input data on GPU * result of the first phase. * \param deviceOutput is pointer to resulting array, can be the same as input * \param reduction Symmetric binary function representing the reduction operation * \param reduction is instance of Reduction * (usually addition, i.e. an instance of \ref std::plus). * \param zero is neutral element for given Reduction * \param shift A constant shifting all elements of the array (usually `zero`, i.e. * the neutral value). * \param blockSize The CUDA block size to be used for kernel launch. */ */ template< typename Reduction > template< typename Reduction > static void static void start( const Index size, performSecondPhase( const Index size, const Index blockSize, const Real *deviceInput, Real* deviceOutput, Real* deviceOutput, const Real* blockShifts, Reduction& reduction, Reduction& reduction, const Real& zero ) const Real shift, const Index blockSize = 256 ) { { /**** // compute the number of grids * Compute the number of grids const int elementsInBlock = 8 * blockSize; */ const Index elementsInBlock = 8 * blockSize; const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() ); const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() ); Real gridShift = zero; //std::cerr << "numberOfgrids = " << numberOfGrids << std::endl; /**** // loop over all grids * Loop over all grids. for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) { */ // compute current grid size and size of data to be scanned for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) { /**** * Compute current grid size and size of data to be scanned */ const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock; const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock; Index currentSize = size - gridOffset; Index currentSize = size - gridOffset; if( currentSize / elementsInBlock > maxGridSize() ) if( currentSize / elementsInBlock > maxGridSize() ) currentSize = maxGridSize() * elementsInBlock; currentSize = maxGridSize() * elementsInBlock; //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl; //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl; cudaRecursivePrefixSum( prefixSumType, reduction, // setup block and grid size zero, dim3 cudaBlockSize, cudaGridSize; currentSize, cudaBlockSize.x = blockSize; blockSize, cudaGridSize.x = roundUpDivision( currentSize, elementsInBlock ); // run the kernel cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>> ( reduction, size, elementsInBlock, elementsInBlock, gridShift, gridIdx, &deviceInput[ gridOffset ], (Index) maxGridSize(), &deviceOutput[ gridOffset ] ); blockShifts, &deviceOutput[ gridOffset ], shift ); } } /*** // synchronize the null-stream after all grids * Store the number of CUDA grids for the purpose of unit testing, i.e. cudaStreamSynchronize(0); * to check if we test the algorithm with more than one CUDA grid. TNL_CHECK_CUDA_DEVICE; */ gridsCount() = numberOfGrids; } } /**** /**** Loading src/TNL/Containers/Algorithms/PrefixSum.hpp +13 −17 Original line number Original line Diff line number Diff line Loading @@ -141,17 +141,18 @@ perform( Vector& v, const Reduction& reduction, const Reduction& reduction, const typename Vector::RealType& zero ) const typename Vector::RealType& zero ) { { #ifdef HAVE_CUDA using RealType = typename Vector::RealType; using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; using IndexType = typename Vector::IndexType; using IndexType = typename Vector::IndexType; #ifdef HAVE_CUDA CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::perform( CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::start( end - begin, ( IndexType ) ( end - begin ), &v[ begin ], // input ( IndexType ) 256, &v[ begin ], // output &v[ begin ], &v[ begin ], reduction, reduction, zero ); zero ); #else throw Exceptions::CudaSupportMissing(); #endif #endif } } Loading Loading @@ -211,18 +212,13 @@ perform( Vector& v, const Reduction& reduction, const Reduction& reduction, const typename Vector::RealType& zero ) const typename Vector::RealType& zero ) { { #ifdef HAVE_CUDA using RealType = typename Vector::RealType; using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; using IndexType = typename Vector::IndexType; using IndexType = typename Vector::IndexType; #ifdef HAVE_CUDA throw Exceptions::NotImplementedError( "Segmented prefix sum is not implemented for CUDA." ); throw Exceptions::NotImplementedError( "Segmented prefix sum is not implemented for CUDA." ); // NOT IMPLEMENTED YET #else /*CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::start( throw Exceptions::CudaSupportMissing(); ( IndexType ) ( end - begin ), ( IndexType ) 256, &v[ begin ], &v[ begin ], reduction, zero );*/ #endif #endif } } Loading Loading
src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h +164 −122 Original line number Original line Diff line number Diff line Loading @@ -31,7 +31,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, Reduction reduction, Reduction reduction, const Real zero, const Real zero, const Index size, const Index size, const Index elementsInBlock, const int elementsInBlock, const Real* input, const Real* input, Real* output, Real* output, Real* auxArray ) Real* auxArray ) Loading @@ -46,8 +46,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, /*** /*** * Load data into the shared memory. * Load data into the shared memory. */ */ const Index blockOffset = blockIdx.x * elementsInBlock; const int blockOffset = blockIdx.x * elementsInBlock; Index idx = threadIdx.x; int idx = threadIdx.x; if( prefixSumType == PrefixSumType::Exclusive ) if( prefixSumType == PrefixSumType::Exclusive ) { { if( idx == 0 ) if( idx == 0 ) Loading Loading @@ -81,7 +81,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, sharedData[ Devices::Cuda::getInterleaving( chunkOffset ) ]; sharedData[ Devices::Cuda::getInterleaving( chunkOffset ) ]; } } Index chunkPointer( 1 ); int chunkPointer = 1; while( chunkPointer < chunkSize && while( chunkPointer < chunkSize && chunkOffset + chunkPointer < lastElementInBlock ) chunkOffset + chunkPointer < lastElementInBlock ) { { Loading Loading @@ -132,7 +132,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, idx = threadIdx.x; idx = threadIdx.x; while( idx < elementsInBlock && blockOffset + idx < size ) while( idx < elementsInBlock && blockOffset + idx < size ) { { const Index chunkIdx = idx / chunkSize; const int chunkIdx = idx / chunkSize; Real chunkShift( zero ); Real chunkShift( zero ); if( chunkIdx > 0 ) if( chunkIdx > 0 ) chunkShift = auxData[ chunkIdx - 1 ]; chunkShift = auxData[ chunkIdx - 1 ]; Loading Loading @@ -161,18 +161,20 @@ template< typename Real, __global__ void __global__ void cudaSecondPhaseBlockPrefixSum( Reduction reduction, cudaSecondPhaseBlockPrefixSum( Reduction reduction, const Index size, const Index size, const Index elementsInBlock, const int elementsInBlock, Real gridShift, const Index gridIdx, const Index maxGridSize, const Real* auxArray, const Real* auxArray, Real* data ) Real* data, Real shift ) { { if( blockIdx.x > 0 ) if( gridIdx > 0 || blockIdx.x > 0 ) gridShift = reduction( gridShift, auxArray[ blockIdx.x - 1 ] ); shift = reduction( shift, auxArray[ gridIdx * maxGridSize + blockIdx.x - 1 ] ); const Index readOffset = blockIdx.x * elementsInBlock; const int readOffset = blockIdx.x * elementsInBlock; Index readIdx = threadIdx.x; int readIdx = threadIdx.x; while( readIdx < elementsInBlock && readOffset + readIdx < size ) while( readIdx < elementsInBlock && readOffset + readIdx < size ) { { data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], gridShift ); data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], shift ); readIdx += blockDim.x; readIdx += blockDim.x; } } } } Loading @@ -182,143 +184,183 @@ template< PrefixSumType prefixSumType, typename Index > typename Index > struct CudaPrefixSumKernelLauncher struct CudaPrefixSumKernelLauncher { { /**** * \brief Performs both phases of prefix sum. * * \param size Number of elements to be scanned. * \param deviceInput Pointer to input data on GPU. * \param deviceOutput Pointer to output array on GPU, can be the same as input. * \param reduction Symmetric binary function representing the reduction operation * (usually addition, i.e. an instance of \ref std::plus). * \param zero Neutral element for given reduction operation, i.e. value such that * `reduction(zero, x) == x` for any `x`. * \param blockSize The CUDA block size to be used for kernel launch. */ template< typename Reduction > template< typename Reduction > static void static void cudaRecursivePrefixSum( PrefixSumType prefixSumType_, perform( const Index size, const Real* deviceInput, Real* deviceOutput, Reduction& reduction, Reduction& reduction, const Real& zero, const Real zero, const Index size, const int blockSize = 256 ) const Index blockSize, const Index elementsInBlock, Real& gridShift, const Real* input, Real* output ) { { const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); const auto blockShifts = performFirstPhase( const Index auxArraySize = numberOfBlocks; size, deviceInput, Array< Real, Devices::Cuda > auxArray1, auxArray2; deviceOutput, auxArray1.setSize( auxArraySize ); reduction, auxArray2.setSize( auxArraySize ); zero, blockSize ); performSecondPhase( size, deviceOutput, blockShifts.getData(), reduction, zero, blockSize ); } /**** /**** * Setup block and grid size. * \brief Performs the first phase of prefix sum. * * \param size Number of elements to be scanned. * \param deviceInput Pointer to input data on GPU. * \param deviceOutput Pointer to output array on GPU, can be the same as input. * \param reduction Symmetric binary function representing the reduction operation * (usually addition, i.e. an instance of \ref std::plus). * \param zero Neutral value for given reduction operation, i.e. value such that * `reduction(zero, x) == x` for any `x`. * \param blockSize The CUDA block size to be used for kernel launch. */ */ dim3 cudaBlockSize( 0 ), cudaGridSize( 0 ); template< typename Reduction > static auto performFirstPhase( const Index size, const Real* deviceInput, Real* deviceOutput, Reduction& reduction, const Real zero, const int blockSize = 256 ) { // compute the number of grids const int elementsInBlock = 8 * blockSize; const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() ); //std::cerr << "numberOfgrids = " << numberOfGrids << std::endl; // allocate array for the block sums Array< Real, Devices::Cuda > blockSums; blockSums.setSize( numberOfBlocks ); // loop over all grids for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) { // compute current grid size and size of data to be scanned const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock; Index currentSize = size - gridOffset; if( currentSize / elementsInBlock > maxGridSize() ) currentSize = maxGridSize() * elementsInBlock; //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl; // setup block and grid size dim3 cudaBlockSize, cudaGridSize; cudaBlockSize.x = blockSize; cudaBlockSize.x = blockSize; cudaGridSize.x = roundUpDivision( size, elementsInBlock ); cudaGridSize.x = roundUpDivision( currentSize, elementsInBlock ); /**** // run the kernel * Run the kernel. */ const std::size_t sharedDataSize = elementsInBlock + const std::size_t sharedDataSize = elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2; elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2; const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real ); const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real ); cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>> cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( prefixSumType_, ( prefixSumType, reduction, reduction, zero, zero, size, currentSize, elementsInBlock, elementsInBlock, input, &deviceInput[ gridOffset ], output, &deviceOutput[ gridOffset ], auxArray1.getData() ); &blockSums[ gridIdx * maxGridSize() ] ); } // synchronize the null-stream after all grids cudaStreamSynchronize(0); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; TNL_CHECK_CUDA_DEVICE; // blockSums now contains sums of numbers in each block. The first phase //std::cerr << " auxArray1 = " << auxArray1 << std::endl; // ends by computing prefix-sum of this array. /*** if( numberOfBlocks > 1 ) { * In auxArray1 there is now a sum of numbers in each block. CudaPrefixSumKernelLauncher< PrefixSumType::Inclusive, Real, Index >::perform( * We must compute prefix-sum of auxArray1 and then shift blockSums.getSize(), * each block. blockSums.getData(), */ blockSums.getData(), Real gridShift2 = zero; if( numberOfBlocks > 1 ) cudaRecursivePrefixSum( PrefixSumType::Inclusive, reduction, reduction, zero, zero, numberOfBlocks, blockSize ); blockSize, } elementsInBlock, gridShift2, auxArray1.getData(), auxArray2.getData() ); //std::cerr << " auxArray2 = " << auxArray2 << std::endl; // Store the number of CUDA grids for the purpose of unit testing, i.e. cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>> // to check if we test the algorithm with more than one CUDA grid. ( reduction, gridsCount() = numberOfGrids; size, elementsInBlock, gridShift, auxArray2.getData(), output ); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; gridShift = auxArray2.getElement( auxArraySize - 1 ); // blockSums now contains shift values for each block - to be used in the second phase //std::cerr << "gridShift = " << gridShift << std::endl; return blockSums; } } /**** /**** * \brief Starts prefix sum in CUDA. * \brief Performs the seocond phase of prefix sum. * * * \tparam Reduction reduction to be performed on particular elements - addition usually * \param size Number of elements to be scanned. * \param size is number of elements to be scanned * \param deviceOutput Pointer to output array on GPU. * \param blockSize is CUDA block size * \param blockShifts Pointer to a GPU array containing the block shifts. It is the * \param deviceInput is pointer to input data on GPU * result of the first phase. * \param deviceOutput is pointer to resulting array, can be the same as input * \param reduction Symmetric binary function representing the reduction operation * \param reduction is instance of Reduction * (usually addition, i.e. an instance of \ref std::plus). * \param zero is neutral element for given Reduction * \param shift A constant shifting all elements of the array (usually `zero`, i.e. * the neutral value). * \param blockSize The CUDA block size to be used for kernel launch. */ */ template< typename Reduction > template< typename Reduction > static void static void start( const Index size, performSecondPhase( const Index size, const Index blockSize, const Real *deviceInput, Real* deviceOutput, Real* deviceOutput, const Real* blockShifts, Reduction& reduction, Reduction& reduction, const Real& zero ) const Real shift, const Index blockSize = 256 ) { { /**** // compute the number of grids * Compute the number of grids const int elementsInBlock = 8 * blockSize; */ const Index elementsInBlock = 8 * blockSize; const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() ); const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() ); Real gridShift = zero; //std::cerr << "numberOfgrids = " << numberOfGrids << std::endl; /**** // loop over all grids * Loop over all grids. for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) { */ // compute current grid size and size of data to be scanned for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) { /**** * Compute current grid size and size of data to be scanned */ const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock; const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock; Index currentSize = size - gridOffset; Index currentSize = size - gridOffset; if( currentSize / elementsInBlock > maxGridSize() ) if( currentSize / elementsInBlock > maxGridSize() ) currentSize = maxGridSize() * elementsInBlock; currentSize = maxGridSize() * elementsInBlock; //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl; //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl; cudaRecursivePrefixSum( prefixSumType, reduction, // setup block and grid size zero, dim3 cudaBlockSize, cudaGridSize; currentSize, cudaBlockSize.x = blockSize; blockSize, cudaGridSize.x = roundUpDivision( currentSize, elementsInBlock ); // run the kernel cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>> ( reduction, size, elementsInBlock, elementsInBlock, gridShift, gridIdx, &deviceInput[ gridOffset ], (Index) maxGridSize(), &deviceOutput[ gridOffset ] ); blockShifts, &deviceOutput[ gridOffset ], shift ); } } /*** // synchronize the null-stream after all grids * Store the number of CUDA grids for the purpose of unit testing, i.e. cudaStreamSynchronize(0); * to check if we test the algorithm with more than one CUDA grid. TNL_CHECK_CUDA_DEVICE; */ gridsCount() = numberOfGrids; } } /**** /**** Loading
src/TNL/Containers/Algorithms/PrefixSum.hpp +13 −17 Original line number Original line Diff line number Diff line Loading @@ -141,17 +141,18 @@ perform( Vector& v, const Reduction& reduction, const Reduction& reduction, const typename Vector::RealType& zero ) const typename Vector::RealType& zero ) { { #ifdef HAVE_CUDA using RealType = typename Vector::RealType; using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; using IndexType = typename Vector::IndexType; using IndexType = typename Vector::IndexType; #ifdef HAVE_CUDA CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::perform( CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::start( end - begin, ( IndexType ) ( end - begin ), &v[ begin ], // input ( IndexType ) 256, &v[ begin ], // output &v[ begin ], &v[ begin ], reduction, reduction, zero ); zero ); #else throw Exceptions::CudaSupportMissing(); #endif #endif } } Loading Loading @@ -211,18 +212,13 @@ perform( Vector& v, const Reduction& reduction, const Reduction& reduction, const typename Vector::RealType& zero ) const typename Vector::RealType& zero ) { { #ifdef HAVE_CUDA using RealType = typename Vector::RealType; using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; using IndexType = typename Vector::IndexType; using IndexType = typename Vector::IndexType; #ifdef HAVE_CUDA throw Exceptions::NotImplementedError( "Segmented prefix sum is not implemented for CUDA." ); throw Exceptions::NotImplementedError( "Segmented prefix sum is not implemented for CUDA." ); // NOT IMPLEMENTED YET #else /*CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::start( throw Exceptions::CudaSupportMissing(); ( IndexType ) ( end - begin ), ( IndexType ) 256, &v[ begin ], &v[ begin ], reduction, zero );*/ #endif #endif } } Loading