Loading src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h +2 −2 Original line number Diff line number Diff line Loading @@ -608,7 +608,7 @@ computePrefixSum( Vector& v, &v.getData()[ begin ], &v.getData()[ begin ], operation, Algorithms::inclusivePrefixSum ); Algorithms::PrefixSumType::inclusive ); #else throw Exceptions::CudaSupportMissing(); #endif Loading @@ -633,7 +633,7 @@ computeExclusivePrefixSum( Vector& v, &v.getData()[ begin ], &v.getData()[ begin ], operation, Algorithms::exclusivePrefixSum ); Algorithms::PrefixSumType::exclusive ); #endif } Loading src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h +2 −0 Original line number Diff line number Diff line Loading @@ -627,6 +627,7 @@ computePrefixSum( Vector& v, { typedef typename Vector::IndexType Index; // TODO: parallelize with OpenMP for( Index i = begin + 1; i < end; i++ ) v[ i ] += v[ i - 1 ]; } Loading @@ -641,6 +642,7 @@ computeExclusivePrefixSum( Vector& v, typedef typename Vector::IndexType Index; typedef typename Vector::RealType Real; // TODO: parallelize with OpenMP Real aux( v[ begin ] ); v[ begin ] = 0.0; for( Index i = begin + 1; i < end; i++ ) Loading src/TNL/Containers/Algorithms/cuda-prefix-sum.h +6 −3 Original line number Diff line number Diff line Loading @@ -14,8 +14,11 @@ namespace TNL { namespace Containers { namespace Algorithms { enum enumPrefixSumType { exclusivePrefixSum = 0, inclusivePrefixSum }; enum class PrefixSumType { exclusive, inclusive }; template< typename DataType, typename Operation, Loading @@ -25,7 +28,7 @@ bool cudaPrefixSum( const Index size, const DataType *deviceInput, DataType* deviceOutput, const Operation& operation, const enumPrefixSumType prefixSumType = inclusivePrefixSum ); const PrefixSumType prefixSumType = PrefixSumType::inclusive ); } // namespace Algorithms } // namespace Containers Loading src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h +146 −171 Original line number Diff line number Diff line Loading @@ -12,6 +12,7 @@ #include <iostream> #include <TNL/Math.h> #include <TNL/Devices/Cuda.h> #include <TNL/Exceptions/CudaBadAlloc.h> #include <TNL/Containers/Algorithms/reduction-operations.h> Loading @@ -25,7 +26,8 @@ namespace Algorithms { template< typename DataType, typename Operation, typename Index > __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumType, __global__ void cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, Operation operation, const Index size, const Index elementsInBlock, Loading @@ -46,7 +48,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT */ const Index blockOffset = blockIdx.x * elementsInBlock; Index idx = threadIdx.x; if( prefixSumType == exclusivePrefixSum ) if( prefixSumType == PrefixSumType::exclusive ) { if( idx == 0 ) sharedData[ 0 ] = operation.initialValue(); Loading @@ -69,8 +71,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT __syncthreads(); const int chunkSize = elementsInBlock / blockDim.x; const int chunkOffset = threadIdx.x * chunkSize; const int numberOfChunks = lastElementInBlock / chunkSize + ( lastElementInBlock % chunkSize != 0 ); const int numberOfChunks = roundUpDivision( lastElementInBlock, chunkSize ); if( chunkOffset < lastElementInBlock ) { Loading Loading @@ -136,7 +137,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT if( threadIdx.x == 0 ) { if( prefixSumType == exclusivePrefixSum ) if( prefixSumType == PrefixSumType::exclusive ) { /*auxArray[ blockIdx.x ] = operation.commonReductionOnDevice( Devices::Cuda::getInterleaving( lastElementInBlock - 1 ), Devices::Cuda::getInterleaving( lastElementInBlock ), Loading @@ -149,13 +150,13 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT else auxArray[ blockIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ]; } } template< typename DataType, typename Operation, typename Index > __global__ void cudaSecondPhaseBlockPrefixSum( Operation operation, __global__ void cudaSecondPhaseBlockPrefixSum( Operation operation, const Index size, const Index elementsInBlock, const Index gridShift, Loading @@ -181,7 +182,8 @@ __global__ void cudaSecondPhaseBlockPrefixSum( Operation operation, template< typename DataType, typename Operation, typename Index > bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType, void cudaRecursivePrefixSum( const PrefixSumType prefixSumType, Operation& operation, const Index size, const Index blockSize, Loading @@ -190,75 +192,59 @@ bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType, const DataType* input, DataType *output ) { const Index numberOfBlocks = ceil( ( double ) size / ( double ) elementsInBlock ); const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); const Index auxArraySize = numberOfBlocks * sizeof( DataType ); DataType *auxArray1, *auxArray2; if( cudaMalloc( ( void** ) &auxArray1, auxArraySize ) != cudaSuccess || cudaMalloc( ( void** ) &auxArray2, auxArraySize ) != cudaSuccess ) throw Exceptions::CudaBadAlloc(); Array< DataType, Devices::Cuda > auxArray1, auxArray2; auxArray1.setSize( auxArraySize ); auxArray2.setSize( auxArraySize ); /**** * Setup block and grid size. */ dim3 cudaBlockSize( 0 ), cudaGridSize( 0 ); cudaBlockSize.x = blockSize; cudaGridSize. x = size / elementsInBlock + ( size % elementsInBlock != 0 ); cudaGridSize.x = roundUpDivision( size, elementsInBlock ); /**** * Run the kernel. */ size_t sharedDataSize = elementsInBlock + const std::size_t sharedDataSize = elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2; size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( DataType ); cudaFirstPhaseBlockPrefixSum< DataType, Operation, Index > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( DataType ); cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( prefixSumType, operation, size, elementsInBlock, input, output, auxArray1 ); if( ! TNL_CHECK_CUDA_DEVICE ) { std::cerr << "The CUDA kernel 'cudaFirstPhaseBlockPrefixSum' ended with error." << std::endl; cudaFree( auxArray1 ); cudaFree( auxArray2 ); return false; } auxArray1.getData() ); TNL_CHECK_CUDA_DEVICE; /*** * In auxArray1 there is now a sum of numbers in each block. * We must compute prefix-sum of auxArray1 and then shift * each block. */ if( numberOfBlocks > 1 && ! cudaRecursivePrefixSum< DataType, Operation, Index > ( inclusivePrefixSum, if( numberOfBlocks > 1 ) cudaRecursivePrefixSum( PrefixSumType::inclusive, operation, numberOfBlocks, blockSize, elementsInBlock, 0, auxArray1, auxArray2 ) ) return false; cudaSecondPhaseBlockPrefixSum< DataType, Operation, Index > <<< cudaGridSize, cudaBlockSize >>> ( operation, size, elementsInBlock, gridShift, auxArray2, output ); if( ! TNL_CHECK_CUDA_DEVICE ) { std::cerr << "The CUDA kernel 'cudaSecondPhaseBlockPrefixSum' ended with error." << std::endl; cudaFree( auxArray1 ); cudaFree( auxArray2 ); return false; } cudaFree( auxArray1 ); cudaFree( auxArray2 ); return true; (Index) 0, auxArray1.getData(), auxArray2.getData() ); cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>> ( operation, size, elementsInBlock, gridShift, auxArray2.getData(), output ); TNL_CHECK_CUDA_DEVICE; } Loading @@ -266,7 +252,8 @@ bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType, template< typename DataType, typename Operation, typename Index > bool cudaGridPrefixSum( enumPrefixSumType prefixSumType, void cudaGridPrefixSum( PrefixSumType prefixSumType, Operation& operation, const Index size, const Index blockSize, Loading @@ -275,76 +262,64 @@ bool cudaGridPrefixSum( enumPrefixSumType prefixSumType, DataType *deviceOutput, Index& gridShift ) { if( ! cudaRecursivePrefixSum< DataType, Operation, Index > ( prefixSumType, cudaRecursivePrefixSum( prefixSumType, operation, size, blockSize, elementsInBlock, gridShift, deviceInput, deviceOutput ) ) return false; if( cudaMemcpy( &gridShift, deviceOutput ); cudaMemcpy( &gridShift, &deviceOutput[ size - 1 ], sizeof( DataType ), cudaMemcpyDeviceToHost ) != cudaSuccess ) { std::cerr << "I am not able to copy data from device to host." << std::endl; return false; } return true; cudaMemcpyDeviceToHost ); TNL_CHECK_CUDA_DEVICE; } template< typename DataType, typename Operation, typename Index > bool cudaPrefixSum( const Index size, void cudaPrefixSum( const Index size, const Index blockSize, const DataType *deviceInput, DataType* deviceOutput, Operation& operation, const enumPrefixSumType prefixSumType ) const PrefixSumType prefixSumType ) { /**** * Compute the number of grids */ const Index elementsInBlock = 8 * blockSize; const Index gridSize = size / elementsInBlock + ( size % elementsInBlock != 0 ); const Index maxGridSize = 65536; const Index gridsNumber = gridSize / maxGridSize + ( gridSize % maxGridSize != 0 ); const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); const auto maxGridSize = Devices::Cuda::getMaxGridSize(); const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize ); /**** * Loop over all grids. */ Index gridShift( 0 ); for( Index gridIdx = 0; gridIdx < gridsNumber; gridIdx ++ ) Index gridShift = 0; for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) { /**** * Compute current grid size and size of data to be scanned */ Index gridSize = ( size - gridIdx * maxGridSize * elementsInBlock ) / elementsInBlock; Index currentSize = size - gridIdx * maxGridSize * elementsInBlock; if( gridSize > maxGridSize ) { gridSize = maxGridSize; if( currentSize / elementsInBlock > maxGridSize ) currentSize = maxGridSize * elementsInBlock; } Index gridOffset = gridIdx * maxGridSize * elementsInBlock; if( ! cudaGridPrefixSum< DataType, Operation, Index > ( prefixSumType, const Index gridOffset = gridIdx * maxGridSize * elementsInBlock; cudaGridPrefixSum( prefixSumType, operation, currentSize, blockSize, elementsInBlock, &deviceInput[ gridOffset ], &deviceOutput[ gridOffset ], gridShift ) ) return false; gridShift ); } return true; } #ifdef TEMPLATE_EXPLICIT_INSTANTIATION Loading @@ -353,7 +328,7 @@ extern template bool cudaPrefixSum( const int size, const int *deviceInput, int* deviceOutput, tnlParallelReductionSum< int, int >& operation, const enumPrefixSumType prefixSumType ); const PrefixSumType prefixSumType ); extern template bool cudaPrefixSum( const int size, Loading @@ -361,14 +336,14 @@ extern template bool cudaPrefixSum( const int size, const float *deviceInput, float* deviceOutput, tnlParallelReductionSum< float, int >& operation, const enumPrefixSumType prefixSumType ); const PrefixSumType prefixSumType ); extern template bool cudaPrefixSum( const int size, const int blockSize, const double *deviceInput, double* deviceOutput, tnlParallelReductionSum< double, int >& operation, const enumPrefixSumType prefixSumType ); const PrefixSumType prefixSumType ); #ifdef INSTANTIATE_LONG_DOUBLE extern template bool cudaPrefixSum( const int size, Loading @@ -376,7 +351,7 @@ extern template bool cudaPrefixSum( const int size, const long double *deviceInput, long double* deviceOutput, tnlParallelReductionSum< long double, int >& operation, const enumPrefixSumType prefixSumType ); const PrefixSumType prefixSumType ); #endif #ifdef INSTANTIATE_LONG_INT Loading @@ -385,7 +360,7 @@ extern template bool cudaPrefixSum( const long int size, const int *deviceInput, int* deviceOutput, tnlParallelReductionSum< int, long int >& operation, const enumPrefixSumType prefixSumType ); const PrefixSumType prefixSumType ); extern template bool cudaPrefixSum( const long int size, Loading @@ -393,14 +368,14 @@ extern template bool cudaPrefixSum( const long int size, const float *deviceInput, float* deviceOutput, tnlParallelReductionSum< float, long int >& operation, const enumPrefixSumType prefixSumType ); const PrefixSumType prefixSumType ); extern template bool cudaPrefixSum( const long int size, const long int blockSize, const double *deviceInput, double* deviceOutput, tnlParallelReductionSum< double, long int >& operation, const enumPrefixSumType prefixSumType ); const PrefixSumType prefixSumType ); #ifdef INSTANTIATE_LONG_DOUBLE extern template bool cudaPrefixSum( const long int size, Loading @@ -408,7 +383,7 @@ extern template bool cudaPrefixSum( const long int size, const long double *deviceInput, long double* deviceOutput, tnlParallelReductionSum< long double, long int >& operation, const enumPrefixSumType prefixSumType ); const PrefixSumType prefixSumType ); #endif #endif Loading Loading
src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h +2 −2 Original line number Diff line number Diff line Loading @@ -608,7 +608,7 @@ computePrefixSum( Vector& v, &v.getData()[ begin ], &v.getData()[ begin ], operation, Algorithms::inclusivePrefixSum ); Algorithms::PrefixSumType::inclusive ); #else throw Exceptions::CudaSupportMissing(); #endif Loading @@ -633,7 +633,7 @@ computeExclusivePrefixSum( Vector& v, &v.getData()[ begin ], &v.getData()[ begin ], operation, Algorithms::exclusivePrefixSum ); Algorithms::PrefixSumType::exclusive ); #endif } Loading
src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h +2 −0 Original line number Diff line number Diff line Loading @@ -627,6 +627,7 @@ computePrefixSum( Vector& v, { typedef typename Vector::IndexType Index; // TODO: parallelize with OpenMP for( Index i = begin + 1; i < end; i++ ) v[ i ] += v[ i - 1 ]; } Loading @@ -641,6 +642,7 @@ computeExclusivePrefixSum( Vector& v, typedef typename Vector::IndexType Index; typedef typename Vector::RealType Real; // TODO: parallelize with OpenMP Real aux( v[ begin ] ); v[ begin ] = 0.0; for( Index i = begin + 1; i < end; i++ ) Loading
src/TNL/Containers/Algorithms/cuda-prefix-sum.h +6 −3 Original line number Diff line number Diff line Loading @@ -14,8 +14,11 @@ namespace TNL { namespace Containers { namespace Algorithms { enum enumPrefixSumType { exclusivePrefixSum = 0, inclusivePrefixSum }; enum class PrefixSumType { exclusive, inclusive }; template< typename DataType, typename Operation, Loading @@ -25,7 +28,7 @@ bool cudaPrefixSum( const Index size, const DataType *deviceInput, DataType* deviceOutput, const Operation& operation, const enumPrefixSumType prefixSumType = inclusivePrefixSum ); const PrefixSumType prefixSumType = PrefixSumType::inclusive ); } // namespace Algorithms } // namespace Containers Loading
src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h +146 −171 Original line number Diff line number Diff line Loading @@ -12,6 +12,7 @@ #include <iostream> #include <TNL/Math.h> #include <TNL/Devices/Cuda.h> #include <TNL/Exceptions/CudaBadAlloc.h> #include <TNL/Containers/Algorithms/reduction-operations.h> Loading @@ -25,7 +26,8 @@ namespace Algorithms { template< typename DataType, typename Operation, typename Index > __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumType, __global__ void cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, Operation operation, const Index size, const Index elementsInBlock, Loading @@ -46,7 +48,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT */ const Index blockOffset = blockIdx.x * elementsInBlock; Index idx = threadIdx.x; if( prefixSumType == exclusivePrefixSum ) if( prefixSumType == PrefixSumType::exclusive ) { if( idx == 0 ) sharedData[ 0 ] = operation.initialValue(); Loading @@ -69,8 +71,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT __syncthreads(); const int chunkSize = elementsInBlock / blockDim.x; const int chunkOffset = threadIdx.x * chunkSize; const int numberOfChunks = lastElementInBlock / chunkSize + ( lastElementInBlock % chunkSize != 0 ); const int numberOfChunks = roundUpDivision( lastElementInBlock, chunkSize ); if( chunkOffset < lastElementInBlock ) { Loading Loading @@ -136,7 +137,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT if( threadIdx.x == 0 ) { if( prefixSumType == exclusivePrefixSum ) if( prefixSumType == PrefixSumType::exclusive ) { /*auxArray[ blockIdx.x ] = operation.commonReductionOnDevice( Devices::Cuda::getInterleaving( lastElementInBlock - 1 ), Devices::Cuda::getInterleaving( lastElementInBlock ), Loading @@ -149,13 +150,13 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT else auxArray[ blockIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ]; } } template< typename DataType, typename Operation, typename Index > __global__ void cudaSecondPhaseBlockPrefixSum( Operation operation, __global__ void cudaSecondPhaseBlockPrefixSum( Operation operation, const Index size, const Index elementsInBlock, const Index gridShift, Loading @@ -181,7 +182,8 @@ __global__ void cudaSecondPhaseBlockPrefixSum( Operation operation, template< typename DataType, typename Operation, typename Index > bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType, void cudaRecursivePrefixSum( const PrefixSumType prefixSumType, Operation& operation, const Index size, const Index blockSize, Loading @@ -190,75 +192,59 @@ bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType, const DataType* input, DataType *output ) { const Index numberOfBlocks = ceil( ( double ) size / ( double ) elementsInBlock ); const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); const Index auxArraySize = numberOfBlocks * sizeof( DataType ); DataType *auxArray1, *auxArray2; if( cudaMalloc( ( void** ) &auxArray1, auxArraySize ) != cudaSuccess || cudaMalloc( ( void** ) &auxArray2, auxArraySize ) != cudaSuccess ) throw Exceptions::CudaBadAlloc(); Array< DataType, Devices::Cuda > auxArray1, auxArray2; auxArray1.setSize( auxArraySize ); auxArray2.setSize( auxArraySize ); /**** * Setup block and grid size. */ dim3 cudaBlockSize( 0 ), cudaGridSize( 0 ); cudaBlockSize.x = blockSize; cudaGridSize. x = size / elementsInBlock + ( size % elementsInBlock != 0 ); cudaGridSize.x = roundUpDivision( size, elementsInBlock ); /**** * Run the kernel. */ size_t sharedDataSize = elementsInBlock + const std::size_t sharedDataSize = elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2; size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( DataType ); cudaFirstPhaseBlockPrefixSum< DataType, Operation, Index > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( DataType ); cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( prefixSumType, operation, size, elementsInBlock, input, output, auxArray1 ); if( ! TNL_CHECK_CUDA_DEVICE ) { std::cerr << "The CUDA kernel 'cudaFirstPhaseBlockPrefixSum' ended with error." << std::endl; cudaFree( auxArray1 ); cudaFree( auxArray2 ); return false; } auxArray1.getData() ); TNL_CHECK_CUDA_DEVICE; /*** * In auxArray1 there is now a sum of numbers in each block. * We must compute prefix-sum of auxArray1 and then shift * each block. */ if( numberOfBlocks > 1 && ! cudaRecursivePrefixSum< DataType, Operation, Index > ( inclusivePrefixSum, if( numberOfBlocks > 1 ) cudaRecursivePrefixSum( PrefixSumType::inclusive, operation, numberOfBlocks, blockSize, elementsInBlock, 0, auxArray1, auxArray2 ) ) return false; cudaSecondPhaseBlockPrefixSum< DataType, Operation, Index > <<< cudaGridSize, cudaBlockSize >>> ( operation, size, elementsInBlock, gridShift, auxArray2, output ); if( ! TNL_CHECK_CUDA_DEVICE ) { std::cerr << "The CUDA kernel 'cudaSecondPhaseBlockPrefixSum' ended with error." << std::endl; cudaFree( auxArray1 ); cudaFree( auxArray2 ); return false; } cudaFree( auxArray1 ); cudaFree( auxArray2 ); return true; (Index) 0, auxArray1.getData(), auxArray2.getData() ); cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>> ( operation, size, elementsInBlock, gridShift, auxArray2.getData(), output ); TNL_CHECK_CUDA_DEVICE; } Loading @@ -266,7 +252,8 @@ bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType, template< typename DataType, typename Operation, typename Index > bool cudaGridPrefixSum( enumPrefixSumType prefixSumType, void cudaGridPrefixSum( PrefixSumType prefixSumType, Operation& operation, const Index size, const Index blockSize, Loading @@ -275,76 +262,64 @@ bool cudaGridPrefixSum( enumPrefixSumType prefixSumType, DataType *deviceOutput, Index& gridShift ) { if( ! cudaRecursivePrefixSum< DataType, Operation, Index > ( prefixSumType, cudaRecursivePrefixSum( prefixSumType, operation, size, blockSize, elementsInBlock, gridShift, deviceInput, deviceOutput ) ) return false; if( cudaMemcpy( &gridShift, deviceOutput ); cudaMemcpy( &gridShift, &deviceOutput[ size - 1 ], sizeof( DataType ), cudaMemcpyDeviceToHost ) != cudaSuccess ) { std::cerr << "I am not able to copy data from device to host." << std::endl; return false; } return true; cudaMemcpyDeviceToHost ); TNL_CHECK_CUDA_DEVICE; } template< typename DataType, typename Operation, typename Index > bool cudaPrefixSum( const Index size, void cudaPrefixSum( const Index size, const Index blockSize, const DataType *deviceInput, DataType* deviceOutput, Operation& operation, const enumPrefixSumType prefixSumType ) const PrefixSumType prefixSumType ) { /**** * Compute the number of grids */ const Index elementsInBlock = 8 * blockSize; const Index gridSize = size / elementsInBlock + ( size % elementsInBlock != 0 ); const Index maxGridSize = 65536; const Index gridsNumber = gridSize / maxGridSize + ( gridSize % maxGridSize != 0 ); const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); const auto maxGridSize = Devices::Cuda::getMaxGridSize(); const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize ); /**** * Loop over all grids. */ Index gridShift( 0 ); for( Index gridIdx = 0; gridIdx < gridsNumber; gridIdx ++ ) Index gridShift = 0; for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) { /**** * Compute current grid size and size of data to be scanned */ Index gridSize = ( size - gridIdx * maxGridSize * elementsInBlock ) / elementsInBlock; Index currentSize = size - gridIdx * maxGridSize * elementsInBlock; if( gridSize > maxGridSize ) { gridSize = maxGridSize; if( currentSize / elementsInBlock > maxGridSize ) currentSize = maxGridSize * elementsInBlock; } Index gridOffset = gridIdx * maxGridSize * elementsInBlock; if( ! cudaGridPrefixSum< DataType, Operation, Index > ( prefixSumType, const Index gridOffset = gridIdx * maxGridSize * elementsInBlock; cudaGridPrefixSum( prefixSumType, operation, currentSize, blockSize, elementsInBlock, &deviceInput[ gridOffset ], &deviceOutput[ gridOffset ], gridShift ) ) return false; gridShift ); } return true; } #ifdef TEMPLATE_EXPLICIT_INSTANTIATION Loading @@ -353,7 +328,7 @@ extern template bool cudaPrefixSum( const int size, const int *deviceInput, int* deviceOutput, tnlParallelReductionSum< int, int >& operation, const enumPrefixSumType prefixSumType ); const PrefixSumType prefixSumType ); extern template bool cudaPrefixSum( const int size, Loading @@ -361,14 +336,14 @@ extern template bool cudaPrefixSum( const int size, const float *deviceInput, float* deviceOutput, tnlParallelReductionSum< float, int >& operation, const enumPrefixSumType prefixSumType ); const PrefixSumType prefixSumType ); extern template bool cudaPrefixSum( const int size, const int blockSize, const double *deviceInput, double* deviceOutput, tnlParallelReductionSum< double, int >& operation, const enumPrefixSumType prefixSumType ); const PrefixSumType prefixSumType ); #ifdef INSTANTIATE_LONG_DOUBLE extern template bool cudaPrefixSum( const int size, Loading @@ -376,7 +351,7 @@ extern template bool cudaPrefixSum( const int size, const long double *deviceInput, long double* deviceOutput, tnlParallelReductionSum< long double, int >& operation, const enumPrefixSumType prefixSumType ); const PrefixSumType prefixSumType ); #endif #ifdef INSTANTIATE_LONG_INT Loading @@ -385,7 +360,7 @@ extern template bool cudaPrefixSum( const long int size, const int *deviceInput, int* deviceOutput, tnlParallelReductionSum< int, long int >& operation, const enumPrefixSumType prefixSumType ); const PrefixSumType prefixSumType ); extern template bool cudaPrefixSum( const long int size, Loading @@ -393,14 +368,14 @@ extern template bool cudaPrefixSum( const long int size, const float *deviceInput, float* deviceOutput, tnlParallelReductionSum< float, long int >& operation, const enumPrefixSumType prefixSumType ); const PrefixSumType prefixSumType ); extern template bool cudaPrefixSum( const long int size, const long int blockSize, const double *deviceInput, double* deviceOutput, tnlParallelReductionSum< double, long int >& operation, const enumPrefixSumType prefixSumType ); const PrefixSumType prefixSumType ); #ifdef INSTANTIATE_LONG_DOUBLE extern template bool cudaPrefixSum( const long int size, Loading @@ -408,7 +383,7 @@ extern template bool cudaPrefixSum( const long int size, const long double *deviceInput, long double* deviceOutput, tnlParallelReductionSum< long double, long int >& operation, const enumPrefixSumType prefixSumType ); const PrefixSumType prefixSumType ); #endif #endif Loading