diff --git a/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h index 705369fe0ab0d2a3baca0fb7e7f1964da2372c10..c32f44bbaf3a318a8d43b9116baac563b26cdf5e 100644 --- a/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h +++ b/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h @@ -608,7 +608,7 @@ computePrefixSum( Vector& v, &v.getData()[ begin ], &v.getData()[ begin ], operation, - Algorithms::inclusivePrefixSum ); + Algorithms::PrefixSumType::inclusive ); #else throw Exceptions::CudaSupportMissing(); #endif @@ -633,7 +633,7 @@ computeExclusivePrefixSum( Vector& v, &v.getData()[ begin ], &v.getData()[ begin ], operation, - Algorithms::exclusivePrefixSum ); + Algorithms::PrefixSumType::exclusive ); #endif } diff --git a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h index b103340c26ee1d5c4b8135a3e53224ca823e6fe4..d8cbca17eae6398235a165455872d7c9284dacd9 100644 --- a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h +++ b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h @@ -627,6 +627,7 @@ computePrefixSum( Vector& v, { typedef typename Vector::IndexType Index; + // TODO: parallelize with OpenMP for( Index i = begin + 1; i < end; i++ ) v[ i ] += v[ i - 1 ]; } @@ -641,6 +642,7 @@ computeExclusivePrefixSum( Vector& v, typedef typename Vector::IndexType Index; typedef typename Vector::RealType Real; + // TODO: parallelize with OpenMP Real aux( v[ begin ] ); v[ begin ] = 0.0; for( Index i = begin + 1; i < end; i++ ) diff --git a/src/TNL/Containers/Algorithms/cuda-prefix-sum.h b/src/TNL/Containers/Algorithms/cuda-prefix-sum.h index 2c7ec97c5dee2adea84d8c67bafce27fda4fc3aa..37215a99570676c04a9681ec0540f752e15a26f4 100644 --- a/src/TNL/Containers/Algorithms/cuda-prefix-sum.h +++ b/src/TNL/Containers/Algorithms/cuda-prefix-sum.h @@ -14,8 +14,11 @@ namespace TNL { namespace Containers { namespace Algorithms { -enum enumPrefixSumType { exclusivePrefixSum = 0, - inclusivePrefixSum }; +enum class PrefixSumType +{ + exclusive, + inclusive +}; template< typename DataType, typename Operation, @@ -25,7 +28,7 @@ bool cudaPrefixSum( const Index size, const DataType *deviceInput, DataType* deviceOutput, const Operation& operation, - const enumPrefixSumType prefixSumType = inclusivePrefixSum ); + const PrefixSumType prefixSumType = PrefixSumType::inclusive ); } // namespace Algorithms } // namespace Containers diff --git a/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h b/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h index 384a9c8c252713142bdd98bfac26cf7c9bb43b6e..85f8eace6944f27ce84e602b5333b153a0010c0d 100644 --- a/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h +++ b/src/TNL/Containers/Algorithms/cuda-prefix-sum_impl.h @@ -12,6 +12,7 @@ #include <iostream> +#include <TNL/Math.h> #include <TNL/Devices/Cuda.h> #include <TNL/Exceptions/CudaBadAlloc.h> #include <TNL/Containers/Algorithms/reduction-operations.h> @@ -25,56 +26,56 @@ namespace Algorithms { template< typename DataType, typename Operation, typename Index > -__global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumType, - Operation operation, - const Index size, - const Index elementsInBlock, - const DataType* input, - DataType* output, - DataType* auxArray ) +__global__ void +cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, + Operation operation, + const Index size, + const Index elementsInBlock, + const DataType* input, + DataType* output, + DataType* auxArray ) { DataType* sharedData = TNL::Devices::Cuda::getSharedMemory< DataType >(); DataType* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ]; - DataType* warpSums = &auxData[ blockDim. x ]; + DataType* warpSums = &auxData[ blockDim.x ]; - const Index lastElementIdx = size - blockIdx. x * elementsInBlock; + const Index lastElementIdx = size - blockIdx.x * elementsInBlock; Index lastElementInBlock = ( lastElementIdx < elementsInBlock ? lastElementIdx : elementsInBlock ); /*** * Load data into the shared memory. */ - const Index blockOffset = blockIdx. x * elementsInBlock; - Index idx = threadIdx. x; - if( prefixSumType == exclusivePrefixSum ) + const Index blockOffset = blockIdx.x * elementsInBlock; + Index idx = threadIdx.x; + if( prefixSumType == PrefixSumType::exclusive ) { if( idx == 0 ) sharedData[ 0 ] = operation.initialValue(); while( idx < elementsInBlock && blockOffset + idx < size ) { sharedData[ Devices::Cuda::getInterleaving( idx + 1 ) ] = input[ blockOffset + idx ]; - idx += blockDim. x; + idx += blockDim.x; } } else while( idx < elementsInBlock && blockOffset + idx < size ) { sharedData[ Devices::Cuda::getInterleaving( idx ) ] = input[ blockOffset + idx ]; - idx += blockDim. x; + idx += blockDim.x; } /*** * Perform the sequential prefix-sum. */ __syncthreads(); - const int chunkSize = elementsInBlock / blockDim. x; - const int chunkOffset = threadIdx. x * chunkSize; - const int numberOfChunks = lastElementInBlock / chunkSize + - ( lastElementInBlock % chunkSize != 0 ); + const int chunkSize = elementsInBlock / blockDim.x; + const int chunkOffset = threadIdx.x * chunkSize; + const int numberOfChunks = roundUpDivision( lastElementInBlock, chunkSize ); if( chunkOffset < lastElementInBlock ) { - auxData[ threadIdx. x ] = + auxData[ threadIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( chunkOffset ) ]; } @@ -84,22 +85,22 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT { operation.commonReductionOnDevice( sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ], sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] ); - auxData[ threadIdx. x ] = + auxData[ threadIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ]; - chunkPointer ++; + chunkPointer++; } /*** * Perform the parallel prefix-sum inside warps. */ - const int threadInWarpIdx = threadIdx. x % Devices::Cuda::getWarpSize(); - const int warpIdx = threadIdx. x / Devices::Cuda::getWarpSize(); + const int threadInWarpIdx = threadIdx.x % Devices::Cuda::getWarpSize(); + const int warpIdx = threadIdx.x / Devices::Cuda::getWarpSize(); for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) - if( threadInWarpIdx >= stride && threadIdx. x < numberOfChunks ) - operation.commonReductionOnDevice( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] ); + if( threadInWarpIdx >= stride && threadIdx.x < numberOfChunks ) + operation.commonReductionOnDevice( auxData[ threadIdx.x ], auxData[ threadIdx.x - stride ] ); if( threadInWarpIdx == Devices::Cuda::getWarpSize() - 1 ) - warpSums[ warpIdx ] = auxData[ threadIdx. x ]; + warpSums[ warpIdx ] = auxData[ threadIdx.x ]; __syncthreads(); /**** @@ -115,13 +116,13 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT * Shift the warp prefix-sums. */ if( warpIdx > 0 ) - operation.commonReductionOnDevice( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] ); + operation.commonReductionOnDevice( auxData[ threadIdx.x ], warpSums[ warpIdx - 1 ] ); /*** * Store the result back in global memory. */ __syncthreads(); - idx = threadIdx. x; + idx = threadIdx.x; while( idx < elementsInBlock && blockOffset + idx < size ) { const Index chunkIdx = idx / chunkSize; @@ -130,49 +131,49 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT chunkShift = auxData[ chunkIdx - 1 ]; operation.commonReductionOnDevice( sharedData[ Devices::Cuda::getInterleaving( idx ) ], chunkShift ); output[ blockOffset + idx ] = sharedData[ Devices::Cuda::getInterleaving( idx ) ]; - idx += blockDim. x; + idx += blockDim.x; } __syncthreads(); - if( threadIdx. x == 0 ) + if( threadIdx.x == 0 ) { - if( prefixSumType == exclusivePrefixSum ) + if( prefixSumType == PrefixSumType::exclusive ) { - /*auxArray[ blockIdx. x ] = operation.commonReductionOnDevice( Devices::Cuda::getInterleaving( lastElementInBlock - 1 ), + /*auxArray[ blockIdx.x ] = operation.commonReductionOnDevice( Devices::Cuda::getInterleaving( lastElementInBlock - 1 ), Devices::Cuda::getInterleaving( lastElementInBlock ), sharedData );*/ DataType aux = operation.initialValue(); operation.commonReductionOnDevice( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ] ); operation.commonReductionOnDevice( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); - auxArray[ blockIdx. x ] = aux; + auxArray[ blockIdx.x ] = aux; } else - auxArray[ blockIdx. x ] = sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ]; + auxArray[ blockIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ]; } - } template< typename DataType, typename Operation, typename Index > -__global__ void cudaSecondPhaseBlockPrefixSum( Operation operation, - const Index size, - const Index elementsInBlock, - const Index gridShift, - const DataType* auxArray, - DataType* data ) +__global__ void +cudaSecondPhaseBlockPrefixSum( Operation operation, + const Index size, + const Index elementsInBlock, + const Index gridShift, + const DataType* auxArray, + DataType* data ) { - if( blockIdx. x > 0 ) + if( blockIdx.x > 0 ) { DataType shift( gridShift ); - operation.commonReductionOnDevice( shift, auxArray[ blockIdx. x - 1 ] ); + operation.commonReductionOnDevice( shift, auxArray[ blockIdx.x - 1 ] ); - const Index readOffset = blockIdx. x * elementsInBlock; - Index readIdx = threadIdx. x; + const Index readOffset = blockIdx.x * elementsInBlock; + Index readIdx = threadIdx.x; while( readIdx < elementsInBlock && readOffset + readIdx < size ) { operation.commonReductionOnDevice( data[ readIdx + readOffset ], shift ); - readIdx += blockDim. x; + readIdx += blockDim.x; } } } @@ -181,84 +182,69 @@ __global__ void cudaSecondPhaseBlockPrefixSum( Operation operation, template< typename DataType, typename Operation, typename Index > -bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType, - Operation& operation, - const Index size, - const Index blockSize, - const Index elementsInBlock, - const Index gridShift, - const DataType* input, - DataType *output ) +void +cudaRecursivePrefixSum( const PrefixSumType prefixSumType, + Operation& operation, + const Index size, + const Index blockSize, + const Index elementsInBlock, + const Index gridShift, + const DataType* input, + DataType *output ) { - const Index numberOfBlocks = ceil( ( double ) size / ( double ) elementsInBlock ); + const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); const Index auxArraySize = numberOfBlocks * sizeof( DataType ); - DataType *auxArray1, *auxArray2; - if( cudaMalloc( ( void** ) &auxArray1, auxArraySize ) != cudaSuccess || - cudaMalloc( ( void** ) &auxArray2, auxArraySize ) != cudaSuccess ) - throw Exceptions::CudaBadAlloc(); + Array< DataType, Devices::Cuda > auxArray1, auxArray2; + auxArray1.setSize( auxArraySize ); + auxArray2.setSize( auxArraySize ); /**** * Setup block and grid size. */ dim3 cudaBlockSize( 0 ), cudaGridSize( 0 ); - cudaBlockSize. x = blockSize; - cudaGridSize. x = size / elementsInBlock + - ( size % elementsInBlock != 0 ); + cudaBlockSize.x = blockSize; + cudaGridSize.x = roundUpDivision( size, elementsInBlock ); /**** * Run the kernel. */ - size_t sharedDataSize = elementsInBlock + - elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2; - size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( DataType ); - cudaFirstPhaseBlockPrefixSum< DataType, Operation, Index > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( prefixSumType, - operation, - size, - elementsInBlock, - input, - output, - auxArray1 ); - if( ! TNL_CHECK_CUDA_DEVICE ) - { - std::cerr << "The CUDA kernel 'cudaFirstPhaseBlockPrefixSum' ended with error." << std::endl; - cudaFree( auxArray1 ); - cudaFree( auxArray2 ); - return false; - } + const std::size_t sharedDataSize = elementsInBlock + + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2; + const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( DataType ); + cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>> + ( prefixSumType, + operation, + size, + elementsInBlock, + input, + output, + auxArray1.getData() ); + TNL_CHECK_CUDA_DEVICE; /*** * In auxArray1 there is now a sum of numbers in each block. * We must compute prefix-sum of auxArray1 and then shift * each block. */ - if( numberOfBlocks > 1 && - ! cudaRecursivePrefixSum< DataType, Operation, Index > - ( inclusivePrefixSum, - operation, - numberOfBlocks, - blockSize, - elementsInBlock, - 0, - auxArray1, - auxArray2 ) ) - return false; - cudaSecondPhaseBlockPrefixSum< DataType, Operation, Index > - <<< cudaGridSize, cudaBlockSize >>> - ( operation, size, elementsInBlock, gridShift, auxArray2, output ); - - if( ! TNL_CHECK_CUDA_DEVICE ) - { - std::cerr << "The CUDA kernel 'cudaSecondPhaseBlockPrefixSum' ended with error." << std::endl; - cudaFree( auxArray1 ); - cudaFree( auxArray2 ); - return false; - } - cudaFree( auxArray1 ); - cudaFree( auxArray2 ); - return true; + if( numberOfBlocks > 1 ) + cudaRecursivePrefixSum( PrefixSumType::inclusive, + operation, + numberOfBlocks, + blockSize, + elementsInBlock, + (Index) 0, + auxArray1.getData(), + auxArray2.getData() ); + + cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>> + ( operation, + size, + elementsInBlock, + gridShift, + auxArray2.getData(), + output ); + TNL_CHECK_CUDA_DEVICE; } @@ -266,85 +252,74 @@ bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType, template< typename DataType, typename Operation, typename Index > -bool cudaGridPrefixSum( enumPrefixSumType prefixSumType, - Operation& operation, - const Index size, - const Index blockSize, - const Index elementsInBlock, - const DataType *deviceInput, - DataType *deviceOutput, - Index& gridShift ) +void +cudaGridPrefixSum( PrefixSumType prefixSumType, + Operation& operation, + const Index size, + const Index blockSize, + const Index elementsInBlock, + const DataType *deviceInput, + DataType *deviceOutput, + Index& gridShift ) { - - if( ! cudaRecursivePrefixSum< DataType, Operation, Index > - ( prefixSumType, - operation, - size, - blockSize, - elementsInBlock, - gridShift, - deviceInput, - deviceOutput ) ) - return false; - if( cudaMemcpy( &gridShift, - &deviceOutput[ size - 1 ], - sizeof( DataType ), - cudaMemcpyDeviceToHost ) != cudaSuccess ) - { - std::cerr << "I am not able to copy data from device to host." << std::endl; - return false; - } - return true; + cudaRecursivePrefixSum( prefixSumType, + operation, + size, + blockSize, + elementsInBlock, + gridShift, + deviceInput, + deviceOutput ); + + cudaMemcpy( &gridShift, + &deviceOutput[ size - 1 ], + sizeof( DataType ), + cudaMemcpyDeviceToHost ); + TNL_CHECK_CUDA_DEVICE; } template< typename DataType, typename Operation, typename Index > -bool cudaPrefixSum( const Index size, - const Index blockSize, - const DataType *deviceInput, - DataType* deviceOutput, - Operation& operation, - const enumPrefixSumType prefixSumType ) +void +cudaPrefixSum( const Index size, + const Index blockSize, + const DataType *deviceInput, + DataType* deviceOutput, + Operation& operation, + const PrefixSumType prefixSumType ) { /**** * Compute the number of grids */ const Index elementsInBlock = 8 * blockSize; - const Index gridSize = size / elementsInBlock + ( size % elementsInBlock != 0 ); - const Index maxGridSize = 65536; - const Index gridsNumber = gridSize / maxGridSize + ( gridSize % maxGridSize != 0 ); + const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); + const auto maxGridSize = Devices::Cuda::getMaxGridSize(); + const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize ); /**** * Loop over all grids. */ - Index gridShift( 0 ); - for( Index gridIdx = 0; gridIdx < gridsNumber; gridIdx ++ ) + Index gridShift = 0; + for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) { /**** * Compute current grid size and size of data to be scanned */ - Index gridSize = ( size - gridIdx * maxGridSize * elementsInBlock ) / - elementsInBlock; Index currentSize = size - gridIdx * maxGridSize * elementsInBlock; - if( gridSize > maxGridSize ) - { - gridSize = maxGridSize; + if( currentSize / elementsInBlock > maxGridSize ) currentSize = maxGridSize * elementsInBlock; - } - Index gridOffset = gridIdx * maxGridSize * elementsInBlock; - if( ! cudaGridPrefixSum< DataType, Operation, Index > - ( prefixSumType, - operation, - currentSize, - blockSize, - elementsInBlock, - &deviceInput[ gridOffset ], - &deviceOutput[ gridOffset ], - gridShift ) ) - return false; + const Index gridOffset = gridIdx * maxGridSize * elementsInBlock; + + cudaGridPrefixSum( prefixSumType, + operation, + currentSize, + blockSize, + elementsInBlock, + &deviceInput[ gridOffset ], + &deviceOutput[ gridOffset ], + gridShift ); } - return true; } #ifdef TEMPLATE_EXPLICIT_INSTANTIATION @@ -353,7 +328,7 @@ extern template bool cudaPrefixSum( const int size, const int *deviceInput, int* deviceOutput, tnlParallelReductionSum< int, int >& operation, - const enumPrefixSumType prefixSumType ); + const PrefixSumType prefixSumType ); extern template bool cudaPrefixSum( const int size, @@ -361,14 +336,14 @@ extern template bool cudaPrefixSum( const int size, const float *deviceInput, float* deviceOutput, tnlParallelReductionSum< float, int >& operation, - const enumPrefixSumType prefixSumType ); + const PrefixSumType prefixSumType ); extern template bool cudaPrefixSum( const int size, const int blockSize, const double *deviceInput, double* deviceOutput, tnlParallelReductionSum< double, int >& operation, - const enumPrefixSumType prefixSumType ); + const PrefixSumType prefixSumType ); #ifdef INSTANTIATE_LONG_DOUBLE extern template bool cudaPrefixSum( const int size, @@ -376,7 +351,7 @@ extern template bool cudaPrefixSum( const int size, const long double *deviceInput, long double* deviceOutput, tnlParallelReductionSum< long double, int >& operation, - const enumPrefixSumType prefixSumType ); + const PrefixSumType prefixSumType ); #endif #ifdef INSTANTIATE_LONG_INT @@ -385,7 +360,7 @@ extern template bool cudaPrefixSum( const long int size, const int *deviceInput, int* deviceOutput, tnlParallelReductionSum< int, long int >& operation, - const enumPrefixSumType prefixSumType ); + const PrefixSumType prefixSumType ); extern template bool cudaPrefixSum( const long int size, @@ -393,14 +368,14 @@ extern template bool cudaPrefixSum( const long int size, const float *deviceInput, float* deviceOutput, tnlParallelReductionSum< float, long int >& operation, - const enumPrefixSumType prefixSumType ); + const PrefixSumType prefixSumType ); extern template bool cudaPrefixSum( const long int size, const long int blockSize, const double *deviceInput, double* deviceOutput, tnlParallelReductionSum< double, long int >& operation, - const enumPrefixSumType prefixSumType ); + const PrefixSumType prefixSumType ); #ifdef INSTANTIATE_LONG_DOUBLE extern template bool cudaPrefixSum( const long int size, @@ -408,7 +383,7 @@ extern template bool cudaPrefixSum( const long int size, const long double *deviceInput, long double* deviceOutput, tnlParallelReductionSum< long double, long int >& operation, - const enumPrefixSumType prefixSumType ); + const PrefixSumType prefixSumType ); #endif #endif