Loading src/TNL/Containers/Algorithms/CudaReductionKernel.h +227 −240 Original line number Diff line number Diff line Loading @@ -12,20 +12,17 @@ #include <utility> // std::pair #ifdef HAVE_CUDA #include <cuda.h> #endif #include <TNL/Assert.h> #include <TNL/Math.h> #include <TNL/Devices/CudaDeviceInfo.h> #include <TNL/Containers/Algorithms/CudaReductionBuffer.h> #include <TNL/Containers/Algorithms/ArrayOperations.h> #include <TNL/Exceptions/CudaSupportMissing.h> namespace TNL { namespace Containers { namespace Algorithms { #ifdef HAVE_CUDA /**** * The performance of this kernel is very sensitive to register usage. * Compile with --ptxas-options=-v and configure these constants for given Loading @@ -34,6 +31,7 @@ namespace Algorithms { static constexpr int Reduction_maxThreadsPerBlock = 256; // must be a power of 2 static constexpr int Reduction_registersPerThread = 32; // empirically determined optimal value #ifdef HAVE_CUDA // __CUDA_ARCH__ is defined only in device code! #if (__CUDA_ARCH__ >= 300 ) static constexpr int Reduction_minBlocksPerMultiprocessor = 8; Loading @@ -56,19 +54,16 @@ CudaReductionKernel( const Result zero, const Index size, Result* output ) { using IndexType = Index; using ResultType = Result; ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); Result* sdata = Devices::Cuda::getSharedMemory< Result >(); // Get the thread id (tid), global thread id (gid) and gridSize. const IndexType tid = threadIdx.x; IndexType gid = blockIdx.x * blockDim. x + threadIdx.x; const IndexType gridSize = blockDim.x * gridDim.x; const Index tid = threadIdx.x; Index gid = blockIdx.x * blockDim. x + threadIdx.x; const Index gridSize = blockDim.x * gridDim.x; sdata[ tid ] = zero; // Read data into the shared memory. We start with the sequential reduction. // Start with the sequential reduction and push the result into the shared memory. while( gid + 4 * gridSize < size ) { reduction( sdata[ tid ], dataFetcher( gid ) ); reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); Loading Loading @@ -111,29 +106,24 @@ CudaReductionKernel( const Result zero, // This runs in one warp so it is synchronized implicitly. if( tid < 32 ) { volatile ResultType* vsdata = sdata; if( blockSize >= 64 ) { volatile Result* vsdata = sdata; if( blockSize >= 64 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); } // Note that here we do not have to check if tid < 16 etc, because we have // twice as much shared memory, so we do not access out of bounds. The // results for the upper half will be undefined, but unused anyway. if( blockSize >= 32 ) { // 2 * blockSize.x elements of shared memory per block, so we do not // access out of bounds. The results for the upper half will be undefined, // but unused anyway. if( blockSize >= 32 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); } if( blockSize >= 16 ) { if( blockSize >= 16 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); } if( blockSize >= 8 ) { if( blockSize >= 8 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); } if( blockSize >= 4 ) { if( blockSize >= 4 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); } if( blockSize >= 2 ) { if( blockSize >= 2 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); } } // Store the result back in the global memory. if( tid == 0 ) Loading @@ -157,18 +147,15 @@ CudaReductionWithArgumentKernel( const Result zero, Index* idxOutput, const Index* idxInput = nullptr ) { using IndexType = Index; using ResultType = Result; ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); IndexType* sidx = reinterpret_cast< IndexType* >( &sdata[ blockDim.x ] ); Result* sdata = Devices::Cuda::getSharedMemory< Result >(); Index* sidx = reinterpret_cast< Index* >( &sdata[ blockDim.x ] ); // Get the thread id (tid), global thread id (gid) and gridSize. const IndexType tid = threadIdx.x; IndexType gid = blockIdx.x * blockDim. x + threadIdx.x; const IndexType gridSize = blockDim.x * gridDim.x; const Index tid = threadIdx.x; Index gid = blockIdx.x * blockDim. x + threadIdx.x; const Index gridSize = blockDim.x * gridDim.x; // Read data into the shared memory. We start with the sequential reduction. // Start with the sequential reduction and push the result into the shared memory. if( idxInput ) { if( gid < size ) { sdata[ tid ] = dataFetcher( gid ); Loading Loading @@ -245,30 +232,25 @@ CudaReductionWithArgumentKernel( const Result zero, // This runs in one warp so it is synchronized implicitly. if( tid < 32 ) { volatile ResultType* vsdata = sdata; volatile IndexType* vsidx = sidx; if( blockSize >= 64 ) { volatile Result* vsdata = sdata; volatile Index* vsidx = sidx; if( blockSize >= 64 ) volatileReduction( vsidx[ tid ], vsidx[ tid + 32 ], vsdata[ tid ], vsdata[ tid + 32 ] ); } // Note that here we do not have to check if tid < 16 etc, because we have // twice as much shared memory, so we do not access out of bounds. The // results for the upper half will be undefined, but unused anyway. if( blockSize >= 32 ) { // 2 * blockSize.x elements of shared memory per block, so we do not // access out of bounds. The results for the upper half will be undefined, // but unused anyway. if( blockSize >= 32 ) volatileReduction( vsidx[ tid ], vsidx[ tid + 16 ], vsdata[ tid ], vsdata[ tid + 16 ] ); } if( blockSize >= 16 ) { if( blockSize >= 16 ) volatileReduction( vsidx[ tid ], vsidx[ tid + 8 ], vsdata[ tid ], vsdata[ tid + 8 ] ); } if( blockSize >= 8 ) { if( blockSize >= 8 ) volatileReduction( vsidx[ tid ], vsidx[ tid + 4 ], vsdata[ tid ], vsdata[ tid + 4 ] ); } if( blockSize >= 4 ) { if( blockSize >= 4 ) volatileReduction( vsidx[ tid ], vsidx[ tid + 2 ], vsdata[ tid ], vsdata[ tid + 2 ] ); } if( blockSize >= 2 ) { if( blockSize >= 2 ) volatileReduction( vsidx[ tid ], vsidx[ tid + 1 ], vsdata[ tid ], vsdata[ tid + 1 ] ); } } // Store the result back in the global memory. if( tid == 0 ) { Loading @@ -276,15 +258,13 @@ CudaReductionWithArgumentKernel( const Result zero, idxOutput[ blockIdx.x ] = sidx[ 0 ]; } } #endif template< typename Index, typename Result > struct CudaReductionKernelLauncher { using IndexType = Index; using ResultType = Result; // The number of blocks should be a multiple of the number of multiprocessors // to ensure optimum balancing of the load. This is very important, because // we run the kernel with a fixed number of blocks, so the amount of work per Loading Loading @@ -315,13 +295,13 @@ struct CudaReductionKernelLauncher const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, ResultType*& output ) Result*& output ) { // create reference to the reduction buffer singleton and set size const std::size_t buf_size = 2 * desGridSize * sizeof( ResultType ); const std::size_t buf_size = 2 * desGridSize * sizeof( Result ); CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); cudaReductionBuffer.setSize( buf_size ); output = cudaReductionBuffer.template getData< ResultType >(); output = cudaReductionBuffer.template getData< Result >(); this->reducedSize = this->launch( originalSize, reduction, volatileReduction, dataFetcher, zero, output ); return this->reducedSize; Loading @@ -334,15 +314,15 @@ struct CudaReductionKernelLauncher const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, ResultType*& output, IndexType*& idxOutput ) Result*& output, Index*& idxOutput ) { // create reference to the reduction buffer singleton and set size const std::size_t buf_size = 2 * desGridSize * ( sizeof( ResultType ) + sizeof( IndexType ) ); const std::size_t buf_size = 2 * desGridSize * ( sizeof( Result ) + sizeof( Index ) ); CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); cudaReductionBuffer.setSize( buf_size ); output = cudaReductionBuffer.template getData< ResultType >(); idxOutput = reinterpret_cast< IndexType* >( &output[ 2 * desGridSize ] ); output = cudaReductionBuffer.template getData< Result >(); idxOutput = reinterpret_cast< Index* >( &output[ 2 * desGridSize ] ); this->reducedSize = this->launchWithArgument( originalSize, reduction, volatileReduction, dataFetcher, zero, output, idxOutput, nullptr ); return this->reducedSize; Loading @@ -357,13 +337,13 @@ struct CudaReductionKernelLauncher { // Input is the first half of the buffer, output is the second half CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); ResultType* input = cudaReductionBuffer.template getData< ResultType >(); ResultType* output = &input[ desGridSize ]; Result* input = cudaReductionBuffer.template getData< Result >(); Result* output = &input[ desGridSize ]; while( this->reducedSize > 1 ) { // this lambda has to be defined inside the loop, because the captured variable changes auto copyFetch = [input] __cuda_callable__ ( IndexType i ) { return input[ i ]; }; auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; }; this->reducedSize = this->launch( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output ); std::swap( input, output ); } Loading @@ -373,7 +353,7 @@ struct CudaReductionKernelLauncher std::swap( input, output ); // Copy result on CPU ResultType result; Result result; ArrayOperations< Devices::Host, Devices::Cuda >::copy( &result, output, 1 ); return result; } Loading @@ -387,15 +367,15 @@ struct CudaReductionKernelLauncher { // Input is the first half of the buffer, output is the second half CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); ResultType* input = cudaReductionBuffer.template getData< ResultType >(); ResultType* output = &input[ desGridSize ]; IndexType* idxInput = reinterpret_cast< IndexType* >( &output[ desGridSize ] ); IndexType* idxOutput = &idxInput[ desGridSize ]; Result* input = cudaReductionBuffer.template getData< Result >(); Result* output = &input[ desGridSize ]; Index* idxInput = reinterpret_cast< Index* >( &output[ desGridSize ] ); Index* idxOutput = &idxInput[ desGridSize ]; while( this->reducedSize > 1 ) { // this lambda has to be defined inside the loop, because the captured variable changes auto copyFetch = [input] __cuda_callable__ ( IndexType i ) { return input[ i ]; }; auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; }; this->reducedSize = this->launchWithArgument( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output, idxOutput, idxInput ); std::swap( input, output ); std::swap( idxInput, idxOutput ); Loading Loading @@ -426,15 +406,16 @@ struct CudaReductionKernelLauncher const Result& zero, Result* output ) { #ifdef HAVE_CUDA dim3 blockSize, gridSize; blockSize.x = Reduction_maxThreadsPerBlock; gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds const IndexType shmem = (blockSize.x <= 32) ? 2 * blockSize.x * sizeof( ResultType ) : blockSize.x * sizeof( ResultType ); const Index shmem = (blockSize.x <= 32) ? 2 * blockSize.x * sizeof( Result ) : blockSize.x * sizeof( Result ); // This is "general", but this method always sets blockSize.x to a specific value, // so runtime switch is not necessary - it only prolongs the compilation time. Loading Loading @@ -515,6 +496,9 @@ struct CudaReductionKernelLauncher // Return the size of the output array on the CUDA device return gridSize.x; #else throw Exceptions::CudaSupportMissing(); #endif } template< typename DataFetcher, Loading @@ -529,15 +513,16 @@ struct CudaReductionKernelLauncher Index* idxOutput, const Index* idxInput ) { #ifdef HAVE_CUDA dim3 blockSize, gridSize; blockSize.x = Reduction_maxThreadsPerBlock; gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds const IndexType shmem = (blockSize.x <= 32) ? 2 * blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) ) : blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) ); const Index shmem = (blockSize.x <= 32) ? 2 * blockSize.x * ( sizeof( Result ) + sizeof( Index ) ) : blockSize.x * ( sizeof( Result ) + sizeof( Index ) ); // This is "general", but this method always sets blockSize.x to a specific value, // so runtime switch is not necessary - it only prolongs the compilation time. Loading Loading @@ -618,16 +603,18 @@ struct CudaReductionKernelLauncher // return the size of the output array on the CUDA device return gridSize.x; #else throw Exceptions::CudaSupportMissing(); #endif } const int activeDevice; const int blocksdPerMultiprocessor; const int desGridSize; const IndexType originalSize; IndexType reducedSize; const Index originalSize; Index reducedSize; }; #endif } // namespace Algorithms } // namespace Containers Loading src/TNL/Containers/Algorithms/Reduction.h +47 −49 Original line number Diff line number Diff line Loading @@ -22,12 +22,11 @@ namespace Containers { namespace Algorithms { template< typename Device > class Reduction; struct Reduction; template<> class Reduction< Devices::Host > struct Reduction< Devices::Host > { public: template< typename Index, typename Result, typename ReductionOperation, Loading @@ -54,9 +53,8 @@ class Reduction< Devices::Host > }; template<> class Reduction< Devices::Cuda > struct Reduction< Devices::Cuda > { public: template< typename Index, typename Result, typename ReductionOperation, Loading src/TNL/Containers/Algorithms/Reduction.hpp +53 −97 File changed.Preview size limit exceeded, changes collapsed. Show changes Loading
src/TNL/Containers/Algorithms/CudaReductionKernel.h +227 −240 Original line number Diff line number Diff line Loading @@ -12,20 +12,17 @@ #include <utility> // std::pair #ifdef HAVE_CUDA #include <cuda.h> #endif #include <TNL/Assert.h> #include <TNL/Math.h> #include <TNL/Devices/CudaDeviceInfo.h> #include <TNL/Containers/Algorithms/CudaReductionBuffer.h> #include <TNL/Containers/Algorithms/ArrayOperations.h> #include <TNL/Exceptions/CudaSupportMissing.h> namespace TNL { namespace Containers { namespace Algorithms { #ifdef HAVE_CUDA /**** * The performance of this kernel is very sensitive to register usage. * Compile with --ptxas-options=-v and configure these constants for given Loading @@ -34,6 +31,7 @@ namespace Algorithms { static constexpr int Reduction_maxThreadsPerBlock = 256; // must be a power of 2 static constexpr int Reduction_registersPerThread = 32; // empirically determined optimal value #ifdef HAVE_CUDA // __CUDA_ARCH__ is defined only in device code! #if (__CUDA_ARCH__ >= 300 ) static constexpr int Reduction_minBlocksPerMultiprocessor = 8; Loading @@ -56,19 +54,16 @@ CudaReductionKernel( const Result zero, const Index size, Result* output ) { using IndexType = Index; using ResultType = Result; ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); Result* sdata = Devices::Cuda::getSharedMemory< Result >(); // Get the thread id (tid), global thread id (gid) and gridSize. const IndexType tid = threadIdx.x; IndexType gid = blockIdx.x * blockDim. x + threadIdx.x; const IndexType gridSize = blockDim.x * gridDim.x; const Index tid = threadIdx.x; Index gid = blockIdx.x * blockDim. x + threadIdx.x; const Index gridSize = blockDim.x * gridDim.x; sdata[ tid ] = zero; // Read data into the shared memory. We start with the sequential reduction. // Start with the sequential reduction and push the result into the shared memory. while( gid + 4 * gridSize < size ) { reduction( sdata[ tid ], dataFetcher( gid ) ); reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); Loading Loading @@ -111,29 +106,24 @@ CudaReductionKernel( const Result zero, // This runs in one warp so it is synchronized implicitly. if( tid < 32 ) { volatile ResultType* vsdata = sdata; if( blockSize >= 64 ) { volatile Result* vsdata = sdata; if( blockSize >= 64 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); } // Note that here we do not have to check if tid < 16 etc, because we have // twice as much shared memory, so we do not access out of bounds. The // results for the upper half will be undefined, but unused anyway. if( blockSize >= 32 ) { // 2 * blockSize.x elements of shared memory per block, so we do not // access out of bounds. The results for the upper half will be undefined, // but unused anyway. if( blockSize >= 32 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); } if( blockSize >= 16 ) { if( blockSize >= 16 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); } if( blockSize >= 8 ) { if( blockSize >= 8 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); } if( blockSize >= 4 ) { if( blockSize >= 4 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); } if( blockSize >= 2 ) { if( blockSize >= 2 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); } } // Store the result back in the global memory. if( tid == 0 ) Loading @@ -157,18 +147,15 @@ CudaReductionWithArgumentKernel( const Result zero, Index* idxOutput, const Index* idxInput = nullptr ) { using IndexType = Index; using ResultType = Result; ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); IndexType* sidx = reinterpret_cast< IndexType* >( &sdata[ blockDim.x ] ); Result* sdata = Devices::Cuda::getSharedMemory< Result >(); Index* sidx = reinterpret_cast< Index* >( &sdata[ blockDim.x ] ); // Get the thread id (tid), global thread id (gid) and gridSize. const IndexType tid = threadIdx.x; IndexType gid = blockIdx.x * blockDim. x + threadIdx.x; const IndexType gridSize = blockDim.x * gridDim.x; const Index tid = threadIdx.x; Index gid = blockIdx.x * blockDim. x + threadIdx.x; const Index gridSize = blockDim.x * gridDim.x; // Read data into the shared memory. We start with the sequential reduction. // Start with the sequential reduction and push the result into the shared memory. if( idxInput ) { if( gid < size ) { sdata[ tid ] = dataFetcher( gid ); Loading Loading @@ -245,30 +232,25 @@ CudaReductionWithArgumentKernel( const Result zero, // This runs in one warp so it is synchronized implicitly. if( tid < 32 ) { volatile ResultType* vsdata = sdata; volatile IndexType* vsidx = sidx; if( blockSize >= 64 ) { volatile Result* vsdata = sdata; volatile Index* vsidx = sidx; if( blockSize >= 64 ) volatileReduction( vsidx[ tid ], vsidx[ tid + 32 ], vsdata[ tid ], vsdata[ tid + 32 ] ); } // Note that here we do not have to check if tid < 16 etc, because we have // twice as much shared memory, so we do not access out of bounds. The // results for the upper half will be undefined, but unused anyway. if( blockSize >= 32 ) { // 2 * blockSize.x elements of shared memory per block, so we do not // access out of bounds. The results for the upper half will be undefined, // but unused anyway. if( blockSize >= 32 ) volatileReduction( vsidx[ tid ], vsidx[ tid + 16 ], vsdata[ tid ], vsdata[ tid + 16 ] ); } if( blockSize >= 16 ) { if( blockSize >= 16 ) volatileReduction( vsidx[ tid ], vsidx[ tid + 8 ], vsdata[ tid ], vsdata[ tid + 8 ] ); } if( blockSize >= 8 ) { if( blockSize >= 8 ) volatileReduction( vsidx[ tid ], vsidx[ tid + 4 ], vsdata[ tid ], vsdata[ tid + 4 ] ); } if( blockSize >= 4 ) { if( blockSize >= 4 ) volatileReduction( vsidx[ tid ], vsidx[ tid + 2 ], vsdata[ tid ], vsdata[ tid + 2 ] ); } if( blockSize >= 2 ) { if( blockSize >= 2 ) volatileReduction( vsidx[ tid ], vsidx[ tid + 1 ], vsdata[ tid ], vsdata[ tid + 1 ] ); } } // Store the result back in the global memory. if( tid == 0 ) { Loading @@ -276,15 +258,13 @@ CudaReductionWithArgumentKernel( const Result zero, idxOutput[ blockIdx.x ] = sidx[ 0 ]; } } #endif template< typename Index, typename Result > struct CudaReductionKernelLauncher { using IndexType = Index; using ResultType = Result; // The number of blocks should be a multiple of the number of multiprocessors // to ensure optimum balancing of the load. This is very important, because // we run the kernel with a fixed number of blocks, so the amount of work per Loading Loading @@ -315,13 +295,13 @@ struct CudaReductionKernelLauncher const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, ResultType*& output ) Result*& output ) { // create reference to the reduction buffer singleton and set size const std::size_t buf_size = 2 * desGridSize * sizeof( ResultType ); const std::size_t buf_size = 2 * desGridSize * sizeof( Result ); CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); cudaReductionBuffer.setSize( buf_size ); output = cudaReductionBuffer.template getData< ResultType >(); output = cudaReductionBuffer.template getData< Result >(); this->reducedSize = this->launch( originalSize, reduction, volatileReduction, dataFetcher, zero, output ); return this->reducedSize; Loading @@ -334,15 +314,15 @@ struct CudaReductionKernelLauncher const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, ResultType*& output, IndexType*& idxOutput ) Result*& output, Index*& idxOutput ) { // create reference to the reduction buffer singleton and set size const std::size_t buf_size = 2 * desGridSize * ( sizeof( ResultType ) + sizeof( IndexType ) ); const std::size_t buf_size = 2 * desGridSize * ( sizeof( Result ) + sizeof( Index ) ); CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); cudaReductionBuffer.setSize( buf_size ); output = cudaReductionBuffer.template getData< ResultType >(); idxOutput = reinterpret_cast< IndexType* >( &output[ 2 * desGridSize ] ); output = cudaReductionBuffer.template getData< Result >(); idxOutput = reinterpret_cast< Index* >( &output[ 2 * desGridSize ] ); this->reducedSize = this->launchWithArgument( originalSize, reduction, volatileReduction, dataFetcher, zero, output, idxOutput, nullptr ); return this->reducedSize; Loading @@ -357,13 +337,13 @@ struct CudaReductionKernelLauncher { // Input is the first half of the buffer, output is the second half CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); ResultType* input = cudaReductionBuffer.template getData< ResultType >(); ResultType* output = &input[ desGridSize ]; Result* input = cudaReductionBuffer.template getData< Result >(); Result* output = &input[ desGridSize ]; while( this->reducedSize > 1 ) { // this lambda has to be defined inside the loop, because the captured variable changes auto copyFetch = [input] __cuda_callable__ ( IndexType i ) { return input[ i ]; }; auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; }; this->reducedSize = this->launch( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output ); std::swap( input, output ); } Loading @@ -373,7 +353,7 @@ struct CudaReductionKernelLauncher std::swap( input, output ); // Copy result on CPU ResultType result; Result result; ArrayOperations< Devices::Host, Devices::Cuda >::copy( &result, output, 1 ); return result; } Loading @@ -387,15 +367,15 @@ struct CudaReductionKernelLauncher { // Input is the first half of the buffer, output is the second half CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); ResultType* input = cudaReductionBuffer.template getData< ResultType >(); ResultType* output = &input[ desGridSize ]; IndexType* idxInput = reinterpret_cast< IndexType* >( &output[ desGridSize ] ); IndexType* idxOutput = &idxInput[ desGridSize ]; Result* input = cudaReductionBuffer.template getData< Result >(); Result* output = &input[ desGridSize ]; Index* idxInput = reinterpret_cast< Index* >( &output[ desGridSize ] ); Index* idxOutput = &idxInput[ desGridSize ]; while( this->reducedSize > 1 ) { // this lambda has to be defined inside the loop, because the captured variable changes auto copyFetch = [input] __cuda_callable__ ( IndexType i ) { return input[ i ]; }; auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; }; this->reducedSize = this->launchWithArgument( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output, idxOutput, idxInput ); std::swap( input, output ); std::swap( idxInput, idxOutput ); Loading Loading @@ -426,15 +406,16 @@ struct CudaReductionKernelLauncher const Result& zero, Result* output ) { #ifdef HAVE_CUDA dim3 blockSize, gridSize; blockSize.x = Reduction_maxThreadsPerBlock; gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds const IndexType shmem = (blockSize.x <= 32) ? 2 * blockSize.x * sizeof( ResultType ) : blockSize.x * sizeof( ResultType ); const Index shmem = (blockSize.x <= 32) ? 2 * blockSize.x * sizeof( Result ) : blockSize.x * sizeof( Result ); // This is "general", but this method always sets blockSize.x to a specific value, // so runtime switch is not necessary - it only prolongs the compilation time. Loading Loading @@ -515,6 +496,9 @@ struct CudaReductionKernelLauncher // Return the size of the output array on the CUDA device return gridSize.x; #else throw Exceptions::CudaSupportMissing(); #endif } template< typename DataFetcher, Loading @@ -529,15 +513,16 @@ struct CudaReductionKernelLauncher Index* idxOutput, const Index* idxInput ) { #ifdef HAVE_CUDA dim3 blockSize, gridSize; blockSize.x = Reduction_maxThreadsPerBlock; gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds const IndexType shmem = (blockSize.x <= 32) ? 2 * blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) ) : blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) ); const Index shmem = (blockSize.x <= 32) ? 2 * blockSize.x * ( sizeof( Result ) + sizeof( Index ) ) : blockSize.x * ( sizeof( Result ) + sizeof( Index ) ); // This is "general", but this method always sets blockSize.x to a specific value, // so runtime switch is not necessary - it only prolongs the compilation time. Loading Loading @@ -618,16 +603,18 @@ struct CudaReductionKernelLauncher // return the size of the output array on the CUDA device return gridSize.x; #else throw Exceptions::CudaSupportMissing(); #endif } const int activeDevice; const int blocksdPerMultiprocessor; const int desGridSize; const IndexType originalSize; IndexType reducedSize; const Index originalSize; Index reducedSize; }; #endif } // namespace Algorithms } // namespace Containers Loading
src/TNL/Containers/Algorithms/Reduction.h +47 −49 Original line number Diff line number Diff line Loading @@ -22,12 +22,11 @@ namespace Containers { namespace Algorithms { template< typename Device > class Reduction; struct Reduction; template<> class Reduction< Devices::Host > struct Reduction< Devices::Host > { public: template< typename Index, typename Result, typename ReductionOperation, Loading @@ -54,9 +53,8 @@ class Reduction< Devices::Host > }; template<> class Reduction< Devices::Cuda > struct Reduction< Devices::Cuda > { public: template< typename Index, typename Result, typename ReductionOperation, Loading
src/TNL/Containers/Algorithms/Reduction.hpp +53 −97 File changed.Preview size limit exceeded, changes collapsed. Show changes