diff --git a/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp index 5430f6eacda7c42bc966a7200a74c44aacca4b00..ef3317cb7a0636e3dc5ef32a1798addfdf90b6cb 100644 --- a/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp +++ b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp @@ -24,7 +24,7 @@ CommonVectorOperations< Device >:: getVectorMax( const Vector& v ) { TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - + using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; @@ -42,7 +42,7 @@ CommonVectorOperations< Device >:: getVectorMin( const Vector& v ) { TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - + using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h index a90e0a3aab8df3f4b7d60410001f2ae1873a981e..8701b8b7df3ade43bbf9f6cd195142915226d110 100644 --- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h @@ -56,7 +56,7 @@ CudaReductionKernel( const Result zero, { using IndexType = Index; using ResultType = Result; - + ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); /*** @@ -178,22 +178,14 @@ CudaReductionKernel( const Result zero, } -template< typename DataFetcher, - typename Reduction, - typename VolatileReduction, - typename Index, - typename Result > -int -CudaReductionKernelLauncher( const Index size, - const Reduction& reduction, - const VolatileReduction& volatileReduction, - const DataFetcher& dataFetcher, - const Result& zero, - Result*& output ) +template< typename Index, + typename Result > +struct CudaReductionKernelLauncher { using IndexType = Index; using ResultType = Result; + //// // The number of blocks should be a multiple of the number of multiprocessors // to ensure optimum balancing of the load. This is very important, because // we run the kernel with a fixed number of blocks, so the amount of work per @@ -203,93 +195,159 @@ CudaReductionKernelLauncher( const Index size, // where blocksPerMultiprocessor is determined according to the number of // available registers on the multiprocessor. // On Tesla K40c, desGridSize = 8 * 15 = 120. - const int activeDevice = Devices::CudaDeviceInfo::getActiveDevice(); - const int blocksdPerMultiprocessor = Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice ) - / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread ); - const int desGridSize = blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ); - dim3 blockSize, gridSize; - blockSize.x = Reduction_maxThreadsPerBlock; - gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); - - // create reference to the reduction buffer singleton and set size - const size_t buf_size = desGridSize * sizeof( ResultType ); - CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); - cudaReductionBuffer.setSize( buf_size ); - output = cudaReductionBuffer.template getData< ResultType >(); - - // when there is only one warp per blockSize.x, we need to allocate two warps - // worth of shared memory so that we don't index shared memory out of bounds - const IndexType shmem = (blockSize.x <= 32) - ? 2 * blockSize.x * sizeof( ResultType ) - : blockSize.x * sizeof( ResultType ); + CudaReductionKernelLauncher( const Index size ) + : activeDevice( Devices::CudaDeviceInfo::getActiveDevice() ), + blocksdPerMultiprocessor( Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice ) + / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread ) ), + desGridSize( blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ), + originalSize( size ) + { + } - /*** - * Depending on the blockSize we generate appropriate template instance. - */ - switch( blockSize.x ) + template< typename DataFetcher, + typename Reduction, + typename VolatileReduction > + int start( const Reduction& reduction, + const VolatileReduction& volatileReduction, + const DataFetcher& dataFetcher, + const Result& zero, + ResultType*& output ) { - case 512: - CudaReductionKernel< 512 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 256: - cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 256 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 128: - cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 128 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 64: - cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 64 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 32: - cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 32 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 16: - cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 16 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 8: - cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 8 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 4: - cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 4 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 2: - cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 2 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 1: - TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); - default: - TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); + //// + // create reference to the reduction buffer singleton and set size + const size_t buf_size = 2 * desGridSize * sizeof( ResultType ); + CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); + cudaReductionBuffer.setSize( buf_size ); + output = cudaReductionBuffer.template getData< ResultType >(); + + this-> reducedSize = this->launch( originalSize, reduction, volatileReduction, dataFetcher, zero, output ); + return this->reducedSize; } - TNL_CHECK_CUDA_DEVICE; - // return the size of the output array on the CUDA device - return gridSize.x; -} + template< typename Reduction, + typename VolatileReduction > + Result finish( const Reduction& reduction, + const VolatileReduction& volatileReduction, + const Result& zero ) + { + //// + // Input is the first half of the buffer, output is the second half + const size_t buf_size = desGridSize * sizeof( ResultType ); + CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); + ResultType* input = cudaReductionBuffer.template getData< ResultType >(); + ResultType* output = &input[ buf_size ]; + + auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return input[ i ]; }; + while( this->reducedSize > 1 ) + { + this-> reducedSize = this->launch( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output ); + std::swap( input, output ); + } + + //// + // Copy result on CPU + ResultType result; + ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( &result, output, 1 ); + return result; + } + + protected: + template< typename DataFetcher, + typename Reduction, + typename VolatileReduction > + int launch( const Index size, + const Reduction& reduction, + const VolatileReduction& volatileReduction, + const DataFetcher& dataFetcher, + const Result& zero, + Result* output ) + { + dim3 blockSize, gridSize; + blockSize.x = Reduction_maxThreadsPerBlock; + gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); + + //// + // when there is only one warp per blockSize.x, we need to allocate two warps + // worth of shared memory so that we don't index shared memory out of bounds + const IndexType shmem = (blockSize.x <= 32) + ? 2 * blockSize.x * sizeof( ResultType ) + : blockSize.x * sizeof( ResultType ); + + /*** + * Depending on the blockSize we generate appropriate template instance. + */ + switch( blockSize.x ) + { + case 512: + CudaReductionKernel< 512 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 256: + cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 256 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 128: + cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 128 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 64: + cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 64 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 32: + cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 32 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 16: + cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 16 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 8: + cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 8 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 4: + cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 4 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 2: + cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 2 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 1: + TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); + default: + TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); + } + TNL_CHECK_CUDA_DEVICE; + + //// + // return the size of the output array on the CUDA device + return gridSize.x; + } + + const int activeDevice; + const int blocksdPerMultiprocessor; + const int desGridSize; + const IndexType originalSize; + IndexType reducedSize; +}; #endif } // namespace Algorithms diff --git a/src/TNL/Containers/Algorithms/Reduction.hpp b/src/TNL/Containers/Algorithms/Reduction.hpp index 22121de25f65d9fc0f966b8822cbfbb30de46adf..31c93504cc18f3616171b855a286345953ca19d0 100644 --- a/src/TNL/Containers/Algorithms/Reduction.hpp +++ b/src/TNL/Containers/Algorithms/Reduction.hpp @@ -91,11 +91,13 @@ Reduction< Devices::Cuda >:: timer.start(); #endif + CudaReductionKernelLauncher< IndexType, ResultType > reductionLauncher( size ); + /**** * Reduce the data on the CUDA device. */ ResultType* deviceAux1( 0 ); - IndexType reducedSize = CudaReductionKernelLauncher( size, + IndexType reducedSize = reductionLauncher.start( reduction, volatileReduction, dataFetcher, @@ -112,7 +114,6 @@ Reduction< Devices::Cuda >:: /*** * Transfer the reduced data from device to host. */ - //ResultType* resultArray[ reducedSize ]; std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] }; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray.get(), deviceAux1, reducedSize ); @@ -139,15 +140,7 @@ Reduction< Devices::Cuda >:: /*** * Data can't be safely reduced on host, so continue with the reduction on the CUDA device. */ - auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return deviceAux1[ i ]; }; - while( reducedSize > 1 ) { - reducedSize = CudaReductionKernelLauncher( reducedSize, - reduction, - volatileReduction, - copyFetch, - zero, - deviceAux1 ); - } + auto result = reductionLauncher.finish( reduction, volatileReduction, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); @@ -156,14 +149,14 @@ Reduction< Devices::Cuda >:: timer.start(); #endif - ResultType resultArray[ 1 ]; - ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize ); - const ResultType result = resultArray[ 0 ]; + //ResultType resultArray[ 1 ]; + //ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize ); + //const ResultType result = resultArray[ 0 ]; - #ifdef CUDA_REDUCTION_PROFILING + /*#ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Transferring the result to CPU took " << timer.getRealTime() << " sec. " << std::endl; - #endif + #endif*/ return result; }