Loading src/TNL/Containers/Algorithms/CudaReductionKernel.h +154 −96 Original line number Diff line number Diff line Loading @@ -178,22 +178,14 @@ CudaReductionKernel( const Result zero, } template< typename DataFetcher, typename Reduction, typename VolatileReduction, typename Index, template< typename Index, typename Result > int CudaReductionKernelLauncher( const Index size, const Reduction& reduction, const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, Result*& output ) struct CudaReductionKernelLauncher { using IndexType = Index; using ResultType = Result; //// // The number of blocks should be a multiple of the number of multiprocessors // to ensure optimum balancing of the load. This is very important, because // we run the kernel with a fixed number of blocks, so the amount of work per Loading @@ -203,20 +195,78 @@ CudaReductionKernelLauncher( const Index size, // where blocksPerMultiprocessor is determined according to the number of // available registers on the multiprocessor. // On Tesla K40c, desGridSize = 8 * 15 = 120. const int activeDevice = Devices::CudaDeviceInfo::getActiveDevice(); const int blocksdPerMultiprocessor = Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice ) / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread ); const int desGridSize = blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ); dim3 blockSize, gridSize; blockSize.x = Reduction_maxThreadsPerBlock; gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); CudaReductionKernelLauncher( const Index size ) : activeDevice( Devices::CudaDeviceInfo::getActiveDevice() ), blocksdPerMultiprocessor( Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice ) / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread ) ), desGridSize( blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ), originalSize( size ) { } template< typename DataFetcher, typename Reduction, typename VolatileReduction > int start( const Reduction& reduction, const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, ResultType*& output ) { //// // create reference to the reduction buffer singleton and set size const size_t buf_size = desGridSize * sizeof( ResultType ); const size_t buf_size = 2 * desGridSize * sizeof( ResultType ); CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); cudaReductionBuffer.setSize( buf_size ); output = cudaReductionBuffer.template getData< ResultType >(); this-> reducedSize = this->launch( originalSize, reduction, volatileReduction, dataFetcher, zero, output ); return this->reducedSize; } template< typename Reduction, typename VolatileReduction > Result finish( const Reduction& reduction, const VolatileReduction& volatileReduction, const Result& zero ) { //// // Input is the first half of the buffer, output is the second half const size_t buf_size = desGridSize * sizeof( ResultType ); CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); ResultType* input = cudaReductionBuffer.template getData< ResultType >(); ResultType* output = &input[ buf_size ]; auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return input[ i ]; }; while( this->reducedSize > 1 ) { this-> reducedSize = this->launch( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output ); std::swap( input, output ); } //// // Copy result on CPU ResultType result; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( &result, output, 1 ); return result; } protected: template< typename DataFetcher, typename Reduction, typename VolatileReduction > int launch( const Index size, const Reduction& reduction, const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, Result* output ) { dim3 blockSize, gridSize; blockSize.x = Reduction_maxThreadsPerBlock; gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); //// // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds const IndexType shmem = (blockSize.x <= 32) Loading Loading @@ -287,9 +337,17 @@ CudaReductionKernelLauncher( const Index size, } TNL_CHECK_CUDA_DEVICE; //// // return the size of the output array on the CUDA device return gridSize.x; } const int activeDevice; const int blocksdPerMultiprocessor; const int desGridSize; const IndexType originalSize; IndexType reducedSize; }; #endif } // namespace Algorithms Loading src/TNL/Containers/Algorithms/Reduction.hpp +9 −16 Original line number Diff line number Diff line Loading @@ -91,11 +91,13 @@ Reduction< Devices::Cuda >:: timer.start(); #endif CudaReductionKernelLauncher< IndexType, ResultType > reductionLauncher( size ); /**** * Reduce the data on the CUDA device. */ ResultType* deviceAux1( 0 ); IndexType reducedSize = CudaReductionKernelLauncher( size, IndexType reducedSize = reductionLauncher.start( reduction, volatileReduction, dataFetcher, Loading @@ -112,7 +114,6 @@ Reduction< Devices::Cuda >:: /*** * Transfer the reduced data from device to host. */ //ResultType* resultArray[ reducedSize ]; std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] }; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray.get(), deviceAux1, reducedSize ); Loading @@ -139,15 +140,7 @@ Reduction< Devices::Cuda >:: /*** * Data can't be safely reduced on host, so continue with the reduction on the CUDA device. */ auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return deviceAux1[ i ]; }; while( reducedSize > 1 ) { reducedSize = CudaReductionKernelLauncher( reducedSize, reduction, volatileReduction, copyFetch, zero, deviceAux1 ); } auto result = reductionLauncher.finish( reduction, volatileReduction, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); Loading @@ -156,14 +149,14 @@ Reduction< Devices::Cuda >:: timer.start(); #endif ResultType resultArray[ 1 ]; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize ); const ResultType result = resultArray[ 0 ]; //ResultType resultArray[ 1 ]; //ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize ); //const ResultType result = resultArray[ 0 ]; #ifdef CUDA_REDUCTION_PROFILING /*#ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Transferring the result to CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif #endif*/ return result; } Loading src/TNL/Containers/Algorithms/CommonVectorOperations.hpp +2 −2 File changed.Contains only whitespace changes. Show changes Loading
src/TNL/Containers/Algorithms/CudaReductionKernel.h +154 −96 Original line number Diff line number Diff line Loading @@ -178,22 +178,14 @@ CudaReductionKernel( const Result zero, } template< typename DataFetcher, typename Reduction, typename VolatileReduction, typename Index, template< typename Index, typename Result > int CudaReductionKernelLauncher( const Index size, const Reduction& reduction, const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, Result*& output ) struct CudaReductionKernelLauncher { using IndexType = Index; using ResultType = Result; //// // The number of blocks should be a multiple of the number of multiprocessors // to ensure optimum balancing of the load. This is very important, because // we run the kernel with a fixed number of blocks, so the amount of work per Loading @@ -203,20 +195,78 @@ CudaReductionKernelLauncher( const Index size, // where blocksPerMultiprocessor is determined according to the number of // available registers on the multiprocessor. // On Tesla K40c, desGridSize = 8 * 15 = 120. const int activeDevice = Devices::CudaDeviceInfo::getActiveDevice(); const int blocksdPerMultiprocessor = Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice ) / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread ); const int desGridSize = blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ); dim3 blockSize, gridSize; blockSize.x = Reduction_maxThreadsPerBlock; gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); CudaReductionKernelLauncher( const Index size ) : activeDevice( Devices::CudaDeviceInfo::getActiveDevice() ), blocksdPerMultiprocessor( Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice ) / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread ) ), desGridSize( blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ), originalSize( size ) { } template< typename DataFetcher, typename Reduction, typename VolatileReduction > int start( const Reduction& reduction, const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, ResultType*& output ) { //// // create reference to the reduction buffer singleton and set size const size_t buf_size = desGridSize * sizeof( ResultType ); const size_t buf_size = 2 * desGridSize * sizeof( ResultType ); CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); cudaReductionBuffer.setSize( buf_size ); output = cudaReductionBuffer.template getData< ResultType >(); this-> reducedSize = this->launch( originalSize, reduction, volatileReduction, dataFetcher, zero, output ); return this->reducedSize; } template< typename Reduction, typename VolatileReduction > Result finish( const Reduction& reduction, const VolatileReduction& volatileReduction, const Result& zero ) { //// // Input is the first half of the buffer, output is the second half const size_t buf_size = desGridSize * sizeof( ResultType ); CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); ResultType* input = cudaReductionBuffer.template getData< ResultType >(); ResultType* output = &input[ buf_size ]; auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return input[ i ]; }; while( this->reducedSize > 1 ) { this-> reducedSize = this->launch( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output ); std::swap( input, output ); } //// // Copy result on CPU ResultType result; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( &result, output, 1 ); return result; } protected: template< typename DataFetcher, typename Reduction, typename VolatileReduction > int launch( const Index size, const Reduction& reduction, const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, Result* output ) { dim3 blockSize, gridSize; blockSize.x = Reduction_maxThreadsPerBlock; gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); //// // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds const IndexType shmem = (blockSize.x <= 32) Loading Loading @@ -287,9 +337,17 @@ CudaReductionKernelLauncher( const Index size, } TNL_CHECK_CUDA_DEVICE; //// // return the size of the output array on the CUDA device return gridSize.x; } const int activeDevice; const int blocksdPerMultiprocessor; const int desGridSize; const IndexType originalSize; IndexType reducedSize; }; #endif } // namespace Algorithms Loading
src/TNL/Containers/Algorithms/Reduction.hpp +9 −16 Original line number Diff line number Diff line Loading @@ -91,11 +91,13 @@ Reduction< Devices::Cuda >:: timer.start(); #endif CudaReductionKernelLauncher< IndexType, ResultType > reductionLauncher( size ); /**** * Reduce the data on the CUDA device. */ ResultType* deviceAux1( 0 ); IndexType reducedSize = CudaReductionKernelLauncher( size, IndexType reducedSize = reductionLauncher.start( reduction, volatileReduction, dataFetcher, Loading @@ -112,7 +114,6 @@ Reduction< Devices::Cuda >:: /*** * Transfer the reduced data from device to host. */ //ResultType* resultArray[ reducedSize ]; std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] }; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray.get(), deviceAux1, reducedSize ); Loading @@ -139,15 +140,7 @@ Reduction< Devices::Cuda >:: /*** * Data can't be safely reduced on host, so continue with the reduction on the CUDA device. */ auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return deviceAux1[ i ]; }; while( reducedSize > 1 ) { reducedSize = CudaReductionKernelLauncher( reducedSize, reduction, volatileReduction, copyFetch, zero, deviceAux1 ); } auto result = reductionLauncher.finish( reduction, volatileReduction, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); Loading @@ -156,14 +149,14 @@ Reduction< Devices::Cuda >:: timer.start(); #endif ResultType resultArray[ 1 ]; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize ); const ResultType result = resultArray[ 0 ]; //ResultType resultArray[ 1 ]; //ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize ); //const ResultType result = resultArray[ 0 ]; #ifdef CUDA_REDUCTION_PROFILING /*#ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Transferring the result to CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif #endif*/ return result; } Loading
src/TNL/Containers/Algorithms/CommonVectorOperations.hpp +2 −2 File changed.Contains only whitespace changes. Show changes