Loading src/TNL/Containers/Algorithms/CudaMultireductionKernel.h +12 −2 Original line number Diff line number Diff line /*************************************************************************** CudaMultireductionKernel.h - description ------------------- begin : Oct 4, 2016 copyright : (C) 2016 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #ifdef HAVE_CUDA Loading Loading @@ -216,7 +226,7 @@ CudaMultireductionKernelLauncher( Operation& operation, throw 1; } // create reference to the reduction buffer singleton and set default size // create reference to the reduction buffer singleton and set size // (make an overestimate to avoid reallocation on every call if n is increased by 1 each time) const size_t buf_size = 8 * ( n / 8 + 1 ) * desGridSizeX * sizeof( ResultType ); CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); Loading src/TNL/Containers/Algorithms/CudaReductionKernel.h +64 −13 Original line number Diff line number Diff line Loading @@ -10,15 +10,35 @@ #pragma once #ifdef HAVE_CUDA #include <cuda.h> #endif #include <TNL/Assert.h> #include <TNL/Math.h> #include <TNL/Devices/CudaDeviceInfo.h> #include <TNL/Containers/Algorithms/CudaReductionBuffer.h> namespace TNL { namespace Containers { namespace Algorithms { #ifdef HAVE_CUDA /**** * The performance of this kernel is very sensitive to register usage. * Compile with --ptxas-options=-v and configure these constants for given * architecture so that there are no local memory spills. */ static constexpr int Reduction_maxThreadsPerBlock = 256; // must be a power of 2 #if (__CUDA_ARCH__ >= 300 ) static constexpr int Reduction_minBlocksPerMultiprocessor = 6; #else static constexpr int Reduction_minBlocksPerMultiprocessor = 4; #endif template< typename Operation, int blockSize > __global__ void __global__ void __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor ) CudaReductionKernel( Operation& operation, const typename Operation::IndexType size, const typename Operation::RealType* input1, Loading Loading @@ -164,19 +184,32 @@ CudaReductionKernelLauncher( Operation& operation, typedef typename Operation::RealType RealType; typedef typename Operation::ResultType ResultType; // TODO: optimize similarly to multireduction const int minGPUReductionDataSize = 256; const IndexType desGridSize( minGPUReductionDataSize ); // The number of blocks should be a multiple of the number of multiprocessors // to ensure optimum balancing of the load. This is very important, because // we run the kernel with a fixed number of blocks, so the amount of work per // block increases with enlarging the problem, so even small imbalance can // cost us dearly. // On Tesla K40c, desGridSize = 4 * 6 * 15 = 360. // const IndexType desGridSize = 4 * Reduction_minBlocksPerMultiprocessor // * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() ); // On Tesla K40c, desGridSize = 6 * 15 = 90. const IndexType desGridSize = Reduction_minBlocksPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() ); dim3 blockSize( 256 ), gridSize( 0 ); gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); // create reference to the reduction buffer singleton and set default size CudaReductionBuffer & cudaReductionBuffer = CudaReductionBuffer::getInstance( 8 * minGPUReductionDataSize ); if( ! cudaReductionBuffer.setSize( gridSize.x * sizeof( ResultType ) ) ) return false; // create reference to the reduction buffer singleton and set size const size_t buf_size = desGridSize * sizeof( ResultType ); CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); if( ! cudaReductionBuffer.setSize( buf_size ) ) throw 1; output = cudaReductionBuffer.template getData< ResultType >(); IndexType shmem = blockSize.x * sizeof( ResultType ); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds const IndexType shmem = (blockSize.x <= 32) ? 2 * blockSize.x * sizeof( ResultType ) : blockSize.x * sizeof( ResultType ); /*** * Depending on the blockSize we generate appropriate template instance. Loading @@ -188,34 +221,50 @@ CudaReductionKernelLauncher( Operation& operation, <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 256: cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 256 >, cudaFuncCachePreferShared); CudaReductionKernel< Operation, 256 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 128: cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 128 >, cudaFuncCachePreferShared); CudaReductionKernel< Operation, 128 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 64: cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 64 >, cudaFuncCachePreferShared); CudaReductionKernel< Operation, 64 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 32: cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 32 >, cudaFuncCachePreferShared); CudaReductionKernel< Operation, 32 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 16: cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 16 >, cudaFuncCachePreferShared); CudaReductionKernel< Operation, 16 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 8: cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 8 >, cudaFuncCachePreferShared); CudaReductionKernel< Operation, 8 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 4: cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 4 >, cudaFuncCachePreferShared); CudaReductionKernel< Operation, 4 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 2: cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 2 >, cudaFuncCachePreferShared); CudaReductionKernel< Operation, 2 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; Loading @@ -224,7 +273,9 @@ CudaReductionKernelLauncher( Operation& operation, default: Assert( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); } //checkCudaDevice; checkCudaDevice; // return the size of the output array on the CUDA device return gridSize.x; } #endif Loading src/TNL/Containers/Algorithms/Multireduction.h +10 −0 Original line number Diff line number Diff line /*************************************************************************** Multireduction.h - description ------------------- begin : Oct 4, 2016 copyright : (C) 2016 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Devices/Host.h> Loading src/TNL/Containers/Algorithms/Multireduction_impl.h +10 −0 Original line number Diff line number Diff line /*************************************************************************** Multireduction_impl.h - description ------------------- begin : Oct 4, 2016 copyright : (C) 2016 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include "Multireduction.h" Loading src/TNL/Containers/Algorithms/Reduction_impl.h +2 −5 Original line number Diff line number Diff line Loading @@ -10,16 +10,13 @@ #pragma once #include "Reduction.h" //#define CUDA_REDUCTION_PROFILING #ifdef HAVE_CUDA #include <cuda.h> #endif #include <TNL/Assert.h> #include <TNL/Containers/Algorithms/reduction-operations.h> #include <TNL/Containers/ArrayOperations.h> #include <TNL/Math.h> #include <TNL/Containers/Algorithms/CudaReductionBuffer.h> #include <TNL/Containers/Algorithms/CudaReductionKernel.h> #ifdef CUDA_REDUCTION_PROFILING Loading Loading
src/TNL/Containers/Algorithms/CudaMultireductionKernel.h +12 −2 Original line number Diff line number Diff line /*************************************************************************** CudaMultireductionKernel.h - description ------------------- begin : Oct 4, 2016 copyright : (C) 2016 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #ifdef HAVE_CUDA Loading Loading @@ -216,7 +226,7 @@ CudaMultireductionKernelLauncher( Operation& operation, throw 1; } // create reference to the reduction buffer singleton and set default size // create reference to the reduction buffer singleton and set size // (make an overestimate to avoid reallocation on every call if n is increased by 1 each time) const size_t buf_size = 8 * ( n / 8 + 1 ) * desGridSizeX * sizeof( ResultType ); CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); Loading
src/TNL/Containers/Algorithms/CudaReductionKernel.h +64 −13 Original line number Diff line number Diff line Loading @@ -10,15 +10,35 @@ #pragma once #ifdef HAVE_CUDA #include <cuda.h> #endif #include <TNL/Assert.h> #include <TNL/Math.h> #include <TNL/Devices/CudaDeviceInfo.h> #include <TNL/Containers/Algorithms/CudaReductionBuffer.h> namespace TNL { namespace Containers { namespace Algorithms { #ifdef HAVE_CUDA /**** * The performance of this kernel is very sensitive to register usage. * Compile with --ptxas-options=-v and configure these constants for given * architecture so that there are no local memory spills. */ static constexpr int Reduction_maxThreadsPerBlock = 256; // must be a power of 2 #if (__CUDA_ARCH__ >= 300 ) static constexpr int Reduction_minBlocksPerMultiprocessor = 6; #else static constexpr int Reduction_minBlocksPerMultiprocessor = 4; #endif template< typename Operation, int blockSize > __global__ void __global__ void __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor ) CudaReductionKernel( Operation& operation, const typename Operation::IndexType size, const typename Operation::RealType* input1, Loading Loading @@ -164,19 +184,32 @@ CudaReductionKernelLauncher( Operation& operation, typedef typename Operation::RealType RealType; typedef typename Operation::ResultType ResultType; // TODO: optimize similarly to multireduction const int minGPUReductionDataSize = 256; const IndexType desGridSize( minGPUReductionDataSize ); // The number of blocks should be a multiple of the number of multiprocessors // to ensure optimum balancing of the load. This is very important, because // we run the kernel with a fixed number of blocks, so the amount of work per // block increases with enlarging the problem, so even small imbalance can // cost us dearly. // On Tesla K40c, desGridSize = 4 * 6 * 15 = 360. // const IndexType desGridSize = 4 * Reduction_minBlocksPerMultiprocessor // * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() ); // On Tesla K40c, desGridSize = 6 * 15 = 90. const IndexType desGridSize = Reduction_minBlocksPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() ); dim3 blockSize( 256 ), gridSize( 0 ); gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); // create reference to the reduction buffer singleton and set default size CudaReductionBuffer & cudaReductionBuffer = CudaReductionBuffer::getInstance( 8 * minGPUReductionDataSize ); if( ! cudaReductionBuffer.setSize( gridSize.x * sizeof( ResultType ) ) ) return false; // create reference to the reduction buffer singleton and set size const size_t buf_size = desGridSize * sizeof( ResultType ); CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); if( ! cudaReductionBuffer.setSize( buf_size ) ) throw 1; output = cudaReductionBuffer.template getData< ResultType >(); IndexType shmem = blockSize.x * sizeof( ResultType ); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds const IndexType shmem = (blockSize.x <= 32) ? 2 * blockSize.x * sizeof( ResultType ) : blockSize.x * sizeof( ResultType ); /*** * Depending on the blockSize we generate appropriate template instance. Loading @@ -188,34 +221,50 @@ CudaReductionKernelLauncher( Operation& operation, <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 256: cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 256 >, cudaFuncCachePreferShared); CudaReductionKernel< Operation, 256 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 128: cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 128 >, cudaFuncCachePreferShared); CudaReductionKernel< Operation, 128 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 64: cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 64 >, cudaFuncCachePreferShared); CudaReductionKernel< Operation, 64 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 32: cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 32 >, cudaFuncCachePreferShared); CudaReductionKernel< Operation, 32 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 16: cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 16 >, cudaFuncCachePreferShared); CudaReductionKernel< Operation, 16 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 8: cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 8 >, cudaFuncCachePreferShared); CudaReductionKernel< Operation, 8 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 4: cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 4 >, cudaFuncCachePreferShared); CudaReductionKernel< Operation, 4 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; case 2: cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 2 >, cudaFuncCachePreferShared); CudaReductionKernel< Operation, 2 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); break; Loading @@ -224,7 +273,9 @@ CudaReductionKernelLauncher( Operation& operation, default: Assert( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); } //checkCudaDevice; checkCudaDevice; // return the size of the output array on the CUDA device return gridSize.x; } #endif Loading
src/TNL/Containers/Algorithms/Multireduction.h +10 −0 Original line number Diff line number Diff line /*************************************************************************** Multireduction.h - description ------------------- begin : Oct 4, 2016 copyright : (C) 2016 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Devices/Host.h> Loading
src/TNL/Containers/Algorithms/Multireduction_impl.h +10 −0 Original line number Diff line number Diff line /*************************************************************************** Multireduction_impl.h - description ------------------- begin : Oct 4, 2016 copyright : (C) 2016 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include "Multireduction.h" Loading
src/TNL/Containers/Algorithms/Reduction_impl.h +2 −5 Original line number Diff line number Diff line Loading @@ -10,16 +10,13 @@ #pragma once #include "Reduction.h" //#define CUDA_REDUCTION_PROFILING #ifdef HAVE_CUDA #include <cuda.h> #endif #include <TNL/Assert.h> #include <TNL/Containers/Algorithms/reduction-operations.h> #include <TNL/Containers/ArrayOperations.h> #include <TNL/Math.h> #include <TNL/Containers/Algorithms/CudaReductionBuffer.h> #include <TNL/Containers/Algorithms/CudaReductionKernel.h> #ifdef CUDA_REDUCTION_PROFILING Loading