Commit e470040a authored by Jakub Klinkovský's avatar Jakub Klinkovský Committed by Jakub Klinkovský
Browse files

Style changes in the code for reduction

parent 1a82b047
Loading
Loading
Loading
Loading
+227 −240
Original line number Diff line number Diff line
@@ -12,20 +12,17 @@

#include <utility>  // std::pair

#ifdef HAVE_CUDA
#include <cuda.h>
#endif

#include <TNL/Assert.h>
#include <TNL/Math.h>
#include <TNL/Devices/CudaDeviceInfo.h>
#include <TNL/Containers/Algorithms/CudaReductionBuffer.h>
#include <TNL/Containers/Algorithms/ArrayOperations.h>
#include <TNL/Exceptions/CudaSupportMissing.h>

namespace TNL {
namespace Containers {
namespace Algorithms {

#ifdef HAVE_CUDA
/****
 * The performance of this kernel is very sensitive to register usage.
 * Compile with --ptxas-options=-v and configure these constants for given
@@ -34,6 +31,7 @@ namespace Algorithms {
static constexpr int Reduction_maxThreadsPerBlock = 256;  // must be a power of 2
static constexpr int Reduction_registersPerThread = 32;   // empirically determined optimal value

#ifdef HAVE_CUDA
// __CUDA_ARCH__ is defined only in device code!
#if (__CUDA_ARCH__ >= 300 )
   static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
@@ -56,19 +54,16 @@ CudaReductionKernel( const Result zero,
                     const Index size,
                     Result* output )
{
   using IndexType = Index;
   using ResultType = Result;

   ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
   Result* sdata = Devices::Cuda::getSharedMemory< Result >();

   // Get the thread id (tid), global thread id (gid) and gridSize.
   const IndexType tid = threadIdx.x;
         IndexType gid = blockIdx.x * blockDim. x + threadIdx.x;
   const IndexType gridSize = blockDim.x * gridDim.x;
   const Index tid = threadIdx.x;
         Index gid = blockIdx.x * blockDim. x + threadIdx.x;
   const Index gridSize = blockDim.x * gridDim.x;

   sdata[ tid ] = zero;

   // Read data into the shared memory. We start with the sequential reduction.
   // Start with the sequential reduction and push the result into the shared memory.
   while( gid + 4 * gridSize < size ) {
      reduction( sdata[ tid ], dataFetcher( gid ) );
      reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
@@ -111,29 +106,24 @@ CudaReductionKernel( const Result zero,

   // This runs in one warp so it is synchronized implicitly.
   if( tid < 32 ) {
      volatile ResultType* vsdata = sdata;
      if( blockSize >= 64 ) {
      volatile Result* vsdata = sdata;
      if( blockSize >= 64 )
         volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
      }
      // Note that here we do not have to check if tid < 16 etc, because we have
      // twice as much shared memory, so we do not access out of bounds. The
      // results for the upper half will be undefined, but unused anyway.
      if( blockSize >= 32 ) {
      // 2 * blockSize.x elements of shared memory per block, so we do not
      // access out of bounds. The results for the upper half will be undefined,
      // but unused anyway.
      if( blockSize >= 32 )
         volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
      }
      if( blockSize >= 16 ) {
      if( blockSize >= 16 )
         volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
      }
      if( blockSize >=  8 ) {
      if( blockSize >=  8 )
         volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
      }
      if( blockSize >=  4 ) {
      if( blockSize >=  4 )
         volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
      }
      if( blockSize >=  2 ) {
      if( blockSize >=  2 )
         volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
   }
   }

   // Store the result back in the global memory.
   if( tid == 0 )
@@ -157,18 +147,15 @@ CudaReductionWithArgumentKernel( const Result zero,
                                 Index* idxOutput,
                                 const Index* idxInput = nullptr )
{
   using IndexType = Index;
   using ResultType = Result;

   ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
   IndexType* sidx = reinterpret_cast< IndexType* >( &sdata[ blockDim.x ] );
   Result* sdata = Devices::Cuda::getSharedMemory< Result >();
   Index* sidx = reinterpret_cast< Index* >( &sdata[ blockDim.x ] );

   // Get the thread id (tid), global thread id (gid) and gridSize.
   const IndexType tid = threadIdx.x;
         IndexType gid = blockIdx.x * blockDim. x + threadIdx.x;
   const IndexType gridSize = blockDim.x * gridDim.x;
   const Index tid = threadIdx.x;
         Index gid = blockIdx.x * blockDim. x + threadIdx.x;
   const Index gridSize = blockDim.x * gridDim.x;

   // Read data into the shared memory. We start with the sequential reduction.
   // Start with the sequential reduction and push the result into the shared memory.
   if( idxInput ) {
      if( gid < size ) {
         sdata[ tid ] = dataFetcher( gid );
@@ -245,30 +232,25 @@ CudaReductionWithArgumentKernel( const Result zero,

   // This runs in one warp so it is synchronized implicitly.
   if( tid < 32 ) {
      volatile ResultType* vsdata = sdata;
      volatile IndexType* vsidx = sidx;
      if( blockSize >= 64 ) {
      volatile Result* vsdata = sdata;
      volatile Index* vsidx = sidx;
      if( blockSize >= 64 )
         volatileReduction( vsidx[ tid ], vsidx[ tid + 32 ], vsdata[ tid ], vsdata[ tid + 32 ] );
      }
      // Note that here we do not have to check if tid < 16 etc, because we have
      // twice as much shared memory, so we do not access out of bounds. The
      // results for the upper half will be undefined, but unused anyway.
      if( blockSize >= 32 ) {
      // 2 * blockSize.x elements of shared memory per block, so we do not
      // access out of bounds. The results for the upper half will be undefined,
      // but unused anyway.
      if( blockSize >= 32 )
         volatileReduction( vsidx[ tid ], vsidx[ tid + 16 ], vsdata[ tid ], vsdata[ tid + 16 ] );
      }
      if( blockSize >= 16 ) {
      if( blockSize >= 16 )
         volatileReduction( vsidx[ tid ], vsidx[ tid + 8 ], vsdata[ tid ], vsdata[ tid + 8 ] );
      }
      if( blockSize >=  8 ) {
      if( blockSize >=  8 )
         volatileReduction( vsidx[ tid ], vsidx[ tid + 4 ], vsdata[ tid ], vsdata[ tid + 4 ] );
      }
      if( blockSize >=  4 ) {
      if( blockSize >=  4 )
         volatileReduction( vsidx[ tid ], vsidx[ tid + 2 ], vsdata[ tid ], vsdata[ tid + 2 ] );
      }
      if( blockSize >=  2 ) {
      if( blockSize >=  2 )
         volatileReduction( vsidx[ tid ], vsidx[ tid + 1 ], vsdata[ tid ], vsdata[ tid + 1 ] );
   }
   }

   // Store the result back in the global memory.
   if( tid == 0 ) {
@@ -276,15 +258,13 @@ CudaReductionWithArgumentKernel( const Result zero,
      idxOutput[ blockIdx.x ] = sidx[ 0 ];
   }
}
#endif


template< typename Index,
          typename Result >
struct CudaReductionKernelLauncher
{
   using IndexType = Index;
   using ResultType = Result;

   // The number of blocks should be a multiple of the number of multiprocessors
   // to ensure optimum balancing of the load. This is very important, because
   // we run the kernel with a fixed number of blocks, so the amount of work per
@@ -315,13 +295,13 @@ struct CudaReductionKernelLauncher
              const VolatileReduction& volatileReduction,
              const DataFetcher& dataFetcher,
              const Result& zero,
              ResultType*& output )
              Result*& output )
   {
      // create reference to the reduction buffer singleton and set size
      const std::size_t buf_size = 2 * desGridSize * sizeof( ResultType );
      const std::size_t buf_size = 2 * desGridSize * sizeof( Result );
      CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
      cudaReductionBuffer.setSize( buf_size );
      output = cudaReductionBuffer.template getData< ResultType >();
      output = cudaReductionBuffer.template getData< Result >();

      this->reducedSize = this->launch( originalSize, reduction, volatileReduction, dataFetcher, zero, output );
      return this->reducedSize;
@@ -334,15 +314,15 @@ struct CudaReductionKernelLauncher
                          const VolatileReduction& volatileReduction,
                          const DataFetcher& dataFetcher,
                          const Result& zero,
                          ResultType*& output,
                          IndexType*& idxOutput )
                          Result*& output,
                          Index*& idxOutput )
   {
      // create reference to the reduction buffer singleton and set size
      const std::size_t buf_size = 2 * desGridSize * ( sizeof( ResultType ) + sizeof( IndexType ) );
      const std::size_t buf_size = 2 * desGridSize * ( sizeof( Result ) + sizeof( Index ) );
      CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
      cudaReductionBuffer.setSize( buf_size );
      output = cudaReductionBuffer.template getData< ResultType >();
      idxOutput = reinterpret_cast< IndexType* >( &output[ 2 * desGridSize ] );
      output = cudaReductionBuffer.template getData< Result >();
      idxOutput = reinterpret_cast< Index* >( &output[ 2 * desGridSize ] );

      this->reducedSize = this->launchWithArgument( originalSize, reduction, volatileReduction, dataFetcher, zero, output, idxOutput, nullptr );
      return this->reducedSize;
@@ -357,13 +337,13 @@ struct CudaReductionKernelLauncher
   {
      // Input is the first half of the buffer, output is the second half
      CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
      ResultType* input = cudaReductionBuffer.template getData< ResultType >();
      ResultType* output = &input[ desGridSize ];
      Result* input = cudaReductionBuffer.template getData< Result >();
      Result* output = &input[ desGridSize ];

      while( this->reducedSize > 1 )
      {
         // this lambda has to be defined inside the loop, because the captured variable changes
         auto copyFetch = [input] __cuda_callable__ ( IndexType i ) { return input[ i ]; };
         auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; };
         this->reducedSize = this->launch( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output );
         std::swap( input, output );
      }
@@ -373,7 +353,7 @@ struct CudaReductionKernelLauncher
      std::swap( input, output );

      // Copy result on CPU
      ResultType result;
      Result result;
      ArrayOperations< Devices::Host, Devices::Cuda >::copy( &result, output, 1 );
      return result;
   }
@@ -387,15 +367,15 @@ struct CudaReductionKernelLauncher
   {
      // Input is the first half of the buffer, output is the second half
      CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
      ResultType* input = cudaReductionBuffer.template getData< ResultType >();
      ResultType* output = &input[ desGridSize ];
      IndexType* idxInput = reinterpret_cast< IndexType* >( &output[ desGridSize ] );
      IndexType* idxOutput = &idxInput[ desGridSize ];
      Result* input = cudaReductionBuffer.template getData< Result >();
      Result* output = &input[ desGridSize ];
      Index* idxInput = reinterpret_cast< Index* >( &output[ desGridSize ] );
      Index* idxOutput = &idxInput[ desGridSize ];

      while( this->reducedSize > 1 )
      {
         // this lambda has to be defined inside the loop, because the captured variable changes
         auto copyFetch = [input] __cuda_callable__ ( IndexType i ) { return input[ i ]; };
         auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; };
         this->reducedSize = this->launchWithArgument( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output, idxOutput, idxInput );
         std::swap( input, output );
         std::swap( idxInput, idxOutput );
@@ -426,15 +406,16 @@ struct CudaReductionKernelLauncher
                  const Result& zero,
                  Result* output )
      {
#ifdef HAVE_CUDA
         dim3 blockSize, gridSize;
         blockSize.x = Reduction_maxThreadsPerBlock;
         gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );

         // when there is only one warp per blockSize.x, we need to allocate two warps
         // worth of shared memory so that we don't index shared memory out of bounds
         const IndexType shmem = (blockSize.x <= 32)
                  ? 2 * blockSize.x * sizeof( ResultType )
                  : blockSize.x * sizeof( ResultType );
         const Index shmem = (blockSize.x <= 32)
                  ? 2 * blockSize.x * sizeof( Result )
                  : blockSize.x * sizeof( Result );

         // This is "general", but this method always sets blockSize.x to a specific value,
         // so runtime switch is not necessary - it only prolongs the compilation time.
@@ -515,6 +496,9 @@ struct CudaReductionKernelLauncher

         // Return the size of the output array on the CUDA device
         return gridSize.x;
#else
         throw Exceptions::CudaSupportMissing();
#endif
      }

      template< typename DataFetcher,
@@ -529,15 +513,16 @@ struct CudaReductionKernelLauncher
                              Index* idxOutput,
                              const Index* idxInput )
      {
#ifdef HAVE_CUDA
         dim3 blockSize, gridSize;
         blockSize.x = Reduction_maxThreadsPerBlock;
         gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );

         // when there is only one warp per blockSize.x, we need to allocate two warps
         // worth of shared memory so that we don't index shared memory out of bounds
         const IndexType shmem = (blockSize.x <= 32)
                  ? 2 * blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) )
                  : blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) );
         const Index shmem = (blockSize.x <= 32)
                  ? 2 * blockSize.x * ( sizeof( Result ) + sizeof( Index ) )
                  : blockSize.x * ( sizeof( Result ) + sizeof( Index ) );

         // This is "general", but this method always sets blockSize.x to a specific value,
         // so runtime switch is not necessary - it only prolongs the compilation time.
@@ -618,16 +603,18 @@ struct CudaReductionKernelLauncher

         // return the size of the output array on the CUDA device
         return gridSize.x;
#else
         throw Exceptions::CudaSupportMissing();
#endif
      }


      const int activeDevice;
      const int blocksdPerMultiprocessor;
      const int desGridSize;
      const IndexType originalSize;
      IndexType reducedSize;
      const Index originalSize;
      Index reducedSize;
};
#endif

} // namespace Algorithms
} // namespace Containers
+47 −49
Original line number Diff line number Diff line
@@ -22,12 +22,11 @@ namespace Containers {
namespace Algorithms {

template< typename Device >
class Reduction;
struct Reduction;

template<>
class Reduction< Devices::Host >
struct Reduction< Devices::Host >
{
   public:
   template< typename Index,
             typename Result,
             typename ReductionOperation,
@@ -54,9 +53,8 @@ class Reduction< Devices::Host >
};

template<>
class Reduction< Devices::Cuda >
struct Reduction< Devices::Cuda >
{
   public:
   template< typename Index,
             typename Result,
             typename ReductionOperation,
+53 −97

File changed.

Preview size limit exceeded, changes collapsed.