Commit b74a24d2 authored by Jakub Klinkovský's avatar Jakub Klinkovský Committed by Jakub Klinkovský
Browse files

Rewritten multireduction with lambda functions

parent e470040a
Loading
Loading
Loading
Loading
+108 −141
Original line number Diff line number Diff line
@@ -12,14 +12,11 @@

#pragma once

#ifdef HAVE_CUDA
#include <cuda.h>
#endif

#include <TNL/Assert.h>
#include <TNL/Math.h>
#include <TNL/Devices/CudaDeviceInfo.h>
#include <TNL/Containers/Algorithms/CudaReductionBuffer.h>
#include <TNL/Exceptions/CudaSupportMissing.h>

namespace TNL {
namespace Containers {
@@ -32,157 +29,128 @@ namespace Algorithms {
 * architecture so that there are no local memory spills.
 */
static constexpr int Multireduction_maxThreadsPerBlock = 256;  // must be a power of 2
static constexpr int Multireduction_registersPerThread = 38;   // empirically determined optimal value
static constexpr int Multireduction_registersPerThread = 32;   // empirically determined optimal value

// __CUDA_ARCH__ is defined only in device code!
#if (__CUDA_ARCH__ >= 300 )
   static constexpr int Multireduction_minBlocksPerMultiprocessor = 6;
   static constexpr int Multireduction_minBlocksPerMultiprocessor = 8;
#else
   static constexpr int Multireduction_minBlocksPerMultiprocessor = 4;
#endif

template< int blockSizeX, typename Operation, typename Index >
template< int blockSizeX,
          typename Result,
          typename DataFetcher,
          typename Reduction,
          typename VolatileReduction,
          typename Index >
__global__ void
__launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor )
CudaMultireductionKernel( Operation operation,
                          const int n,
CudaMultireductionKernel( const Result zero,
                          DataFetcher dataFetcher,
                          const Reduction reduction,
                          const VolatileReduction volatileReduction,
                          const Index size,
                          const typename Operation::DataType1* input1,
                          const Index ldInput1,
                          const typename Operation::DataType2* input2,
                          typename Operation::ResultType* output )
                          const int n,
                          Result* output )
{
   typedef Index IndexType;
   typedef typename Operation::ResultType ResultType;
   Result* sdata = Devices::Cuda::getSharedMemory< Result >();

   ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
   // Get the thread id (tid), global thread id (gid) and gridSize.
   const Index tid = threadIdx.y * blockDim.x + threadIdx.x;
         Index gid = blockIdx.x * blockDim.x + threadIdx.x;
   const Index gridSizeX = blockDim.x * gridDim.x;

   /***
    * Get thread id (tid) and global element id (gid).
    * gridSizeX is the number of elements in the direction of x-axis
    * processed by all blocks at the same time.
    */
   const IndexType tid = threadIdx.y * blockDim.x + threadIdx.x;
         IndexType gid = blockIdx.x * blockDim.x + threadIdx.x;
   const IndexType gridSizeX = blockDim.x * gridDim.x;
   // Get the dataset index.
   const int y = blockIdx.y * blockDim.y + threadIdx.y;
   if( y >= n ) return;

   /***
    * Shift input1 and output pointers.
    */
   const IndexType y = blockIdx.y * blockDim.y + threadIdx.y;
   if( y < n ) {
      input1 += y * ldInput1;
      output += y * gridDim.x;
   }
   else
      return;
   sdata[ tid ] = zero;

   /***
    * Start with the sequential reduction and push the
    * result into the shared memory.
    */
   sdata[ tid ] = operation.initialValue();
   while( gid + 4 * gridSizeX < size )
   {
      operation.firstReduction( sdata[ tid ], gid,                 input1, input2 );
      operation.firstReduction( sdata[ tid ], gid + gridSizeX,     input1, input2 );
      operation.firstReduction( sdata[ tid ], gid + 2 * gridSizeX, input1, input2 );
      operation.firstReduction( sdata[ tid ], gid + 3 * gridSizeX, input1, input2 );
   // Start with the sequential reduction and push the result into the shared memory.
   while( gid + 4 * gridSizeX < size ) {
      reduction( sdata[ tid ], dataFetcher( gid,                 y ) );
      reduction( sdata[ tid ], dataFetcher( gid + gridSizeX,     y ) );
      reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSizeX, y ) );
      reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSizeX, y ) );
      gid += 4 * gridSizeX;
   }
   while( gid + 2 * gridSizeX < size )
   {
      operation.firstReduction( sdata[ tid ], gid,                 input1, input2 );
      operation.firstReduction( sdata[ tid ], gid + gridSizeX,     input1, input2 );
   while( gid + 2 * gridSizeX < size ) {
      reduction( sdata[ tid ], dataFetcher( gid, y ) );
      reduction( sdata[ tid ], dataFetcher( gid + gridSizeX, y ) );
      gid += 2 * gridSizeX;
   }
   while( gid < size )
   {
      operation.firstReduction( sdata[ tid ], gid,                 input1, input2 );
   while( gid < size ) {
      reduction( sdata[ tid ], dataFetcher( gid, y ) );
      gid += gridSizeX;
   }
   __syncthreads();


   //printf( "1: tid %d data %f \n", tid, sdata[ tid ] );

   /***
    *  Perform the parallel reduction.
    */
   // Perform the parallel reduction.
   if( blockSizeX >= 1024 ) {
      if( threadIdx.x < 512 ) {
         operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] );
      }
      if( threadIdx.x < 512 )
         reduction( sdata[ tid ], sdata[ tid + 512 ] );
      __syncthreads();
   }
   if( blockSizeX >= 512 ) {
      if( threadIdx.x < 256 ) {
         operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] );
      }
      if( threadIdx.x < 256 )
         reduction( sdata[ tid ], sdata[ tid + 256 ] );
      __syncthreads();
   }
   if( blockSizeX >= 256 ) {
      if( threadIdx.x < 128 ) {
         operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] );
      }
      if( threadIdx.x < 128 )
         reduction( sdata[ tid ], sdata[ tid + 128 ] );
      __syncthreads();
   }
   if( blockSizeX >= 128 ) {
      if( threadIdx.x <  64 ) {
         operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] );
      }
      if( threadIdx.x <  64 )
         reduction( sdata[ tid ], sdata[ tid + 64 ] );
      __syncthreads();
   }

   /***
    * This runs in one warp so it is synchronized implicitly.
    *
    * When the blockSizeX is less then or equal to the warp size, the shared memory
    * must be at least 2 * blockSizeX elements per block, otherwise unallocated memory
    * will be accessed !!!
    */
   // This runs in one warp so it is synchronized implicitly.
   if( threadIdx.x < 32 ) {
      volatile ResultType* vsdata = sdata;
      if( blockSizeX >= 64 ) {
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
      }
      if( blockSizeX >= 32 ) {
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
      }
      if( blockSizeX >= 16 ) {
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
      }
      if( blockSizeX >=  8 ) {
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
      }
      if( blockSizeX >=  4 ) {
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
      }
      if( blockSizeX >=  2 ) {
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
      }
      volatile Result* vsdata = sdata;
      if( blockSizeX >= 64 )
         volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
      // Note that here we do not have to check if tid < 16 etc, because we have
      // 2 * blockSize.x elements of shared memory per block, so we do not
      // access out of bounds. The results for the upper half will be undefined,
      // but unused anyway.
      if( blockSizeX >= 32 )
         volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
      if( blockSizeX >= 16 )
         volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
      if( blockSizeX >=  8 )
         volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
      if( blockSizeX >=  4 )
         volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
      if( blockSizeX >=  2 )
         volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
   }

   /***
    * Store the result back in the global memory.
    */
   // Store the result back in the global memory.
   if( threadIdx.x == 0 ) {
      output[ blockIdx.x ] = sdata[ tid ];
      output[ blockIdx.x + y * gridDim.x ] = sdata[ tid ];
   }
}
#endif

template< typename Operation, typename Index >
template< typename Result,
          typename DataFetcher,
          typename Reduction,
          typename VolatileReduction,
          typename Index >
int
CudaMultireductionKernelLauncher( Operation& operation,
                                  const int n,
CudaMultireductionKernelLauncher( const Result zero,
                                  DataFetcher dataFetcher,
                                  const Reduction reduction,
                                  const VolatileReduction volatileReduction,
                                  const Index size,
                                  const typename Operation::DataType1* input1,
                                  const Index ldInput1,
                                  const typename Operation::DataType2* input2,
                                  typename Operation::ResultType*& output )
                                  const int n,
                                  Result*& output )
{
   typedef typename Operation::ResultType ResultType;

#ifdef HAVE_CUDA
   // The number of blocks should be a multiple of the number of multiprocessors
   // to ensure optimum balancing of the load. This is very important, because
   // we run the kernel with a fixed number of blocks, so the amount of work per
@@ -231,87 +199,86 @@ CudaMultireductionKernelLauncher( Operation& operation,

   // create reference to the reduction buffer singleton and set size
   // (make an overestimate to avoid reallocation on every call if n is increased by 1 each time)
   const size_t buf_size = 8 * ( n / 8 + 1 ) * desGridSizeX * sizeof( ResultType );
   const std::size_t buf_size = 8 * ( n / 8 + 1 ) * desGridSizeX * sizeof( Result );
   CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
   cudaReductionBuffer.setSize( buf_size );
   output = cudaReductionBuffer.template getData< ResultType >();
   output = cudaReductionBuffer.template getData< Result >();

   // when there is only one warp per blockSize.x, we need to allocate two warps
   // worth of shared memory so that we don't index shared memory out of bounds
   const Index shmem = (blockSize.x <= 32)
            ? 2 * blockSize.x * blockSize.y * sizeof( ResultType )
            : blockSize.x * blockSize.y * sizeof( ResultType );

   //cout << "Multireduction of " << n << " datasets, block size (" << blockSize.x << "," << blockSize.y << "), grid size (" << gridSize.x << "," << gridSize.y << "), shmem " << shmem <<std::endl;
            ? 2 * blockSize.x * blockSize.y * sizeof( Result )
            : blockSize.x * blockSize.y * sizeof( Result );

   /***
    * Depending on the blockSize we generate appropriate template instance.
    */
   // Depending on the blockSize we generate appropriate template instance.
   switch( blockSize.x )
   {
      case 512:
         CudaMultireductionKernel< 512 >
         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
         break;
      case 256:
         cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Operation, Index >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel< 256 >
         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
         break;
      case 128:
         cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Operation, Index >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel< 128 >
         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
         break;
      case  64:
         cudaFuncSetCacheConfig(CudaMultireductionKernel<  64, Operation, Index >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaMultireductionKernel<  64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel<  64 >
         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
         break;
      case  32:
         cudaFuncSetCacheConfig(CudaMultireductionKernel<  32, Operation, Index >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaMultireductionKernel<  32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel<  32 >
         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
         break;
      case  16:
         cudaFuncSetCacheConfig(CudaMultireductionKernel<  16, Operation, Index >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaMultireductionKernel<  16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel<  16 >
         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
         break;
     case   8:
         cudaFuncSetCacheConfig(CudaMultireductionKernel<   8, Operation, Index >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaMultireductionKernel<   8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel<   8 >
         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
         break;
      case   4:
         cudaFuncSetCacheConfig(CudaMultireductionKernel<   4, Operation, Index >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaMultireductionKernel<   4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel<   4 >
        <<< gridSize, blockSize, shmem >>>( operation,  n,size, input1, ldInput1, input2, output);
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
        break;
      case   2:
         cudaFuncSetCacheConfig(CudaMultireductionKernel<   2, Operation, Index >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaMultireductionKernel<   2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel<   2 >
         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
         break;
      case   1:
         throw std::logic_error( "blockSize should not be 1." );
      default:
         throw std::logic_error( "Block size is " + std::to_string(blockSize.x) + " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
   }
   cudaStreamSynchronize(0);
   TNL_CHECK_CUDA_DEVICE;

   // return the size of the output array on the CUDA device
   return gridSize.x;
}
#else
   throw Exceptions::CudaSupportMissing();
#endif
}

} // namespace Algorithms
} // namespace Containers
+50 −20
Original line number Diff line number Diff line
@@ -20,40 +20,70 @@ namespace Containers {
namespace Algorithms {

template< typename Device >
class Multireduction;
struct Multireduction;

template<>
class Multireduction< Devices::Cuda >
struct Multireduction< Devices::Host >
{
public:
   template< typename Operation, typename Index >
   /**
    * Parameters:
    *    zero: starting value for reduction
    *    dataFetcher: callable object such that `dataFetcher( i, j )` yields
    *                 the i-th value to be reduced from the j-th dataset
    *                 (i = 0,...,size-1; j = 0,...,n-1)
    *    reduction: callable object representing the reduction operation
    *    volatileReduction: callable object representing the reduction operation
    *    size: the size of each dataset
    *    n: number of datasets to be reduced
    *    result: output array of size = n
    */
   template< typename Result,
             typename DataFetcher,
             typename Reduction,
             typename VolatileReduction,
             typename Index >
   static void
   reduce( Operation& operation,
           const int n,
   reduce( const Result zero,
           DataFetcher dataFetcher,
           const Reduction reduction,
           const VolatileReduction volatileReduction,
           const Index size,
           const typename Operation::DataType1* deviceInput1,
           const Index ldInput1,
           const typename Operation::DataType2* deviceInput2,
           typename Operation::ResultType* hostResult );
           const int n,
           Result* result );
};

template<>
class Multireduction< Devices::Host >
struct Multireduction< Devices::Cuda >
{
public:
   template< typename Operation, typename Index >
   /**
    * Parameters:
    *    zero: starting value for reduction
    *    dataFetcher: callable object such that `dataFetcher( i, j )` yields
    *                 the i-th value to be reduced from the j-th dataset
    *                 (i = 0,...,size-1; j = 0,...,n-1)
    *    reduction: callable object representing the reduction operation
    *    volatileReduction: callable object representing the reduction operation
    *    size: the size of each dataset
    *    n: number of datasets to be reduced
    *    hostResult: output array of size = n
    */
   template< typename Result,
             typename DataFetcher,
             typename Reduction,
             typename VolatileReduction,
             typename Index >
   static void
   reduce( Operation& operation,
           const int n,
   reduce( const Result zero,
           DataFetcher dataFetcher,
           const Reduction reduction,
           const VolatileReduction volatileReduction,
           const Index size,
           const typename Operation::DataType1* deviceInput1,
           const Index ldInput1,
           const typename Operation::DataType2* deviceInput2,
           typename Operation::ResultType* hostResult );
           const int n,
           Result* hostResult );
};

} // namespace Algorithms
} // namespace Containers
} // namespace TNL

#include "Multireduction_impl.h"
#include "Multireduction.hpp"
+7 −0
Original line number Diff line number Diff line
@@ -84,9 +84,16 @@ protected:
   void hauseholder_cwy( VectorViewType v,
                         const int i );

// nvcc allows __cuda_callable__ lambdas only in public methods
#ifdef __NVCC__
public:
#endif
   void hauseholder_cwy_transposed( VectorViewType z,
                                    const int i,
                                    ConstVectorViewType w );
#ifdef __NVCC__
protected:
#endif

   template< typename Vector >
   void update( const int k,
+40 −13
Original line number Diff line number Diff line
@@ -15,7 +15,6 @@
#include <type_traits>
#include <cmath>

#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/Containers/Algorithms/Multireduction.h>
#include <TNL/Matrices/MatrixOperations.h>

@@ -427,14 +426,28 @@ hauseholder_generate( const int i,
   if( i > 0 ) {
      // aux = Y_{i-1}^T * y_i
      RealType aux[ i ];
      Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
//      Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
//      Containers::Algorithms::Multireduction< DeviceType >::reduce
//               ( scalarProduct,
//                 i,
//                 size,
//                 Y.getData(),
//                 ldSize,
//                 Traits::getConstLocalView( y_i ).getData(),
//                 aux );
      const RealType* _Y = Y.getData();
      const RealType* _y_i = Traits::getConstLocalView( y_i ).getData();
      const IndexType ldSize = this->ldSize;
      auto fetch = [_Y, _y_i, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _y_i[ idx ]; };
      auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
      auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
      Containers::Algorithms::Multireduction< DeviceType >::reduce
               ( scalarProduct,
                 i,
               ( (RealType) 0,
                 fetch,
                 reduction,
                 volatileReduction,
                 size,
                 Y.getData(),
                 ldSize,
                 Traits::getLocalView( y_i ).getData(),
                 i,
                 aux );
      // no-op if the problem is not distributed
      CommunicatorType::Allreduce( aux, i, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) );
@@ -525,14 +538,28 @@ hauseholder_cwy_transposed( VectorViewType z,
{
   // aux = Y_i^T * w
   RealType aux[ i + 1 ];
   Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
//   Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
//   Containers::Algorithms::Multireduction< DeviceType >::reduce
//            ( scalarProduct,
//              i + 1,
//              size,
//              Y.getData(),
//              ldSize,
//              Traits::getConstLocalView( w ).getData(),
//              aux );
   const RealType* _Y = Y.getData();
   const RealType* _w = Traits::getConstLocalView( w ).getData();
   const IndexType ldSize = this->ldSize;
   auto fetch = [_Y, _w, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _w[ idx ]; };
   auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
   auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
   Containers::Algorithms::Multireduction< DeviceType >::reduce
            ( scalarProduct,
              i + 1,
            ( (RealType) 0,
              fetch,
              reduction,
              volatileReduction,
              size,
              Y.getData(),
              ldSize,
              Traits::getConstLocalView( w ).getData(),
              i + 1,
              aux );
   // no-op if the problem is not distributed
   Traits::CommunicatorType::Allreduce( aux, i + 1, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) );
Loading