Commit 174ad5fd authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

PrefixSum: separate first and second phase for OpenMP implementation and...

PrefixSum: separate first and second phase for OpenMP implementation and expose performFirstPhase and performSecondPhase methods
parent 2c40015f
Loading
Loading
Loading
Loading
+82 −46
Original line number Original line Diff line number Diff line
@@ -26,17 +26,16 @@ enum class PrefixSumType {


template< typename Device,
template< typename Device,
           PrefixSumType Type = PrefixSumType::Inclusive >
           PrefixSumType Type = PrefixSumType::Inclusive >
class PrefixSum;
struct PrefixSum;


template< typename Device,
template< typename Device,
           PrefixSumType Type = PrefixSumType::Inclusive >
           PrefixSumType Type = PrefixSumType::Inclusive >
class SegmentedPrefixSum;
struct SegmentedPrefixSum;




template< PrefixSumType Type >
template< PrefixSumType Type >
class PrefixSum< Devices::Host, Type >
struct PrefixSum< Devices::Host, Type >
{
{
   public:
   template< typename Vector,
   template< typename Vector,
             typename Reduction >
             typename Reduction >
   static void
   static void
@@ -44,13 +43,32 @@ class PrefixSum< Devices::Host, Type >
            const typename Vector::IndexType begin,
            const typename Vector::IndexType begin,
            const typename Vector::IndexType end,
            const typename Vector::IndexType end,
            const Reduction& reduction,
            const Reduction& reduction,
               const typename Vector::RealType& zero );
            const typename Vector::RealType zero );

   template< typename Vector,
             typename Reduction >
   static auto
   performFirstPhase( Vector& v,
                      const typename Vector::IndexType begin,
                      const typename Vector::IndexType end,
                      const Reduction& reduction,
                      const typename Vector::RealType zero );

   template< typename Vector,
             typename BlockShifts,
             typename Reduction >
   static void
   performSecondPhase( Vector& v,
                       const BlockShifts& blockShifts,
                       const typename Vector::IndexType begin,
                       const typename Vector::IndexType end,
                       const Reduction& reduction,
                       const typename Vector::RealType shift );
};
};


template< PrefixSumType Type >
template< PrefixSumType Type >
class PrefixSum< Devices::Cuda, Type >
struct PrefixSum< Devices::Cuda, Type >
{
{
   public:
   template< typename Vector,
   template< typename Vector,
             typename Reduction >
             typename Reduction >
   static void
   static void
@@ -58,13 +76,32 @@ class PrefixSum< Devices::Cuda, Type >
            const typename Vector::IndexType begin,
            const typename Vector::IndexType begin,
            const typename Vector::IndexType end,
            const typename Vector::IndexType end,
            const Reduction& reduction,
            const Reduction& reduction,
               const typename Vector::RealType& zero );
            const typename Vector::RealType zero );

   template< typename Vector,
             typename Reduction >
   static auto
   performFirstPhase( Vector& v,
                      const typename Vector::IndexType begin,
                      const typename Vector::IndexType end,
                      const Reduction& reduction,
                      const typename Vector::RealType zero );

   template< typename Vector,
             typename BlockShifts,
             typename Reduction >
   static void
   performSecondPhase( Vector& v,
                       const BlockShifts& blockShifts,
                       const typename Vector::IndexType begin,
                       const typename Vector::IndexType end,
                       const Reduction& reduction,
                       const typename Vector::RealType shift );
};
};


template< PrefixSumType Type >
template< PrefixSumType Type >
class SegmentedPrefixSum< Devices::Host, Type >
struct SegmentedPrefixSum< Devices::Host, Type >
{
{
   public:
   template< typename Vector,
   template< typename Vector,
             typename Reduction,
             typename Reduction,
             typename Flags >
             typename Flags >
@@ -74,13 +111,12 @@ class SegmentedPrefixSum< Devices::Host, Type >
            const typename Vector::IndexType begin,
            const typename Vector::IndexType begin,
            const typename Vector::IndexType end,
            const typename Vector::IndexType end,
            const Reduction& reduction,
            const Reduction& reduction,
               const typename Vector::RealType& zero );
            const typename Vector::RealType zero );
};
};


template< PrefixSumType Type >
template< PrefixSumType Type >
class SegmentedPrefixSum< Devices::Cuda, Type >
struct SegmentedPrefixSum< Devices::Cuda, Type >
{
{
   public:
   template< typename Vector,
   template< typename Vector,
             typename Reduction,
             typename Reduction,
             typename Flags >
             typename Flags >
@@ -90,7 +126,7 @@ class SegmentedPrefixSum< Devices::Cuda, Type >
            const typename Vector::IndexType begin,
            const typename Vector::IndexType begin,
            const typename Vector::IndexType end,
            const typename Vector::IndexType end,
            const Reduction& reduction,
            const Reduction& reduction,
               const typename Vector::RealType& zero );
            const typename Vector::RealType zero );
};
};


} // namespace Algorithms
} // namespace Algorithms
+131 −59
Original line number Original line Diff line number Diff line
@@ -12,36 +12,18 @@


#pragma once
#pragma once


#include <memory>  // std::unique_ptr

#include "PrefixSum.h"
#include "PrefixSum.h"


//#define CUDA_REDUCTION_PROFILING

#include <TNL/Assert.h>
#include <TNL/Assert.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/Containers/Array.h>
#include <TNL/Containers/Algorithms/ArrayOperations.h>
#include <TNL/Containers/Algorithms/CudaPrefixSumKernel.h>
#include <TNL/Containers/Algorithms/CudaPrefixSumKernel.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/Exceptions/NotImplementedError.h>
#include <TNL/Exceptions/NotImplementedError.h>


#ifdef CUDA_REDUCTION_PROFILING
#include <iostream>
#include <TNL/Timer.h>
#endif

namespace TNL {
namespace TNL {
namespace Containers {
namespace Containers {
namespace Algorithms {
namespace Algorithms {


/****
 * Arrays smaller than the following constant
 * are reduced on CPU. The constant must not be larger
 * than maximal CUDA grid size.
 */
static constexpr int PrefixSum_minGpuDataSize = 256;//65536; //16384;//1024;//256;

////
// PrefixSum on host
template< PrefixSumType Type >
template< PrefixSumType Type >
   template< typename Vector,
   template< typename Vector,
             typename Reduction >
             typename Reduction >
@@ -51,30 +33,37 @@ perform( Vector& v,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType end,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const Reduction& reduction,
         const typename Vector::RealType& zero )
         const typename Vector::RealType zero )
{
#ifdef HAVE_OPENMP
   const auto blockShifts = performFirstPhase( v, begin, end, reduction, zero );
   performSecondPhase( v, blockShifts, begin, end, reduction, zero );
#else
   // sequential prefix-sum does not need a second phase
   performFirstPhase( v, begin, end, reduction, zero );
#endif
}

template< PrefixSumType Type >
   template< typename Vector,
             typename Reduction >
auto
PrefixSum< Devices::Host, Type >::
performFirstPhase( Vector& v,
                   const typename Vector::IndexType begin,
                   const typename Vector::IndexType end,
                   const Reduction& reduction,
                   const typename Vector::RealType zero )
{
{
   using RealType = typename Vector::RealType;
   using RealType = typename Vector::RealType;
   using IndexType = typename Vector::IndexType;
   using IndexType = typename Vector::IndexType;


#ifdef HAVE_OPENMP
#ifdef HAVE_OPENMP
   const int threads = Devices::Host::getMaxThreadsCount();
   const int threads = Devices::Host::getMaxThreadsCount();
   std::unique_ptr< RealType[] > block_sums{
   Array< RealType, Devices::Host > block_sums( threads + 1 );
      // Workaround for nvcc 10.1.168 - it would modifie the simple expression
      // `new RealType[reducedSize]` in the source code to `new (RealType[reducedSize])`
      // which is not correct - see e.g. https://stackoverflow.com/a/39671946
      // Thus, the host compiler would spit out hundreds of warnings...
      // Funnily enough, nvcc's behaviour depends on the context rather than the
      // expression, because exactly the same simple expression in different places
      // does not produce warnings.
      #ifdef __NVCC__
      new RealType[ static_cast<const int&>(threads) + 1 ]
      #else
      new RealType[ threads + 1 ]
      #endif
   };
   block_sums[ 0 ] = zero;
   block_sums[ 0 ] = zero;


   #pragma omp parallel
   #pragma omp parallel num_threads(threads)
   {
   {
      // init
      // init
      const int thread_idx = omp_get_thread_num();
      const int thread_idx = omp_get_thread_num();
@@ -99,18 +88,15 @@ perform( Vector& v,


      // write the block sums into the buffer
      // write the block sums into the buffer
      block_sums[ thread_idx + 1 ] = block_sum;
      block_sums[ thread_idx + 1 ] = block_sum;
      #pragma omp barrier
   }


      // calculate per-block offsets
   // block_sums now contains sums of numbers in each block. The first phase
      RealType offset = 0;
   // ends by computing prefix-sum of this array.
      for( int i = 0; i < thread_idx + 1; i++ )
   for( int i = 1; i < threads + 1; i++ )
         offset = reduction( offset, block_sums[ i ] );
      block_sums[ i ] = reduction( block_sums[ i ], block_sums[ i - 1 ] );


      // shift intermediate results by the offset
   // block_sums now contains shift values for each block - to be used in the second phase
      #pragma omp for schedule(static)
   return block_sums;
      for( IndexType i = begin; i < end; i++ )
         v[ i ] = reduction( v[ i ], offset );
   }
#else
#else
   if( Type == PrefixSumType::Inclusive ) {
   if( Type == PrefixSumType::Inclusive ) {
      for( IndexType i = begin + 1; i < end; i++ )
      for( IndexType i = begin + 1; i < end; i++ )
@@ -125,11 +111,47 @@ perform( Vector& v,
         aux = reduction( aux, x );
         aux = reduction( aux, x );
      }
      }
   }
   }

   return 0;
#endif
}

template< PrefixSumType Type >
   template< typename Vector,
             typename BlockShifts,
             typename Reduction >
void
PrefixSum< Devices::Host, Type >::
performSecondPhase( Vector& v,
                    const BlockShifts& blockShifts,
                    const typename Vector::IndexType begin,
                    const typename Vector::IndexType end,
                    const Reduction& reduction,
                    const typename Vector::RealType shift )
{
   using RealType = typename Vector::RealType;
   using IndexType = typename Vector::IndexType;

#ifdef HAVE_OPENMP
   const int threads = blockShifts.getSize() - 1;

   // launch exactly the same number of threads as in the first phase
   #pragma omp parallel num_threads(threads)
   {
      const int thread_idx = omp_get_thread_num();
      const RealType offset = reduction( blockShifts[ thread_idx ], shift );

      // shift intermediate results by the offset
      #pragma omp for schedule(static)
      for( IndexType i = begin; i < end; i++ )
         v[ i ] = reduction( v[ i ], offset );
   }
#else
   for( IndexType i = begin; i < end; i++ )
      v[ i ] = reduction( v[ i ], shift );
#endif
#endif
}
}


////
// PrefixSum on CUDA device
template< PrefixSumType Type >
template< PrefixSumType Type >
   template< typename Vector,
   template< typename Vector,
             typename Reduction >
             typename Reduction >
@@ -139,7 +161,7 @@ perform( Vector& v,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType end,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const Reduction& reduction,
         const typename Vector::RealType& zero )
         const typename Vector::RealType zero )
{
{
#ifdef HAVE_CUDA
#ifdef HAVE_CUDA
   using RealType = typename Vector::RealType;
   using RealType = typename Vector::RealType;
@@ -156,9 +178,61 @@ perform( Vector& v,
#endif
#endif
}
}


template< PrefixSumType Type >
   template< typename Vector,
             typename Reduction >
auto
PrefixSum< Devices::Cuda, Type >::
performFirstPhase( Vector& v,
                   const typename Vector::IndexType begin,
                   const typename Vector::IndexType end,
                   const Reduction& reduction,
                   const typename Vector::RealType zero )
{
#ifdef HAVE_CUDA
   using RealType = typename Vector::RealType;
   using IndexType = typename Vector::IndexType;

   return CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::performFirstPhase(
      end - begin,
      &v[ begin ],  // input
      &v[ begin ],  // output
      reduction,
      zero );
#else
   throw Exceptions::CudaSupportMissing();
#endif
}

template< PrefixSumType Type >
   template< typename Vector,
             typename BlockShifts,
             typename Reduction >
void
PrefixSum< Devices::Cuda, Type >::
performSecondPhase( Vector& v,
                    const BlockShifts& blockShifts,
                    const typename Vector::IndexType begin,
                    const typename Vector::IndexType end,
                    const Reduction& reduction,
                    const typename Vector::RealType shift )
{
#ifdef HAVE_CUDA
   using RealType = typename Vector::RealType;
   using IndexType = typename Vector::IndexType;

   CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::performSecondPhase(
      end - begin,
      &v[ begin ],  // output
      blockShifts.getData(),
      reduction,
      shift );
#else
   throw Exceptions::CudaSupportMissing();
#endif
}



////
// PrefixSum on host
template< PrefixSumType Type >
template< PrefixSumType Type >
   template< typename Vector,
   template< typename Vector,
             typename Reduction,
             typename Reduction,
@@ -170,7 +244,7 @@ perform( Vector& v,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType end,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const Reduction& reduction,
         const typename Vector::RealType& zero )
         const typename Vector::RealType zero )
{
{
   using RealType = typename Vector::RealType;
   using RealType = typename Vector::RealType;
   using IndexType = typename Vector::IndexType;
   using IndexType = typename Vector::IndexType;
@@ -197,8 +271,6 @@ perform( Vector& v,
   }
   }
}
}


////
// PrefixSum on CUDA device
template< PrefixSumType Type >
template< PrefixSumType Type >
   template< typename Vector,
   template< typename Vector,
             typename Reduction,
             typename Reduction,
@@ -210,7 +282,7 @@ perform( Vector& v,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType end,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const Reduction& reduction,
         const typename Vector::RealType& zero )
         const typename Vector::RealType zero )
{
{
#ifdef HAVE_CUDA
#ifdef HAVE_CUDA
   using RealType = typename Vector::RealType;
   using RealType = typename Vector::RealType;