Commit 1a64a618 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Added specialization of CudaBlockScan using __shfl instructions

parent 4bbf495c
Loading
Loading
Loading
Loading
+0 −138
Original line number Diff line number Diff line
/***************************************************************************
                          reduction.h  -  description
                             -------------------
    begin                : Jul 13, 2021
    copyright            : (C) 2021 by Tomas Oberhuber et al.
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

// Implemented by: Xuan Thang Nguyen

#pragma once

namespace TNL {
    namespace Algorithms {
        namespace Sorting {

#ifdef HAVE_CUDA

/**
 * https://developer.nvidia.com/blog/faster-parallel-reductions-kepler/
 * */


__device__ int warpReduceSum(int initVal)
{
    const unsigned int maskConstant = 0xffffffff; //not used
    for (unsigned int mask = warpSize / 2; mask > 0; mask >>= 1)
        initVal += __shfl_xor_sync(maskConstant, initVal, mask);

    return initVal;
}

__device__ int blockReduceSum(int val)
{
    static __shared__ int shared[32];
    int lane = threadIdx.x & (warpSize - 1);
    int wid = threadIdx.x / warpSize;

    val = warpReduceSum(val);

    if (lane == 0)
        shared[wid] = val;
    __syncthreads();

    val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;

    if (wid == 0)
        val = warpReduceSum(val);

    if(threadIdx.x == 0)
        shared[0] = val;
    __syncthreads();

    return shared[0];
}

//-------------------------------------------------------------------------------

__device__ int warpInclusivePrefixSum(int value)
{
    int laneId = threadIdx.x & (32-1);

    #pragma unroll
    for (int i = 1; i*2 <= 32; i *= 2)//32 here is warp size
    {
        int n = __shfl_up_sync(0xffffffff, value, i);
        if ((laneId & (warpSize - 1)) >= i)
            value += n;
    }

    return value;
}

__device__ int blockInclusivePrefixSum(int value)
{
    static __shared__ int shared[32];
    int lane = threadIdx.x & (warpSize - 1);
    int wid = threadIdx.x / warpSize;

    int tmp = warpInclusivePrefixSum(value);

    if (lane == warpSize-1)
        shared[wid] = tmp;
    __syncthreads();

    int tmp2 = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
    if (wid == 0)
        shared[lane] = warpInclusivePrefixSum(tmp2) - tmp2;
    __syncthreads();

    tmp += shared[wid];
    return tmp;
}

//--------------------------------------------------------------------

template<typename Operator>
__device__ int warpCmpReduce(int initVal, const Operator & Cmp)
{
    const unsigned int maskConstant = 0xffffffff; //not used
    for (unsigned int mask = warpSize / 2; mask > 0; mask >>= 1)
        initVal = Cmp(initVal, __shfl_xor_sync(maskConstant, initVal, mask));

    return initVal;
}

template<typename Operator>
__device__ int blockCmpReduce(int val, const Operator & Cmp)
{
    static __shared__ int shared[32];
    int lane = threadIdx.x & (warpSize - 1);
    int wid = threadIdx.x / warpSize;

    val = warpCmpReduce(val, Cmp);

    if (lane == 0)
        shared[wid] = val;
    __syncthreads();

    val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : shared[0];

    if (wid == 0)
        val = warpCmpReduce(val, Cmp);

    if(threadIdx.x == 0)
        shared[0] = val;
    __syncthreads();

    return shared[0];
}

#endif

        } // namespace Sorting
    } // namespace Algorithms
} // namespace TNL
 No newline at end of file
+173 −15
Original line number Diff line number Diff line
@@ -68,43 +68,43 @@ struct CudaBlockScan
      static_assert( blockSize / Cuda::getWarpSize() <= Cuda::getWarpSize(),
                     "blockSize is too large, it would not be possible to scan warpResults using one warp" );

      // Store the threadValue in the shared memory.
      // store the threadValue in the shared memory
      const int chunkResultIdx = Cuda::getInterleaving( tid );
      storage.chunkResults[ chunkResultIdx ] = threadValue;
      __syncthreads();

      // Perform the parallel scan on chunkResults inside warps.
      const int threadInWarpIdx = tid % Cuda::getWarpSize();
      const int warpIdx = tid / Cuda::getWarpSize();
      // perform the parallel scan on chunkResults inside warps
      const int lane_id = tid % Cuda::getWarpSize();
      const int warp_id = tid / Cuda::getWarpSize();
      #pragma unroll
      for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
         if( threadInWarpIdx >= stride ) {
         if( lane_id >= stride ) {
            storage.chunkResults[ chunkResultIdx ] = reduction( storage.chunkResults[ chunkResultIdx ], storage.chunkResults[ Cuda::getInterleaving( tid - stride ) ] );
         }
         __syncwarp();
      }
      threadValue = storage.chunkResults[ chunkResultIdx ];

      // The last thread in warp stores the intermediate result in warpResults.
      if( threadInWarpIdx == Cuda::getWarpSize() - 1 )
         storage.warpResults[ warpIdx ] = threadValue;
      // the last thread in warp stores the intermediate result in warpResults
      if( lane_id == Cuda::getWarpSize() - 1 )
         storage.warpResults[ warp_id ] = threadValue;
      __syncthreads();

      // Perform the scan of warpResults using one warp.
      if( warpIdx == 0 )
      // perform the scan of warpResults using one warp
      if( warp_id == 0 )
         #pragma unroll
         for( int stride = 1; stride < blockSize / Cuda::getWarpSize(); stride *= 2 ) {
            if( threadInWarpIdx >= stride )
            if( lane_id >= stride )
               storage.warpResults[ tid ] = reduction( storage.warpResults[ tid ], storage.warpResults[ tid - stride ] );
            __syncwarp();
         }
      __syncthreads();

      // Shift threadValue by the warpResults.
      if( warpIdx > 0 )
         threadValue = reduction( threadValue, storage.warpResults[ warpIdx - 1 ] );
      // shift threadValue by the warpResults
      if( warp_id > 0 )
         threadValue = reduction( threadValue, storage.warpResults[ warp_id - 1 ] );

      // Shift the result for exclusive scan.
      // shift the result for exclusive scan
      if( scanType == ScanType::Exclusive ) {
         storage.chunkResults[ chunkResultIdx ] = threadValue;
         __syncthreads();
@@ -116,6 +116,164 @@ struct CudaBlockScan
   }
};

template< ScanType scanType,
          int __unused,  // the __shfl implementation does not depend on the blockSize
          typename Reduction,
          typename ValueType >
struct CudaBlockScanShfl
{
   // storage to be allocated in shared memory
   struct Storage
   {
      ValueType warpResults[ Cuda::getWarpSize() ];
   };

   /* Cooperative scan across the CUDA block - each thread will get the
    * result of the scan according to its ID.
    *
    * \param reduction    The binary reduction functor.
    * \param identity     Neutral element for given reduction operation, i.e.
    *                     value such that `reduction(identity, x) == x` for any `x`.
    * \param threadValue  Value of the calling thread to be reduced.
    * \param tid          Index of the calling thread (usually `threadIdx.x`,
    *                     unless you know what you are doing).
    * \param storage      Auxiliary storage (must be allocated as a __shared__
    *                     variable).
    */
   __device__ static
   ValueType
   scan( const Reduction& reduction,
         ValueType identity,
         ValueType threadValue,
         int tid,
         Storage& storage )
   {
      const int lane_id = tid % Cuda::getWarpSize();
      const int warp_id = tid / Cuda::getWarpSize();

      // perform the parallel scan across warps
      ValueType total;
      threadValue = warpScan< scanType >( reduction, identity, threadValue, lane_id, total );

      // the last thread in warp stores the result of inclusive scan in warpResults
      if( lane_id == Cuda::getWarpSize() - 1 )
         storage.warpResults[ warp_id ] = total;
      __syncthreads();

      // the first warp performs the scan of warpResults
      if( warp_id == 0 ) {
         // read from shared memory only if that warp existed
         if( tid < blockDim.x / Cuda::getWarpSize() )
            total = storage.warpResults[ lane_id ];
         else
            total = identity;
         storage.warpResults[ lane_id ] = warpScan< ScanType::Inclusive >( reduction, identity, total, lane_id, total );
      }
      __syncthreads();

      // shift threadValue by the warpResults
      if( warp_id > 0 )
         threadValue = reduction( threadValue, storage.warpResults[ warp_id - 1 ] );

      __syncthreads();
      return threadValue;
   }

   /* Helper function.
    * Cooperative scan across the warp - each thread will get the result of the
    * scan according to its ID.
    * return value = thread's result of the *warpScanType* scan
    * total = thread's result of the *inclusive* scan
    */
   template< ScanType warpScanType >
   __device__ static
   ValueType
   warpScan( const Reduction& reduction,
             ValueType identity,
             ValueType threadValue,
             int lane_id,
             ValueType& total )
   {
      constexpr unsigned mask = 0xffffffff;

      // perform an inclusive scan
      #pragma unroll
      for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
         const ValueType otherValue = __shfl_up_sync( mask, threadValue, stride );
         if( lane_id >= stride )
            threadValue = reduction( threadValue, otherValue );
      }

      // set the result of the inclusive scan
      total = threadValue;

      // shift the result for exclusive scan
      if( warpScanType == ScanType::Exclusive ) {
         threadValue = __shfl_up_sync( mask, threadValue, 1 );
         if( lane_id == 0 )
            threadValue = identity;
      }

      return threadValue;
   }
};

template< ScanType scanType,
          int blockSize,
          typename Reduction >
struct CudaBlockScan< scanType, blockSize, Reduction, int >
: public CudaBlockScanShfl< scanType, blockSize, Reduction, int >
{};

template< ScanType scanType,
          int blockSize,
          typename Reduction >
struct CudaBlockScan< scanType, blockSize, Reduction, unsigned int >
: public CudaBlockScanShfl< scanType, blockSize, Reduction, unsigned int >
{};

template< ScanType scanType,
          int blockSize,
          typename Reduction >
struct CudaBlockScan< scanType, blockSize, Reduction, long >
: public CudaBlockScanShfl< scanType, blockSize, Reduction, long >
{};

template< ScanType scanType,
          int blockSize,
          typename Reduction >
struct CudaBlockScan< scanType, blockSize, Reduction, unsigned long >
: public CudaBlockScanShfl< scanType, blockSize, Reduction, unsigned long >
{};

template< ScanType scanType,
          int blockSize,
          typename Reduction >
struct CudaBlockScan< scanType, blockSize, Reduction, long long >
: public CudaBlockScanShfl< scanType, blockSize, Reduction, long long >
{};

template< ScanType scanType,
          int blockSize,
          typename Reduction >
struct CudaBlockScan< scanType, blockSize, Reduction, unsigned long long >
: public CudaBlockScanShfl< scanType, blockSize, Reduction, unsigned long long >
{};

template< ScanType scanType,
          int blockSize,
          typename Reduction >
struct CudaBlockScan< scanType, blockSize, Reduction, float >
: public CudaBlockScanShfl< scanType, blockSize, Reduction, float >
{};

template< ScanType scanType,
          int blockSize,
          typename Reduction >
struct CudaBlockScan< scanType, blockSize, Reduction, double >
: public CudaBlockScanShfl< scanType, blockSize, Reduction, double >
{};

/* Template for cooperative scan of a data tile in the global memory.
 * It is a *cooperative* operation - all threads must call the operation,
 * otherwise it will deadlock!