Commit afc75e25 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Refuciktoring of adaptive CSR kernel.

parent 0c4eacdb
Loading
Loading
Loading
Loading
+11 −133
Original line number Diff line number Diff line
@@ -23,35 +23,6 @@ namespace TNL {
   namespace Algorithms {
      namespace Segments {

#ifdef HAVE_CUDA

template< int CudaBlockSize,
          int warpSize,
          int WARPS,
          int SHARED_PER_WARP,
          int MAX_ELEM_PER_WARP,
          typename BlocksView,
          typename Offsets,
          typename Index,
          typename Fetch,
          typename Reduction,
          typename ResultKeeper,
          typename Real,
          typename... Args >
__global__ void
segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
                                    int gridIdx,
                                    Offsets offsets,
                                    Index first,
                                    Index last,
                                    Fetch fetch,
                                    Reduction reduce,
                                    ResultKeeper keep,
                                    Real zero,
                                    Args... args );
#endif


template< typename Index,
          typename Device >
struct CSRAdaptiveKernel
@@ -63,114 +34,24 @@ struct CSRAdaptiveKernel
   using BlocksType = typename ViewType::BlocksType;
   using BlocksView = typename BlocksType::ViewType;

   static TNL::String getKernelType()
   {
      return ViewType::getKernelType();
   };

    static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;

   /* How many shared memory use per block in CSR Adaptive kernel */
   static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO:

   /* Number of elements in shared memory */
   static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double);

   /* Number of warps in block for CSR Adaptive */
   static constexpr Index WARPS = THREADS_ADAPTIVE / 32;
   static TNL::String getKernelType();

   /* Number of elements in shared memory per one warp */
   static constexpr Index SHARED_PER_WARP = SHARED / WARPS;

   /* Max length of row to process one warp for CSR Light, MultiVector */
   static constexpr Index MAX_ELEMENTS_PER_WARP = 384;

   /* Max length of row to process one warp for CSR Adaptive */
   static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512;
   
   template< typename Offsets >
   Index findLimit( const Index start,
                    const Offsets& offsets,
                    const Index size,
                    details::Type &type,
                    Index &sum )
   {
      sum = 0;
      for (Index current = start; current < size - 1; current++ )
      {
         Index elements = offsets[ current + 1 ] - offsets[ current ];
         sum += elements;
         if( sum > SHARED_PER_WARP )
         {
            if( current - start > 0 ) // extra row
            {
               type = details::Type::STREAM;
               return current;
            }
            else
            {                  // one long row
               if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT )
                  type = details::Type::VECTOR;
               else
                  type = details::Type::LONG;
               return current + 1;
            }
         }
      }
      type = details::Type::STREAM;
      return size - 1; // return last row pointer
    }
                    Index &sum );

   template< typename Offsets >
   void init( const Offsets& offsets )
   {
      using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >;
      HostOffsetsType hostOffsets( offsets );
      const Index rows = offsets.getSize();
      Index sum, start( 0 ), nextStart( 0 );
   void init( const Offsets& offsets );

      // Fill blocks
      std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks;
      inBlocks.reserve( rows );
   void reset();

      while( nextStart != rows - 1 )
      {
         details::Type type;
         nextStart = findLimit( start, hostOffsets, rows, type, sum );
   ViewType getView();

         if( type == details::Type::LONG )
         {
            const Index blocksCount = inBlocks.size();
            const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize();
            Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount;
            if( warpsLeft == 0 )
               warpsLeft = warpsPerCudaBlock;
            for( Index index = 0; index < warpsLeft; index++ )
               inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft );
         }
         else
         {
            inBlocks.emplace_back(start, type,
                  nextStart,
                  offsets.getElement(nextStart),
                  offsets.getElement(start) );
         }
         start = nextStart;
      }
      inBlocks.emplace_back(nextStart);
      this->blocks = inBlocks;
      this->view.setBlocks( blocks );
   }

   void reset()
   {
      this->blocks.reset();
      this->view.setBlocks( blocks );
   }

   ViewType getView() { return this->view; };

   ConstViewType getConstView() const { return this->view; };
   ConstViewType getConstView() const;

   template< typename OffsetsView,
              typename Fetch,
@@ -185,10 +66,7 @@ struct CSRAdaptiveKernel
                        const Reduction& reduction,
                        ResultKeeper& keeper,
                        const Real& zero,
                        Args... args ) const
   {
      view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
   }
                        Args... args ) const;

   protected:
      BlocksType blocks;
+142 −1
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@
#include <TNL/Containers/VectorView.h>
#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h>
#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>

@@ -22,6 +23,146 @@ namespace TNL {
   namespace Algorithms {
      namespace Segments {

template< typename Index,
          typename Device >
TNL::String
CSRAdaptiveKernel< Index, Device >::
getKernelType()
{
   return ViewType::getKernelType();
}

template< typename Index,
          typename Device >
   template< typename Offsets >
Index
CSRAdaptiveKernel< Index, Device >::
findLimit( const Index start,
           const Offsets& offsets,
           const Index size,
           details::Type &type,
           Index &sum )
{
   sum = 0;
   for (Index current = start; current < size - 1; current++ )
   {
      Index elements = offsets[ current + 1 ] - offsets[ current ];
      sum += elements;
      if( sum > details::CSRAdaptiveKernelParameters< double >::StreamedSharedElementsPerWarp() )
      {
         if( current - start > 0 ) // extra row
         {
            type = details::Type::STREAM;
            return current;
         }
         else
         {                  // one long row
            if( sum <= 2 * details::CSRAdaptiveKernelParameters< double >::MaxAdaptiveElementsPerWarp() )
               type = details::Type::VECTOR;
            else
               type = details::Type::LONG;
            return current + 1;
         }
      }
   }
   type = details::Type::STREAM;
   return size - 1; // return last row pointer
   }

template< typename Index,
          typename Device >
   template< typename Offsets >
void
CSRAdaptiveKernel< Index, Device >::
init( const Offsets& offsets )
{
   using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >;
   HostOffsetsType hostOffsets( offsets );
   const Index rows = offsets.getSize();
   Index sum, start( 0 ), nextStart( 0 );

   // Fill blocks
   std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks;
   inBlocks.reserve( rows );

   while( nextStart != rows - 1 )
   {
      details::Type type;
      nextStart = findLimit( start, hostOffsets, rows, type, sum );

      if( type == details::Type::LONG )
      {
         const Index blocksCount = inBlocks.size();
         const Index warpsPerCudaBlock = details::CSRAdaptiveKernelParameters< double >::CudaBlockSize() / Cuda::getWarpSize();
         Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount;
         if( warpsLeft == 0 )
            warpsLeft = warpsPerCudaBlock;
         for( Index index = 0; index < warpsLeft; index++ )
            inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft );
      }
      else
      {
         inBlocks.emplace_back(start, type,
               nextStart,
               offsets.getElement(nextStart),
               offsets.getElement(start) );
      }
      start = nextStart;
   }
   inBlocks.emplace_back(nextStart);
   this->blocks = inBlocks;
   this->view.setBlocks( blocks );
}

template< typename Index,
          typename Device >
void
CSRAdaptiveKernel< Index, Device >::
reset()
{
   this->blocks.reset();
   this->view.setBlocks( blocks );
}

template< typename Index,
          typename Device >
auto
CSRAdaptiveKernel< Index, Device >::
getView() -> ViewType
{
   return this->view;
}

template< typename Index,
          typename Device >
auto
CSRAdaptiveKernel< Index, Device >::
getConstView() const -> ConstViewType
{
   return this->view;
};

template< typename Index,
          typename Device >
   template< typename OffsetsView,
               typename Fetch,
               typename Reduction,
               typename ResultKeeper,
               typename Real,
               typename... Args >
void
CSRAdaptiveKernel< Index, Device >::
segmentsReduction( const OffsetsView& offsets,
                   Index first,
                   Index last,
                   Fetch& fetch,
                   const Reduction& reduction,
                   ResultKeeper& keeper,
                   const Real& zero,
                   Args... args ) const
{
   view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
}

      } // namespace Segments
   }  // namespace Algorithms
+1 −0
Original line number Diff line number Diff line
@@ -12,6 +12,7 @@

#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>
#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h>

namespace TNL {
   namespace Algorithms {
+30 −77
Original line number Diff line number Diff line
@@ -25,12 +25,7 @@ namespace TNL {

#ifdef HAVE_CUDA

template< int CudaBlockSize,
          int warpSize,
          int WARPS,
          int SHARED_PER_WARP,
          int MAX_ELEM_PER_WARP,
          typename BlocksView,
template< typename BlocksView,
          typename Offsets,
          typename Index,
          typename Fetch,
@@ -50,10 +45,18 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
                                    Real zero,
                                    Args... args )
{
   __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ];
   __shared__ Real multivectorShared[ CudaBlockSize / warpSize ];
   constexpr size_t MAX_X_DIM = 2147483647;
   const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
   constexpr size_t CudaBlockSize = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize();
   constexpr int WarpSize = Cuda::getWarpSize();
   constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< Real >::WarpsCount();
   constexpr size_t StreamedSharedElementsPerWarp  = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedElementsPerWarp();

   /////
   // Note, for some large large Value types the following allocation of the shared memory might fail.
   // We might add some check here.
   __shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ];
   __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];

   const Index index = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   const Index blockIdx = index / warpSize;
   if( blockIdx >= blocks.getSize() - 1 )
      return;
@@ -113,32 +116,17 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
   }
   else // blockType == Type::LONG - several warps per segment
   {
      // Number of elements processed by previous warps
      //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP;
      ///   block.getWarpIdx() * MAX_ELEM_PER_WARP;
      //Index to = begin + (block.getWarpIdx()  + 1) * MAX_ELEM_PER_WARP;
      const Index segmentIdx = block.getFirstSegment();//block.index[0];
      //minID = offsets[block.index[0] ];
      const Index segmentIdx = block.getFirstSegment();
      const Index end = offsets[ segmentIdx + 1 ];
      //const int tid = threadIdx.x;
      //const int inBlockWarpIdx = block.getWarpIdx();

      //if( to > end )
      //   to = end;
      TNL_ASSERT_GT( block.getWarpsCount(), 0, "" );
      result = zero;
      //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx()  );
      for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx();
           globalIdx < end;
           globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() )
      {
         result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) );
         //if( laneIdx == 0 )
         //   printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end,
         //    details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) );
         //result += values[i] * inVector[columnIndexes[i]];
      }
      //printf( "tid %d -> %d \n", tid, result );

      result += __shfl_down_sync( 0xFFFFFFFF, result, 16 );
      result += __shfl_down_sync( 0xFFFFFFFF, result,  8 );
@@ -146,9 +134,6 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
      result += __shfl_down_sync( 0xFFFFFFFF, result,  2 );
      result += __shfl_down_sync( 0xFFFFFFFF, result,  1 );

      //if( laneIdx == 0 )
      //   printf( "WARP RESULT: tid %d -> %d \n", tid, result );

      const Index warpID = threadIdx.x / 32;
      if( laneIdx == 0 )
         multivectorShared[ warpID ] = result;
@@ -157,7 +142,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
      // Reduction in multivectorShared
      if( block.getWarpIdx() == 0 && laneIdx < 16 )
      {
         constexpr int totalWarps = CudaBlockSize / warpSize;
         constexpr int totalWarps = CudaBlockSize / WarpSize;
         if( totalWarps >= 32 )
         {
            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] );
@@ -184,13 +169,10 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
            __syncwarp();
         }
         if( laneIdx == 0 )
         {
            //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] );
            keep( segmentIdx, multivectorShared[ 0 ] );
      }
   }
}
}
#endif

template< typename Index,
@@ -264,58 +246,29 @@ segmentsReduction( const OffsetsView& offsets,
      return;
   }

   static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
   //static constexpr Index THREADS_SCALAR = 128;
   //static constexpr Index THREADS_VECTOR = 128;
   //static constexpr Index THREADS_LIGHT = 128;

   /* Max length of row to process one warp for CSR Light, MultiVector */
   //static constexpr Index MAX_ELEMENTS_PER_WARP = 384;

   /* Max length of row to process one warp for CSR Adaptive */
   static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512;

   /* How many shared memory use per block in CSR Adaptive kernel */
   static constexpr Index SHARED_PER_BLOCK = 24576;

   /* Number of elements in shared memory */
   static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real);

   /* Number of warps in block for CSR Adaptive */
   static constexpr Index WARPS = THREADS_ADAPTIVE / 32;

   /* Number of elements in shared memory per one warp */
   static constexpr Index SHARED_PER_WARP = SHARED / WARPS;

   constexpr int warpSize = 32;
   //constexpr int warpSize = 32;

   Index blocksCount;

   const Index threads = THREADS_ADAPTIVE;
   constexpr size_t MAX_X_DIM = 2147483647;
   const Index threads = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize();

   /* Fill blocks */
   size_t neededThreads = this->blocks.getSize() * warpSize; // one warp per block
   size_t neededThreads = this->blocks.getSize() * TNL::Cuda::getWarpSize(); // one warp per block
   /* Execute kernels on device */
   for( Index gridIdx = 0; neededThreads != 0; gridIdx++ )
   {
      if (MAX_X_DIM * threads >= neededThreads)
      if( Cuda::getMaxGridSize() * threads >= neededThreads )
      {
         blocksCount = roundUpDivision( neededThreads, threads );
         neededThreads = 0;
      }
      else
      {
         blocksCount = MAX_X_DIM;
         neededThreads -= MAX_X_DIM * threads;
         blocksCount = TNL::Cuda::getMaxGridSize();
         neededThreads -= TNL::Cuda::getMaxGridSize() * threads;
      }

      segmentsReductionCSRAdaptiveKernel<
            THREADS_ADAPTIVE,
            warpSize,
            WARPS,
            SHARED_PER_WARP,
            MAX_ELEMENTS_PER_WARP_ADAPT,
            BlocksView,
            OffsetsView,
            Index, Fetch, Reduction, ResultKeeper, Real, Args... >
+70 −0
Original line number Diff line number Diff line
/***************************************************************************
                          CSRAdaptiveKernelBlockDescriptor.h -  description
                             -------------------
    begin                : Jan 25, 2021 -> Joe Biden inauguration
    copyright            : (C) 2021 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

#pragma once

namespace TNL {
   namespace Algorithms {
      namespace Segments {
         namespace details {

template< typename Value >
struct CSRAdaptiveKernelParameters
{
   /**
    * \brief Computes number of CUDA threads per block depending on Value type.
    *
    * \return CUDA block size.
    */
   static constexpr int CudaBlockSize() { return std::max( ( int ) ( 1024 / sizeof( Value ) ), ( int ) Cuda::getWarpSize() ); };

   /**
    * \brief Returns amount of shared memory dedicated for stream CSR kernel.
    *
    * \return Stream shared memory.
    */
   static constexpr size_t StreamedSharedMemory() { return 40960; };

   /**
    * \brief Number of elements fitting into streamed shared memory.
    */
   static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / sizeof( Value ); };

   /**
    * \brief Computes number of warps in one CUDA block.
    */
   static constexpr size_t WarpsCount() { return CudaBlockSize() / Cuda::getWarpSize(); };

   /**
    * \brief Computes number of elements to be streamed into the shared memory.
    *
    * \return Number of elements to be streamed into the shared memory.
    */
   static constexpr size_t StreamedSharedElementsPerWarp() { return StreamedSharedElementsCount() / WarpsCount(); };

   /**
    * \brief Returns maximum number of elements per warp for vector and hybrid kernel.
    *
    * \return Maximum number of elements per warp for vector and hybrid kernel.
    */
   static constexpr int MaxVectorElementsPerWarp() { return 384; };

   /**
    * \brief Returns maximum number of elements per warp for adaptive kernel.
    *
    * \return Maximum number of elements per warp for adaptive kernel.
    */
   static constexpr int MaxAdaptiveElementsPerWarp() { return 512; };
};

         } // namespace details
      } // namespace Segments
   }  // namespace Algorithms
} // namespace TNL