Commit 0785c1da authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Adaptive CSR kernels adapts to different Value/Real types.

parent 8d5b7fb2
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -65,6 +65,8 @@ struct CSRAdaptiveKernel

   static constexpr int MaxValueSizeLog() { return ViewType::MaxValueSizeLog; };

   static int getSizeValueLog( const int& i ) { return details::CSRAdaptiveKernelParameters<>::getSizeValueLog( i ); };

   static TNL::String getKernelType();

   template< typename Offsets >
+2 −5
Original line number Diff line number Diff line
@@ -165,7 +165,7 @@ initValueSize( const Offsets& offsets )
      if( type == details::Type::LONG )
      {
         const Index blocksCount = inBlocks.size();
         const Index warpsPerCudaBlock = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::CudaBlockSize() / TNL::Cuda::getWarpSize();
         const Index warpsPerCudaBlock = details::CSRAdaptiveKernelParameters< SizeOfValue >::CudaBlockSize() / TNL::Cuda::getWarpSize();
         Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount;
         if( warpsLeft == 0 )
            warpsLeft = warpsPerCudaBlock;
@@ -182,10 +182,7 @@ initValueSize( const Offsets& offsets )
      start = nextStart;
   }
   inBlocks.emplace_back(nextStart);
   //std::cerr << "Setting blocks to " << std::log2( SizeOfValue ) << std::endl;
   TNL_ASSERT_LT( std::log2( SizeOfValue ), MaxValueSizeLog(), "" );
   TNL_ASSERT_GE( std::log2( SizeOfValue ), 0, "" );
   this->blocksArray[ (int ) std::log2( SizeOfValue ) ] = inBlocks;
   this->blocksArray[ getSizeValueLog( SizeOfValue ) ] = inBlocks;
}

      } // namespace Segments
+4 −1
Original line number Diff line number Diff line
@@ -12,6 +12,7 @@

#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>
#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h>

namespace TNL {
   namespace Algorithms {
@@ -28,7 +29,9 @@ struct CSRAdaptiveKernelView
   using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >;
   using BlocksView = typename BlocksType::ViewType;

   static constexpr int MaxValueSizeLog = 6;
   static constexpr int MaxValueSizeLog = details::CSRAdaptiveKernelParameters<>::MaxValueSizeLog;

   static int getSizeValueLog( const int& i ) { return details::CSRAdaptiveKernelParameters<>::getSizeValueLog( i ); };

   CSRAdaptiveKernelView() = default;

+5 −23
Original line number Diff line number Diff line
@@ -113,32 +113,17 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
   }
   else // blockType == Type::LONG - several warps per segment
   {
      // Number of elements processed by previous warps
      //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP;
      ///   block.getWarpIdx() * MAX_ELEM_PER_WARP;
      //Index to = begin + (block.getWarpIdx()  + 1) * MAX_ELEM_PER_WARP;
      const Index segmentIdx = block.getFirstSegment();//block.index[0];
      //minID = offsets[block.index[0] ];
      const Index end = offsets[segmentIdx + 1];
      //const int tid = threadIdx.x;
      //const int inBlockWarpIdx = block.getWarpIdx();

      //if( to > end )
      //   to = end;
      TNL_ASSERT_GT( block.getWarpsCount(), 0, "" );
      result = zero;
      //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx()  );
      for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx();
           globalIdx < end;
           globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() )
      {
         result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) );
         //if( laneIdx == 0 )
         //   printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end,
         //    details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) );
         //result += values[i] * inVector[columnIndexes[i]];
      }
      //printf( "tid %d -> %d \n", tid, result );

      result += __shfl_down_sync(0xFFFFFFFF, result, 16);
      result += __shfl_down_sync(0xFFFFFFFF, result, 8);
@@ -146,9 +131,6 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
      result += __shfl_down_sync(0xFFFFFFFF, result, 2);
      result += __shfl_down_sync(0xFFFFFFFF, result, 1);

      //if( laneIdx == 0 )
      //   printf( "WARP RESULT: tid %d -> %d \n", tid, result );

      const Index warpID = threadIdx.x / 32;
      if( laneIdx == 0 )
         multivectorShared[ warpID ] = result;
@@ -249,9 +231,9 @@ segmentsReduction( const OffsetsView& offsets,
                   Args... args ) const
{
#ifdef HAVE_CUDA
   int valueSizeLog = std::ceil( log2f( ( double ) sizeof( Real ) ) );
   int valueSizeLog = getSizeValueLog( sizeof( Real ) );

   if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog > MaxValueSizeLog )
   if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog >= MaxValueSizeLog )
   {
      TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
         segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
@@ -261,7 +243,7 @@ segmentsReduction( const OffsetsView& offsets,
   Index blocksCount;

   const Index threads = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
   constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize(); //2147483647;
   constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize();

   // Fill blocks
   size_t neededThreads = this->blocksArray[ valueSizeLog ].getSize() * TNL::Cuda::getWarpSize(); // one warp per block
+38 −3
Original line number Diff line number Diff line
@@ -15,17 +15,26 @@ namespace TNL {
      namespace Segments {
         namespace details {

template< int SizeOfValue,
static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 128, 128, 128 };

template< int SizeOfValue = 1,
          int StreamedSharedMemory_ = 24576 >
struct CSRAdaptiveKernelParameters
{
   static constexpr int MaxValueSizeLog = 6;

   static constexpr int getSizeValueLogConstexpr( const int i );

   static constexpr int SizeOfValueLog = getSizeValueLogConstexpr( SizeOfValue );
   static_assert( SizeOfValueLog < MaxValueSizeLog, "Parameter SizeOfValue is too large." );

   /**
    * \brief Computes number of CUDA threads per block depending on Value type.
    *
    * \return CUDA block size.
    */
   static constexpr int CudaBlockSize() { return 128; }; //sizeof( Value ) == 8 ? 128 : 256; };
    //std::max( ( int ) ( 1024 / sizeof( Value ) ), ( int ) Cuda::getWarpSize() ); };
   static constexpr int CudaBlockSize() { return CSRAdaptiveKernelParametersCudaBlockSizes[ SizeOfValueLog ]; };
   //{ return SizeOfValue == 8 ? 128 : 256; };

   /**
    * \brief Returns amount of shared memory dedicated for stream CSR kernel.
@@ -64,6 +73,32 @@ struct CSRAdaptiveKernelParameters
    * \return Maximum number of elements per warp for adaptive kernel.
    */
   static constexpr int MaxAdaptiveElementsPerWarp() { return 512; };

   static int getSizeValueLog( const int i )
   {
      if( i ==  1 ) return 0;
      if( i ==  2 ) return 1;
      if( i <=  4 ) return 2;
      if( i <=  8 ) return 3;
      if( i <= 16 ) return 4;
      return 5;
   }
};


template< int SizeOfValue,
          int StreamedSharedMemory_ >
constexpr int 
CSRAdaptiveKernelParameters< SizeOfValue, StreamedSharedMemory_ >::
getSizeValueLogConstexpr( const int i )
{
   if( i ==  1 ) return 0;
   if( i ==  2 ) return 1;
   if( i <=  4 ) return 2;
   if( i <=  8 ) return 3;
   if( i <= 16 ) return 4;
   if( i <= 32 ) return 5;
   return 6;
};

         } // namespace details