From 0785c1dac5e0b6dad7f1ee39eccca92696aaf1bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 10 Feb 2021 15:50:44 +0100
Subject: [PATCH] Adaptive CSR kernels adapts to different Value/Real types.

---
 .../Algorithms/Segments/CSRAdaptiveKernel.h   |  2 +
 .../Algorithms/Segments/CSRAdaptiveKernel.hpp |  7 +---
 .../Segments/CSRAdaptiveKernelView.h          |  5 ++-
 .../Segments/CSRAdaptiveKernelView.hpp        | 28 +++----------
 .../details/CSRAdaptiveKernelParameters.h     | 41 +++++++++++++++++--
 5 files changed, 51 insertions(+), 32 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
index 22cf447ecb..d6c3f2b92d 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
@@ -65,6 +65,8 @@ struct CSRAdaptiveKernel
 
    static constexpr int MaxValueSizeLog() { return ViewType::MaxValueSizeLog; };
 
+   static int getSizeValueLog( const int& i ) { return details::CSRAdaptiveKernelParameters<>::getSizeValueLog( i ); };
+
    static TNL::String getKernelType();
 
    template< typename Offsets >
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp
index 13c653c6c5..7bcb66c28c 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp
@@ -165,7 +165,7 @@ initValueSize( const Offsets& offsets )
       if( type == details::Type::LONG )
       {
          const Index blocksCount = inBlocks.size();
-         const Index warpsPerCudaBlock = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::CudaBlockSize() / TNL::Cuda::getWarpSize();
+         const Index warpsPerCudaBlock = details::CSRAdaptiveKernelParameters< SizeOfValue >::CudaBlockSize() / TNL::Cuda::getWarpSize();
          Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount;
          if( warpsLeft == 0 )
             warpsLeft = warpsPerCudaBlock;
@@ -182,10 +182,7 @@ initValueSize( const Offsets& offsets )
       start = nextStart;
    }
    inBlocks.emplace_back(nextStart);
-   //std::cerr << "Setting blocks to " << std::log2( SizeOfValue ) << std::endl;
-   TNL_ASSERT_LT( std::log2( SizeOfValue ), MaxValueSizeLog(), "" );
-   TNL_ASSERT_GE( std::log2( SizeOfValue ), 0, "" );
-   this->blocksArray[ (int ) std::log2( SizeOfValue ) ] = inBlocks;
+   this->blocksArray[ getSizeValueLog( SizeOfValue ) ] = inBlocks;
 }
 
       } // namespace Segments
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h
index 113008ad08..b81d360278 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h
@@ -12,6 +12,7 @@
 
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>
+#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -28,7 +29,9 @@ struct CSRAdaptiveKernelView
    using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >;
    using BlocksView = typename BlocksType::ViewType;
 
-   static constexpr int MaxValueSizeLog = 6;
+   static constexpr int MaxValueSizeLog = details::CSRAdaptiveKernelParameters<>::MaxValueSizeLog;
+
+   static int getSizeValueLog( const int& i ) { return details::CSRAdaptiveKernelParameters<>::getSizeValueLog( i ); };
 
    CSRAdaptiveKernelView() = default;
 
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
index 427b5eba7b..2ddfcd65c8 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
@@ -113,32 +113,17 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
    }
    else // blockType == Type::LONG - several warps per segment
    {
-      // Number of elements processed by previous warps
-      //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP;
-      ///   block.getWarpIdx() * MAX_ELEM_PER_WARP;
-      //Index to = begin + (block.getWarpIdx()  + 1) * MAX_ELEM_PER_WARP;
       const Index segmentIdx = block.getFirstSegment();//block.index[0];
-      //minID = offsets[block.index[0] ];
       const Index end = offsets[segmentIdx + 1];
-      //const int tid = threadIdx.x;
-      //const int inBlockWarpIdx = block.getWarpIdx();
 
-      //if( to > end )
-      //   to = end;
       TNL_ASSERT_GT( block.getWarpsCount(), 0, "" );
       result = zero;
-      //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx()  );
       for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx();
            globalIdx < end;
            globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() )
       {
          result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) );
-         //if( laneIdx == 0 )
-         //   printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end,
-         //    details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) );
-         //result += values[i] * inVector[columnIndexes[i]];
       }
-      //printf( "tid %d -> %d \n", tid, result );
 
       result += __shfl_down_sync(0xFFFFFFFF, result, 16);
       result += __shfl_down_sync(0xFFFFFFFF, result, 8);
@@ -146,9 +131,6 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
       result += __shfl_down_sync(0xFFFFFFFF, result, 2);
       result += __shfl_down_sync(0xFFFFFFFF, result, 1);
 
-      //if( laneIdx == 0 )
-      //   printf( "WARP RESULT: tid %d -> %d \n", tid, result );
-
       const Index warpID = threadIdx.x / 32;
       if( laneIdx == 0 )
          multivectorShared[ warpID ] = result;
@@ -249,9 +231,9 @@ segmentsReduction( const OffsetsView& offsets,
                    Args... args ) const
 {
 #ifdef HAVE_CUDA
-   int valueSizeLog = std::ceil( log2f( ( double ) sizeof( Real ) ) );
+   int valueSizeLog = getSizeValueLog( sizeof( Real ) );
 
-   if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog > MaxValueSizeLog )
+   if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog >= MaxValueSizeLog )
    {
       TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
          segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
@@ -261,11 +243,11 @@ segmentsReduction( const OffsetsView& offsets,
    Index blocksCount;
 
    const Index threads = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
-   constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize(); //2147483647;
+   constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize();
 
-   // Fill blocks 
+   // Fill blocks
    size_t neededThreads = this->blocksArray[ valueSizeLog ].getSize() * TNL::Cuda::getWarpSize(); // one warp per block
-   // Execute kernels on device 
+   // Execute kernels on device
    for (Index gridIdx = 0; neededThreads != 0; gridIdx++ )
    {
       if( maxGridSize * threads >= neededThreads )
diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
index 56f203a74b..3fa0855cb6 100644
--- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
+++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
@@ -15,17 +15,26 @@ namespace TNL {
       namespace Segments {
          namespace details {
 
-template< int SizeOfValue,
+static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 128, 128, 128 };
+
+template< int SizeOfValue = 1,
           int StreamedSharedMemory_ = 24576 >
 struct CSRAdaptiveKernelParameters
 {
+   static constexpr int MaxValueSizeLog = 6;
+
+   static constexpr int getSizeValueLogConstexpr( const int i );
+
+   static constexpr int SizeOfValueLog = getSizeValueLogConstexpr( SizeOfValue );
+   static_assert( SizeOfValueLog < MaxValueSizeLog, "Parameter SizeOfValue is too large." );
+
    /**
     * \brief Computes number of CUDA threads per block depending on Value type.
     *
     * \return CUDA block size.
     */
-   static constexpr int CudaBlockSize() { return 128; }; //sizeof( Value ) == 8 ? 128 : 256; };
-    //std::max( ( int ) ( 1024 / sizeof( Value ) ), ( int ) Cuda::getWarpSize() ); };
+   static constexpr int CudaBlockSize() { return CSRAdaptiveKernelParametersCudaBlockSizes[ SizeOfValueLog ]; };
+   //{ return SizeOfValue == 8 ? 128 : 256; };
 
    /**
     * \brief Returns amount of shared memory dedicated for stream CSR kernel.
@@ -64,6 +73,32 @@ struct CSRAdaptiveKernelParameters
     * \return Maximum number of elements per warp for adaptive kernel.
     */
    static constexpr int MaxAdaptiveElementsPerWarp() { return 512; };
+
+   static int getSizeValueLog( const int i )
+   {
+      if( i ==  1 ) return 0;
+      if( i ==  2 ) return 1;
+      if( i <=  4 ) return 2;
+      if( i <=  8 ) return 3;
+      if( i <= 16 ) return 4;
+      return 5;
+   }
+};
+
+
+template< int SizeOfValue,
+          int StreamedSharedMemory_ >
+constexpr int 
+CSRAdaptiveKernelParameters< SizeOfValue, StreamedSharedMemory_ >::
+getSizeValueLogConstexpr( const int i )
+{
+   if( i ==  1 ) return 0;
+   if( i ==  2 ) return 1;
+   if( i <=  4 ) return 2;
+   if( i <=  8 ) return 3;
+   if( i <= 16 ) return 4;
+   if( i <= 32 ) return 5;
+   return 6;
 };
 
          } // namespace details
-- 
GitLab