From 936f2c3213616ecafebc039e25d8964149f1d4e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 12 Feb 2021 20:45:07 +0100
Subject: [PATCH] Refactoring of adaptive CSR kernel.

---
 .../Segments/CSRAdaptiveKernelView.hpp        | 23 +++++++++----------
 .../CSRAdaptiveKernelBlockDescriptor.h        | 14 ++++++++++-
 .../details/CSRAdaptiveKernelParameters.h     |  5 +++-
 3 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
index 40700c50f2..a9f921c73c 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
@@ -53,16 +53,16 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
    constexpr size_t StreamedSharedElementsPerWarp  = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp();
 
    __shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ];
-   //__shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];
-   //__shared__ BlockType sharedBlocks[ WarpsCount ];
+   __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];
+   __shared__ BlockType sharedBlocks[ WarpsCount ];
 
    const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x;
    const Index blockIdx = index / WarpSize;
    if( blockIdx >= blocks.getSize() - 1 )
       return;
 
-   //if( threadIdx.x < CudaBlockSize / WarpSize )
-   //   multivectorShared[ threadIdx.x ] = zero;
+   if( threadIdx.x < CudaBlockSize / WarpSize )
+      multivectorShared[ threadIdx.x ] = zero;
    Real result = zero;
    bool compute( true );
    const Index laneIdx = threadIdx.x & 31; // & is cheaper than %
@@ -71,7 +71,8 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
    __syncthreads();
    const auto& block = sharedBlocks[ warpIdx ];*/
    const BlockType block = blocks[ blockIdx ];
-   const Index begin = offsets[ block.getFirstSegment() ];
+   const Index firstSegmentIdx = block.getFirstSegment();
+   const Index begin = offsets[ firstSegmentIdx ];
 
    if( block.getType() == details::Type::STREAM ) // Stream kernel - many short segments per warp
    {
@@ -80,12 +81,10 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
 
       // Stream data to shared memory
       for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += WarpSize )
-      {
          streamShared[ warpIdx ][ globalIdx - begin ] = fetch( globalIdx, compute );
-      }
-      //const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();
+      const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();
 
-      /*for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize )
+      for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize )
       {
          const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data
          result = zero;
@@ -93,9 +92,9 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
          for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ )
             result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] );
          keep( i, result );
-      }*/
+      }
    }
-   /*else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp
+   else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp
    {
       const Index end = begin + block.getSize();
       const Index segmentIdx = block.getFirstSegment();
@@ -172,7 +171,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
             keep( segmentIdx, multivectorShared[ 0 ] );
          }
       }
-   }*/
+   }
 }
 #endif
 
diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
index 96f1899b26..d2be896645 100644
--- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
+++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
@@ -22,11 +22,13 @@ enum class Type {
    VECTOR = 2
 };
 
+//#define CSR_ADAPTIVE_UNION
+
 #ifdef CSR_ADAPTIVE_UNION
 template< typename Index >
 union CSRAdaptiveKernelBlockDescriptor
 {
-   CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0) noexcept
+   CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0, uint8_t warpsCount = 0) noexcept
    {
       this->index[0] = row;
       this->index[1] = index;
@@ -80,6 +82,16 @@ union CSRAdaptiveKernelBlockDescriptor
       return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF );
    }
 
+   __cuda_callable__ uint8_t getWarpIdx() const
+   {
+      return index[ 1 ];
+   }
+
+   __cuda_callable__ uint8_t getWarpsCount() const
+   {
+      return 1;
+   }
+
    void print( std::ostream& str ) const
    {
       Type type = this->getType();
diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
index 3fa0855cb6..0f00fbd808 100644
--- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
+++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
@@ -15,7 +15,7 @@ namespace TNL {
       namespace Segments {
          namespace details {
 
-static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 128, 128, 128 };
+static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 256, 256, 256 };
 
 template< int SizeOfValue = 1,
           int StreamedSharedMemory_ = 24576 >
@@ -25,7 +25,10 @@ struct CSRAdaptiveKernelParameters
 
    static constexpr int getSizeValueLogConstexpr( const int i );
 
+   static constexpr int getSizeOfValue() { return SizeOfValue; };
+
    static constexpr int SizeOfValueLog = getSizeValueLogConstexpr( SizeOfValue );
+
    static_assert( SizeOfValueLog < MaxValueSizeLog, "Parameter SizeOfValue is too large." );
 
    /**
-- 
GitLab