From 46482276b60fb41514f4448aef2951718b5e1696 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 7 Feb 2021 20:48:05 +0100
Subject: [PATCH] Renaming CSRKernelAdaptive to CSRAdaptiveKernel.

---
 src/TNL/Algorithms/Segments/CSR.h             |   2 +-
 ...SRKernelAdaptive.h => CSRAdaptiveKernel.h} | 171 ++-------------
 .../Algorithms/Segments/CSRAdaptiveKernel.hpp | 197 ++++++++++++++++++
 src/TNL/Algorithms/Segments/CSRView.h         |   4 +-
 4 files changed, 215 insertions(+), 159 deletions(-)
 rename src/TNL/Algorithms/Segments/{CSRKernelAdaptive.h => CSRAdaptiveKernel.h} (59%)
 create mode 100644 src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp

diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index 188960b6d4..394d4dbade 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -155,7 +155,7 @@ using CSRHybrid = CSR< Device, Index, CSRHybridKernel< Index, Device >, IndexAll
 template< typename Device,
           typename Index,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
-using CSRAdaptive = CSR< Device, Index, CSRKernelAdaptive< Index, Device >, IndexAllocator >;
+using CSRAdaptive = CSR< Device, Index, CSRAdaptiveKernel< Index, Device >, IndexAllocator >;
 
 template< typename Device,
           typename Index,
diff --git a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
similarity index 59%
rename from src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
rename to src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
index 84f1cc4376..0336b06e1f 100644
--- a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          CSRKernels.h -  description
+                          CSRAdaptiveKernel.h -  description
                              -------------------
     begin                : Jan 20, 2021 -> Joe Biden inauguration
     copyright            : (C) 2021 by Tomas Oberhuber
@@ -15,7 +15,7 @@
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
-#include <TNL/Algorithms/Segments/CSRKernelScalar.h>
+#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
 #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>
 
 namespace TNL {
@@ -47,166 +47,23 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
                                     Reduction reduce,
                                     ResultKeeper keep,
                                     Real zero,
-                                    Args... args )
-{
-   __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ];
-   __shared__ Real multivectorShared[ CudaBlockSize / warpSize ];
-   constexpr size_t MAX_X_DIM = 2147483647;
-   const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
-   const Index blockIdx = index / warpSize;
-   if( blockIdx >= blocks.getSize() - 1 )
-      return;
-
-   if( threadIdx.x < CudaBlockSize / warpSize )
-      multivectorShared[ threadIdx.x ] = zero;
-   Real result = zero;
-   bool compute( true );
-   const Index laneIdx = threadIdx.x & 31; // & is cheaper than %
-   const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ];
-   const Index& firstSegmentIdx = block.getFirstSegment();
-   const Index begin = offsets[ firstSegmentIdx ];
-
-   const auto blockType = block.getType();
-   if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp
-   {
-      const Index warpIdx = threadIdx.x / 32;
-      const Index end = begin + block.getSize();
-
-      // Stream data to shared memory
-      for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize )
-      {
-         streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute );
-            details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute );
-         // TODO:: fix this by template specialization so that we can assume fetch lambda
-         // with short parameters
-      }
-
-      const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();
-
-      for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize )
-      {
-         const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data
-         result = zero;
-         // Scalar reduction
-         for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ )
-            result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] );
-         keep( i, result );
-      }
-   }
-   else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp
-   {
-      const Index end = begin + block.getSize();
-      const Index segmentIdx = block.getFirstSegment();
-
-      for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize )
-         result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx
-
-      // Parallel reduction
-      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) );
-      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  8 ) );
-      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  4 ) );
-      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  2 ) );
-      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  1 ) );
-      if( laneIdx == 0 )
-         keep( segmentIdx, result );
-   }
-   else // blockType == Type::LONG - several warps per segment
-   {
-      // Number of elements processed by previous warps
-      //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP;
-      ///   block.getWarpIdx() * MAX_ELEM_PER_WARP;
-      //Index to = begin + (block.getWarpIdx()  + 1) * MAX_ELEM_PER_WARP;
-      const Index segmentIdx = block.getFirstSegment();//block.index[0];
-      //minID = offsets[block.index[0] ];
-      const Index end = offsets[segmentIdx + 1];
-      //const int tid = threadIdx.x;
-      //const int inBlockWarpIdx = block.getWarpIdx();
-
-      //if( to > end )
-      //   to = end;
-      TNL_ASSERT_GT( block.getWarpsCount(), 0, "" );
-      result = zero;
-      //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx()  );
-      for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx();
-           globalIdx < end;
-           globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() )
-      {
-         result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) );
-         //if( laneIdx == 0 )
-         //   printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end,
-         //    details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) );
-         //result += values[i] * inVector[columnIndexes[i]];
-      }
-      //printf( "tid %d -> %d \n", tid, result );
-
-      result += __shfl_down_sync(0xFFFFFFFF, result, 16);
-      result += __shfl_down_sync(0xFFFFFFFF, result, 8);
-      result += __shfl_down_sync(0xFFFFFFFF, result, 4);
-      result += __shfl_down_sync(0xFFFFFFFF, result, 2);
-      result += __shfl_down_sync(0xFFFFFFFF, result, 1);
-
-      //if( laneIdx == 0 )
-      //   printf( "WARP RESULT: tid %d -> %d \n", tid, result );
-
-      const Index warpID = threadIdx.x / 32;
-      if( laneIdx == 0 )
-         multivectorShared[ warpID ] = result;
-
-      __syncthreads();
-      // Reduction in multivectorShared
-      if( block.getWarpIdx() == 0 && laneIdx < 16 )
-      {
-         constexpr int totalWarps = CudaBlockSize / warpSize;
-         if( totalWarps >= 32 )
-         {
-            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] );
-            __syncwarp();
-         }
-         if( totalWarps >= 16 )
-         {
-            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  8 ] );
-            __syncwarp();
-         }
-         if( totalWarps >= 8 )
-         {
-            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  4 ] );
-            __syncwarp();
-         }
-         if( totalWarps >= 4 )
-         {
-            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  2 ] );
-            __syncwarp();
-         }
-         if( totalWarps >= 2 )
-         {
-            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  1 ] );
-            __syncwarp();
-         }
-         if( laneIdx == 0 )
-         {
-            //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] );
-            keep( segmentIdx, multivectorShared[ 0 ] );
-         }
-      }
-   }
-}
+                                    Args... args );
 #endif
 
-
 template< typename Index,
           typename Device >
-struct CSRKernelAdaptiveView
+struct CSRAdaptiveKernelView
 {
    using IndexType = Index;
    using DeviceType = Device;
-   using ViewType = CSRKernelAdaptiveView< Index, Device >;
-   using ConstViewType = CSRKernelAdaptiveView< Index, Device >;
+   using ViewType = CSRAdaptiveKernelView< Index, Device >;
+   using ConstViewType = CSRAdaptiveKernelView< Index, Device >;
    using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >;
    using BlocksView = typename BlocksType::ViewType;
 
-   CSRKernelAdaptiveView() = default;
+   CSRAdaptiveKernelView() = default;
 
-   CSRKernelAdaptiveView( BlocksType& blocks )
+   CSRAdaptiveKernelView( BlocksType& blocks )
    {
       this->blocks.bind( blocks );
    };
@@ -243,7 +100,7 @@ struct CSRKernelAdaptiveView
 #ifdef HAVE_CUDA
       if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() )
       {
-         TNL::Algorithms::Segments::CSRKernelScalar< Index, Device >::
+         TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
             segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
          return;
       }
@@ -318,7 +175,7 @@ struct CSRKernelAdaptiveView
 #endif
    }
 
-   CSRKernelAdaptiveView& operator=( const CSRKernelAdaptiveView< Index, Device >& kernelView )
+   CSRAdaptiveKernelView& operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView )
    {
       this->blocks.bind( kernelView.blocks );
       return *this;
@@ -340,12 +197,12 @@ struct CSRKernelAdaptiveView
 
 template< typename Index,
           typename Device >
-struct CSRKernelAdaptive
+struct CSRAdaptiveKernel
 {
    using IndexType = Index;
    using DeviceType = Device;
-   using ViewType = CSRKernelAdaptiveView< Index, Device >;
-   using ConstViewType = CSRKernelAdaptiveView< Index, Device >;
+   using ViewType = CSRAdaptiveKernelView< Index, Device >;
+   using ConstViewType = CSRAdaptiveKernelView< Index, Device >;
    using BlocksType = typename ViewType::BlocksType;
    using BlocksView = typename BlocksType::ViewType;
 
@@ -485,3 +342,5 @@ struct CSRKernelAdaptive
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
+
+#include <TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp>
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp
new file mode 100644
index 0000000000..1557628b8e
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp
@@ -0,0 +1,197 @@
+/***************************************************************************
+                          CSRAdaptiveKernel.h -  description
+                             -------------------
+    begin                : Feb 7, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
+#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+#ifdef HAVE_CUDA
+
+template< int CudaBlockSize,
+          int warpSize,
+          int WARPS,
+          int SHARED_PER_WARP,
+          int MAX_ELEM_PER_WARP,
+          typename BlocksView,
+          typename Offsets,
+          typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real,
+          typename... Args >
+__global__ void
+segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
+                                    int gridIdx,
+                                    Offsets offsets,
+                                    Index first,
+                                    Index last,
+                                    Fetch fetch,
+                                    Reduction reduce,
+                                    ResultKeeper keep,
+                                    Real zero,
+                                    Args... args )
+{
+   __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ];
+   __shared__ Real multivectorShared[ CudaBlockSize / warpSize ];
+   constexpr size_t MAX_X_DIM = 2147483647;
+   const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
+   const Index blockIdx = index / warpSize;
+   if( blockIdx >= blocks.getSize() - 1 )
+      return;
+
+   if( threadIdx.x < CudaBlockSize / warpSize )
+      multivectorShared[ threadIdx.x ] = zero;
+   Real result = zero;
+   bool compute( true );
+   const Index laneIdx = threadIdx.x & 31; // & is cheaper than %
+   const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ];
+   const Index& firstSegmentIdx = block.getFirstSegment();
+   const Index begin = offsets[ firstSegmentIdx ];
+
+   const auto blockType = block.getType();
+   if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp
+   {
+      const Index warpIdx = threadIdx.x / 32;
+      const Index end = begin + block.getSize();
+
+      // Stream data to shared memory
+      for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize )
+      {
+         streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute );
+            details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute );
+         // TODO:: fix this by template specialization so that we can assume fetch lambda
+         // with short parameters
+      }
+
+      const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();
+
+      for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize )
+      {
+         const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data
+         result = zero;
+         // Scalar reduction
+         for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ )
+            result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] );
+         keep( i, result );
+      }
+   }
+   else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp
+   {
+      const Index end = begin + block.getSize();
+      const Index segmentIdx = block.getFirstSegment();
+
+      for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize )
+         result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx
+
+      // Parallel reduction
+      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) );
+      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  8 ) );
+      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  4 ) );
+      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  2 ) );
+      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  1 ) );
+      if( laneIdx == 0 )
+         keep( segmentIdx, result );
+   }
+   else // blockType == Type::LONG - several warps per segment
+   {
+      // Number of elements processed by previous warps
+      //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP;
+      ///   block.getWarpIdx() * MAX_ELEM_PER_WARP;
+      //Index to = begin + (block.getWarpIdx()  + 1) * MAX_ELEM_PER_WARP;
+      const Index segmentIdx = block.getFirstSegment();//block.index[0];
+      //minID = offsets[block.index[0] ];
+      const Index end = offsets[segmentIdx + 1];
+      //const int tid = threadIdx.x;
+      //const int inBlockWarpIdx = block.getWarpIdx();
+
+      //if( to > end )
+      //   to = end;
+      TNL_ASSERT_GT( block.getWarpsCount(), 0, "" );
+      result = zero;
+      //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx()  );
+      for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx();
+           globalIdx < end;
+           globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() )
+      {
+         result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) );
+         //if( laneIdx == 0 )
+         //   printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end,
+         //    details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) );
+         //result += values[i] * inVector[columnIndexes[i]];
+      }
+      //printf( "tid %d -> %d \n", tid, result );
+
+      result += __shfl_down_sync(0xFFFFFFFF, result, 16);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 8);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 4);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+      result += __shfl_down_sync(0xFFFFFFFF, result, 1);
+
+      //if( laneIdx == 0 )
+      //   printf( "WARP RESULT: tid %d -> %d \n", tid, result );
+
+      const Index warpID = threadIdx.x / 32;
+      if( laneIdx == 0 )
+         multivectorShared[ warpID ] = result;
+
+      __syncthreads();
+      // Reduction in multivectorShared
+      if( block.getWarpIdx() == 0 && laneIdx < 16 )
+      {
+         constexpr int totalWarps = CudaBlockSize / warpSize;
+         if( totalWarps >= 32 )
+         {
+            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] );
+            __syncwarp();
+         }
+         if( totalWarps >= 16 )
+         {
+            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  8 ] );
+            __syncwarp();
+         }
+         if( totalWarps >= 8 )
+         {
+            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  4 ] );
+            __syncwarp();
+         }
+         if( totalWarps >= 4 )
+         {
+            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  2 ] );
+            __syncwarp();
+         }
+         if( totalWarps >= 2 )
+         {
+            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  1 ] );
+            __syncwarp();
+         }
+         if( laneIdx == 0 )
+         {
+            //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] );
+            keep( segmentIdx, multivectorShared[ 0 ] );
+         }
+      }
+   }
+}
+#endif
+
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
\ No newline at end of file
diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index 5d8ebeeb91..a97d784536 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -17,7 +17,7 @@
 #include <TNL/Algorithms/Segments/CSRScalarKernel.h>
 #include <TNL/Algorithms/Segments/CSRVectorKernel.h>
 #include <TNL/Algorithms/Segments/CSRHybridKernel.h>
-#include <TNL/Algorithms/Segments/CSRKernelAdaptive.h>
+#include <TNL/Algorithms/Segments/CSRAdaptiveKernel.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -146,7 +146,7 @@ using CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< Index, Device > >
 
 template< typename Device,
           typename Index >
-using CSRViewAdaptive = CSRView< Device, Index, CSRKernelAdaptive< Index, Device > >;
+using CSRViewAdaptive = CSRView< Device, Index, CSRAdaptiveKernel< Index, Device > >;
 
 template< typename Device,
           typename Index >
-- 
GitLab