From 1da0935b395fed29f5637ec288dc7334052e7721 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 9 Feb 2021 22:45:24 +0100
Subject: [PATCH] Refactoring CSR Adaptive kernel.

---
 src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h | 17 +++++++++--------
 .../Segments/CSRAdaptiveKernelView.hpp          |  6 +++---
 .../details/CSRAdaptiveKernelParameters.h       |  4 ++--
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
index 6314ecef5f..46d323f023 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
@@ -65,25 +65,26 @@ struct CSRAdaptiveKernel
 
    static TNL::String getKernelType();
 
+
    static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;
 
-   /* How many shared memory use per block in CSR Adaptive kernel */
-   static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO:
+   // How many shared memory use per block in CSR Adaptive kernel
+   static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Index >::StreamedSharedMemory(); //20000; //24576; TODO:
 
-   /* Number of elements in shared memory */
+   // Number of elements in shared memory 
    static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double);
 
-   /* Number of warps in block for CSR Adaptive */
+   // Number of warps in block for CSR Adaptive 
    static constexpr Index WARPS = THREADS_ADAPTIVE / 32;
 
-   /* Number of elements in shared memory per one warp */
+   // Number of elements in shared memory per one warp 
    static constexpr Index SHARED_PER_WARP = SHARED / WARPS;
 
-   /* Max length of row to process one warp for CSR Light, MultiVector */
+   // Max length of row to process one warp for CSR Light, MultiVector 
    static constexpr Index MAX_ELEMENTS_PER_WARP = 384;
 
-   /* Max length of row to process one warp for CSR Adaptive */
-   static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512;
+   // Max length of row to process one warp for CSR Adaptive 
+   static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Index >::MaxAdaptiveElementsPerWarp();
 
    template< typename Offsets >
    Index findLimit( const Index start,
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
index 48867aa816..743f0902f5 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
@@ -276,10 +276,10 @@ segmentsReduction( const OffsetsView& offsets,
    //static constexpr Index MAX_ELEMENTS_PER_WARP = 384;
 
    /* Max length of row to process one warp for CSR Adaptive */
-   static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512;
+   static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp();
 
    /* How many shared memory use per block in CSR Adaptive kernel */
-   static constexpr Index SHARED_PER_BLOCK = 24576;
+   static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedMemory();
 
    /* Number of elements in shared memory */
    static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real);
@@ -317,7 +317,7 @@ segmentsReduction( const OffsetsView& offsets,
             warpSize,
             WARPS,
             SHARED_PER_WARP,
-            MAX_ELEMENTS_PER_WARP_ADAPT,
+            details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(),
             BlocksView,
             OffsetsView,
             Index, Fetch, Reduction, ResultKeeper, Real, Args... >
diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
index 83fe3e4bc3..f9dedbaf0b 100644
--- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
+++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
@@ -15,10 +15,10 @@ namespace TNL {
       namespace Segments {
          namespace details {
 
-template< typename Value >
+template< typename Value,
+          int StreamedSharedMemory_ = 24576 >
 struct CSRAdaptiveKernelParameters
 {
-   static const int StreamedSharedMemory_ = 20000;
    /**
     * \brief Computes number of CUDA threads per block depending on Value type.
     *
-- 
GitLab