From 25e559134e90a78fd7b6d63820786ab7ee98cd72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 9 Feb 2021 21:48:26 +0100
Subject: [PATCH] Added CSRAdaptiveKernelParameters.

---
 .../Algorithms/Segments/CSRAdaptiveKernel.h   |  2 +-
 .../Segments/CSRAdaptiveKernelView.hpp        | 16 +++--
 .../details/CSRAdaptiveKernelParameters.h     | 72 +++++++++++++++++++
 3 files changed, 82 insertions(+), 8 deletions(-)
 create mode 100644 src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h

diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
index 6b64f1a851..6314ecef5f 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
@@ -65,7 +65,7 @@ struct CSRAdaptiveKernel
 
    static TNL::String getKernelType();
 
-    static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
+   static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;
 
    /* How many shared memory use per block in CSR Adaptive kernel */
    static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO:
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
index e9e1badbab..48867aa816 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
@@ -18,6 +18,7 @@
 #include <TNL/Algorithms/Segments/CSRScalarKernel.h>
 #include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.h>
 #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>
+#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -25,8 +26,7 @@ namespace TNL {
 
 #ifdef HAVE_CUDA
 
-template< int CudaBlockSize,
-          int warpSize,
+template< int warpSize,
           int WARPS,
           int SHARED_PER_WARP,
           int MAX_ELEM_PER_WARP,
@@ -50,6 +50,12 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
                                     Real zero,
                                     Args... args )
 {
+   static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize();
+   constexpr int WarpSize = Cuda::getWarpSize();
+   constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< Real >::WarpsCount();
+   constexpr size_t StreamedSharedElementsPerWarp  = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedElementsPerWarp();
+
+
    __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ];
    __shared__ Real multivectorShared[ CudaBlockSize / warpSize ];
    constexpr size_t MAX_X_DIM = 2147483647;
@@ -264,10 +270,7 @@ segmentsReduction( const OffsetsView& offsets,
       return;
    }
 
-   static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
-   //static constexpr Index THREADS_SCALAR = 128;
-   //static constexpr Index THREADS_VECTOR = 128;
-   //static constexpr Index THREADS_LIGHT = 128;
+   static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;
 
    /* Max length of row to process one warp for CSR Light, MultiVector */
    //static constexpr Index MAX_ELEMENTS_PER_WARP = 384;
@@ -311,7 +314,6 @@ segmentsReduction( const OffsetsView& offsets,
       }
 
       segmentsReductionCSRAdaptiveKernel<
-            THREADS_ADAPTIVE,
             warpSize,
             WARPS,
             SHARED_PER_WARP,
diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
new file mode 100644
index 0000000000..83fe3e4bc3
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
@@ -0,0 +1,72 @@
+/***************************************************************************
+                          CSRAdaptiveKernelBlockDescriptor.h -  description
+                             -------------------
+    begin                : Jan 25, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+         namespace details {
+
+template< typename Value >
+struct CSRAdaptiveKernelParameters
+{
+   static const int StreamedSharedMemory_ = 20000;
+   /**
+    * \brief Computes number of CUDA threads per block depending on Value type.
+    *
+    * \return CUDA block size.
+    */
+   static constexpr int CudaBlockSize() { return 256; }; //sizeof( Value ) == 8 ? 128 : 256; };
+    //std::max( ( int ) ( 1024 / sizeof( Value ) ), ( int ) Cuda::getWarpSize() ); };
+
+   /**
+    * \brief Returns amount of shared memory dedicated for stream CSR kernel.
+    *
+    * \return Stream shared memory.
+    */
+   static constexpr size_t StreamedSharedMemory() { return StreamedSharedMemory_; };
+
+   /**
+    * \brief Number of elements fitting into streamed shared memory.
+    */
+   static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / sizeof( Value ); };
+
+   /**
+    * \brief Computes number of warps in one CUDA block.
+    */
+   static constexpr size_t WarpsCount() { return CudaBlockSize() / Cuda::getWarpSize(); };
+
+   /**
+    * \brief Computes number of elements to be streamed into the shared memory.
+    *
+    * \return Number of elements to be streamed into the shared memory.
+    */
+   static constexpr size_t StreamedSharedElementsPerWarp() { return StreamedSharedElementsCount() / WarpsCount(); };
+
+   /**
+    * \brief Returns maximum number of elements per warp for vector and hybrid kernel.
+    *
+    * \return Maximum number of elements per warp for vector and hybrid kernel.
+    */
+   static constexpr int MaxVectorElementsPerWarp() { return 384; };
+
+   /**
+    * \brief Returns maximum number of elements per warp for adaptive kernel.
+    *
+    * \return Maximum number of elements per warp for adaptive kernel.
+    */
+   static constexpr int MaxAdaptiveElementsPerWarp() { return 512; };
+};
+
+         } // namespace details
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
-- 
GitLab