Refactoring CSR Adaptive kernel.

1da0935b · Tomáš Oberhuber · 25e55913 · 1da0935b · 1da0935b · 1da0935b
Commit 1da0935b authored 4 years ago by Tomáš Oberhuber
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
@@ -65,25 +65,26 @@ struct CSRAdaptiveKernel
   static TNL::String getKernelType();
   static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;
-   /* How many shared memory use per block in CSR Adaptive kernel */
+   // How many shared memory use per block in CSR Adaptive kernel
-   static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO:
+   static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Index >::StreamedSharedMemory(); //20000; //24576; TODO:
-   /* Number of elements in shared memory */
+   // Number of elements in shared memory 
   static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double);
-   /* Number of warps in block for CSR Adaptive */
+   // Number of warps in block for CSR Adaptive 
   static constexpr Index WARPS = THREADS_ADAPTIVE / 32;
-   /* Number of elements in shared memory per one warp */
+   // Number of elements in shared memory per one warp 
   static constexpr Index SHARED_PER_WARP = SHARED / WARPS;
-   /* Max length of row to process one warp for CSR Light, MultiVector */
+   // Max length of row to process one warp for CSR Light, MultiVector 
   static constexpr Index MAX_ELEMENTS_PER_WARP = 384;
-   /* Max length of row to process one warp for CSR Adaptive */
+   // Max length of row to process one warp for CSR Adaptive 
-   static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512;
+   static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Index >::MaxAdaptiveElementsPerWarp();
   template< typename Offsets >
   Index findLimit( const Index start,

--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
@@ -276,10 +276,10 @@ segmentsReduction( const OffsetsView& offsets,
   //static constexpr Index MAX_ELEMENTS_PER_WARP = 384;
   /* Max length of row to process one warp for CSR Adaptive */
-   static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512;
+   static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp();
   /* How many shared memory use per block in CSR Adaptive kernel */
-   static constexpr Index SHARED_PER_BLOCK = 24576;
+   static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedMemory();
   /* Number of elements in shared memory */
   static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real);
@@ -317,7 +317,7 @@ segmentsReduction( const OffsetsView& offsets,
            warpSize,
            WARPS,
            SHARED_PER_WARP,
-            MAX_ELEMENTS_PER_WARP_ADAPT,
+            details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(),
            BlocksView,
            OffsetsView,
            Index, Fetch, Reduction, ResultKeeper, Real, Args... >

--- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
+++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
@@ -15,10 +15,10 @@ namespace TNL {
      namespace Segments {
         namespace details {
-template< typename Value >
+template< typename Value,
+          int StreamedSharedMemory_ = 24576 >
 struct CSRAdaptiveKernelParameters
 {
-   static const int StreamedSharedMemory_ = 20000;
   /**
    * \brief Computes number of CUDA threads per block depending on Value type.
    *