From 1da0935b395fed29f5637ec288dc7334052e7721 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com> Date: Tue, 9 Feb 2021 22:45:24 +0100 Subject: [PATCH] Refactoring CSR Adaptive kernel. --- src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h | 17 +++++++++-------- .../Segments/CSRAdaptiveKernelView.hpp | 6 +++--- .../details/CSRAdaptiveKernelParameters.h | 4 ++-- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h index 6314ecef5f..46d323f023 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h @@ -65,25 +65,26 @@ struct CSRAdaptiveKernel static TNL::String getKernelType(); + static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; - /* How many shared memory use per block in CSR Adaptive kernel */ - static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO: + // How many shared memory use per block in CSR Adaptive kernel + static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Index >::StreamedSharedMemory(); //20000; //24576; TODO: - /* Number of elements in shared memory */ + // Number of elements in shared memory static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double); - /* Number of warps in block for CSR Adaptive */ + // Number of warps in block for CSR Adaptive static constexpr Index WARPS = THREADS_ADAPTIVE / 32; - /* Number of elements in shared memory per one warp */ + // Number of elements in shared memory per one warp static constexpr Index SHARED_PER_WARP = SHARED / WARPS; - /* Max length of row to process one warp for CSR Light, MultiVector */ + // Max length of row to process one warp for CSR Light, MultiVector static constexpr Index MAX_ELEMENTS_PER_WARP = 384; - /* Max length of row to process one warp for CSR Adaptive */ - static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; + // Max length of row to process one warp for CSR Adaptive + static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Index >::MaxAdaptiveElementsPerWarp(); template< typename Offsets > Index findLimit( const Index start, diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp index 48867aa816..743f0902f5 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp @@ -276,10 +276,10 @@ segmentsReduction( const OffsetsView& offsets, //static constexpr Index MAX_ELEMENTS_PER_WARP = 384; /* Max length of row to process one warp for CSR Adaptive */ - static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; + static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(); /* How many shared memory use per block in CSR Adaptive kernel */ - static constexpr Index SHARED_PER_BLOCK = 24576; + static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedMemory(); /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); @@ -317,7 +317,7 @@ segmentsReduction( const OffsetsView& offsets, warpSize, WARPS, SHARED_PER_WARP, - MAX_ELEMENTS_PER_WARP_ADAPT, + details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(), BlocksView, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... > diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h index 83fe3e4bc3..f9dedbaf0b 100644 --- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h +++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h @@ -15,10 +15,10 @@ namespace TNL { namespace Segments { namespace details { -template< typename Value > +template< typename Value, + int StreamedSharedMemory_ = 24576 > struct CSRAdaptiveKernelParameters { - static const int StreamedSharedMemory_ = 20000; /** * \brief Computes number of CUDA threads per block depending on Value type. * -- GitLab