diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h index 6314ecef5f602b9747a332c4429a47e521f19505..46d323f0235fe48926b87294f3efa0cc42b91e6d 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h @@ -65,25 +65,26 @@ struct CSRAdaptiveKernel static TNL::String getKernelType(); + static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; - /* How many shared memory use per block in CSR Adaptive kernel */ - static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO: + // How many shared memory use per block in CSR Adaptive kernel + static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Index >::StreamedSharedMemory(); //20000; //24576; TODO: - /* Number of elements in shared memory */ + // Number of elements in shared memory static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double); - /* Number of warps in block for CSR Adaptive */ + // Number of warps in block for CSR Adaptive static constexpr Index WARPS = THREADS_ADAPTIVE / 32; - /* Number of elements in shared memory per one warp */ + // Number of elements in shared memory per one warp static constexpr Index SHARED_PER_WARP = SHARED / WARPS; - /* Max length of row to process one warp for CSR Light, MultiVector */ + // Max length of row to process one warp for CSR Light, MultiVector static constexpr Index MAX_ELEMENTS_PER_WARP = 384; - /* Max length of row to process one warp for CSR Adaptive */ - static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; + // Max length of row to process one warp for CSR Adaptive + static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Index >::MaxAdaptiveElementsPerWarp(); template< typename Offsets > Index findLimit( const Index start, diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp index 48867aa8164194ebfc31f350147f979c1647557f..743f0902f5694aba54647e017f712ceb5c689fae 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp @@ -276,10 +276,10 @@ segmentsReduction( const OffsetsView& offsets, //static constexpr Index MAX_ELEMENTS_PER_WARP = 384; /* Max length of row to process one warp for CSR Adaptive */ - static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; + static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(); /* How many shared memory use per block in CSR Adaptive kernel */ - static constexpr Index SHARED_PER_BLOCK = 24576; + static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedMemory(); /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); @@ -317,7 +317,7 @@ segmentsReduction( const OffsetsView& offsets, warpSize, WARPS, SHARED_PER_WARP, - MAX_ELEMENTS_PER_WARP_ADAPT, + details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(), BlocksView, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... > diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h index 83fe3e4bc37a95b0b138b1c7fee99139eb401f96..f9dedbaf0bbafb5f346102a0de5abdd8a80ab05a 100644 --- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h +++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h @@ -15,10 +15,10 @@ namespace TNL { namespace Segments { namespace details { -template< typename Value > +template< typename Value, + int StreamedSharedMemory_ = 24576 > struct CSRAdaptiveKernelParameters { - static const int StreamedSharedMemory_ = 20000; /** * \brief Computes number of CUDA threads per block depending on Value type. *