From 25e559134e90a78fd7b6d63820786ab7ee98cd72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com> Date: Tue, 9 Feb 2021 21:48:26 +0100 Subject: [PATCH] Added CSRAdaptiveKernelParameters. --- .../Algorithms/Segments/CSRAdaptiveKernel.h | 2 +- .../Segments/CSRAdaptiveKernelView.hpp | 16 +++-- .../details/CSRAdaptiveKernelParameters.h | 72 +++++++++++++++++++ 3 files changed, 82 insertions(+), 8 deletions(-) create mode 100644 src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h index 6b64f1a851..6314ecef5f 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h @@ -65,7 +65,7 @@ struct CSRAdaptiveKernel static TNL::String getKernelType(); - static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; + static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO: diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp index e9e1badbab..48867aa816 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp @@ -18,6 +18,7 @@ #include <TNL/Algorithms/Segments/CSRScalarKernel.h> #include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> +#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h> namespace TNL { namespace Algorithms { @@ -25,8 +26,7 @@ namespace TNL { #ifdef HAVE_CUDA -template< int CudaBlockSize, - int warpSize, +template< int warpSize, int WARPS, int SHARED_PER_WARP, int MAX_ELEM_PER_WARP, @@ -50,6 +50,12 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, Real zero, Args... args ) { + static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); + constexpr int WarpSize = Cuda::getWarpSize(); + constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< Real >::WarpsCount(); + constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedElementsPerWarp(); + + __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; __shared__ Real multivectorShared[ CudaBlockSize / warpSize ]; constexpr size_t MAX_X_DIM = 2147483647; @@ -264,10 +270,7 @@ segmentsReduction( const OffsetsView& offsets, return; } - static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; - //static constexpr Index THREADS_SCALAR = 128; - //static constexpr Index THREADS_VECTOR = 128; - //static constexpr Index THREADS_LIGHT = 128; + static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; /* Max length of row to process one warp for CSR Light, MultiVector */ //static constexpr Index MAX_ELEMENTS_PER_WARP = 384; @@ -311,7 +314,6 @@ segmentsReduction( const OffsetsView& offsets, } segmentsReductionCSRAdaptiveKernel< - THREADS_ADAPTIVE, warpSize, WARPS, SHARED_PER_WARP, diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h new file mode 100644 index 0000000000..83fe3e4bc3 --- /dev/null +++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h @@ -0,0 +1,72 @@ +/*************************************************************************** + CSRAdaptiveKernelBlockDescriptor.h - description + ------------------- + begin : Jan 25, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +namespace TNL { + namespace Algorithms { + namespace Segments { + namespace details { + +template< typename Value > +struct CSRAdaptiveKernelParameters +{ + static const int StreamedSharedMemory_ = 20000; + /** + * \brief Computes number of CUDA threads per block depending on Value type. + * + * \return CUDA block size. + */ + static constexpr int CudaBlockSize() { return 256; }; //sizeof( Value ) == 8 ? 128 : 256; }; + //std::max( ( int ) ( 1024 / sizeof( Value ) ), ( int ) Cuda::getWarpSize() ); }; + + /** + * \brief Returns amount of shared memory dedicated for stream CSR kernel. + * + * \return Stream shared memory. + */ + static constexpr size_t StreamedSharedMemory() { return StreamedSharedMemory_; }; + + /** + * \brief Number of elements fitting into streamed shared memory. + */ + static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / sizeof( Value ); }; + + /** + * \brief Computes number of warps in one CUDA block. + */ + static constexpr size_t WarpsCount() { return CudaBlockSize() / Cuda::getWarpSize(); }; + + /** + * \brief Computes number of elements to be streamed into the shared memory. + * + * \return Number of elements to be streamed into the shared memory. + */ + static constexpr size_t StreamedSharedElementsPerWarp() { return StreamedSharedElementsCount() / WarpsCount(); }; + + /** + * \brief Returns maximum number of elements per warp for vector and hybrid kernel. + * + * \return Maximum number of elements per warp for vector and hybrid kernel. + */ + static constexpr int MaxVectorElementsPerWarp() { return 384; }; + + /** + * \brief Returns maximum number of elements per warp for adaptive kernel. + * + * \return Maximum number of elements per warp for adaptive kernel. + */ + static constexpr int MaxAdaptiveElementsPerWarp() { return 512; }; +}; + + } // namespace details + } // namespace Segments + } // namespace Algorithms +} // namespace TNL -- GitLab