Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +1 −1 Original line number Diff line number Diff line Loading @@ -65,7 +65,7 @@ struct CSRAdaptiveKernel static TNL::String getKernelType(); static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO: Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +9 −7 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ #include <TNL/Algorithms/Segments/CSRScalarKernel.h> #include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h> namespace TNL { namespace Algorithms { Loading @@ -25,8 +26,7 @@ namespace TNL { #ifdef HAVE_CUDA template< int CudaBlockSize, int warpSize, template< int warpSize, int WARPS, int SHARED_PER_WARP, int MAX_ELEM_PER_WARP, Loading @@ -50,6 +50,12 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, Real zero, Args... args ) { static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); constexpr int WarpSize = Cuda::getWarpSize(); constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< Real >::WarpsCount(); constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedElementsPerWarp(); __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; __shared__ Real multivectorShared[ CudaBlockSize / warpSize ]; constexpr size_t MAX_X_DIM = 2147483647; Loading Loading @@ -264,10 +270,7 @@ segmentsReduction( const OffsetsView& offsets, return; } static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; //static constexpr Index THREADS_SCALAR = 128; //static constexpr Index THREADS_VECTOR = 128; //static constexpr Index THREADS_LIGHT = 128; static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; /* Max length of row to process one warp for CSR Light, MultiVector */ //static constexpr Index MAX_ELEMENTS_PER_WARP = 384; Loading Loading @@ -311,7 +314,6 @@ segmentsReduction( const OffsetsView& offsets, } segmentsReductionCSRAdaptiveKernel< THREADS_ADAPTIVE, warpSize, WARPS, SHARED_PER_WARP, Loading src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h 0 → 100644 +72 −0 Original line number Diff line number Diff line /*************************************************************************** CSRAdaptiveKernelBlockDescriptor.h - description ------------------- begin : Jan 25, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once namespace TNL { namespace Algorithms { namespace Segments { namespace details { template< typename Value > struct CSRAdaptiveKernelParameters { static const int StreamedSharedMemory_ = 20000; /** * \brief Computes number of CUDA threads per block depending on Value type. * * \return CUDA block size. */ static constexpr int CudaBlockSize() { return 256; }; //sizeof( Value ) == 8 ? 128 : 256; }; //std::max( ( int ) ( 1024 / sizeof( Value ) ), ( int ) Cuda::getWarpSize() ); }; /** * \brief Returns amount of shared memory dedicated for stream CSR kernel. * * \return Stream shared memory. */ static constexpr size_t StreamedSharedMemory() { return StreamedSharedMemory_; }; /** * \brief Number of elements fitting into streamed shared memory. */ static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / sizeof( Value ); }; /** * \brief Computes number of warps in one CUDA block. */ static constexpr size_t WarpsCount() { return CudaBlockSize() / Cuda::getWarpSize(); }; /** * \brief Computes number of elements to be streamed into the shared memory. * * \return Number of elements to be streamed into the shared memory. */ static constexpr size_t StreamedSharedElementsPerWarp() { return StreamedSharedElementsCount() / WarpsCount(); }; /** * \brief Returns maximum number of elements per warp for vector and hybrid kernel. * * \return Maximum number of elements per warp for vector and hybrid kernel. */ static constexpr int MaxVectorElementsPerWarp() { return 384; }; /** * \brief Returns maximum number of elements per warp for adaptive kernel. * * \return Maximum number of elements per warp for adaptive kernel. */ static constexpr int MaxAdaptiveElementsPerWarp() { return 512; }; }; } // namespace details } // namespace Segments } // namespace Algorithms } // namespace TNL Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +1 −1 Original line number Diff line number Diff line Loading @@ -65,7 +65,7 @@ struct CSRAdaptiveKernel static TNL::String getKernelType(); static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO: Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +9 −7 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ #include <TNL/Algorithms/Segments/CSRScalarKernel.h> #include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h> namespace TNL { namespace Algorithms { Loading @@ -25,8 +26,7 @@ namespace TNL { #ifdef HAVE_CUDA template< int CudaBlockSize, int warpSize, template< int warpSize, int WARPS, int SHARED_PER_WARP, int MAX_ELEM_PER_WARP, Loading @@ -50,6 +50,12 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, Real zero, Args... args ) { static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); constexpr int WarpSize = Cuda::getWarpSize(); constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< Real >::WarpsCount(); constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedElementsPerWarp(); __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; __shared__ Real multivectorShared[ CudaBlockSize / warpSize ]; constexpr size_t MAX_X_DIM = 2147483647; Loading Loading @@ -264,10 +270,7 @@ segmentsReduction( const OffsetsView& offsets, return; } static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; //static constexpr Index THREADS_SCALAR = 128; //static constexpr Index THREADS_VECTOR = 128; //static constexpr Index THREADS_LIGHT = 128; static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; /* Max length of row to process one warp for CSR Light, MultiVector */ //static constexpr Index MAX_ELEMENTS_PER_WARP = 384; Loading Loading @@ -311,7 +314,6 @@ segmentsReduction( const OffsetsView& offsets, } segmentsReductionCSRAdaptiveKernel< THREADS_ADAPTIVE, warpSize, WARPS, SHARED_PER_WARP, Loading
src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h 0 → 100644 +72 −0 Original line number Diff line number Diff line /*************************************************************************** CSRAdaptiveKernelBlockDescriptor.h - description ------------------- begin : Jan 25, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once namespace TNL { namespace Algorithms { namespace Segments { namespace details { template< typename Value > struct CSRAdaptiveKernelParameters { static const int StreamedSharedMemory_ = 20000; /** * \brief Computes number of CUDA threads per block depending on Value type. * * \return CUDA block size. */ static constexpr int CudaBlockSize() { return 256; }; //sizeof( Value ) == 8 ? 128 : 256; }; //std::max( ( int ) ( 1024 / sizeof( Value ) ), ( int ) Cuda::getWarpSize() ); }; /** * \brief Returns amount of shared memory dedicated for stream CSR kernel. * * \return Stream shared memory. */ static constexpr size_t StreamedSharedMemory() { return StreamedSharedMemory_; }; /** * \brief Number of elements fitting into streamed shared memory. */ static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / sizeof( Value ); }; /** * \brief Computes number of warps in one CUDA block. */ static constexpr size_t WarpsCount() { return CudaBlockSize() / Cuda::getWarpSize(); }; /** * \brief Computes number of elements to be streamed into the shared memory. * * \return Number of elements to be streamed into the shared memory. */ static constexpr size_t StreamedSharedElementsPerWarp() { return StreamedSharedElementsCount() / WarpsCount(); }; /** * \brief Returns maximum number of elements per warp for vector and hybrid kernel. * * \return Maximum number of elements per warp for vector and hybrid kernel. */ static constexpr int MaxVectorElementsPerWarp() { return 384; }; /** * \brief Returns maximum number of elements per warp for adaptive kernel. * * \return Maximum number of elements per warp for adaptive kernel. */ static constexpr int MaxAdaptiveElementsPerWarp() { return 512; }; }; } // namespace details } // namespace Segments } // namespace Algorithms } // namespace TNL