Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +9 −8 Original line number Diff line number Diff line Loading @@ -65,25 +65,26 @@ struct CSRAdaptiveKernel static TNL::String getKernelType(); static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO: // How many shared memory use per block in CSR Adaptive kernel static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Index >::StreamedSharedMemory(); //20000; //24576; TODO: /* Number of elements in shared memory */ // Number of elements in shared memory static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double); /* Number of warps in block for CSR Adaptive */ // Number of warps in block for CSR Adaptive static constexpr Index WARPS = THREADS_ADAPTIVE / 32; /* Number of elements in shared memory per one warp */ // Number of elements in shared memory per one warp static constexpr Index SHARED_PER_WARP = SHARED / WARPS; /* Max length of row to process one warp for CSR Light, MultiVector */ // Max length of row to process one warp for CSR Light, MultiVector static constexpr Index MAX_ELEMENTS_PER_WARP = 384; /* Max length of row to process one warp for CSR Adaptive */ static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; // Max length of row to process one warp for CSR Adaptive static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Index >::MaxAdaptiveElementsPerWarp(); template< typename Offsets > Index findLimit( const Index start, Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +3 −3 Original line number Diff line number Diff line Loading @@ -276,10 +276,10 @@ segmentsReduction( const OffsetsView& offsets, //static constexpr Index MAX_ELEMENTS_PER_WARP = 384; /* Max length of row to process one warp for CSR Adaptive */ static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(); /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 24576; static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedMemory(); /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); Loading Loading @@ -317,7 +317,7 @@ segmentsReduction( const OffsetsView& offsets, warpSize, WARPS, SHARED_PER_WARP, MAX_ELEMENTS_PER_WARP_ADAPT, details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(), BlocksView, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... > Loading src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h +2 −2 Original line number Diff line number Diff line Loading @@ -15,10 +15,10 @@ namespace TNL { namespace Segments { namespace details { template< typename Value > template< typename Value, int StreamedSharedMemory_ = 24576 > struct CSRAdaptiveKernelParameters { static const int StreamedSharedMemory_ = 20000; /** * \brief Computes number of CUDA threads per block depending on Value type. * Loading Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +9 −8 Original line number Diff line number Diff line Loading @@ -65,25 +65,26 @@ struct CSRAdaptiveKernel static TNL::String getKernelType(); static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO: // How many shared memory use per block in CSR Adaptive kernel static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Index >::StreamedSharedMemory(); //20000; //24576; TODO: /* Number of elements in shared memory */ // Number of elements in shared memory static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double); /* Number of warps in block for CSR Adaptive */ // Number of warps in block for CSR Adaptive static constexpr Index WARPS = THREADS_ADAPTIVE / 32; /* Number of elements in shared memory per one warp */ // Number of elements in shared memory per one warp static constexpr Index SHARED_PER_WARP = SHARED / WARPS; /* Max length of row to process one warp for CSR Light, MultiVector */ // Max length of row to process one warp for CSR Light, MultiVector static constexpr Index MAX_ELEMENTS_PER_WARP = 384; /* Max length of row to process one warp for CSR Adaptive */ static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; // Max length of row to process one warp for CSR Adaptive static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Index >::MaxAdaptiveElementsPerWarp(); template< typename Offsets > Index findLimit( const Index start, Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +3 −3 Original line number Diff line number Diff line Loading @@ -276,10 +276,10 @@ segmentsReduction( const OffsetsView& offsets, //static constexpr Index MAX_ELEMENTS_PER_WARP = 384; /* Max length of row to process one warp for CSR Adaptive */ static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(); /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 24576; static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedMemory(); /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); Loading Loading @@ -317,7 +317,7 @@ segmentsReduction( const OffsetsView& offsets, warpSize, WARPS, SHARED_PER_WARP, MAX_ELEMENTS_PER_WARP_ADAPT, details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(), BlocksView, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... > Loading
src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h +2 −2 Original line number Diff line number Diff line Loading @@ -15,10 +15,10 @@ namespace TNL { namespace Segments { namespace details { template< typename Value > template< typename Value, int StreamedSharedMemory_ = 24576 > struct CSRAdaptiveKernelParameters { static const int StreamedSharedMemory_ = 20000; /** * \brief Computes number of CUDA threads per block depending on Value type. * Loading