Skip to content
Snippets Groups Projects
Commit 1da0935b authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Refactoring CSR Adaptive kernel.

parent 25e55913
No related branches found
No related tags found
1 merge request!89To/matrices adaptive csr
...@@ -65,25 +65,26 @@ struct CSRAdaptiveKernel ...@@ -65,25 +65,26 @@ struct CSRAdaptiveKernel
static TNL::String getKernelType(); static TNL::String getKernelType();
static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;
/* How many shared memory use per block in CSR Adaptive kernel */ // How many shared memory use per block in CSR Adaptive kernel
static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO: static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Index >::StreamedSharedMemory(); //20000; //24576; TODO:
/* Number of elements in shared memory */ // Number of elements in shared memory
static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double); static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double);
/* Number of warps in block for CSR Adaptive */ // Number of warps in block for CSR Adaptive
static constexpr Index WARPS = THREADS_ADAPTIVE / 32; static constexpr Index WARPS = THREADS_ADAPTIVE / 32;
/* Number of elements in shared memory per one warp */ // Number of elements in shared memory per one warp
static constexpr Index SHARED_PER_WARP = SHARED / WARPS; static constexpr Index SHARED_PER_WARP = SHARED / WARPS;
/* Max length of row to process one warp for CSR Light, MultiVector */ // Max length of row to process one warp for CSR Light, MultiVector
static constexpr Index MAX_ELEMENTS_PER_WARP = 384; static constexpr Index MAX_ELEMENTS_PER_WARP = 384;
/* Max length of row to process one warp for CSR Adaptive */ // Max length of row to process one warp for CSR Adaptive
static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Index >::MaxAdaptiveElementsPerWarp();
template< typename Offsets > template< typename Offsets >
Index findLimit( const Index start, Index findLimit( const Index start,
......
...@@ -276,10 +276,10 @@ segmentsReduction( const OffsetsView& offsets, ...@@ -276,10 +276,10 @@ segmentsReduction( const OffsetsView& offsets,
//static constexpr Index MAX_ELEMENTS_PER_WARP = 384; //static constexpr Index MAX_ELEMENTS_PER_WARP = 384;
/* Max length of row to process one warp for CSR Adaptive */ /* Max length of row to process one warp for CSR Adaptive */
static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp();
/* How many shared memory use per block in CSR Adaptive kernel */ /* How many shared memory use per block in CSR Adaptive kernel */
static constexpr Index SHARED_PER_BLOCK = 24576; static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedMemory();
/* Number of elements in shared memory */ /* Number of elements in shared memory */
static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real);
...@@ -317,7 +317,7 @@ segmentsReduction( const OffsetsView& offsets, ...@@ -317,7 +317,7 @@ segmentsReduction( const OffsetsView& offsets,
warpSize, warpSize,
WARPS, WARPS,
SHARED_PER_WARP, SHARED_PER_WARP,
MAX_ELEMENTS_PER_WARP_ADAPT, details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(),
BlocksView, BlocksView,
OffsetsView, OffsetsView,
Index, Fetch, Reduction, ResultKeeper, Real, Args... > Index, Fetch, Reduction, ResultKeeper, Real, Args... >
......
...@@ -15,10 +15,10 @@ namespace TNL { ...@@ -15,10 +15,10 @@ namespace TNL {
namespace Segments { namespace Segments {
namespace details { namespace details {
template< typename Value > template< typename Value,
int StreamedSharedMemory_ = 24576 >
struct CSRAdaptiveKernelParameters struct CSRAdaptiveKernelParameters
{ {
static const int StreamedSharedMemory_ = 20000;
/** /**
* \brief Computes number of CUDA threads per block depending on Value type. * \brief Computes number of CUDA threads per block depending on Value type.
* *
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment