Commit 1da0935b authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Refactoring CSR Adaptive kernel.

parent 25e55913
Loading
Loading
Loading
Loading
+9 −8
Original line number Diff line number Diff line
@@ -65,25 +65,26 @@ struct CSRAdaptiveKernel

   static TNL::String getKernelType();


   static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;

   /* How many shared memory use per block in CSR Adaptive kernel */
   static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO:
   // How many shared memory use per block in CSR Adaptive kernel
   static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Index >::StreamedSharedMemory(); //20000; //24576; TODO:

   /* Number of elements in shared memory */
   // Number of elements in shared memory 
   static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double);

   /* Number of warps in block for CSR Adaptive */
   // Number of warps in block for CSR Adaptive 
   static constexpr Index WARPS = THREADS_ADAPTIVE / 32;

   /* Number of elements in shared memory per one warp */
   // Number of elements in shared memory per one warp 
   static constexpr Index SHARED_PER_WARP = SHARED / WARPS;

   /* Max length of row to process one warp for CSR Light, MultiVector */
   // Max length of row to process one warp for CSR Light, MultiVector 
   static constexpr Index MAX_ELEMENTS_PER_WARP = 384;

   /* Max length of row to process one warp for CSR Adaptive */
   static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512;
   // Max length of row to process one warp for CSR Adaptive 
   static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Index >::MaxAdaptiveElementsPerWarp();

   template< typename Offsets >
   Index findLimit( const Index start,
+3 −3
Original line number Diff line number Diff line
@@ -276,10 +276,10 @@ segmentsReduction( const OffsetsView& offsets,
   //static constexpr Index MAX_ELEMENTS_PER_WARP = 384;

   /* Max length of row to process one warp for CSR Adaptive */
   static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512;
   static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp();

   /* How many shared memory use per block in CSR Adaptive kernel */
   static constexpr Index SHARED_PER_BLOCK = 24576;
   static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedMemory();

   /* Number of elements in shared memory */
   static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real);
@@ -317,7 +317,7 @@ segmentsReduction( const OffsetsView& offsets,
            warpSize,
            WARPS,
            SHARED_PER_WARP,
            MAX_ELEMENTS_PER_WARP_ADAPT,
            details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(),
            BlocksView,
            OffsetsView,
            Index, Fetch, Reduction, ResultKeeper, Real, Args... >
+2 −2
Original line number Diff line number Diff line
@@ -15,10 +15,10 @@ namespace TNL {
      namespace Segments {
         namespace details {

template< typename Value >
template< typename Value,
          int StreamedSharedMemory_ = 24576 >
struct CSRAdaptiveKernelParameters
{
   static const int StreamedSharedMemory_ = 20000;
   /**
    * \brief Computes number of CUDA threads per block depending on Value type.
    *