Commit 25e55913 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Added CSRAdaptiveKernelParameters.

parent 91806e1d
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -65,7 +65,7 @@ struct CSRAdaptiveKernel

   static TNL::String getKernelType();

    static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
   static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;

   /* How many shared memory use per block in CSR Adaptive kernel */
   static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO:
+9 −7
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@
#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
#include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.h>
#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>
#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h>

namespace TNL {
   namespace Algorithms {
@@ -25,8 +26,7 @@ namespace TNL {

#ifdef HAVE_CUDA

template< int CudaBlockSize,
          int warpSize,
template< int warpSize,
          int WARPS,
          int SHARED_PER_WARP,
          int MAX_ELEM_PER_WARP,
@@ -50,6 +50,12 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
                                    Real zero,
                                    Args... args )
{
   static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize();
   constexpr int WarpSize = Cuda::getWarpSize();
   constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< Real >::WarpsCount();
   constexpr size_t StreamedSharedElementsPerWarp  = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedElementsPerWarp();


   __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ];
   __shared__ Real multivectorShared[ CudaBlockSize / warpSize ];
   constexpr size_t MAX_X_DIM = 2147483647;
@@ -264,10 +270,7 @@ segmentsReduction( const OffsetsView& offsets,
      return;
   }

   static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
   //static constexpr Index THREADS_SCALAR = 128;
   //static constexpr Index THREADS_VECTOR = 128;
   //static constexpr Index THREADS_LIGHT = 128;
   static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;

   /* Max length of row to process one warp for CSR Light, MultiVector */
   //static constexpr Index MAX_ELEMENTS_PER_WARP = 384;
@@ -311,7 +314,6 @@ segmentsReduction( const OffsetsView& offsets,
      }

      segmentsReductionCSRAdaptiveKernel<
            THREADS_ADAPTIVE,
            warpSize,
            WARPS,
            SHARED_PER_WARP,
+72 −0
Original line number Diff line number Diff line
/***************************************************************************
                          CSRAdaptiveKernelBlockDescriptor.h -  description
                             -------------------
    begin                : Jan 25, 2021 -> Joe Biden inauguration
    copyright            : (C) 2021 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

#pragma once

namespace TNL {
   namespace Algorithms {
      namespace Segments {
         namespace details {

template< typename Value >
struct CSRAdaptiveKernelParameters
{
   static const int StreamedSharedMemory_ = 20000;
   /**
    * \brief Computes number of CUDA threads per block depending on Value type.
    *
    * \return CUDA block size.
    */
   static constexpr int CudaBlockSize() { return 256; }; //sizeof( Value ) == 8 ? 128 : 256; };
    //std::max( ( int ) ( 1024 / sizeof( Value ) ), ( int ) Cuda::getWarpSize() ); };

   /**
    * \brief Returns amount of shared memory dedicated for stream CSR kernel.
    *
    * \return Stream shared memory.
    */
   static constexpr size_t StreamedSharedMemory() { return StreamedSharedMemory_; };

   /**
    * \brief Number of elements fitting into streamed shared memory.
    */
   static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / sizeof( Value ); };

   /**
    * \brief Computes number of warps in one CUDA block.
    */
   static constexpr size_t WarpsCount() { return CudaBlockSize() / Cuda::getWarpSize(); };

   /**
    * \brief Computes number of elements to be streamed into the shared memory.
    *
    * \return Number of elements to be streamed into the shared memory.
    */
   static constexpr size_t StreamedSharedElementsPerWarp() { return StreamedSharedElementsCount() / WarpsCount(); };

   /**
    * \brief Returns maximum number of elements per warp for vector and hybrid kernel.
    *
    * \return Maximum number of elements per warp for vector and hybrid kernel.
    */
   static constexpr int MaxVectorElementsPerWarp() { return 384; };

   /**
    * \brief Returns maximum number of elements per warp for adaptive kernel.
    *
    * \return Maximum number of elements per warp for adaptive kernel.
    */
   static constexpr int MaxAdaptiveElementsPerWarp() { return 512; };
};

         } // namespace details
      } // namespace Segments
   }  // namespace Algorithms
} // namespace TNL