Commit 440c35b9 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Added blocks in CSR adaptive kernel for different Value/Real type sizes.

parent 1da0935b
Loading
Loading
Loading
Loading
+20 −11
Original line number Diff line number Diff line
@@ -63,13 +63,15 @@ struct CSRAdaptiveKernel
   using BlocksType = typename ViewType::BlocksType;
   using BlocksView = typename BlocksType::ViewType;

   static constexpr int MaxValueSizeLog() { return ViewType::MaxValueSizeLog; };

   static TNL::String getKernelType();


   static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;
   static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;

   // How many shared memory use per block in CSR Adaptive kernel
   static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Index >::StreamedSharedMemory(); //20000; //24576; TODO:
   static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::StreamedSharedMemory(); //20000; //24576; TODO:

   // Number of elements in shared memory 
   static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double);
@@ -84,14 +86,7 @@ struct CSRAdaptiveKernel
   static constexpr Index MAX_ELEMENTS_PER_WARP = 384;

   // Max length of row to process one warp for CSR Adaptive 
   static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Index >::MaxAdaptiveElementsPerWarp();

   template< typename Offsets >
   Index findLimit( const Index start,
                    const Offsets& offsets,
                    const Index size,
                    details::Type &type,
                    Index &sum );
   static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::MaxAdaptiveElementsPerWarp();

   template< typename Offsets >
   void init( const Offsets& offsets );
@@ -118,7 +113,21 @@ struct CSRAdaptiveKernel
                        Args... args ) const;

   protected:
      BlocksType blocks;
      template< int SizeOfValue, typename Offsets >
      Index findLimit( const Index start,
                     const Offsets& offsets,
                     const Index size,
                     details::Type &type,
                     Index &sum );

      template< int SizeOfValue,
                typename Offsets >
      void initValueSize( const Offsets& offsets );

      /**
       * \brief  blocksArray[ i ] stores blocks for sizeof( Value ) == 2^i.
       */
      BlocksType blocksArray[ MaxValueSizeLog() ];

      ViewType view;
};
+83 −59
Original line number Diff line number Diff line
@@ -31,10 +31,81 @@ getKernelType()
   return ViewType::getKernelType();
};


template< typename Index,
          typename Device >
   template< typename Offsets >
void
CSRAdaptiveKernel< Index, Device >::
init( const Offsets& offsets )
{
   this->template initValueSize<  1 >( offsets );
   this->template initValueSize<  2 >( offsets );
   this->template initValueSize<  4 >( offsets );
   this->template initValueSize<  8 >( offsets );
   this->template initValueSize< 16 >( offsets );
   this->template initValueSize< 32 >( offsets );
   for( int i = 0; i < MaxValueSizeLog(); i++ )
      this->view.setBlocks( blocksArray[ i ], i );
}


template< typename Index,
          typename Device >
void
CSRAdaptiveKernel< Index, Device >::
reset()
{
   for( int i = 0; i < MaxValueSizeLog(); i++ )
   {
      this->blocksArray[ i ].reset();
      this->view.setBlocks( this->blocksArray[ i ], i );
   }
}

template< typename Index,
          typename Device >
auto
CSRAdaptiveKernel< Index, Device >::
getView() -> ViewType
{
   return this->view;
}

template< typename Index,
          typename Device >
auto
CSRAdaptiveKernel< Index, Device >::
getConstView() const -> ConstViewType
{
   return this->view;
};

template< typename Index,
          typename Device >
   template< typename OffsetsView,
               typename Fetch,
               typename Reduction,
               typename ResultKeeper,
               typename Real,
               typename... Args >
void
CSRAdaptiveKernel< Index, Device >::
segmentsReduction( const OffsetsView& offsets,
                   Index first,
                   Index last,
                   Fetch& fetch,
                   const Reduction& reduction,
                   ResultKeeper& keeper,
                   const Real& zero,
                   Args... args ) const
{
   view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
}

template< typename Index,
          typename Device >
   template< int SizeOfValue,
             typename Offsets >
Index
CSRAdaptiveKernel< Index, Device >::
findLimit( const Index start,
@@ -48,7 +119,7 @@ findLimit( const Index start,
   {
      Index elements = offsets[ current + 1 ] - offsets[ current ];
      sum += elements;
      if( sum > SHARED_PER_WARP )
      if( sum > details::CSRAdaptiveKernelParameters< SizeOfValue >::StreamedSharedElementsPerWarp() )
      {
         if( current - start > 0 ) // extra row
         {
@@ -57,7 +128,7 @@ findLimit( const Index start,
         }
         else
         {                  // one long row
            if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT )
            if( sum <= 2 * details::CSRAdaptiveKernelParameters< SizeOfValue >::MaxAdaptiveElementsPerWarp() ) //MAX_ELEMENTS_PER_WARP_ADAPT )
               type = details::Type::VECTOR;
            else
               type = details::Type::LONG;
@@ -71,10 +142,11 @@ findLimit( const Index start,

template< typename Index,
          typename Device >
   template< typename Offsets >
   template< int SizeOfValue,
             typename Offsets >
void
CSRAdaptiveKernel< Index, Device >::
init( const Offsets& offsets )
initValueSize( const Offsets& offsets )
{
   using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >;
   HostOffsetsType hostOffsets( offsets );
@@ -88,7 +160,7 @@ init( const Offsets& offsets )
   while( nextStart != rows - 1 )
   {
      details::Type type;
      nextStart = findLimit( start, hostOffsets, rows, type, sum );
      nextStart = findLimit< SizeOfValue >( start, hostOffsets, rows, type, sum );

      if( type == details::Type::LONG )
      {
@@ -110,58 +182,10 @@ init( const Offsets& offsets )
      start = nextStart;
   }
   inBlocks.emplace_back(nextStart);
   this->blocks = inBlocks;
   this->view.setBlocks( blocks );
}

template< typename Index,
          typename Device >
void
CSRAdaptiveKernel< Index, Device >::
reset()
{
   this->blocks.reset();
   this->view.setBlocks( blocks );
}

template< typename Index,
          typename Device >
auto
CSRAdaptiveKernel< Index, Device >::
getView() -> ViewType
{
   return this->view;
}

template< typename Index,
          typename Device >
auto
CSRAdaptiveKernel< Index, Device >::
getConstView() const -> ConstViewType
{
   return this->view;
};

template< typename Index,
          typename Device >
   template< typename OffsetsView,
               typename Fetch,
               typename Reduction,
               typename ResultKeeper,
               typename Real,
               typename... Args >
void
CSRAdaptiveKernel< Index, Device >::
segmentsReduction( const OffsetsView& offsets,
                   Index first,
                   Index last,
                   Fetch& fetch,
                   const Reduction& reduction,
                   ResultKeeper& keeper,
                   const Real& zero,
                   Args... args ) const
{
   view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
   //std::cerr << "Setting blocks to " << std::log2( SizeOfValue ) << std::endl;
   TNL_ASSERT_LT( std::log2( SizeOfValue ), MaxValueSizeLog(), "" );
   TNL_ASSERT_GE( std::log2( SizeOfValue ), 0, "" );
   this->blocksArray[ (int ) std::log2( SizeOfValue ) ] = inBlocks;
}

      } // namespace Segments
+6 −4
Original line number Diff line number Diff line
@@ -28,11 +28,13 @@ struct CSRAdaptiveKernelView
   using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >;
   using BlocksView = typename BlocksType::ViewType;

   static constexpr int MaxValueSizeLog = 6;

   CSRAdaptiveKernelView() = default;

   CSRAdaptiveKernelView( BlocksType& blocks );

   void setBlocks( BlocksType& blocks );
   void setBlocks( BlocksType& blocks, const int idx );

   ViewType getView();

@@ -57,10 +59,10 @@ struct CSRAdaptiveKernelView

   CSRAdaptiveKernelView& operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView );

   void printBlocks() const;
   void printBlocks( int idx ) const;

   protected:
      BlocksView blocks;
      BlocksView blocksArray[ MaxValueSizeLog ];
};

      } // namespace Segments
+21 −17
Original line number Diff line number Diff line
@@ -50,10 +50,10 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
                                    Real zero,
                                    Args... args )
{
   static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize();
   constexpr int WarpSize = Cuda::getWarpSize();
   constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< Real >::WarpsCount();
   constexpr size_t StreamedSharedElementsPerWarp  = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedElementsPerWarp();
   static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
   //constexpr int WarpSize = Cuda::getWarpSize();
   //constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::WarpsCount();
   //constexpr size_t StreamedSharedElementsPerWarp  = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp();


   __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ];
@@ -199,21 +199,21 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
}
#endif

template< typename Index,
/*template< typename Index,
          typename Device >
CSRAdaptiveKernelView< Index, Device >::
CSRAdaptiveKernelView( BlocksType& blocks )
{
   this->blocks.bind( blocks );
}
}*/

template< typename Index,
          typename Device >
void
CSRAdaptiveKernelView< Index, Device >::
setBlocks( BlocksType& blocks )
setBlocks( BlocksType& blocks, const int idx )
{
   this->blocks.bind( blocks );
   this->blocksArray[ idx ].bind( blocks );
}

template< typename Index,
@@ -263,23 +263,25 @@ segmentsReduction( const OffsetsView& offsets,
                   Args... args ) const
{
#ifdef HAVE_CUDA
   if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() )
   int valueSizeLog = std::ceil( log2f( ( double ) sizeof( Real ) ) );

   if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog > MaxValueSizeLog )
   {
      TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
         segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
      return;
   }

   static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;
   static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;

   /* Max length of row to process one warp for CSR Light, MultiVector */
   //static constexpr Index MAX_ELEMENTS_PER_WARP = 384;

   /* Max length of row to process one warp for CSR Adaptive */
   static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp();
   //static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::MaxAdaptiveElementsPerWarp();

   /* How many shared memory use per block in CSR Adaptive kernel */
   static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedMemory();
   static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedMemory();

   /* Number of elements in shared memory */
   static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real);
@@ -298,7 +300,7 @@ segmentsReduction( const OffsetsView& offsets,
   constexpr size_t MAX_X_DIM = 2147483647;

   /* Fill blocks */
   size_t neededThreads = this->blocks.getSize() * warpSize; // one warp per block
   size_t neededThreads = this->blocksArray[ valueSizeLog ].getSize() * warpSize; // one warp per block
   /* Execute kernels on device */
   for (Index gridIdx = 0; neededThreads != 0; gridIdx++ )
   {
@@ -317,12 +319,12 @@ segmentsReduction( const OffsetsView& offsets,
            warpSize,
            WARPS,
            SHARED_PER_WARP,
            details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(),
            details::CSRAdaptiveKernelParameters< sizeof( Real ) >::MaxAdaptiveElementsPerWarp(),
            BlocksView,
            OffsetsView,
            Index, Fetch, Reduction, ResultKeeper, Real, Args... >
         <<<blocksCount, threads>>>(
            this->blocks,
            this->blocksArray[ valueSizeLog ],
            gridIdx,
            offsets,
            first,
@@ -342,7 +344,8 @@ CSRAdaptiveKernelView< Index, Device >&
CSRAdaptiveKernelView< Index, Device >::
operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView )
{
   this->blocks.bind( kernelView.blocks );
   for( int i = 0; i < MaxValueSizeLog; i++ )
      this->blocksArray[ i ].bind( kernelView.blocksArray[ i ] );
   return *this;
}

@@ -350,8 +353,9 @@ template< typename Index,
          typename Device >
void
CSRAdaptiveKernelView< Index, Device >::
printBlocks() const
printBlocks( int idx ) const
{
   auto& blocks = this->blocksArray[ idx ];
   for( Index i = 0; i < this->blocks.getSize(); i++ )
   {
      auto block = blocks.getElement( i );
+2 −2
Original line number Diff line number Diff line
@@ -15,7 +15,7 @@ namespace TNL {
      namespace Segments {
         namespace details {

template< typename Value,
template< int SizeOfValue,
          int StreamedSharedMemory_ = 24576 >
struct CSRAdaptiveKernelParameters
{
@@ -37,7 +37,7 @@ struct CSRAdaptiveKernelParameters
   /**
    * \brief Number of elements fitting into streamed shared memory.
    */
   static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / sizeof( Value ); };
   static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / SizeOfValue; };

   /**
    * \brief Computes number of warps in one CUDA block.