Commit f389535b authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Added Adaptove CSR kernel dispatcher.

parent afc75e25
Loading
Loading
Loading
Loading
+145 −68
Original line number Diff line number Diff line
@@ -176,85 +176,73 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
#endif

template< typename Index,
          typename Device >
CSRAdaptiveKernelView< Index, Device >::
CSRAdaptiveKernelView( BlocksType& blocks )
{
   this->blocks.bind( blocks );
}

template< typename Index,
          typename Device >
void
CSRAdaptiveKernelView< Index, Device >::
setBlocks( BlocksType& blocks )
{
   this->blocks.bind( blocks );
}
          typename Device,
          typename Fetch,
          typename Reduction,
          typename ResultKeeper,
          int StreamedMemory,
          bool DispatchScalarCSR =
            details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ||
            std::is_same< Device, Devices::Host >::value >
struct CSRAdaptiveKernelSegmentsReductionDispatcher;

template< typename Index,
          typename Device >
auto
CSRAdaptiveKernelView< Index, Device >::
getView() -> ViewType
          typename Device,
          typename Fetch,
          typename Reduction,
          typename ResultKeeper,
          int StreamedMemory >
struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, StreamedMemory, true >
{
   return *this;
};

template< typename Index,
          typename Device >
auto
CSRAdaptiveKernelView< Index, Device >::
getConstView() const -> ConstViewType
{
   return *this;
}

template< typename Index,
          typename Device >
TNL::String
CSRAdaptiveKernelView< Index, Device >::
getKernelType()
   template< typename BlocksView,
             typename Offsets,
             typename Real,
             typename... Args >
   static void reduce( const Offsets& offsets,
                       const BlocksView& blocks,
                       Index first,
                       Index last,
                       Fetch& fetch,
                       const Reduction& reduction,
                       ResultKeeper& keeper,
                       const Real& zero,
                       Args... args)
   {
   return "Adaptive";
      TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
         segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
   }
};

template< typename Index,
          typename Device >
   template< typename OffsetsView,
          typename Device,
          typename Fetch,
          typename Reduction,
          typename ResultKeeper,
          int StreamedMemory >
struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, StreamedMemory, false >
{
   template< typename BlocksView,
             typename Offsets,
             typename Real,
             typename... Args >
void
CSRAdaptiveKernelView< Index, Device >::
segmentsReduction( const OffsetsView& offsets,
   static void reduce( const Offsets& offsets,
                       const BlocksView& blocks,
                       Index first,
                       Index last,
                       Fetch& fetch,
                       const Reduction& reduction,
                       ResultKeeper& keeper,
                       const Real& zero,
                   Args... args ) const
                       Args... args)
   {
#ifdef HAVE_CUDA
   if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() )
   {
      TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
         segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
      return;
   }

   //constexpr int warpSize = 32;

      Index blocksCount;
      const Index threads = details::CSRAdaptiveKernelParameters< Real, StreamedMemory >::CudaBlockSize();

   const Index threads = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize();

   /* Fill blocks */
   size_t neededThreads = this->blocks.getSize() * TNL::Cuda::getWarpSize(); // one warp per block
   /* Execute kernels on device */
      // Fill blocks
      size_t neededThreads = blocks.getSize() * TNL::Cuda::getWarpSize(); // one warp per block
      // Execute kernels on device
      for( Index gridIdx = 0; neededThreads != 0; gridIdx++ )
      {
         if( Cuda::getMaxGridSize() * threads >= neededThreads )
@@ -269,11 +257,12 @@ segmentsReduction( const OffsetsView& offsets,
         }

         segmentsReductionCSRAdaptiveKernel<
               StreamedMemory,
               BlocksView,
            OffsetsView,
               Offsets,
               Index, Fetch, Reduction, ResultKeeper, Real, Args... >
            <<<blocksCount, threads>>>(
            this->blocks,
               blocks,
               gridIdx,
               offsets,
               first,
@@ -286,11 +275,99 @@ segmentsReduction( const OffsetsView& offsets,
      }
#endif
   }
};



template< typename Index,
          typename Device >
CSRAdaptiveKernelView< Index, Device >&
CSRAdaptiveKernelView< Index, Device >::
          typename Device,
          int StreamedMemory >
CSRAdaptiveKernelView< Index, Device, StreamedMemory >::
CSRAdaptiveKernelView( BlocksType& blocks )
{
   this->blocks.bind( blocks );
}

template< typename Index,
          typename Device,
          int StreamedMemory >
void
CSRAdaptiveKernelView< Index, Device, StreamedMemory >::
setBlocks( BlocksType& blocks )
{
   this->blocks.bind( blocks );
}

template< typename Index,
          typename Device,
          int StreamedMemory >
auto
CSRAdaptiveKernelView< Index, Device, StreamedMemory >::
getBlocks() const -> const BlocksView& 
{
   return this->blocks;
}

template< typename Index,
          typename Device,
          int StreamedMemory >
auto
CSRAdaptiveKernelView< Index, Device, StreamedMemory >::
getView() -> ViewType
{
   return *this;
};

template< typename Index,
          typename Device,
          int StreamedMemory >
auto
CSRAdaptiveKernelView< Index, Device, StreamedMemory >::
getConstView() const -> ConstViewType
{
   return *this;
}

template< typename Index,
          typename Device,
          int StreamedMemory >
TNL::String
CSRAdaptiveKernelView< Index, Device, StreamedMemory >::
getKernelType()
{
   return "Adaptive";
}

template< typename Index,
          typename Device,
          int StreamedMemory >
   template< typename Offsets,
             typename Fetch,
             typename Reduction,
             typename ResultKeeper,
             typename Real,
             typename... Args >
void
CSRAdaptiveKernelView< Index, Device, StreamedMemory >::
segmentsReduction( const Offsets& offsets,
                   Index first,
                   Index last,
                   Fetch& fetch,
                   const Reduction& reduction,
                   ResultKeeper& keeper,
                   const Real& zero,
                   Args... args ) const
{

   CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, StreamedMemory  >::template
      reduce< BlocksView, Offsets, Real, Args... >( offsets, this->getBlocks(), first, last, fetch, reduction, keeper, zero, args... );
}

template< typename Index,
          typename Device,
          int StreamedMemory >
CSRAdaptiveKernelView< Index, Device, StreamedMemory >&
CSRAdaptiveKernelView< Index, Device, StreamedMemory >::
operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView )
{
   this->blocks.bind( kernelView.blocks );