Commit 91806e1d authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Moving implementation of CSRAdaptiveKernel to .hpp file.

parent 37632fd9
Loading
Loading
Loading
Loading
+10 −84
Original line number Diff line number Diff line
@@ -63,10 +63,7 @@ struct CSRAdaptiveKernel
   using BlocksType = typename ViewType::BlocksType;
   using BlocksView = typename BlocksType::ViewType;

   static TNL::String getKernelType()
   {
      return ViewType::getKernelType();
   };
   static TNL::String getKernelType();

    static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;

@@ -93,84 +90,16 @@ struct CSRAdaptiveKernel
                    const Offsets& offsets,
                    const Index size,
                    details::Type &type,
                    Index &sum )
   {
      sum = 0;
      for (Index current = start; current < size - 1; current++ )
      {
         Index elements = offsets[ current + 1 ] - offsets[ current ];
         sum += elements;
         if( sum > SHARED_PER_WARP )
         {
            if( current - start > 0 ) // extra row
            {
               type = details::Type::STREAM;
               return current;
            }
            else
            {                  // one long row
               if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT )
                  type = details::Type::VECTOR;
               else
                  type = details::Type::LONG;
               return current + 1;
            }
         }
      }
      type = details::Type::STREAM;
      return size - 1; // return last row pointer
    }
                    Index &sum );

   template< typename Offsets >
   void init( const Offsets& offsets )
   {
      using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >;
      HostOffsetsType hostOffsets( offsets );
      const Index rows = offsets.getSize();
      Index sum, start( 0 ), nextStart( 0 );
   void init( const Offsets& offsets );

      // Fill blocks
      std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks;
      inBlocks.reserve( rows );
   void reset();

      while( nextStart != rows - 1 )
      {
         details::Type type;
         nextStart = findLimit( start, hostOffsets, rows, type, sum );
   ViewType getView();

         if( type == details::Type::LONG )
         {
            const Index blocksCount = inBlocks.size();
            const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize();
            Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount;
            if( warpsLeft == 0 )
               warpsLeft = warpsPerCudaBlock;
            for( Index index = 0; index < warpsLeft; index++ )
               inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft );
         }
         else
         {
            inBlocks.emplace_back(start, type,
                  nextStart,
                  offsets.getElement(nextStart),
                  offsets.getElement(start) );
         }
         start = nextStart;
      }
      inBlocks.emplace_back(nextStart);
      this->blocks = inBlocks;
      this->view.setBlocks( blocks );
   }

   void reset()
   {
      this->blocks.reset();
      this->view.setBlocks( blocks );
   }

   ViewType getView() { return this->view; };

   ConstViewType getConstView() const { return this->view; };
   ConstViewType getConstView() const;

   template< typename OffsetsView,
              typename Fetch,
@@ -185,10 +114,7 @@ struct CSRAdaptiveKernel
                        const Reduction& reduction,
                        ResultKeeper& keeper,
                        const Real& zero,
                        Args... args ) const
   {
      view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
   }
                        Args... args ) const;

   protected:
      BlocksType blocks;
+142 −1
Original line number Diff line number Diff line
@@ -22,6 +22,147 @@ namespace TNL {
   namespace Algorithms {
      namespace Segments {

template< typename Index,
          typename Device >
TNL::String
CSRAdaptiveKernel< Index, Device >::
getKernelType()
{
   return ViewType::getKernelType();
};


template< typename Index,
          typename Device >
   template< typename Offsets >
Index
CSRAdaptiveKernel< Index, Device >::
findLimit( const Index start,
           const Offsets& offsets,
           const Index size,
           details::Type &type,
           Index &sum )
{
   sum = 0;
   for (Index current = start; current < size - 1; current++ )
   {
      Index elements = offsets[ current + 1 ] - offsets[ current ];
      sum += elements;
      if( sum > SHARED_PER_WARP )
      {
         if( current - start > 0 ) // extra row
         {
            type = details::Type::STREAM;
            return current;
         }
         else
         {                  // one long row
            if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT )
               type = details::Type::VECTOR;
            else
               type = details::Type::LONG;
            return current + 1;
         }
      }
   }
   type = details::Type::STREAM;
   return size - 1; // return last row pointer
   }

template< typename Index,
          typename Device >
   template< typename Offsets >
void
CSRAdaptiveKernel< Index, Device >::
init( const Offsets& offsets )
{
   using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >;
   HostOffsetsType hostOffsets( offsets );
   const Index rows = offsets.getSize();
   Index sum, start( 0 ), nextStart( 0 );

   // Fill blocks
   std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks;
   inBlocks.reserve( rows );

   while( nextStart != rows - 1 )
   {
      details::Type type;
      nextStart = findLimit( start, hostOffsets, rows, type, sum );

      if( type == details::Type::LONG )
      {
         const Index blocksCount = inBlocks.size();
         const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize();
         Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount;
         if( warpsLeft == 0 )
            warpsLeft = warpsPerCudaBlock;
         for( Index index = 0; index < warpsLeft; index++ )
            inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft );
      }
      else
      {
         inBlocks.emplace_back(start, type,
               nextStart,
               offsets.getElement(nextStart),
               offsets.getElement(start) );
      }
      start = nextStart;
   }
   inBlocks.emplace_back(nextStart);
   this->blocks = inBlocks;
   this->view.setBlocks( blocks );
}

template< typename Index,
          typename Device >
void
CSRAdaptiveKernel< Index, Device >::
reset()
{
   this->blocks.reset();
   this->view.setBlocks( blocks );
}

template< typename Index,
          typename Device >
auto
CSRAdaptiveKernel< Index, Device >::
getView() -> ViewType
{
   return this->view;
}

template< typename Index,
          typename Device >
auto
CSRAdaptiveKernel< Index, Device >::
getConstView() const -> ConstViewType
{
   return this->view;
};

template< typename Index,
          typename Device >
   template< typename OffsetsView,
               typename Fetch,
               typename Reduction,
               typename ResultKeeper,
               typename Real,
               typename... Args >
void
CSRAdaptiveKernel< Index, Device >::
segmentsReduction( const OffsetsView& offsets,
                   Index first,
                   Index last,
                   Fetch& fetch,
                   const Reduction& reduction,
                   ResultKeeper& keeper,
                   const Real& zero,
                   Args... args ) const
{
   view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
}

      } // namespace Segments
   }  // namespace Algorithms