Commit 45ad3fa7 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber Committed by Jakub Klinkovský
Browse files

Implementing Light CSR kernel for segments.

parent 14060b9f
Loading
Loading
Loading
Loading
+5 −1
Original line number Diff line number Diff line
@@ -546,6 +546,11 @@ template< typename Device,
          typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
using CSRHybrid = CSR< Device, Index, CSRHybridKernel< Index, Device >, IndexAllocator >;

template< typename Device,
          typename Index,
          typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
using CSRLight = CSR< Device, Index, CSRLightKernel< Index, Device >, IndexAllocator >;

template< typename Device,
          typename Index,
          typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
@@ -556,7 +561,6 @@ template< typename Device,
          typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
using CSRDefault = CSRScalar< Device, Index, IndexAllocator >;


      } // namespace Segments
   }  // namespace Algorithms
} // namespace TNL
+7 −1
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@
#include <TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h>
#include <TNL/Algorithms/Segments/Kernels/CSRVectorKernel.h>
#include <TNL/Algorithms/Segments/Kernels/CSRHybridKernel.h>
#include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.h>
#include <TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.h>
#include <TNL/Algorithms/Segments/SegmentsPrinting.h>

@@ -163,9 +164,14 @@ template< typename Device,
          typename Index >
using CSRViewVector = CSRView< Device, Index, CSRVectorKernel< Index, Device > >;

template< typename Device,
          typename Index,
          int ThreadsInBlock = 256 >
using CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< Index, Device, ThreadsInBlock > >;

template< typename Device,
          typename Index >
using CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< Index, Device > >;
using CSRViewLight = CSRView< Device, Index, CSRLightKernel< Index, Device > >;

template< typename Device,
          typename Index >
+64 −0
Original line number Diff line number Diff line
/***************************************************************************
                          CSRLightKernel.h -  description
                             -------------------
    begin                : Jun 9, 2021 -> Joe Biden inauguration
    copyright            : (C) 2021 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

#pragma once

#include <TNL/Assert.h>
#include <TNL/Cuda/LaunchHelpers.h>
#include <TNL/Containers/VectorView.h>
#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>

namespace TNL {
   namespace Algorithms {
      namespace Segments {

template< typename Index,
          typename Device >
struct CSRLightKernel
{
   using IndexType = Index;
   using DeviceType = Device;
   using ViewType = CSRLightKernel< Index, Device >;
   using ConstViewType = CSRLightKernel< Index, Device >;

   template< typename Offsets >
   void init( const Offsets& offsets );

   void reset();

   ViewType getView();

   ConstViewType getConstView() const;

   static TNL::String getKernelType();

   template< typename OffsetsView,
             typename Fetch,
             typename Reduction,
             typename ResultKeeper,
             typename Real >
   void reduceSegments( const OffsetsView& offsets,
                        Index first,
                        Index last,
                        Fetch& fetch,
                        const Reduction& reduction,
                        ResultKeeper& keeper,
                        const Real& zero ) const;

   protected:
      int threadsPerSegment = 0;
};

      } // namespace Segments
   }  // namespace Algorithms
} // namespace TNL

#include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp>
+422 −0
Original line number Diff line number Diff line
/***************************************************************************
                          CSRLightKernel.hpp -  description
                             -------------------
    begin                : Jun 9, 2021 -> Joe Biden inauguration
    copyright            : (C) 2021 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

#pragma once

#include <TNL/Assert.h>
#include <TNL/Cuda/LaunchHelpers.h>
#include <TNL/Containers/VectorView.h>
#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
#include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.h>

namespace TNL {
   namespace Algorithms {
      namespace Segments {

#ifdef HAVE_CUDA
template< typename Real,
          typename Index,
          typename OffsetsView,
          typename Fetch,
          typename Reduce,
          typename Keep >
__global__
void SpMVCSRLightWithoutAtomic2( OffsetsView offsets,
                                 const Index first,
                                 const Index last,
                                 Fetch fetch,
                                 Reduce reduce,
                                 Keep keep,
                                 const Real zero,
                                 const Index gridID)
{
   const Index segmentIdx =
      first + ( ( gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x ) / 2;
   if( segmentIdx >= last )
      return;

   const Index inGroupID = threadIdx.x & 1; // & is cheaper than %
   const Index maxID = offsets[ segmentIdx  + 1];

   Real result = zero;
   for( Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 2)
      result = reduce( result, fetch( i, true ) );

   /* Parallel reduction */
   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) );

   /* Write result */
   if( inGroupID == 0 )
      keep( segmentIdx, result );
}

template< typename Real,
          typename Index,
          typename OffsetsView,
          typename Fetch,
          typename Reduce,
          typename Keep >
__global__
void SpMVCSRLightWithoutAtomic4( OffsetsView offsets,
                                 const Index first,
                                 const Index last,
                                 Fetch fetch,
                                 Reduce reduce,
                                 Keep keep,
                                 const Real zero,
                                 const Index gridID )
{
   const Index segmentIdx =
      first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / 4;
   if (segmentIdx >= last)
      return;

   const Index inGroupID = threadIdx.x & 3; // & is cheaper than %
   const Index maxID = offsets[segmentIdx + 1];

   Real result = zero;
   for (Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 4)
      result = reduce( result, fetch( i, true ) );

   /* Parallel reduction */
   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) );
   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) );

   /* Write result */
   if( inGroupID == 0 )
      keep( segmentIdx, result );

}

template< typename Real,
          typename Index,
          typename OffsetsView,
          typename Fetch,
          typename Reduce,
          typename Keep >
__global__
void SpMVCSRLightWithoutAtomic8( OffsetsView offsets,
                                 const Index first,
                                 const Index last,
                                 Fetch fetch,
                                 Reduce reduce,
                                 Keep keep,
                                 const Real zero,
                                 const Index gridID)
{
   const Index segmentIdx =
      first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / 8;
   if (segmentIdx >= last)
      return;

   Index i;
   const Index inGroupID = threadIdx.x & 7; // & is cheaper than %
   const Index maxID = offsets[segmentIdx + 1];

   Real result = zero;
   for (i = offsets[segmentIdx] + inGroupID; i < maxID; i += 8)
      result = reduce( result, fetch( i, true ) );

   /* Parallel reduction */
   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 4 ) );
   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) );
   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) );

   /* Write result */
   if( inGroupID == 0 )
      keep( segmentIdx, result );
}

template< typename Real,
          typename Index,
          typename OffsetsView,
          typename Fetch,
          typename Reduce,
          typename Keep >
__global__
void SpMVCSRLightWithoutAtomic16( OffsetsView offsets,
                                  const Index first,
                                  const Index last,
                                  Fetch fetch,
                                  Reduce reduce,
                                  Keep keep,
                                  const Real zero,
                                  const Index gridID )
{
   const Index segmentIdx =
      first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x ) / 16;
   if( segmentIdx >= last )
      return;


   Index i;
   const Index inGroupID = threadIdx.x & 15; // & is cheaper than %
   const Index maxID = offsets[segmentIdx + 1];

   Real result = zero;
   for( i = offsets[segmentIdx] + inGroupID; i < maxID; i += 16 )
      result = reduce( result, fetch( i, true ) );

   /* Parallel reduction */
   result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) );
   result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) );
   result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) );
   result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) );

   /* Write result */
   if( inGroupID == 0 )
      keep( segmentIdx, result );
}

template< typename Real,
          typename Index,
          typename OffsetsView,
          typename Fetch,
          typename Reduce,
          typename Keep >
__global__
void SpMVCSRVector( OffsetsView offsets,
                    const Index first,
                    const Index last,
                    Fetch fetch,
                    Reduce reduce,
                    Keep keep,
                    const Real zero,
                    const Index gridID )
{
   const int warpSize = 32;
   const Index warpID = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize;
   if (warpID >= last)
      return;

   Real result = zero;
   const Index laneID = threadIdx.x & 31; // & is cheaper than %
   Index endID = offsets[warpID + 1];

   /* Calculate result */
   for (Index i = offsets[warpID] + laneID; i < endID; i += warpSize)
      result = reduce( result, fetch( i, true ) );

   /* Reduction */
   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) );
   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  8 ) );
   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  4 ) );
   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  2 ) );
   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  1 ) );
   /* Write result */
   if( laneID == 0 )
      keep( warpID, result );
}
#endif
template< typename Index,
          typename Device,
          typename Fetch,
          typename Reduce,
          typename Keep,
          bool DispatchScalarCSR =
            details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ||
            std::is_same< Device, Devices::Host >::value >
struct CSRLightKernelreduceSegmentsDispatcher;

template< typename Index,
          typename Device,
          typename Fetch,
          typename Reduction,
          typename ResultKeeper >
struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, true >
{

   template< typename Offsets,
             typename Real >
   static void reduce( const Offsets& offsets,
                       Index first,
                       Index last,
                       Fetch& fetch,
                       const Reduction& reduce,
                       ResultKeeper& keep,
                       const Real& zero,
                       const Index threadsPerSegment )
   {
      TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
         reduceSegments( offsets, first, last, fetch, reduce, keep, zero );
   }
};

template< typename Index,
          typename Device,
          typename Fetch,
          typename Reduce,
          typename Keep >
struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Keep, false >
{
   template< typename OffsetsView,
             typename Real >
   static void reduce( const OffsetsView& offsets,
                       Index first,
                       Index last,
                       Fetch& fetch,
                       const Reduce& reduce,
                       Keep& keep,
                       const Real& zero,
                       const Index threadsPerSegment )
   {
#ifdef HAVE_CUDA
      const int threads = 128;
      Index blocks, groupSize;
      //if (KernelType == CSRLightWithoutAtomic)
      int  neededThreads = threadsPerSegment * ( last - first );
      //else
      //   neededThreads = rows * (threadsPerSegment > 32 ? 32 : threadsPerSegment);

      /* Execute kernels on device */
      for (Index grid = 0; neededThreads != 0; ++grid)
      {
         if( TNL::Cuda::getMaxGridXSize() * threads >= neededThreads)
         {
            blocks = roundUpDivision(neededThreads, threads);
            neededThreads = 0;
         }
         else
         {
            blocks = TNL::Cuda::getMaxGridXSize();
            neededThreads -= TNL::Cuda::getMaxGridXSize() * threads;
         }

         if (threadsPerSegment == 2)
            SpMVCSRLightWithoutAtomic2<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
               offsets, first, last, fetch, reduce, keep, zero, grid );
         else if (threadsPerSegment == 4)
            SpMVCSRLightWithoutAtomic4<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
               offsets, first, last, fetch, reduce, keep, zero, grid );
         else if (threadsPerSegment == 8)
            SpMVCSRLightWithoutAtomic8<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
               offsets, first, last, fetch, reduce, keep, zero, grid );
         else if (threadsPerSegment == 16)
            SpMVCSRLightWithoutAtomic16<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
               offsets, first, last, fetch, reduce, keep, zero, grid );
         else // if (threadsPerSegment == 32)
         { // CSR SpMV Light with threadsPerSegment = 32 is CSR Vector
            SpMVCSRVector<Real, Index, OffsetsView, Fetch, Reduce, Keep, warpSize ><<<blocks, threads>>>(
               offsets, first, last, fetch, reduce, keep, zero, grid );
         }
         /*else
         { // Execute CSR MultiVector
            SpMVCSRMultiVector<Real, Index, warpSize><<<blocks, threads>>>(
                     inVector, outVector, matrix.getoffsets().getData(),
                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
                     rows, threadsPerSegment / 32, grid
            );
         }*/

      }
#endif

   }
};


template< typename Index,
          typename Device >
    template< typename Offsets >
void
CSRLightKernel< Index, Device >::
init( const Offsets& offsets )
{
   //const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount );
   //this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ) ); //TNL::Cuda::getWarpSize() );

   const Index segmentsCount = offsets.getSize() - 1;
   //const Index threads = 128; // !!!!!!!!!!!!!!!!!!!!!! block size
   size_t neededThreads = segmentsCount * 32;//warpSize;
   Index blocks, threadsPerSegment;

   const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row
   if( elementsInSegment <= 2 )
      threadsPerSegment = 2;
   else if( elementsInSegment <= 4 )
      threadsPerSegment = 4;
   else if( elementsInSegment <= 8 )
      threadsPerSegment = 8;
   else if( elementsInSegment <= 16 )
      threadsPerSegment = 16;
   else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP)
      threadsPerSegment = 32; // CSR Vector
   //else
   //   threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector

   TNL_ASSERT_GE( threadsPerSegment, 0, "" );
   TNL_ASSERT_LE( threadsPerSegment, 33, "" );

}

template< typename Index,
          typename Device >
void
CSRLightKernel< Index, Device >::
reset()
{
    this->threadsPerSegment = 0;
}

template< typename Index,
          typename Device >
auto
CSRLightKernel< Index, Device >::
getView() -> ViewType
{
    return *this;
}

template< typename Index,
          typename Device >
TNL::String
CSRLightKernel< Index, Device >::
getKernelType()
{
    return "Light";
}

template< typename Index,
          typename Device >
auto
CSRLightKernel< Index, Device >::
getConstView() const -> ConstViewType
{
    return *this;
};


template< typename Index,
          typename Device >
    template< typename OffsetsView,
              typename Fetch,
              typename Reduce,
              typename Keep,
              typename Real >
void
CSRLightKernel< Index, Device >::
reduceSegments( const OffsetsView& offsets,
                Index first,
                Index last,
                Fetch& fetch,
                const Reduce& reduce,
                Keep& keep,
                const Real& zero ) const
{
   TNL_ASSERT_GE( this->threadsPerSegment, 0, "" );
   TNL_ASSERT_LE( this->threadsPerSegment, 33, "" );
   CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Keep >::reduce(
      offsets, first, last, fetch, reduce, keep, zero, this->threadsPerSegment );
}

      } // namespace Segments
   }  // namespace Algorithms
} // namespace TNL
+2 −0
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@ set( COMMON_TESTS
            SparseMatrixTest_CSRScalar
            SparseMatrixTest_CSRVector
            SparseMatrixTest_CSRHybrid
            SparseMatrixTest_CSRLight
            SparseMatrixTest_CSRAdaptive
            SparseMatrixTest_Ellpack
            SparseMatrixTest_SlicedEllpack
@@ -16,6 +17,7 @@ set( COMMON_TESTS
            SparseMatrixVectorProductTest_CSRScalar
            SparseMatrixVectorProductTest_CSRVector
            SparseMatrixVectorProductTest_CSRHybrid
            SparseMatrixVectorProductTest_CSRLight
            SparseMatrixVectorProductTest_CSRAdaptive
            SparseMatrixVectorProductTest_Ellpack
            SparseMatrixVectorProductTest_SlicedEllpack
Loading