Commit 60161718 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Renaming CSRLightKernel to CSRKernelHyrbid and making separate source files.

parent 3e9f89a0
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -143,7 +143,7 @@ using CSRVector = CSR< Device, Index, CSRKernelVector< Index, Device >, IndexAll
template< typename Device,
          typename Index,
          typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
using CSRLight = CSR< Device, Index, CSRLightKernel< Index, Device >, IndexAllocator >;
using CSRHybrid = CSR< Device, Index, CSRKernelHybrid< Index, Device >, IndexAllocator >;

template< typename Device,
          typename Index,
+65 −0
Original line number Diff line number Diff line
/***************************************************************************
                          CSRKernelHybrid.h -  description
                             -------------------
    begin                : Jan 23, 2021 -> Joe Biden inauguration
    copyright            : (C) 2021 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

#pragma once

#include <TNL/Assert.h>
#include <TNL/Cuda/LaunchHelpers.h>
#include <TNL/Containers/VectorView.h>
#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>

namespace TNL {
   namespace Algorithms {
      namespace Segments {

template< typename Index,
          typename Device >
struct CSRKernelHybrid
{
   using IndexType = Index;
   using DeviceType = Device;
   using ViewType = CSRKernelHybrid< Index, Device >;
   using ConstViewType = CSRKernelHybrid< Index, Device >;

   template< typename Offsets >
   void init( const Offsets& offsets );

   void reset();

   ViewType getView();

   ConstViewType getConstView() const;


   template< typename OffsetsView,
             typename Fetch,
             typename Reduction,
             typename ResultKeeper,
             typename Real,
             typename... Args >
   void segmentsReduction( const OffsetsView& offsets,
                                  Index first,
                                  Index last,
                                  Fetch& fetch,
                                  const Reduction& reduction,
                                  ResultKeeper& keeper,
                                  const Real& zero,
                                  Args... args ) const;

   protected:
      int threadsPerSegment;
};

      } // namespace Segments
   }  // namespace Algorithms
} // namespace TNL

#include <TNL/Algorithms/Segments/CSRKernelHybrid.hpp>
+195 −0
Original line number Diff line number Diff line
/***************************************************************************
                          CSRKernelHybrid.hpp -  description
                             -------------------
    begin                : Jan 23, 2021 -> Joe Biden inauguration
    copyright            : (C) 2021 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

#pragma once

#include <TNL/Assert.h>
#include <TNL/Cuda/LaunchHelpers.h>
#include <TNL/Containers/VectorView.h>
#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
#include <TNL/Algorithms/Segments/CSRKernelHybrid.h>

namespace TNL {
   namespace Algorithms {
      namespace Segments {

#ifdef HAVE_CUDA
template< int ThreadsPerSegment,
          typename Offsets,
          typename Index,
          typename Fetch,
          typename Reduction,
          typename ResultKeeper,
          typename Real,
          typename... Args >
__global__
void segmentsReductionCSRHybridKernel(
    int gridIdx,
    const Offsets offsets,
    Index first,
    Index last,
    Fetch fetch,
    const Reduction reduce,
    ResultKeeper keep,
    const Real zero,
    Args... args )
{
    /***
     * We map one warp to each segment
     */
    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / ThreadsPerSegment + first;
    if( segmentIdx >= last )
        return;

    const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than %
    Index endIdx = offsets[ segmentIdx + 1] ;

    Index localIdx( laneIdx );
    Real aux = zero;
    bool compute( true );
    for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += ThreadsPerSegment )
    {
      aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
      localIdx += TNL::Cuda::getWarpSize();
    }

    /****
     * Reduction in each segment.
     */
    if( ThreadsPerSegment == 32 )
        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) );
    if( ThreadsPerSegment >= 16 )
        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  8 ) );
    if( ThreadsPerSegment >= 8 )
        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  4 ) );
    if( ThreadsPerSegment >= 4 )
        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  2 ) );
    if( ThreadsPerSegment >= 2 )
        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  1 ) );

    if( laneIdx == 0 )
        keep( segmentIdx, aux );
}
#endif



template< typename Index,
          typename Device >
    template< typename Offsets >
void
CSRKernelHybrid< Index, Device >::
init( const Offsets& offsets )
{
    const Index segmentsCount = offsets.getSize() - 1;
    const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount );
    this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), TNL::Cuda::getWarpSize() );
    TNL_ASSERT_GE( threadsPerSegment, 0, "" );
    TNL_ASSERT_LE( threadsPerSegment, 32, "" );
}

template< typename Index,
          typename Device >
void
CSRKernelHybrid< Index, Device >::
reset()
{
    this->threadsPerSegment = 0;
}

template< typename Index,
          typename Device >
auto
CSRKernelHybrid< Index, Device >::
getView() -> ViewType
{
    return *this;
}

template< typename Index,
          typename Device >
auto
CSRKernelHybrid< Index, Device >::
getConstView() const -> ConstViewType
{
    return *this;
};


template< typename Index,
          typename Device >
    template< typename OffsetsView,
              typename Fetch,
              typename Reduction,
              typename ResultKeeper,
              typename Real,
              typename... Args >
void
CSRKernelHybrid< Index, Device >::
segmentsReduction( const OffsetsView& offsets,
                         Index first,
                         Index last,
                         Fetch& fetch,
                         const Reduction& reduction,
                         ResultKeeper& keeper,
                         const Real& zero,
                         Args... args ) const
{
    TNL_ASSERT_GE( this->threadsPerSegment, 0, "" );
    TNL_ASSERT_LE( this->threadsPerSegment, 32, "" );

#ifdef HAVE_CUDA
    const size_t threadsCount = this->threadsPerSegment * ( last - first );
    dim3 blocksCount, gridsCount, blockSize( 256 );
    TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
    //std::cerr << " this->threadsPerSegment = " << this->threadsPerSegment << " offsets = " << offsets << std::endl;
    for( unsigned int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ )
    {
        dim3 gridSize;
        TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
        switch( this->threadsPerSegment )
        {
            case 0:      // this means zero/empty matrix
                break;
            case 1:
                segmentsReductionCSRHybridKernel<  1, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                    break;
            case 2:
                segmentsReductionCSRHybridKernel<  2, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                    break;
            case 4:
                segmentsReductionCSRHybridKernel<  4, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                    break;
            case 8:
                segmentsReductionCSRHybridKernel<  8, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                    break;
            case 16:
                segmentsReductionCSRHybridKernel< 16, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                    break;
            case 32:
                segmentsReductionCSRHybridKernel< 32, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                    break;
            default:
                throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) );
        }
    }
#endif
}

      } // namespace Segments
   }  // namespace Algorithms
} // namespace TNL
+0 −152
Original line number Diff line number Diff line
@@ -21,158 +21,6 @@ namespace TNL {
      namespace Segments {



#ifdef HAVE_CUDA
template< int ThreadsPerSegment,
          typename Offsets,
          typename Index,
          typename Fetch,
          typename Reduction,
          typename ResultKeeper,
          typename Real,
          typename... Args >
__global__
void segmentsReductionCSRLightKernel(
    int gridIdx,
    const Offsets offsets,
    Index first,
    Index last,
    Fetch fetch,
    const Reduction reduce,
    ResultKeeper keep,
    const Real zero,
    Args... args )
{
    /***
     * We map one warp to each segment
     */
    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / ThreadsPerSegment + first;
    if( segmentIdx >= last )
        return;

    const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than %
    Index endIdx = offsets[ segmentIdx + 1] ;

    Index localIdx( laneIdx );
    Real aux = zero;
    bool compute( true );
    for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += ThreadsPerSegment )
    {
      aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
      localIdx += TNL::Cuda::getWarpSize();
    }

    /****
     * Reduction in each segment.
     */
    if( ThreadsPerSegment == 32 )
        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) );
    if( ThreadsPerSegment >= 16 )
        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  8 ) );
    if( ThreadsPerSegment >= 8 )
        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  4 ) );
    if( ThreadsPerSegment >= 4 )
        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  2 ) );
    if( ThreadsPerSegment >= 2 )
        aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  1 ) );

    if( laneIdx == 0 )
        keep( segmentIdx, aux );
}
#endif

template< typename Index,
          typename Device >
struct CSRLightKernel
{
    using IndexType = Index;
    using DeviceType = Device;
    using ViewType = CSRLightKernel< Index, Device >;
    using ConstViewType = CSRLightKernel< Index, Device >;

    template< typename Offsets >
    void init( const Offsets& offsets )
    {
        const Index segmentsCount = offsets.getSize() - 1;
        const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount );
        this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), TNL::Cuda::getWarpSize() );
        TNL_ASSERT_GE( threadsPerSegment, 0, "" );
        TNL_ASSERT_LE( threadsPerSegment, 32, "" );
    };

    void reset() { this->threadsPerSegment = 0; }

    ViewType getView() { return *this; };

    ConstViewType getConstView() const { return *this; };

    template< typename OffsetsView,
              typename Fetch,
              typename Reduction,
              typename ResultKeeper,
              typename Real,
              typename... Args >
    void segmentsReduction( const OffsetsView& offsets,
                        Index first,
                        Index last,
                        Fetch& fetch,
                        const Reduction& reduction,
                        ResultKeeper& keeper,
                        const Real& zero,
                        Args... args ) const
    {
        TNL_ASSERT_GE( threadsPerSegment, 0, "" );
        TNL_ASSERT_LE( threadsPerSegment, 32, "" );

#ifdef HAVE_CUDA
        const size_t threadsCount = this->threadsPerSegment * ( last - first );
        dim3 blocksCount, gridsCount, blockSize( 256 );
        TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
        //std::cerr << " this->threadsPerSegment = " << this->threadsPerSegment << " offsets = " << offsets << std::endl;
        for( unsigned int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ )
        {
            dim3 gridSize;
            TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
            switch( this->threadsPerSegment )
            {
                case 0:      // this means zero/empty matrix
                    break;
                case 1:
                    segmentsReductionCSRLightKernel<  1, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                        gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                        break;
                case 2:
                    segmentsReductionCSRLightKernel<  2, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                        gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                        break;
                case 4:
                    segmentsReductionCSRLightKernel<  4, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                        gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                        break;
                case 8:
                    segmentsReductionCSRLightKernel<  8, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                        gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                        break;
                case 16:
                    segmentsReductionCSRLightKernel< 16, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                        gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                        break;
                case 32:
                    segmentsReductionCSRLightKernel< 32, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
                        gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
                        break;
                default:
                    throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) );
            }
        }
#endif
    }

    protected:
        int threadsPerSegment;
};


template< typename Index,
          typename Device >
struct CSRAdaptiveKernelView
+2 −1
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@
#include <TNL/Algorithms/Segments/SegmentView.h>
#include <TNL/Algorithms/Segments/CSRKernelScalar.h>
#include <TNL/Algorithms/Segments/CSRKernelVector.h>
#include <TNL/Algorithms/Segments/CSRKernelHybrid.h>
#include <TNL/Algorithms/Segments/CSRKernels.h>

namespace TNL {
@@ -141,7 +142,7 @@ using CSRViewVector = CSRView< Device, Index, CSRKernelVector< Index, Device > >

template< typename Device,
          typename Index >
using CSRViewLight = CSRView< Device, Index, CSRLightKernel< Index, Device > >;
using CSRViewHybrid = CSRView< Device, Index, CSRKernelHybrid< Index, Device > >;

template< typename Device,
          typename Index >
Loading