Commit 3e9f89a0 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Renaming CSRVectorKernel to CSRKernelVector and maiking separate source files.

parent 69c12aea
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -138,7 +138,7 @@ using CSRScalar = CSR< Device, Index, CSRKernelScalar< Index, Device >, IndexAll
template< typename Device,
          typename Index,
          typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
using CSRVector = CSR< Device, Index, CSRVectorKernel< Index, Device >, IndexAllocator >;
using CSRVector = CSR< Device, Index, CSRKernelVector< Index, Device >, IndexAllocator >;

template< typename Device,
          typename Index,
+62 −0
Original line number Diff line number Diff line
/***************************************************************************
                          CSRKernelVector.h -  description
                             -------------------
    begin                : Jan 23, 2021 -> Joe Biden inauguration
    copyright            : (C) 2021 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

#pragma once

#include <TNL/Assert.h>
#include <TNL/Cuda/LaunchHelpers.h>
#include <TNL/Containers/VectorView.h>
#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>

namespace TNL {
   namespace Algorithms {
      namespace Segments {

template< typename Index,
          typename Device >
struct CSRKernelVector
{
    using IndexType = Index;
    using DeviceType = Device;
    using ViewType = CSRKernelVector< Index, Device >;
    using ConstViewType = CSRKernelVector< Index, Device >;

    template< typename Offsets >
    void init( const Offsets& offsets );

    void reset();

    ViewType getView();

    ConstViewType getConstView() const;


    template< typename OffsetsView,
              typename Fetch,
              typename Reduction,
              typename ResultKeeper,
              typename Real,
              typename... Args >
    static void segmentsReduction( const OffsetsView& offsets,
                                   Index first,
                                   Index last,
                                   Fetch& fetch,
                                   const Reduction& reduction,
                                   ResultKeeper& keeper,
                                   const Real& zero,
                                   Args... args );
};

      } // namespace Segments
   }  // namespace Algorithms
} // namespace TNL

#include <TNL/Algorithms/Segments/CSRKernelVector.hpp>
+152 −0
Original line number Diff line number Diff line
/***************************************************************************
                          CSRKernelVector.hpp -  description
                             -------------------
    begin                : Jan 23, 2021 -> Joe Biden inauguration
    copyright            : (C) 2021 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

#pragma once

#include <TNL/Assert.h>
#include <TNL/Cuda/LaunchHelpers.h>
#include <TNL/Containers/VectorView.h>
#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
#include <TNL/Algorithms/Segments/CSRKernelVector.h>

namespace TNL {
   namespace Algorithms {
      namespace Segments {

#ifdef HAVE_CUDA
template< typename Offsets,
          typename Index,
          typename Fetch,
          typename Reduction,
          typename ResultKeeper,
          typename Real,
          typename... Args >
__global__
void segmentsReductionCSRKernelVector(
    int gridIdx,
    const Offsets offsets,
    Index first,
    Index last,
    Fetch fetch,
    const Reduction reduce,
    ResultKeeper keep,
    const Real zero,
    Args... args )
{
    /***
     * We map one warp to each segment
     */
    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first;
    if( segmentIdx >= last )
        return;

    const int laneIdx = threadIdx.x & ( TNL::Cuda::getWarpSize() - 1 ); // & is cheaper than %
    TNL_ASSERT_LT( segmentIdx + 1, offsets.getSize(), "" );
    Index endIdx = offsets[ segmentIdx + 1 ];

    Index localIdx( laneIdx );
    Real aux = zero;
    bool compute( true );
    for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += TNL::Cuda::getWarpSize() )
    {
        TNL_ASSERT_LT( globalIdx, endIdx, "" );
        aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
        localIdx += TNL::Cuda::getWarpSize();
    }

   /****
    * Reduction in each warp which means in each segment.
    */
   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) );
   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  8 ) );
   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  4 ) );
   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  2 ) );
   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  1 ) );

   if( laneIdx == 0 )
     keep( segmentIdx, aux );
}
#endif

template< typename Index,
          typename Device >
    template< typename Offsets >
void
CSRKernelVector< Index, Device >::
init( const Offsets& offsets )
{
}

template< typename Index,
          typename Device >
void
CSRKernelVector< Index, Device >::
reset()
{
}

template< typename Index,
          typename Device >
auto
CSRKernelVector< Index, Device >::
getView() -> ViewType
{
    return *this;
}

template< typename Index,
          typename Device >
auto
CSRKernelVector< Index, Device >::
getConstView() const -> ConstViewType
{
    return *this;
};


template< typename Index,
          typename Device >
    template< typename OffsetsView,
              typename Fetch,
              typename Reduction,
              typename ResultKeeper,
              typename Real,
              typename... Args >
void
CSRKernelVector< Index, Device >::
segmentsReduction( const OffsetsView& offsets,
                         Index first,
                         Index last,
                         Fetch& fetch,
                         const Reduction& reduction,
                         ResultKeeper& keeper,
                         const Real& zero,
                         Args... args )
{
#ifdef HAVE_CUDA
    const Index warpsCount = last - first;
    const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize();
    dim3 blocksCount, gridsCount, blockSize( 256 );
    TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
    dim3 gridIdx;
    for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x ++ )
    {
        dim3 gridSize;
        TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
        segmentsReductionCSRKernelVector< OffsetsView, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... >
        <<< gridSize, blockSize >>>(
            gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args... );
    };
#endif
}
      } // namespace Segments
   }  // namespace Algorithms
} // namespace TNL
+0 −106
Original line number Diff line number Diff line
@@ -20,112 +20,6 @@ namespace TNL {
   namespace Algorithms {
      namespace Segments {

#ifdef HAVE_CUDA
template< typename Offsets,
          typename Index,
          typename Fetch,
          typename Reduction,
          typename ResultKeeper,
          typename Real,
          typename... Args >
__global__
void segmentsReductionCSRVectorKernel(
    int gridIdx,
    const Offsets offsets,
    Index first,
    Index last,
    Fetch fetch,
    const Reduction reduce,
    ResultKeeper keep,
    const Real zero,
    Args... args )
{
    /***
     * We map one warp to each segment
     */
    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first;
    if( segmentIdx >= last )
        return;

    const int laneIdx = threadIdx.x & ( TNL::Cuda::getWarpSize() - 1 ); // & is cheaper than %
    TNL_ASSERT_LT( segmentIdx + 1, offsets.getSize(), "" );
    Index endIdx = offsets[ segmentIdx + 1 ];

    Index localIdx( laneIdx );
    Real aux = zero;
    bool compute( true );
    for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += TNL::Cuda::getWarpSize() )
    {
        TNL_ASSERT_LT( globalIdx, endIdx, "" );
        aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
        localIdx += TNL::Cuda::getWarpSize();
    }

   /****
    * Reduction in each warp which means in each segment.
    */
   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) );
   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  8 ) );
   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  4 ) );
   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  2 ) );
   aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux,  1 ) );

   if( laneIdx == 0 )
     keep( segmentIdx, aux );
}
#endif

template< typename Index,
          typename Device >
struct CSRVectorKernel
{
    using IndexType = Index;
    using DeviceType = Device;
    using ViewType = CSRVectorKernel< Index, Device >;
    using ConstViewType = CSRVectorKernel< Index, Device >;

    template< typename Offsets >
    void init( const Offsets& offsets ) {};

    void reset(){};

    ViewType getView() { return *this; };

    ConstViewType getConstView() const { return *this; };


    template< typename OffsetsView,
              typename Fetch,
              typename Reduction,
              typename ResultKeeper,
              typename Real,
              typename... Args >
    static void segmentsReduction( const OffsetsView& offsets,
                               Index first,
                               Index last,
                               Fetch& fetch,
                               const Reduction& reduction,
                               ResultKeeper& keeper,
                               const Real& zero,
                               Args... args )
    {
#ifdef HAVE_CUDA
        const Index warpsCount = last - first;
        const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize();
        dim3 blocksCount, gridsCount, blockSize( 256 );
        TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
        dim3 gridIdx;
        for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x ++ )
        {
            dim3 gridSize;
            TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
            segmentsReductionCSRVectorKernel< OffsetsView, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... >
            <<< gridSize, blockSize >>>(
                gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args... );
        };
#endif
    }
};


#ifdef HAVE_CUDA
+2 −1
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@
#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/Segments/SegmentView.h>
#include <TNL/Algorithms/Segments/CSRKernelScalar.h>
#include <TNL/Algorithms/Segments/CSRKernelVector.h>
#include <TNL/Algorithms/Segments/CSRKernels.h>

namespace TNL {
@@ -136,7 +137,7 @@ using CSRViewScalar = CSRView< Device, Index, CSRKernelScalar< Index, Device > >

template< typename Device,
          typename Index >
using CSRViewVector = CSRView< Device, Index, CSRVectorKernel< Index, Device > >;
using CSRViewVector = CSRView< Device, Index, CSRKernelVector< Index, Device > >;

template< typename Device,
          typename Index >