Commit 46482276 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Renaming CSRKernelAdaptive to CSRAdaptiveKernel.

parent c1072c2b
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -155,7 +155,7 @@ using CSRHybrid = CSR< Device, Index, CSRHybridKernel< Index, Device >, IndexAll
template< typename Device,
          typename Index,
          typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
using CSRAdaptive = CSR< Device, Index, CSRKernelAdaptive< Index, Device >, IndexAllocator >;
using CSRAdaptive = CSR< Device, Index, CSRAdaptiveKernel< Index, Device >, IndexAllocator >;

template< typename Device,
          typename Index,
+15 −156
Original line number Diff line number Diff line
/***************************************************************************
                          CSRKernels.h -  description
                          CSRAdaptiveKernel.h -  description
                             -------------------
    begin                : Jan 20, 2021 -> Joe Biden inauguration
    copyright            : (C) 2021 by Tomas Oberhuber
@@ -15,7 +15,7 @@
#include <TNL/Containers/VectorView.h>
#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
#include <TNL/Algorithms/Segments/CSRKernelScalar.h>
#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>

namespace TNL {
@@ -47,166 +47,23 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
                                    Reduction reduce,
                                    ResultKeeper keep,
                                    Real zero,
                                    Args... args )
{
   __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ];
   __shared__ Real multivectorShared[ CudaBlockSize / warpSize ];
   constexpr size_t MAX_X_DIM = 2147483647;
   const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
   const Index blockIdx = index / warpSize;
   if( blockIdx >= blocks.getSize() - 1 )
      return;

   if( threadIdx.x < CudaBlockSize / warpSize )
      multivectorShared[ threadIdx.x ] = zero;
   Real result = zero;
   bool compute( true );
   const Index laneIdx = threadIdx.x & 31; // & is cheaper than %
   const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ];
   const Index& firstSegmentIdx = block.getFirstSegment();
   const Index begin = offsets[ firstSegmentIdx ];

   const auto blockType = block.getType();
   if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp
   {
      const Index warpIdx = threadIdx.x / 32;
      const Index end = begin + block.getSize();

      // Stream data to shared memory
      for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize )
      {
         streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute );
            details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute );
         // TODO:: fix this by template specialization so that we can assume fetch lambda
         // with short parameters
      }

      const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();

      for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize )
      {
         const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data
         result = zero;
         // Scalar reduction
         for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ )
            result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] );
         keep( i, result );
      }
   }
   else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp
   {
      const Index end = begin + block.getSize();
      const Index segmentIdx = block.getFirstSegment();

      for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize )
         result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx

      // Parallel reduction
      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) );
      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  8 ) );
      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  4 ) );
      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  2 ) );
      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  1 ) );
      if( laneIdx == 0 )
         keep( segmentIdx, result );
   }
   else // blockType == Type::LONG - several warps per segment
   {
      // Number of elements processed by previous warps
      //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP;
      ///   block.getWarpIdx() * MAX_ELEM_PER_WARP;
      //Index to = begin + (block.getWarpIdx()  + 1) * MAX_ELEM_PER_WARP;
      const Index segmentIdx = block.getFirstSegment();//block.index[0];
      //minID = offsets[block.index[0] ];
      const Index end = offsets[segmentIdx + 1];
      //const int tid = threadIdx.x;
      //const int inBlockWarpIdx = block.getWarpIdx();

      //if( to > end )
      //   to = end;
      TNL_ASSERT_GT( block.getWarpsCount(), 0, "" );
      result = zero;
      //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx()  );
      for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx();
           globalIdx < end;
           globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() )
      {
         result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) );
         //if( laneIdx == 0 )
         //   printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end,
         //    details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) );
         //result += values[i] * inVector[columnIndexes[i]];
      }
      //printf( "tid %d -> %d \n", tid, result );

      result += __shfl_down_sync(0xFFFFFFFF, result, 16);
      result += __shfl_down_sync(0xFFFFFFFF, result, 8);
      result += __shfl_down_sync(0xFFFFFFFF, result, 4);
      result += __shfl_down_sync(0xFFFFFFFF, result, 2);
      result += __shfl_down_sync(0xFFFFFFFF, result, 1);

      //if( laneIdx == 0 )
      //   printf( "WARP RESULT: tid %d -> %d \n", tid, result );

      const Index warpID = threadIdx.x / 32;
      if( laneIdx == 0 )
         multivectorShared[ warpID ] = result;

      __syncthreads();
      // Reduction in multivectorShared
      if( block.getWarpIdx() == 0 && laneIdx < 16 )
      {
         constexpr int totalWarps = CudaBlockSize / warpSize;
         if( totalWarps >= 32 )
         {
            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] );
            __syncwarp();
         }
         if( totalWarps >= 16 )
         {
            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  8 ] );
            __syncwarp();
         }
         if( totalWarps >= 8 )
         {
            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  4 ] );
            __syncwarp();
         }
         if( totalWarps >= 4 )
         {
            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  2 ] );
            __syncwarp();
         }
         if( totalWarps >= 2 )
         {
            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  1 ] );
            __syncwarp();
         }
         if( laneIdx == 0 )
         {
            //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] );
            keep( segmentIdx, multivectorShared[ 0 ] );
         }
      }
   }
}
                                    Args... args );
#endif


template< typename Index,
          typename Device >
struct CSRKernelAdaptiveView
struct CSRAdaptiveKernelView
{
   using IndexType = Index;
   using DeviceType = Device;
   using ViewType = CSRKernelAdaptiveView< Index, Device >;
   using ConstViewType = CSRKernelAdaptiveView< Index, Device >;
   using ViewType = CSRAdaptiveKernelView< Index, Device >;
   using ConstViewType = CSRAdaptiveKernelView< Index, Device >;
   using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >;
   using BlocksView = typename BlocksType::ViewType;

   CSRKernelAdaptiveView() = default;
   CSRAdaptiveKernelView() = default;

   CSRKernelAdaptiveView( BlocksType& blocks )
   CSRAdaptiveKernelView( BlocksType& blocks )
   {
      this->blocks.bind( blocks );
   };
@@ -243,7 +100,7 @@ struct CSRKernelAdaptiveView
#ifdef HAVE_CUDA
      if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() )
      {
         TNL::Algorithms::Segments::CSRKernelScalar< Index, Device >::
         TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
            segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
         return;
      }
@@ -318,7 +175,7 @@ struct CSRKernelAdaptiveView
#endif
   }

   CSRKernelAdaptiveView& operator=( const CSRKernelAdaptiveView< Index, Device >& kernelView )
   CSRAdaptiveKernelView& operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView )
   {
      this->blocks.bind( kernelView.blocks );
      return *this;
@@ -340,12 +197,12 @@ struct CSRKernelAdaptiveView

template< typename Index,
          typename Device >
struct CSRKernelAdaptive
struct CSRAdaptiveKernel
{
   using IndexType = Index;
   using DeviceType = Device;
   using ViewType = CSRKernelAdaptiveView< Index, Device >;
   using ConstViewType = CSRKernelAdaptiveView< Index, Device >;
   using ViewType = CSRAdaptiveKernelView< Index, Device >;
   using ConstViewType = CSRAdaptiveKernelView< Index, Device >;
   using BlocksType = typename ViewType::BlocksType;
   using BlocksView = typename BlocksType::ViewType;

@@ -485,3 +342,5 @@ struct CSRKernelAdaptive
      } // namespace Segments
   }  // namespace Algorithms
} // namespace TNL

#include <TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp>
+197 −0
Original line number Diff line number Diff line
/***************************************************************************
                          CSRAdaptiveKernel.h -  description
                             -------------------
    begin                : Feb 7, 2021 -> Joe Biden inauguration
    copyright            : (C) 2021 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

#pragma once

#include <TNL/Assert.h>
#include <TNL/Cuda/LaunchHelpers.h>
#include <TNL/Containers/VectorView.h>
#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>

namespace TNL {
   namespace Algorithms {
      namespace Segments {

#ifdef HAVE_CUDA

template< int CudaBlockSize,
          int warpSize,
          int WARPS,
          int SHARED_PER_WARP,
          int MAX_ELEM_PER_WARP,
          typename BlocksView,
          typename Offsets,
          typename Index,
          typename Fetch,
          typename Reduction,
          typename ResultKeeper,
          typename Real,
          typename... Args >
__global__ void
segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
                                    int gridIdx,
                                    Offsets offsets,
                                    Index first,
                                    Index last,
                                    Fetch fetch,
                                    Reduction reduce,
                                    ResultKeeper keep,
                                    Real zero,
                                    Args... args )
{
   __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ];
   __shared__ Real multivectorShared[ CudaBlockSize / warpSize ];
   constexpr size_t MAX_X_DIM = 2147483647;
   const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
   const Index blockIdx = index / warpSize;
   if( blockIdx >= blocks.getSize() - 1 )
      return;

   if( threadIdx.x < CudaBlockSize / warpSize )
      multivectorShared[ threadIdx.x ] = zero;
   Real result = zero;
   bool compute( true );
   const Index laneIdx = threadIdx.x & 31; // & is cheaper than %
   const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ];
   const Index& firstSegmentIdx = block.getFirstSegment();
   const Index begin = offsets[ firstSegmentIdx ];

   const auto blockType = block.getType();
   if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp
   {
      const Index warpIdx = threadIdx.x / 32;
      const Index end = begin + block.getSize();

      // Stream data to shared memory
      for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize )
      {
         streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute );
            details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute );
         // TODO:: fix this by template specialization so that we can assume fetch lambda
         // with short parameters
      }

      const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();

      for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize )
      {
         const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data
         result = zero;
         // Scalar reduction
         for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ )
            result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] );
         keep( i, result );
      }
   }
   else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp
   {
      const Index end = begin + block.getSize();
      const Index segmentIdx = block.getFirstSegment();

      for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize )
         result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx

      // Parallel reduction
      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) );
      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  8 ) );
      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  4 ) );
      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  2 ) );
      result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result,  1 ) );
      if( laneIdx == 0 )
         keep( segmentIdx, result );
   }
   else // blockType == Type::LONG - several warps per segment
   {
      // Number of elements processed by previous warps
      //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP;
      ///   block.getWarpIdx() * MAX_ELEM_PER_WARP;
      //Index to = begin + (block.getWarpIdx()  + 1) * MAX_ELEM_PER_WARP;
      const Index segmentIdx = block.getFirstSegment();//block.index[0];
      //minID = offsets[block.index[0] ];
      const Index end = offsets[segmentIdx + 1];
      //const int tid = threadIdx.x;
      //const int inBlockWarpIdx = block.getWarpIdx();

      //if( to > end )
      //   to = end;
      TNL_ASSERT_GT( block.getWarpsCount(), 0, "" );
      result = zero;
      //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx()  );
      for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx();
           globalIdx < end;
           globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() )
      {
         result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) );
         //if( laneIdx == 0 )
         //   printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end,
         //    details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) );
         //result += values[i] * inVector[columnIndexes[i]];
      }
      //printf( "tid %d -> %d \n", tid, result );

      result += __shfl_down_sync(0xFFFFFFFF, result, 16);
      result += __shfl_down_sync(0xFFFFFFFF, result, 8);
      result += __shfl_down_sync(0xFFFFFFFF, result, 4);
      result += __shfl_down_sync(0xFFFFFFFF, result, 2);
      result += __shfl_down_sync(0xFFFFFFFF, result, 1);

      //if( laneIdx == 0 )
      //   printf( "WARP RESULT: tid %d -> %d \n", tid, result );

      const Index warpID = threadIdx.x / 32;
      if( laneIdx == 0 )
         multivectorShared[ warpID ] = result;

      __syncthreads();
      // Reduction in multivectorShared
      if( block.getWarpIdx() == 0 && laneIdx < 16 )
      {
         constexpr int totalWarps = CudaBlockSize / warpSize;
         if( totalWarps >= 32 )
         {
            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] );
            __syncwarp();
         }
         if( totalWarps >= 16 )
         {
            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  8 ] );
            __syncwarp();
         }
         if( totalWarps >= 8 )
         {
            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  4 ] );
            __syncwarp();
         }
         if( totalWarps >= 4 )
         {
            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  2 ] );
            __syncwarp();
         }
         if( totalWarps >= 2 )
         {
            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  1 ] );
            __syncwarp();
         }
         if( laneIdx == 0 )
         {
            //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] );
            keep( segmentIdx, multivectorShared[ 0 ] );
         }
      }
   }
}
#endif

      } // namespace Segments
   }  // namespace Algorithms
} // namespace TNL
 No newline at end of file
+2 −2
Original line number Diff line number Diff line
@@ -17,7 +17,7 @@
#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
#include <TNL/Algorithms/Segments/CSRVectorKernel.h>
#include <TNL/Algorithms/Segments/CSRHybridKernel.h>
#include <TNL/Algorithms/Segments/CSRKernelAdaptive.h>
#include <TNL/Algorithms/Segments/CSRAdaptiveKernel.h>

namespace TNL {
   namespace Algorithms {
@@ -146,7 +146,7 @@ using CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< Index, Device > >

template< typename Device,
          typename Index >
using CSRViewAdaptive = CSRView< Device, Index, CSRKernelAdaptive< Index, Device > >;
using CSRViewAdaptive = CSRView< Device, Index, CSRAdaptiveKernel< Index, Device > >;

template< typename Device,
          typename Index >