Commit e93c933e authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Changed smart lambdas in segments, implemented fast segments reduction in ChunkedEllpack.

parent 49853fdf
Loading
Loading
Loading
Loading
+18 −0
Original line number Diff line number Diff line
@@ -148,6 +148,21 @@ class ChunkedEllpackView
   protected:

#ifdef HAVE_CUDA
      template< typename Fetch,
                typename Reduction,
                typename ResultKeeper,
                typename Real,
                typename... Args >
      __device__
      void segmentsReductionKernelWithAllParameters( IndexType gridIdx,
                                                     IndexType first,
                                                     IndexType last,
                                                     Fetch fetch,
                                                     Reduction reduction,
                                                     ResultKeeper keeper,
                                                     Real zero,
                                                     Args... args ) const;

      template< typename Fetch,
                typename Reduction,
                typename ResultKeeper,
@@ -206,6 +221,9 @@ class ChunkedEllpackView
                                                  ResultKeeper_ keeper,
                                                  Real_ zero,
                                                  Args_... args );

      template< typename Index_, typename Fetch_, bool B_ >
      friend struct details::ChunkedEllpackSegmentsReductionDispatcher;
#endif
};
      } // namespace Segements
+130 −92
Original line number Diff line number Diff line
@@ -20,30 +20,6 @@ namespace TNL {
   namespace Containers {
      namespace Segments {

#ifdef HAVE_CUDA
template< typename View,
          typename Index,
          typename Fetch,
          typename Reduction,
          typename ResultKeeper,
          typename Real,
          typename... Args >
__global__
void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack,
                                            Index gridIdx,
                                            Index first,
                                            Index last,
                                            Fetch fetch,
                                            Reduction reduction,
                                            ResultKeeper keeper,
                                            Real zero,
                                            Args... args )
{
   chunkedEllpack.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
}
#endif


template< typename Device,
          typename Index,
          bool RowMajorOrder >
@@ -460,9 +436,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
      {
         if( gridIdx == cudaGrids - 1 )
            cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
         //ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args...  >
         //   <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
         //   ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
         details::ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args...  >
            <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
            ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
      }
#endif
   }
@@ -571,7 +547,7 @@ template< typename Device,
__device__
void
ChunkedEllpackView< Device, Index, RowMajorOrder >::
segmentsReductionKernel( IndexType gridIdx,
segmentsReductionKernelWithAllParameters( IndexType gridIdx,
                                          IndexType first,
                                          IndexType last,
                                          Fetch fetch,
@@ -582,17 +558,14 @@ segmentsReductionKernel( IndexType gridIdx,
{
   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );

   const IndexType firstSlice = rowToChunkMapping[ first ] / chunksInSlice;
   const IndexType lastSlice = rowToChunkMapping[ last - 1 ] / chunksInSlice;
   //for( IndexType sliceIdx = firstSlice; sliceIdx < lastSlice; sliceIdx++ )
   {
      const IndexType sliceIdx = gridIdx * Cuda::getMaxGridSize() + blockIdx.x;
      if( sliceIdx >= lastSlice )
   const IndexType firstSlice = rowToSliceMapping[ first ];
   const IndexType lastSlice = rowToSliceMapping[ last - 1 ];

   const IndexType sliceIdx = firstSlice + gridIdx * Cuda::getMaxGridSize() + blockIdx.x;
   if( sliceIdx > lastSlice )
      return;

   RealType* chunksResults = Cuda::getSharedMemory< RealType >();
      //for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ )
      //{
   __shared__ details::ChunkedEllpackSliceInfo< IndexType > sliceInfo;
   if( threadIdx.x == 0 )
      sliceInfo = this->slices[ sliceIdx ];
@@ -616,23 +589,89 @@ segmentsReductionKernel( IndexType gridIdx,
      IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice
      IndexType end = begin + chunkSize;
      for( IndexType j = begin; j < end && compute; j++ )
               reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute, args...) );
         reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute ) );
   }
   else
   {
      const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice
      const IndexType end = begin + chunksInSlice * chunkSize;
         for( IndexType j = begin; j < end && compute; j += chunksInSlice )
            reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute ) );
   }
   __syncthreads();
   if( threadIdx.x < sliceInfo.size )
   {
      const IndexType row = sliceInfo.firstSegment + threadIdx.x;
      IndexType chunkIndex( 0 );
      if( threadIdx.x != 0 )
         chunkIndex = this->rowToChunkMapping[ row - 1 ];
      const IndexType lastChunk = this->rowToChunkMapping[ row ];
      RealType result( zero );
      while( chunkIndex < lastChunk )
         reduction( result,  chunksResults[ chunkIndex++ ] );
      if( row >= first && row < last )
         keeper( row, result );
   }
}

template< typename Device,
          typename Index,
          bool RowMajorOrder >
   template< typename Fetch,
             typename Reduction,
             typename ResultKeeper,
             typename Real,
             typename... Args >
__device__
void
ChunkedEllpackView< Device, Index, RowMajorOrder >::
segmentsReductionKernel( IndexType gridIdx,
                         IndexType first,
                         IndexType last,
                         Fetch fetch,
                         Reduction reduction,
                         ResultKeeper keeper,
                         Real zero,
                         Args... args ) const
{
   using RealType = decltype( fetch( IndexType(), std::declval< bool& >(), args... ) );

   const IndexType firstSlice = rowToSliceMapping[ first ];
   const IndexType lastSlice = rowToSliceMapping[ last - 1 ];

   const IndexType sliceIdx = firstSlice + gridIdx * Cuda::getMaxGridSize() + blockIdx.x;
   if( sliceIdx > lastSlice )
      return;

   RealType* chunksResults = Cuda::getSharedMemory< RealType >();
   __shared__ details::ChunkedEllpackSliceInfo< IndexType > sliceInfo;

   if( threadIdx.x == 0 )
      sliceInfo = this->slices[ sliceIdx ];
   chunksResults[ threadIdx.x ] = zero;
   __syncthreads();

   const IndexType sliceOffset = sliceInfo.pointer;
   const IndexType chunkSize = sliceInfo.chunkSize;
   const IndexType chunkIdx = sliceIdx * chunksInSlice + threadIdx.x;
   bool compute( true );

   if( RowMajorOrder )
   {
      IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice
      IndexType end = begin + chunkSize;
      for( IndexType j = begin; j < end && compute; j++ )
         reduction( chunksResults[ threadIdx.x ], fetch( j, compute ) );
   }
   else
   {
      const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice
      const IndexType end = begin + chunksInSlice * chunkSize;
         for( IndexType j = begin; j < end && compute; j += chunksInSlice )
                  reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute, args...) );
            reduction( chunksResults[ threadIdx.x ], fetch( j, compute ) );
   }
   __syncthreads();
      //}

      //details::ChunkedEllpackSliceInfo< IndexType > sliceInfo;
      //for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ )
      //{
         //if( threadIdx == 0 )
         //   sliceInfo = this->slices[ sliceIdx ];
   if( threadIdx.x < sliceInfo.size )
   {
      const IndexType row = sliceInfo.firstSegment + threadIdx.x;
@@ -643,10 +682,9 @@ segmentsReductionKernel( IndexType gridIdx,
      RealType result( zero );
      while( chunkIndex < lastChunk )
         reduction( result,  chunksResults[ chunkIndex++ ] );
      if( row >= first && row < last )
         keeper( row, result );
   }
      //} // threadIdx
   } // sliceIdx
}
#endif

+7 −48
Original line number Diff line number Diff line
@@ -18,7 +18,7 @@ namespace TNL {

template< typename Index,
          typename Lambda >
class CheckFetchLambdaAcceptsSegmentIdxAndCompute
class CheckFetchLambda
{
   private:
      typedef char YesType[1];
@@ -27,52 +27,11 @@ class CheckFetchLambdaAcceptsSegmentIdxAndCompute
      template< typename C > static YesType& test( decltype(std::declval< C >()( Index(), Index(), Index(), std::declval< bool& >() ) ) );
      template< typename C > static NoType& test(...);

   public:
      static constexpr bool value = ( sizeof( test< Lambda >(0) ) == sizeof( YesType ) );
};

template< typename Index,
          typename Lambda >
class CheckFetchLambdaAcceptsSegmentIdx
{
   private:
       typedef char YesType[1];
       typedef char NoType[2];

       template< typename C > static YesType& test( decltype(std::declval< C >()( Index(), Index(), Index() ) ) );
       template< typename C > static NoType& test(...);

   public:
       static constexpr bool value = ( sizeof( test< Lambda >(0) ) == sizeof( YesType ) );
};

template< typename Index,
          typename Lambda >
class CheckFetchLambdaAcceptsCompute
{
   private:
       typedef char YesType[1];
       typedef char NoType[2];

       template< typename C > static YesType& test( decltype(std::declval< C >()( Index(), Index(), std::declval< bool& >() ) ) );
       template< typename C > static NoType& test(...);

   public:
       static constexpr bool value = ( sizeof( test< Lambda >(0) ) == sizeof( YesType ) );
};


template< typename Index,
          typename Lambda >
class CheckFetchLambda
{
   static constexpr bool AcceptsSegmentIdxAndCompute = CheckFetchLambdaAcceptsSegmentIdxAndCompute< Index, Lambda >::value;
   static constexpr bool AcceptsSegmentIdx = CheckFetchLambdaAcceptsSegmentIdx< Index, Lambda >::value;
   static constexpr bool AcceptsCompute = CheckFetchLambdaAcceptsCompute< Index, Lambda >::value;

   public:
      static constexpr bool acceptsSegmentIdx() { return AcceptsSegmentIdxAndCompute || AcceptsSegmentIdx; };
      static constexpr bool acceptsCompute() { return AcceptsSegmentIdxAndCompute || AcceptsCompute; };
      static constexpr bool hasAllParameters() { return value; };
};

         } // namespace details
+76 −0
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@
#include <type_traits>
#include <TNL/Containers/Vector.h>
#include <TNL/Containers/Segments/ChunkedEllpackSegmentView.h>
#include <TNL/Containers/Segments/details/CheckLambdas.h>

namespace TNL {
   namespace Containers {
@@ -223,6 +224,81 @@ class ChunkedEllpack
                                    chunksInSlice );
      }
};

#ifdef HAVE_CUDA
template< typename Index,
          typename Fetch,
          bool HasAllParameters = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
struct ChunkedEllpackSegmentsReductionDispatcher{};

template< typename Index, typename Fetch >
struct ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch, true >
{
   template< typename View,
             typename Reduction,
             typename ResultKeeper,
             typename Real,
             typename... Args >
   __device__
   static void exec( View chunkedEllpack,
                     Index gridIdx,
                     Index first,
                     Index last,
                     Fetch fetch,
                     Reduction reduction,
                     ResultKeeper keeper,
                     Real zero,
                     Args... args )
   {
      chunkedEllpack.segmentsReductionKernelWithAllParameters( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
   }
};

template< typename Index, typename Fetch >
struct ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch, false >
{
   template< typename View,
             typename Reduction,
             typename ResultKeeper,
             typename Real,
             typename... Args >
   __device__
   static void exec( View chunkedEllpack,
                     Index gridIdx,
                     Index first,
                     Index last,
                     Fetch fetch,
                     Reduction reduction,
                     ResultKeeper keeper,
                     Real zero,
                     Args... args )
   {
      chunkedEllpack.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
   }
};

template< typename View,
          typename Index,
          typename Fetch,
          typename Reduction,
          typename ResultKeeper,
          typename Real,
          typename... Args >
__global__
void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack,
                                            Index gridIdx,
                                            Index first,
                                            Index last,
                                            Fetch fetch,
                                            Reduction reduction,
                                            ResultKeeper keeper,
                                            Real zero,
                                            Args... args )
{
   ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch >::exec( chunkedEllpack, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
}
#endif

         } //namespace details
      } //namespace Segments
   } //namespace Containers
+3 −3
Original line number Diff line number Diff line
@@ -21,7 +21,7 @@ namespace TNL {

template< typename Index,
          typename Lambda,
          bool AcceptsSegmentIdx = CheckFetchLambda< Index, Lambda >::acceptsSegmentIdx() >
          bool AllParameters = CheckFetchLambda< Index, Lambda >::hasAllParameters() >
struct FetchLambdaAdapter
{
};
@@ -42,10 +42,10 @@ template< typename Index,
          typename Lambda >
struct FetchLambdaAdapter< Index, Lambda, false >
{
   using ReturnType = decltype( std::declval< Lambda >()( Index(), Index(), std::declval< bool& >() ) );
   using ReturnType = decltype( std::declval< Lambda >()( Index(), std::declval< bool& >() ) );
   static ReturnType call( Lambda& f, Index segmentIdx, Index localIdx, Index globalIdx, bool& compute )
   {
      return f( localIdx, globalIdx, compute );
      return f( globalIdx, compute );
   }
};

Loading