Commit 936f2c32 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Refactoring of adaptive CSR kernel.

parent 9b08c76c
Loading
Loading
Loading
Loading
+11 −12
Original line number Diff line number Diff line
@@ -53,16 +53,16 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
   constexpr size_t StreamedSharedElementsPerWarp  = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp();

   __shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ];
   //__shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];
   //__shared__ BlockType sharedBlocks[ WarpsCount ];
   __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];
   __shared__ BlockType sharedBlocks[ WarpsCount ];

   const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x;
   const Index blockIdx = index / WarpSize;
   if( blockIdx >= blocks.getSize() - 1 )
      return;

   //if( threadIdx.x < CudaBlockSize / WarpSize )
   //   multivectorShared[ threadIdx.x ] = zero;
   if( threadIdx.x < CudaBlockSize / WarpSize )
      multivectorShared[ threadIdx.x ] = zero;
   Real result = zero;
   bool compute( true );
   const Index laneIdx = threadIdx.x & 31; // & is cheaper than %
@@ -71,7 +71,8 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
   __syncthreads();
   const auto& block = sharedBlocks[ warpIdx ];*/
   const BlockType block = blocks[ blockIdx ];
   const Index begin = offsets[ block.getFirstSegment() ];
   const Index firstSegmentIdx = block.getFirstSegment();
   const Index begin = offsets[ firstSegmentIdx ];

   if( block.getType() == details::Type::STREAM ) // Stream kernel - many short segments per warp
   {
@@ -80,12 +81,10 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,

      // Stream data to shared memory
      for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += WarpSize )
      {
         streamShared[ warpIdx ][ globalIdx - begin ] = fetch( globalIdx, compute );
      }
      //const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();
      const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();

      /*for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize )
      for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize )
      {
         const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data
         result = zero;
@@ -93,9 +92,9 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
         for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ )
            result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] );
         keep( i, result );
      }*/
      }
   /*else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp
   }
   else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp
   {
      const Index end = begin + block.getSize();
      const Index segmentIdx = block.getFirstSegment();
@@ -172,7 +171,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
            keep( segmentIdx, multivectorShared[ 0 ] );
         }
      }
   }*/
   }
}
#endif

+13 −1
Original line number Diff line number Diff line
@@ -22,11 +22,13 @@ enum class Type {
   VECTOR = 2
};

//#define CSR_ADAPTIVE_UNION

#ifdef CSR_ADAPTIVE_UNION
template< typename Index >
union CSRAdaptiveKernelBlockDescriptor
{
   CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0) noexcept
   CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0, uint8_t warpsCount = 0) noexcept
   {
      this->index[0] = row;
      this->index[1] = index;
@@ -80,6 +82,16 @@ union CSRAdaptiveKernelBlockDescriptor
      return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF );
   }

   __cuda_callable__ uint8_t getWarpIdx() const
   {
      return index[ 1 ];
   }

   __cuda_callable__ uint8_t getWarpsCount() const
   {
      return 1;
   }

   void print( std::ostream& str ) const
   {
      Type type = this->getType();
+4 −1
Original line number Diff line number Diff line
@@ -15,7 +15,7 @@ namespace TNL {
      namespace Segments {
         namespace details {

static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 128, 128, 128 };
static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 256, 256, 256 };

template< int SizeOfValue = 1,
          int StreamedSharedMemory_ = 24576 >
@@ -25,7 +25,10 @@ struct CSRAdaptiveKernelParameters

   static constexpr int getSizeValueLogConstexpr( const int i );

   static constexpr int getSizeOfValue() { return SizeOfValue; };

   static constexpr int SizeOfValueLog = getSizeValueLogConstexpr( SizeOfValue );

   static_assert( SizeOfValueLog < MaxValueSizeLog, "Parameter SizeOfValue is too large." );

   /**