Refactoring of adaptive CSR kernel.

936f2c32 · Tomáš Oberhuber · 9b08c76c · 936f2c32 · 936f2c32 · 936f2c32
Commit 936f2c32 authored 4 years ago by Tomáš Oberhuber
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
@@ -53,16 +53,16 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
   constexpr size_t StreamedSharedElementsPerWarp  = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp();

   __shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ];
-   //__shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];
-   //__shared__ BlockType sharedBlocks[ WarpsCount ];
+   __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];
+   __shared__ BlockType sharedBlocks[ WarpsCount ];

   const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x;
   const Index blockIdx = index / WarpSize;
   if( blockIdx >= blocks.getSize() - 1 )
      return;

-   //if( threadIdx.x < CudaBlockSize / WarpSize )
-   //   multivectorShared[ threadIdx.x ] = zero;
+   if( threadIdx.x < CudaBlockSize / WarpSize )
+      multivectorShared[ threadIdx.x ] = zero;
   Real result = zero;
   bool compute( true );
   const Index laneIdx = threadIdx.x & 31; // & is cheaper than %
@@ -71,7 +71,8 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
   __syncthreads();
   const auto& block = sharedBlocks[ warpIdx ];*/
   const BlockType block = blocks[ blockIdx ];
-   const Index begin = offsets[ block.getFirstSegment() ];
+   const Index firstSegmentIdx = block.getFirstSegment();
+   const Index begin = offsets[ firstSegmentIdx ];

   if( block.getType() == details::Type::STREAM ) // Stream kernel - many short segments per warp
   {
@@ -80,12 +81,10 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,

      // Stream data to shared memory
      for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += WarpSize )
-      {
         streamShared[ warpIdx ][ globalIdx - begin ] = fetch( globalIdx, compute );
-      }
-      //const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();
+      const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();

-      /*for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize )
+      for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize )
      {
         const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data
         result = zero;
@@ -93,9 +92,9 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
         for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ )
            result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] );
         keep( i, result );
-      }*/
+      }
   }
-   /*else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp
+   else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp
   {
      const Index end = begin + block.getSize();
      const Index segmentIdx = block.getFirstSegment();
@@ -172,7 +171,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
            keep( segmentIdx, multivectorShared[ 0 ] );
         }
      }
-   }*/
+   }
 }
 #endif


--- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
+++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
@@ -22,11 +22,13 @@ enum class Type {
   VECTOR = 2
 };

+//#define CSR_ADAPTIVE_UNION
+
 #ifdef CSR_ADAPTIVE_UNION
 template< typename Index >
 union CSRAdaptiveKernelBlockDescriptor
 {
-   CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0) noexcept
+   CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0, uint8_t warpsCount = 0) noexcept
   {
      this->index[0] = row;
      this->index[1] = index;
@@ -80,6 +82,16 @@ union CSRAdaptiveKernelBlockDescriptor
      return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF );
   }

+   __cuda_callable__ uint8_t getWarpIdx() const
+   {
+      return index[ 1 ];
+   }
+
+   __cuda_callable__ uint8_t getWarpsCount() const
+   {
+      return 1;
+   }
+
   void print( std::ostream& str ) const
   {
      Type type = this->getType();

--- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
+++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
@@ -15,7 +15,7 @@ namespace TNL {
      namespace Segments {
         namespace details {

-static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 128, 128, 128 };
+static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 256, 256, 256 };

 template< int SizeOfValue = 1,
          int StreamedSharedMemory_ = 24576 >
@@ -25,7 +25,10 @@ struct CSRAdaptiveKernelParameters

   static constexpr int getSizeValueLogConstexpr( const int i );

+   static constexpr int getSizeOfValue() { return SizeOfValue; };
+
   static constexpr int SizeOfValueLog = getSizeValueLogConstexpr( SizeOfValue );
+
   static_assert( SizeOfValueLog < MaxValueSizeLog, "Parameter SizeOfValue is too large." );

   /**