Commit e9f25336 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Refactoring BiEllapck SpMV CUDA kernel.

parent a92f219d
Loading
Loading
Loading
Loading
+44 −53
Original line number Diff line number Diff line
@@ -364,17 +364,14 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
   if( std::is_same< DeviceType, Devices::Cuda >::value )
   {
#ifdef HAVE_CUDA
      //printStructure( std::cerr );
      //for( IndexType i = first; i < last; i += getWarpSize() )
      {
         //IndexType first = i;
         //IndexType last = TNL::min( this->getSize(), i + getWarpSize() );
         constexpr int BlockDim = getWarpSize();
      constexpr int BlockDim = 256;//getWarpSize();
      dim3 cudaBlockSize = BlockDim;
      const IndexType stripsCount = roundUpDivision( last - first, getWarpSize() );
      const IndexType cudaBlocks = roundUpDivision( stripsCount * getWarpSize(), cudaBlockSize.x );
      const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
         const IndexType sharedMemory = cudaBlockSize.x * sizeof( RealType );
      IndexType sharedMemory = 0;
      if( ! RowMajorOrder )
         sharedMemory = cudaBlockSize.x * sizeof( RealType );

      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
      {
@@ -387,7 +384,6 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
         cudaThreadSynchronize();
         TNL_CHECK_CUDA_DEVICE;
      }
      }
#endif
   }
}
@@ -493,7 +489,6 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx,
   const IndexType rowStripPerm = rowPermArray[ segmentIdx ] - strip * getWarpSize();
   const IndexType groupsCount = details::BiEllpack< IndexType, DeviceType, RowMajorOrder, getWarpSize() >::getActiveGroupsCountDirect( rowPermArray, segmentIdx );
   IndexType groupHeight = getWarpSize();
   //printf( "segmentIdx = %d strip = %d firstGroupInStrip = %d rowStripPerm = %d groupsCount = %d \n", segmentIdx, strip, firstGroupInStrip, rowStripPerm, groupsCount );
   bool compute( true );
   IndexType localIdx( 0 );
   RealType result( zero );
@@ -501,23 +496,15 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx,
   {
      IndexType groupOffset = groupPointers[ groupIdx ];
      const IndexType groupSize = groupPointers[ groupIdx + 1 ] - groupOffset;
      //printf( "groupSize = %d \n", groupSize );
      if( groupSize )
      {
         const IndexType groupWidth = groupSize / groupHeight;
         for( IndexType i = 0; i < groupWidth; i++ )
         {
            if( RowMajorOrder )
            {
               reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm * groupWidth + i, compute ) );
            }
            else
            {
               /*printf( "segmentIdx = %d localIdx = %d globalIdx = %d groupIdx = %d groupSize = %d groupWidth = %d\n",
                  segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight,
                  groupIdx, groupSize, groupWidth );*/
               reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight, compute ) );
            }
            localIdx++;
         }
      }
@@ -561,9 +548,6 @@ segmentsReductionKernel( IndexType gridIdx,
   IndexType groupHeight = getWarpSize();
   IndexType firstGroupIdx = strip * ( getLogWarpSize() + 1 );

   RealType* temp( nullptr );
   if( ! RowMajorOrder )
      temp = Cuda::getSharedMemory< RealType >();
   __shared__ RealType results[ BlockDim ];
   results[ threadIdx.x ] = zero;
   __shared__ IndexType sharedGroupPointers[ 7 ]; // TODO: getLogWarpSize() + 1 ];
@@ -573,14 +557,15 @@ segmentsReductionKernel( IndexType gridIdx,
   __syncthreads();

   bool compute( true );
   if( RowMajorOrder )
   {
      for( IndexType group = 0; group < getLogWarpSize() + 1; group++ )
      {
         IndexType groupBegin = sharedGroupPointers[ group ];
         IndexType groupEnd = sharedGroupPointers[ group + 1 ];
         if( groupEnd - groupBegin > 0 )
         {
         if( RowMajorOrder )
         {

               if( inWarpIdx < groupHeight )
               {
                  const IndexType groupWidth = ( groupEnd - groupBegin ) / groupHeight;
@@ -589,17 +574,23 @@ segmentsReductionKernel( IndexType gridIdx,
                     reduction( results[ threadIdx.x ], fetch( globalIdx++, compute ) );
               }
            }
         groupHeight >>= 1;
      }
   }
   else
   {
      RealType* temp = Cuda::getSharedMemory< RealType >();
      for( IndexType group = 0; group < getLogWarpSize() + 1; group++ )
      {
         IndexType groupBegin = sharedGroupPointers[ group ];
         IndexType groupEnd = sharedGroupPointers[ group + 1 ];
         if( groupEnd - groupBegin > 0 )
         {
            temp[ threadIdx.x ] = zero;
            IndexType globalIdx = groupBegin + inWarpIdx;
            while( globalIdx < groupEnd )
            {
               reduction( temp[ threadIdx.x ], fetch( globalIdx, compute ) );
               /*printf( "FETCH: globalIdx = %d fetch = %d result = %d groupEnd = %d \n", 
                  globalIdx,
                  ( int ) fetch( globalIdx, compute ),
                  ( int ) temp[ threadIdx.x ], groupEnd );*/
               globalIdx += getWarpSize();
            }
            // TODO: reduction via templates
@@ -613,9 +604,9 @@ segmentsReductionKernel( IndexType gridIdx,
            if( inWarpIdx < groupHeight )
               reduction( results[ threadIdx.x ], temp[ threadIdx.x ] );
         }
      }
         groupHeight >>= 1;
      }
   }
   __syncthreads();
   if( warpStart + inWarpIdx >= last )
      return;