Commit c37987b1 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Refactored CUDA parallel scan kernel

The input values are first copied into shared memory, reduced
sequentially across chunks, and scanned only at the end of the kernel.
This follows the upsweep-downsweep approach by Blelloch which is more
work-efficient. Also the distinction between exclusive and inclusive
scan appears only at the end of the kernel, which avoids the weird "+2"
size of the shared memory.

Also used Cuda::getInterleaving() for the indices when accessing the
chunkResults array, which avoids shared memory banks conflicts in the
spine-scan phase.
parent 7a688833
Loading
Loading
Loading
Loading
+55 −63
Original line number Diff line number Diff line
@@ -57,75 +57,63 @@ CudaScanKernelFirstPhase( const InputView input,
   outputBegin += threadOffset;

   // allocate shared memory
   constexpr int shmemElements = maxElementsInBlock + maxElementsInBlock / Cuda::getNumberOfSharedMemoryBanks() + 2;
   constexpr int shmemElements = maxElementsInBlock + maxElementsInBlock / Cuda::getNumberOfSharedMemoryBanks();
   __shared__ ValueType sharedData[ shmemElements ];  // accessed via Cuda::getInterleaving()
   __shared__ ValueType chunkResults[ blockSize ];
   __shared__ ValueType chunkResults[ blockSize + blockSize / Cuda::getNumberOfSharedMemoryBanks() ];  // accessed via Cuda::getInterleaving()
   __shared__ ValueType warpResults[ Cuda::getWarpSize() ];

   /***
    * Load data into the shared memory.
    */
   int idx = threadIdx.x;
   if( scanType == ScanType::Exclusive )
   // Load data into the shared memory.
   {
      if( idx == 0 )
         sharedData[ 0 ] = zero;
      int idx = threadIdx.x;
      while( idx < elementsInBlock )
      {
         sharedData[ Cuda::getInterleaving( idx + 1 ) ] = input[ begin ];
         sharedData[ Cuda::getInterleaving( idx ) ] = input[ begin ];
         begin += blockDim.x;
         idx += blockDim.x;
      }
   }
   else
   {
      while( idx < elementsInBlock )
      // fill the remaining (maxElementsInBlock - elementsInBlock) values with zero
      // (this helps to avoid divergent branches in the blocks below)
      while( idx < maxElementsInBlock )
      {
         sharedData[ Cuda::getInterleaving( idx ) ] = input[ begin ];
         begin += blockDim.x;
         sharedData[ Cuda::getInterleaving( idx ) ] = zero;
         idx += blockDim.x;
      }
   }
   __syncthreads();

   /***
    * Perform the sequential scan of the chunk in shared memory.
    */
   // Perform sequential reduction of the chunk in shared memory.
   const int chunkOffset = threadIdx.x * valuesPerThread;
   const int numberOfChunks = roundUpDivision( elementsInBlock, valuesPerThread );

   int chunkPointer = 1;
   while( chunkPointer < valuesPerThread && chunkOffset + chunkPointer < elementsInBlock )
   const int chunkResultIdx = Cuda::getInterleaving( threadIdx.x );
   {
      sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer ) ] =
         reduction( sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer ) ],
                    sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
      chunkPointer++;
   }
      ValueType chunkResult = sharedData[ Cuda::getInterleaving( chunkOffset ) ];
      #pragma unroll
      for( int i = 1; i < valuesPerThread; i++ )
         chunkResult = reduction( chunkResult, sharedData[ Cuda::getInterleaving( chunkOffset + i ) ] );

      // store the result of the sequential reduction of the chunk in chunkResults
   chunkResults[ threadIdx.x ] = sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ];
      chunkResults[ chunkResultIdx ] = chunkResult;
   }
   __syncthreads();

   /***
    * Perform the parallel scan on chunkResults inside warps.
    */
   // Perform the parallel scan on chunkResults inside warps.
   const int threadInWarpIdx = threadIdx.x % Cuda::getWarpSize();
   const int warpIdx = threadIdx.x / Cuda::getWarpSize();
   #pragma unroll
   for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
      if( threadInWarpIdx >= stride && threadIdx.x < numberOfChunks )
         chunkResults[ threadIdx.x ] = reduction( chunkResults[ threadIdx.x ], chunkResults[ threadIdx.x - stride ] );
      if( threadInWarpIdx >= stride ) {
         chunkResults[ chunkResultIdx ] = reduction( chunkResults[ chunkResultIdx ], chunkResults[ Cuda::getInterleaving( threadIdx.x - stride ) ] );
      }
      __syncwarp();
   }

   // The last thread in warp stores the intermediate result in warpResults.
   if( threadInWarpIdx == Cuda::getWarpSize() - 1 )
      warpResults[ warpIdx ] = chunkResults[ threadIdx.x ];
      warpResults[ warpIdx ] = chunkResults[ chunkResultIdx ];
   __syncthreads();

   /****
    * Perform the scan of warpResults using one warp.
    */
   // Perform the scan of warpResults using one warp.
   if( warpIdx == 0 )
      #pragma unroll
      for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
         if( threadInWarpIdx >= stride )
            warpResults[ threadIdx.x ] = reduction( warpResults[ threadIdx.x ], warpResults[ threadIdx.x - stride ] );
@@ -133,40 +121,44 @@ CudaScanKernelFirstPhase( const InputView input,
      }
   __syncthreads();

   /****
    * Shift chunkResults by the warpResults.
    */
   // Shift chunkResults by the warpResults.
   if( warpIdx > 0 )
      chunkResults[ threadIdx.x ] = reduction( chunkResults[ threadIdx.x ], warpResults[ warpIdx - 1 ] );
      chunkResults[ chunkResultIdx ] = reduction( chunkResults[ chunkResultIdx ], warpResults[ warpIdx - 1 ] );
   __syncthreads();

   /***
    * Store the result back in global memory.
    */
   idx = threadIdx.x;
   while( idx < elementsInBlock )
   // Downsweep step: scan the chunks and use the chunk result as the initial value.
   {
      const int chunkIdx = idx / valuesPerThread;
      ValueType chunkShift = zero;
      if( chunkIdx > 0 )
         chunkShift = chunkResults[ chunkIdx - 1 ];
      output[ outputBegin ] =
      sharedData[ Cuda::getInterleaving( idx ) ] =
         reduction( sharedData[ Cuda::getInterleaving( idx ) ], chunkShift );
      outputBegin += blockDim.x;
      idx += blockDim.x;
      ValueType value = zero;
      if( threadIdx.x > 0 )
         value = chunkResults[ Cuda::getInterleaving( threadIdx.x - 1 ) ];

      #pragma unroll
      for( int i = 0; i < valuesPerThread; i++ )
      {
         const int sharedIdx = Cuda::getInterleaving( chunkOffset + i );
         const ValueType inputValue = sharedData[ sharedIdx ];
         if( scanType == ScanType::Exclusive )
            sharedData[ sharedIdx ] = value;
         value = reduction( value, inputValue );
         if( scanType == ScanType::Inclusive )
            sharedData[ sharedIdx ] = value;
      }

      // The last thread of the block stores the block result in the global memory.
      if( blockResults && threadIdx.x == blockDim.x - 1 )
         blockResults[ blockIdx.x ] = value;
   }
   __syncthreads();

   if( threadIdx.x == 0 )
   // Store the result back in the global memory.
   {
      if( scanType == ScanType::Exclusive )
      int idx = threadIdx.x;
      while( idx < elementsInBlock )
      {
         blockResults[ blockIdx.x ] = reduction( sharedData[ Cuda::getInterleaving( elementsInBlock - 1 ) ],
                                                 sharedData[ Cuda::getInterleaving( elementsInBlock ) ] );
         output[ outputBegin ] = sharedData[ Cuda::getInterleaving( idx ) ];
         outputBegin += blockDim.x;
         idx += blockDim.x;
      }
      else
         blockResults[ blockIdx.x ] = sharedData[ Cuda::getInterleaving( elementsInBlock - 1 ) ];
   }
}