Commit e36a896a authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Optimizing CUDA parallel reduction.

parent 398d4b1a
Loading
Loading
Loading
Loading
+15 −10
Original line number Diff line number Diff line
@@ -86,7 +86,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
   while( chunkPointer < chunkSize &&
          chunkOffset + chunkPointer < lastElementInBlock )
   {
      operation.performInPlace( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ],
      operation.commonReductionOnDevice( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ],
                                         sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
      auxData[ threadIdx. x ] =
         sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer  ) ];
@@ -100,7 +100,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
   const int warpIdx = threadIdx. x / tnlCuda::getWarpSize();
   for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 )
      if( threadInWarpIdx >= stride && threadIdx. x < numberOfChunks )
         operation.performInPlace( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] );
         operation.commonReductionOnDevice( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] );

   if( threadInWarpIdx == tnlCuda::getWarpSize() - 1 )
      warpSums[ warpIdx ] = auxData[ threadIdx. x ];
@@ -112,14 +112,14 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
   if( warpIdx == 0 )
      for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 )
         if( threadInWarpIdx >= stride )
            operation.performInPlace( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] );
            operation.commonReductionOnDevice( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] );
   __syncthreads();

   /****
    * Shift the warp prefix-sums.
    */
   if( warpIdx > 0 )
      operation.performInPlace( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] );
      operation.commonReductionOnDevice( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] );

   /***
    *  Store the result back in global memory.
@@ -132,7 +132,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
      DataType chunkShift( operation.initialValue() );
      if( chunkIdx > 0 )
         chunkShift = auxData[ chunkIdx - 1 ];
      operation.performInPlace( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift );
      operation.commonReductionOnDevice( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift );
      output[ blockOffset + idx ] = sharedData[ tnlCuda::getInterleaving( idx ) ];
      idx += blockDim. x;
   }
@@ -141,10 +141,15 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
   if( threadIdx. x == 0 )
   {
      if( prefixSumType == exclusivePrefixSum )
         auxArray[ blockIdx. x ] =
            operation.commonReductionOnDevice( tnlCuda::getInterleaving( lastElementInBlock - 1 ),
      {
         /*auxArray[ blockIdx. x ] = operation.commonReductionOnDevice( tnlCuda::getInterleaving( lastElementInBlock - 1 ),
                                                                      tnlCuda::getInterleaving( lastElementInBlock ),
                                               sharedData );
                                                                      sharedData );*/
         DataType aux = operation.initialValue();
         operation.commonReductionOnDevice( aux, sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ] );
         operation.commonReductionOnDevice( aux, sharedData[ tnlCuda::getInterleaving( lastElementInBlock ) ] );
         auxArray[ blockIdx. x ] = aux;
      }
      else
         auxArray[ blockIdx. x ] = sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ];
   }
+83 −146
Original line number Diff line number Diff line
@@ -37,27 +37,25 @@

using namespace std;


/****
 * Arrays smaller than the following constant
 * are reduced on CPU. The constant must not be larger
 * than maximal CUDA grid size.
 */
const int minGPUReductionDataSize = 256;//65536; //16384;//1024;//256;
const int minGPUReductionDataSize = 128;//65536; //16384;//1024;//256;

static tnlCudaReductionBuffer cudaReductionBuffer( 8 * minGPUReductionDataSize );

#ifdef HAVE_CUDA


template< typename Operation, int blockSize, bool isSizePow2 >
template< typename Operation, int blockSize >
__global__ void tnlCUDAReductionKernel( const Operation operation,
                                        const typename Operation :: IndexType size,
                                        const typename Operation :: RealType* input1,
                                        const typename Operation :: RealType* input2,
                                        typename Operation :: ResultType* output )
{
   typedef tnlCUDAReduction< Operation, blockSize, isSizePow2 > Reduction;
   typedef tnlCUDAReduction< Operation, blockSize > Reduction;
   Reduction::reduce( operation, size, input1, input2, output );
};

@@ -74,63 +72,52 @@ typename Operation::IndexType reduceOnCudaDevice( const Operation& operation,
   
   const IndexType desGridSize( minGPUReductionDataSize );   
   dim3 blockSize( 256 ), gridSize( 0 );   
   
   gridSize. x = Min( tnlCuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
   
   /*#ifdef CUDA_REDUCTION_PROFILING
      tnlTimerRT timer;
      timer.reset();
      timer.start();
   #endif */     
   
   if( ! cudaReductionBuffer.setSize( gridSize.x * sizeof( ResultType ) ) )
      return false;
   output = cudaReductionBuffer.template getData< ResultType >();      
      

   IndexType shmem = blockSize.x * sizeof( ResultType );
   
   /***
    * Depending on the blockSize we generate appropriate template instance.
    */

   if( isPow2( size ) )
   {      
   switch( blockSize.x )         
   {
      case 512:
            tnlCUDAReductionKernel< Operation, 512, true >
         tnlCUDAReductionKernel< Operation, 512 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case 256:
            tnlCUDAReductionKernel< Operation, 256, true >
         tnlCUDAReductionKernel< Operation, 256 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case 128:
            tnlCUDAReductionKernel< Operation, 128, true >
         tnlCUDAReductionKernel< Operation, 128 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case  64:
            tnlCUDAReductionKernel< Operation,  64, true >
         tnlCUDAReductionKernel< Operation,  64 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case  32:
            tnlCUDAReductionKernel< Operation,  32, true >
         tnlCUDAReductionKernel< Operation,  32 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case  16:
            tnlCUDAReductionKernel< Operation,  16, true >
         tnlCUDAReductionKernel< Operation,  16 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
     case   8:
            tnlCUDAReductionKernel< Operation,   8, true >
         tnlCUDAReductionKernel< Operation,   8 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case   4:
            tnlCUDAReductionKernel< Operation,   4, true >
         tnlCUDAReductionKernel< Operation,   4 >
        <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
        break;
      case   2:
            tnlCUDAReductionKernel< Operation,   2, true >
         tnlCUDAReductionKernel< Operation,   2 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case   1:
@@ -138,59 +125,7 @@ typename Operation::IndexType reduceOnCudaDevice( const Operation& operation,
      default:
         tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
   }
   }
   else
   {
      switch( blockSize.x )
      {
         case 512:
            tnlCUDAReductionKernel< Operation, 512, false >
            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
            break;
         case 256:
            tnlCUDAReductionKernel< Operation, 256, false >
            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
            break;
         case 128:
            tnlCUDAReductionKernel< Operation, 128, false >
            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
            break;
         case  64:
            tnlCUDAReductionKernel< Operation,  64, false >
            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
            break;
         case  32:
            tnlCUDAReductionKernel< Operation,  32, false >
            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
            break;
         case  16:
            tnlCUDAReductionKernel< Operation,  16, false >
            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
            break;
        case   8:
            tnlCUDAReductionKernel< Operation,   8, false >
            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
            break;
         case   4:
            tnlCUDAReductionKernel< Operation,   4, false >
           <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
           break;
         case   2:
            tnlCUDAReductionKernel< Operation,   2, false >
            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
            break;
         case   1:
            tnlAssert( false, cerr << "blockSize should not be 1." << endl );
         default:
            tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
      }
   }
   //checkCudaDevice;
   /*#ifdef CUDA_REDUCTION_PROFILING
      //cudaThreadSynchronize();
      timer.stop();
      cout << "   Main reduction on GPU took " << timer.getTime() << " sec. " << endl;
   #endif   */      
   return gridSize. x;
}
#endif
@@ -222,20 +157,21 @@ bool reductionOnCudaDevice( const Operation& operation,
      if( deviceInput2 && ! 
          tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< RealType, RealType, IndexType >( hostArray2, deviceInput2, size ) )
         return false;
      result = operation. initialValueOnHost( 0, hostArray1, hostArray2 );
      for( IndexType i = 1; i < size; i ++ )
      result = operation.initialValue();
      for( IndexType i = 0; i < size; i ++ )
         result = operation.reduceOnHost( i, result, hostArray1, hostArray2 );
      return true;
   }

   /****
    * Reduce the data on the CUDA device.
    */
   #ifdef CUDA_REDUCTION_PROFILING
      tnlTimerRT timer;
      timer.reset();
      timer.start();
   #endif   

   /****
    * Reduce the data on the CUDA device.
    */      
   ResultType* deviceAux1( 0 );
   IndexType reducedSize = reduceOnCudaDevice( operation,
                                               size,
@@ -245,39 +181,40 @@ bool reductionOnCudaDevice( const Operation& operation,
   #ifdef CUDA_REDUCTION_PROFILING
      timer.stop();
      cout << "   Reduction on GPU to size " << reducedSize << " took " << timer.getTime() << " sec. " << endl;
      timer.reset();
      timer.start();
   #endif   

   /***
    * Transfer the reduced data from device to host.
    */
#ifdef CUDA_REDUCTION_PROFILING
   timer.reset();
   timer.start();
#endif   
   ResultType resultArray[ minGPUReductionDataSize ];
   if( ! tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< ResultType, ResultType, IndexType >( resultArray, deviceAux1, reducedSize ) )
      return false;
   
   #ifdef CUDA_REDUCTION_PROFILING   
      timer.stop();
      cout << "   Transferring data to CPU took " << timer.getTime() << " sec. " << endl;
   #endif   

   /***
    * Reduce the data on the host system.
    */
   LaterReductionOperation laterReductionOperation;
   #ifdef CUDA_REDUCTION_PROFILING
      timer.reset();
      timer.start();
   #endif      
   result = laterReductionOperation. initialValueOnHost( 0, resultArray, ( ResultType* ) 0 );
   for( IndexType i = 1; i < reducedSize; i ++ )
   
   /***
    * Reduce the data on the host system.
    */    
   LaterReductionOperation laterReductionOperation;
   result = laterReductionOperation. initialValue();
   for( IndexType i = 0; i < reducedSize; i ++ )
      result = laterReductionOperation.reduceOnHost( i, result, resultArray, ( ResultType*) 0 );
   
   #ifdef CUDA_REDUCTION_PROFILING
   cudaThreadSynchronize();
      timer.stop();
      cout << "   Reduction of small data set on CPU took " << timer.getTime() << " sec. " << endl;
   #endif 
   
   return checkCudaDevice;
#else
   tnlCudaSupportMissingMessage;;
+83 −1264

File changed.

Preview size limit exceeded, changes collapsed.

+3 −3
Original line number Diff line number Diff line
@@ -20,7 +20,7 @@

#ifdef HAVE_CUDA

template< typename Operation, int blockSize, bool isSizePow2 >
template< typename Operation, int blockSize >
class tnlCUDAReduction
{
   public:
@@ -37,8 +37,8 @@ class tnlCUDAReduction
                                     ResultType* output );
};
      
/*template< typename Real, typename Index, int blockSize, bool isSizePow2 >
class tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize, isSizePow2 >
/*template< typename Real, typename Index, int blockSize >
class tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize >
{
   public:
      
+4 −4
Original line number Diff line number Diff line
@@ -18,10 +18,10 @@
#ifndef TNLCUDAREDUCTION_IMPL_H
#define	TNLCUDAREDUCTION_IMPL_H

template< typename Operation, int blockSize, bool isSizePow2 >      
template< typename Operation, int blockSize >      
__device__
void
tnlCUDAReduction< Operation, blockSize, isSizePow2 >::
tnlCUDAReduction< Operation, blockSize >::
reduce( const Operation operation,
        const IndexType size,
        const RealType* input1,
@@ -154,10 +154,10 @@ reduce( const Operation operation,

#ifdef UNDEF

template< typename Real, typename Index, int blockSize, bool isSizePow2 >      
template< typename Real, typename Index, int blockSize >      
__device__
void
tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize, isSizePow2 >::
tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize >::
reduce( const Operation operation,
        const IndexType size,
        const RealType* input1,
Loading