Commit 398d4b1a authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Optimizing parallel reduction in CUDA.

parent 816b1ec8
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -3,6 +3,8 @@ set( headers cuda-prefix-sum.h
             cuda-reduction.h             
             cuda-reduction_impl.h
             reduction-operations.h
             tnlCudaReduction.h
             tnlCudaReduction_impl.h
             tnlCublasWrapper.h )

SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/core/cuda ) 
+3 −2
Original line number Diff line number Diff line
@@ -163,13 +163,14 @@ __global__ void cudaSecondPhaseBlockPrefixSum( const Operation operation,
{
   if( blockIdx. x > 0 )
   {
      const DataType shift = operation.commonReductionOnDevice( gridShift, auxArray[ blockIdx. x - 1 ] );
      DataType shift( gridShift );
      operation.commonReductionOnDevice( shift, auxArray[ blockIdx. x - 1 ] );

      const Index readOffset = blockIdx. x * elementsInBlock;
      Index readIdx = threadIdx. x;
      while( readIdx < elementsInBlock && readOffset + readIdx < size )
      {
         operation.performInPlace( data[ readIdx + readOffset ], shift );
         operation.commonReductionOnDevice( data[ readIdx + readOffset ], shift );
         readIdx += blockDim. x;
      }
   }
+19 −181
Original line number Diff line number Diff line
@@ -29,6 +29,7 @@
#include <core/arrays/tnlArrayOperations.h>
#include <core/mfuncs.h>
#include <core/cuda/tnlCudaReductionBuffer.h>
#include <core/cuda/tnlCudaReduction.h>

#ifdef CUDA_REDUCTION_PROFILING
#include <core/tnlTimerRT.h>
@@ -42,184 +43,23 @@ using namespace std;
 * are reduced on CPU. The constant must not be larger
 * than maximal CUDA grid size.
 */
const int minGPUReductionDataSize = 256; //2048;//65536; //16384;//1024;//256;
const int minGPUReductionDataSize = 256;//65536; //16384;//1024;//256;

static tnlCudaReductionBuffer cudaReductionBuffer( 8 * minGPUReductionDataSize );

#ifdef HAVE_CUDA

/***
 * The parallel reduction of one vector.
 *
 * WARNING: This kernel only reduce data in one block. Use rather tnlCUDASimpleReduction2
 *          to call this kernel then doing it by yourself.
 *          This kernel is very inefficient. It is here only for educative and testing reasons.
 *          Please use tnlCUDAReduction instead.
 *
 * The kernel parameters:
 * @param size is the number of all element to reduce - not just in one block.
 * @param deviceInput input data which we want to reduce
 * @param deviceOutput an array to which we write the result of reduction.
 *                     Each block of the grid writes one element in this array
 *                     (i.e. the size of this array equals the number of CUDA blocks).
 */

template< typename Operation, int blockSize, bool isSizePow2 >
__global__ void tnlCUDAReductionKernel( const Operation operation,
                                        const typename Operation :: IndexType size,
                                        const typename Operation :: RealType* deviceInput,
                                        const typename Operation :: RealType* deviceInput2,
                                        typename Operation :: ResultType* deviceOutput )
{
   extern __shared__ __align__ ( 8 ) char __sdata[];
   
   typedef typename Operation :: IndexType IndexType;
   typedef typename Operation :: RealType RealType;
   typedef typename Operation :: ResultType ResultType;

   ResultType* sdata = reinterpret_cast< ResultType* >( __sdata );

   /***
    * Get thread id (tid) and global thread id (gid).
    * gridSize is the number of element processed by all blocks at the
    * same time.
    */
   IndexType tid = threadIdx. x;
   IndexType gid = blockIdx. x * blockDim. x + threadIdx. x;
   IndexType gridSize = blockDim. x * gridDim.x;

   sdata[ tid ] = operation.initialValue();
   /***
    * Read data into the shared memory. We start with the
    * sequential reduction.
    */
   while( gid + 4 * gridSize < size )
   {
      operation.firstReduction( sdata[ tid ], gid,                deviceInput, deviceInput2 );
      operation.firstReduction( sdata[ tid ], gid + gridSize,     deviceInput, deviceInput2 );
      operation.firstReduction( sdata[ tid ], gid + 2 * gridSize, deviceInput, deviceInput2 );
      operation.firstReduction( sdata[ tid ], gid + 3 * gridSize, deviceInput, deviceInput2 );
      //sdata[ tid ] += deviceInput[ gid ] * deviceInput[ gid ];
      //sdata[ tid ] += deviceInput[ gid + gridSize ] * deviceInput[ gid + gridSize ];
      //sdata[ tid ] += deviceInput[ gid + 2 * gridSize ] * deviceInput[ gid + 2 * gridSize ];
      //sdata[ tid ] += deviceInput[ gid + 3 * gridSize ] * deviceInput[ gid + 3 * gridSize ];
      gid += 4*gridSize;
   }
   while( gid + 2 * gridSize < size )
   {
      operation.firstReduction( sdata[ tid ], gid,                deviceInput, deviceInput2 );
      operation.firstReduction( sdata[ tid ], gid + gridSize,     deviceInput, deviceInput2 );

      //sdata[ tid ] += deviceInput[ gid ] * deviceInput[ gid ];
      //sdata[ tid ] += deviceInput[ gid + gridSize ] * deviceInput[ gid + gridSize ];
      gid += 2*gridSize;
   }
   while( gid < size )
   {
      operation.firstReduction( sdata[ tid ], gid,                deviceInput, deviceInput2 );
      //sdata[ tid ] += deviceInput[ gid ] * deviceInput[ gid ];
      gid += gridSize;
   }
   __syncthreads();
    
   
   //printf( "1: tid %d data %f \n", tid, sdata[ tid ] );
   
   //return;
   /***
    *  Perform the parallel reduction.
    */
   if( blockSize >= 1024 )
   {
      if( tid < 512 )
         //sdata[ tid ] = operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 512 ] );
         sdata[ tid ] += sdata[ tid + 512 ];
      __syncthreads();
   }
   if( blockSize >= 512 )
   {
      if( tid < 256 )
         //sdata[ tid ] = operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 256 ] );
         sdata[ tid ] += sdata[ tid + 256 ];
      __syncthreads();
   }
   if( blockSize >= 256 )
   {
      if( tid < 128 )
         //sdata[ tid ] = operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 128 ] );
         sdata[ tid ] += sdata[ tid + 128 ];
      __syncthreads();
      //printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
   }
   
   if( blockSize >= 128 )
   {
      if( tid <  64 )
         //sdata[ tid ] = operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 64 ] );
         sdata[ tid ] += sdata[ tid + 64 ];
      __syncthreads();
      //printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
   }
   

   /***
    * This runs in one warp so it is synchronized implicitly.
    */
   if( tid < 32 )
   {
      volatile ResultType* vsdata = sdata;
      if( blockSize >= 64 )
      {
         //vsdata[ tid ] = operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 32 ] );
         vsdata[ tid ] += vsdata[ tid + 32 ];
         //__syncthreads();
         //printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >= 32 )
      {
         //vsdata[ tid ] = operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 16 ] );
         vsdata[ tid ] += vsdata[ tid + 16 ];
         //__syncthreads();
         //printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >= 16 )
      {
         //vsdata[ tid ] = operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 8 ] );
         vsdata[ tid ] += vsdata[ tid + 8 ];
         //__syncthreads();
         //printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >=  8 )
      {
         //vsdata[ tid ] = operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 4 ] );
         vsdata[ tid ] += vsdata[ tid + 4 ];
         //__syncthreads();
         //printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >=  4 )
      {
         //vsdata[ tid ] = operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 2 ] );
         vsdata[ tid ] += vsdata[ tid + 2 ];
         //__syncthreads();
         //printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >=  2 )
      {
         //vsdata[ tid ] = operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 1 ] );
         vsdata[ tid ] += vsdata[ tid + 1 ];
         //__syncthreads();
         //printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
      }
   }

   /***
    * Store the result back in the global memory.
    */
   if( tid == 0 )
                                        const typename Operation :: RealType* input1,
                                        const typename Operation :: RealType* input2,
                                        typename Operation :: ResultType* output )
{
      //printf( "Block %d result = %f \n", blockIdx.x, sdata[ 0 ] );
      deviceOutput[ blockIdx. x ] = sdata[ 0 ];
   }
}
   typedef tnlCUDAReduction< Operation, blockSize, isSizePow2 > Reduction;
   Reduction::reduce( operation, size, input1, input2, output );
};

template< typename Operation >
typename Operation::IndexType reduceOnCudaDevice( const Operation& operation,
@@ -430,8 +270,6 @@ bool reductionOnCudaDevice( const Operation& operation,
   timer.reset();
   timer.start();
#endif      
   //for( IndexType i = 0; i < reducedSize; i ++ )
   //   cout << resultArray[ i ] << ", ";
   result = laterReductionOperation. initialValueOnHost( 0, resultArray, ( ResultType* ) 0 );
   for( IndexType i = 1; i < reducedSize; i ++ )
      result = laterReductionOperation. reduceOnHost( i, result, resultArray, ( ResultType*) 0 );
+182 −173

File changed.

Preview size limit exceeded, changes collapsed.

+62 −0
Original line number Diff line number Diff line
/***************************************************************************
                          tnlCudaReduction.h  -  description
                             -------------------
    begin                : Jun 17, 2015
    copyright            : (C) 2015 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/***************************************************************************
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 ***************************************************************************/

#ifndef TNLCUDAREDUCTION_H
#define	TNLCUDAREDUCTION_H

#ifdef HAVE_CUDA

template< typename Operation, int blockSize, bool isSizePow2 >
class tnlCUDAReduction
{
   public:

      typedef typename Operation::IndexType IndexType;
      typedef typename Operation::RealType RealType;
      typedef typename Operation::ResultType ResultType;

      
      __device__ static void reduce( const Operation operation,
                                     const IndexType size,
                                     const RealType* input1,
                                     const RealType* input2,
                                     ResultType* output );
};
      
/*template< typename Real, typename Index, int blockSize, bool isSizePow2 >
class tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize, isSizePow2 >
{
   public:
      
      typedef tnlParallelReductionScalarProduct< Real, Index > Operation;      
      typedef typename Operation::IndexType IndexType;
      typedef typename Operation::RealType RealType;
      typedef typename Operation::ResultType ResultType;
      
      __device__ static void reduce( const Operation operation,
                                     const IndexType size,
                                     const RealType* input1,
                                     const RealType* input2,
                                     ResultType* output );
};*/

#include <core/cuda/tnlCudaReduction_impl.h>

#endif

#endif	/* TNLCUDAREDUCTION_H */
Loading