Skip to content
Snippets Groups Projects
CudaReductionKernel.h 10.5 KiB
Newer Older
  • Learn to ignore specific revisions
  • /***************************************************************************
    
                              CudaReductionKernel.h  -  description
    
                                 -------------------
        begin                : Jun 17, 2015
    
        copyright            : (C) 2015 by Tomas Oberhuber et al.
    
        email                : tomas.oberhuber@fjfi.cvut.cz
     ***************************************************************************/
    
    
    /* See Copyright Notice in tnl/Copyright */
    
    #pragma once
    
    
    #ifdef HAVE_CUDA
    #include <cuda.h>
    #endif
    
    #include <TNL/Assert.h>
    #include <TNL/Math.h>
    #include <TNL/Devices/CudaDeviceInfo.h>
    #include <TNL/Containers/Algorithms/CudaReductionBuffer.h>
    
    
    namespace TNL {
    
    namespace Containers {
    namespace Algorithms {
    
    /****
     * The performance of this kernel is very sensitive to register usage.
     * Compile with --ptxas-options=-v and configure these constants for given
     * architecture so that there are no local memory spills.
     */
    static constexpr int Reduction_maxThreadsPerBlock = 256;  // must be a power of 2
    #if (__CUDA_ARCH__ >= 300 )
    
       static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
    
    #else
       static constexpr int Reduction_minBlocksPerMultiprocessor = 4;
    #endif
    
    template< typename Operation, int blockSize >
    
    __global__ void
    __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
    
    CudaReductionKernel( Operation operation,
    
                         const typename Operation::IndexType size,
                         const typename Operation::RealType* input1,
                         const typename Operation::RealType* input2,
                         typename Operation::ResultType* output )
    
       typedef typename Operation::IndexType IndexType;
       typedef typename Operation::ResultType ResultType;
    
    
       ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
    
    
       /***
        * Get thread id (tid) and global thread id (gid).
        * gridSize is the number of element processed by all blocks at the
        * same time.
        */
    
       const IndexType tid = threadIdx.x;
             IndexType gid = blockIdx.x * blockDim. x + threadIdx.x;
       const IndexType gridSize = blockDim.x * gridDim.x;
    
    
       sdata[ tid ] = operation.initialValue();
       /***
        * Read data into the shared memory. We start with the
        * sequential reduction.
        */
       while( gid + 4 * gridSize < size )
       {
          operation.cudaFirstReduction( sdata[ tid ], gid,                input1, input2 );
          operation.cudaFirstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
          operation.cudaFirstReduction( sdata[ tid ], gid + 2 * gridSize, input1, input2 );
          operation.cudaFirstReduction( sdata[ tid ], gid + 3 * gridSize, input1, input2 );
    
          gid += 4 * gridSize;
    
       }
       while( gid + 2 * gridSize < size )
       {
          operation.cudaFirstReduction( sdata[ tid ], gid,                input1, input2 );
          operation.cudaFirstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
    
          gid += 2 * gridSize;
    
       }
       while( gid < size )
       {
          operation.cudaFirstReduction( sdata[ tid ], gid,                input1, input2 );
          gid += gridSize;
       }
       __syncthreads();
    
    
       //printf( "1: tid %d data %f \n", tid, sdata[ tid ] );
    
       //return;
       /***
        *  Perform the parallel reduction.
        */
       if( blockSize >= 1024 )
       {
          if( tid < 512 )
             operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 512 ] );
          __syncthreads();
       }
       if( blockSize >= 512 )
       {
          if( tid < 256 )
             operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 256 ] );
          __syncthreads();
       }
       if( blockSize >= 256 )
       {
          if( tid < 128 )
             operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 128 ] );
          __syncthreads();
          //printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
       }
    
       if( blockSize >= 128 )
       {
          if( tid <  64 )
             operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 64 ] );
          __syncthreads();
          //printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
       }
    
    
       /***
        * This runs in one warp so it is synchronized implicitly.
    
       if( tid < 32 )
       {
          volatile ResultType* vsdata = sdata;
          if( blockSize >= 64 )
          {
             operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 32 ] );
             //printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
          }
    
          // TODO: If blocksize == 32, the following does not work
          // We do not check if tid < 16. Fix it!!!
    
          if( blockSize >= 32 )
          {
             operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 16 ] );
             //printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
          }
          if( blockSize >= 16 )
          {
             operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 8 ] );
             //printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
          }
          if( blockSize >=  8 )
          {
             operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 4 ] );
             //printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
          }
          if( blockSize >=  4 )
          {
             operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 2 ] );
             //printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
          }
          if( blockSize >=  2 )
          {
             operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 1 ] );
             //printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
          }
       }
    
       /***
        * Store the result back in the global memory.
        */
       if( tid == 0 )
       {
          //printf( "Block %d result = %f \n", blockIdx.x, sdata[ 0 ] );
    
          output[ blockIdx.x ] = sdata[ 0 ];
    
    template< typename Operation >
    typename Operation::IndexType
    CudaReductionKernelLauncher( Operation& operation,
                                 const typename Operation::IndexType size,
                                 const typename Operation::RealType* input1,
                                 const typename Operation::RealType* input2,
                                 typename Operation::ResultType*& output )
    {
       typedef typename Operation::IndexType IndexType;
       typedef typename Operation::RealType RealType;
       typedef typename Operation::ResultType ResultType;
    
    
       // The number of blocks should be a multiple of the number of multiprocessors
       // to ensure optimum balancing of the load. This is very important, because
       // we run the kernel with a fixed number of blocks, so the amount of work per
       // block increases with enlarging the problem, so even small imbalance can
       // cost us dearly.
       // On Tesla K40c, desGridSize = 4 * 6 * 15 = 360.
    //   const IndexType desGridSize = 4 * Reduction_minBlocksPerMultiprocessor
    //                                   * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
       // On Tesla K40c, desGridSize = 6 * 15 = 90.
       const IndexType desGridSize = Reduction_minBlocksPerMultiprocessor
                                   * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
    
       dim3 blockSize( 256 ), gridSize( 0 );
       gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
    
    
       // create reference to the reduction buffer singleton and set size
       const size_t buf_size = desGridSize * sizeof( ResultType );
       CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
       if( ! cudaReductionBuffer.setSize( buf_size ) )
          throw 1;
    
       output = cudaReductionBuffer.template getData< ResultType >();
    
    
       // when there is only one warp per blockSize.x, we need to allocate two warps
       // worth of shared memory so that we don't index shared memory out of bounds
       const IndexType shmem = (blockSize.x <= 32)
                ? 2 * blockSize.x * sizeof( ResultType )
                : blockSize.x * sizeof( ResultType );
    
    
       /***
        * Depending on the blockSize we generate appropriate template instance.
        */
       switch( blockSize.x )
       {
          case 512:
             CudaReductionKernel< Operation, 512 >
             <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
             break;
          case 256:
    
             cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 256 >, cudaFuncCachePreferShared);
    
    
             CudaReductionKernel< Operation, 256 >
             <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
             break;
          case 128:
    
             cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 128 >, cudaFuncCachePreferShared);
    
    
             CudaReductionKernel< Operation, 128 >
             <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
             break;
          case  64:
    
             cudaFuncSetCacheConfig(CudaReductionKernel< Operation,  64 >, cudaFuncCachePreferShared);
    
    
             CudaReductionKernel< Operation,  64 >
             <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
             break;
          case  32:
    
             cudaFuncSetCacheConfig(CudaReductionKernel< Operation,  32 >, cudaFuncCachePreferShared);
    
    
             CudaReductionKernel< Operation,  32 >
             <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
             break;
          case  16:
    
             cudaFuncSetCacheConfig(CudaReductionKernel< Operation,  16 >, cudaFuncCachePreferShared);
    
    
             CudaReductionKernel< Operation,  16 >
             <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
             break;
         case   8:
    
             cudaFuncSetCacheConfig(CudaReductionKernel< Operation,   8 >, cudaFuncCachePreferShared);
    
    
             CudaReductionKernel< Operation,   8 >
             <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
             break;
          case   4:
    
             cudaFuncSetCacheConfig(CudaReductionKernel< Operation,   4 >, cudaFuncCachePreferShared);
    
    
             CudaReductionKernel< Operation,   4 >
    
             <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
             break;
    
             cudaFuncSetCacheConfig(CudaReductionKernel< Operation,   2 >, cudaFuncCachePreferShared);
    
    
             CudaReductionKernel< Operation,   2 >
             <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
             break;
          case   1:
    
             TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
    
             TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
    
       checkCudaDevice;
    
       // return the size of the output array on the CUDA device
       return gridSize.x;
    
    } // namespace Algorithms
    } // namespace Containers
    
    } // namespace TNL