Commit 71f6c6d0 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Fixed configuration of reduction kernels

The problem is that the __CUDA_ARCH__ macro is defined only in device
code, so it can't be used for configuring the kernel launches.
parent 50962503
Loading
Loading
Loading
Loading
+11 −6
Original line number Diff line number Diff line
@@ -32,6 +32,9 @@ namespace Algorithms {
 * architecture so that there are no local memory spills.
 */
static constexpr int Multireduction_maxThreadsPerBlock = 256;  // must be a power of 2
static constexpr int Multireduction_registersPerThread = 38;   // empirically determined optimal value

// __CUDA_ARCH__ is defined only in device code!
#if (__CUDA_ARCH__ >= 300 )
   static constexpr int Multireduction_minBlocksPerMultiprocessor = 6;
#else
@@ -187,12 +190,14 @@ CudaMultireductionKernelLauncher( Operation& operation,
   // we run the kernel with a fixed number of blocks, so the amount of work per
   // block increases with enlarging the problem, so even small imbalance can
   // cost us dearly.
   // On Tesla K40c, desGridSizeX = 4 * 6 * 15 = 360.
//   const IndexType desGridSizeX = 4 * Multireduction_minBlocksPerMultiprocessor
//                                    * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
   // On Tesla K40c, desGridSizeX = 6 * 15 = 90.
   const IndexType desGridSizeX = Multireduction_minBlocksPerMultiprocessor
                                * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
   // Therefore,  desGridSize = blocksPerMultiprocessor * numberOfMultiprocessors
   // where blocksPerMultiprocessor is determined according to the number of
   // available registers on the multiprocessor.
   // On Tesla K40c, desGridSize = 8 * 15 = 120.
   const int activeDevice = Devices::CudaDeviceInfo::getActiveDevice();
   const int blocksdPerMultiprocessor = Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice )
                                      / ( Multireduction_maxThreadsPerBlock * Multireduction_registersPerThread );
   const int desGridSizeX = blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice );
   dim3 blockSize, gridSize;
   
   // version A: max 16 rows of threads
+13 −7
Original line number Diff line number Diff line
@@ -30,6 +30,9 @@ namespace Algorithms {
 * architecture so that there are no local memory spills.
 */
static constexpr int Reduction_maxThreadsPerBlock = 256;  // must be a power of 2
static constexpr int Reduction_registersPerThread = 32;   // empirically determined optimal value

// __CUDA_ARCH__ is defined only in device code!
#if (__CUDA_ARCH__ >= 300 )
   static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
#else
@@ -189,13 +192,16 @@ CudaReductionKernelLauncher( Operation& operation,
   // we run the kernel with a fixed number of blocks, so the amount of work per
   // block increases with enlarging the problem, so even small imbalance can
   // cost us dearly.
   // On Tesla K40c, desGridSize = 4 * 6 * 15 = 360.
//   const IndexType desGridSize = 4 * Reduction_minBlocksPerMultiprocessor
//                                   * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
   // On Tesla K40c, desGridSize = 6 * 15 = 90.
   const IndexType desGridSize = Reduction_minBlocksPerMultiprocessor
                               * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
   dim3 blockSize( 256 ), gridSize( 0 );
   // Therefore,  desGridSize = blocksPerMultiprocessor * numberOfMultiprocessors
   // where blocksPerMultiprocessor is determined according to the number of
   // available registers on the multiprocessor.
   // On Tesla K40c, desGridSize = 8 * 15 = 120.
   const int activeDevice = Devices::CudaDeviceInfo::getActiveDevice();
   const int blocksdPerMultiprocessor = Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice )
                                      / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread );
   const int desGridSize = blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice );
   dim3 blockSize, gridSize;
   blockSize.x = Reduction_maxThreadsPerBlock;
   gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );

   // create reference to the reduction buffer singleton and set size
+7 −0
Original line number Diff line number Diff line
@@ -107,6 +107,13 @@ getCudaCores( int deviceNum )
   return 0;
}

int
CudaDeviceInfo::
getRegistersPerMultiprocessor( int deviceNum )
{
   return 0;
}

void
CudaDeviceInfo::
writeDeviceInfo( Logger& logger )
+15 −0
Original line number Diff line number Diff line
@@ -168,6 +168,21 @@ getCudaCores( int deviceNum )
           CudaDeviceInfo::getCudaCoresPerMultiprocessors( deviceNum );
}

int
CudaDeviceInfo::
getRegistersPerMultiprocessor( int deviceNum )
{
    // results are cached because they are used for configuration of some kernels
    static std::unordered_map< int, int > results;
    if( results.count( deviceNum ) == 0 ) {
        cudaDeviceProp properties;
        cudaGetDeviceProperties( &properties, deviceNum );
        results.emplace( deviceNum, properties.regsPerMultiprocessor );
        return properties.regsPerMultiprocessor;
    }
    return results[ deviceNum ];
}

void
CudaDeviceInfo::
writeDeviceInfo( Logger& logger )
+2 −0
Original line number Diff line number Diff line
@@ -50,6 +50,8 @@ class CudaDeviceInfo

      static int getCudaCores( int deviceNum );

      static int getRegistersPerMultiprocessor( int deviceNum );

      static void writeDeviceInfo( Logger& logger );
};

Loading