Commit 045eb5f4 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Fixed __launch_bounds__ optimizations for Turing GPUs

parent 1a1a4867
Loading
Loading
Loading
Loading
+4 −3
Original line number Diff line number Diff line
@@ -32,10 +32,11 @@ static constexpr int Multireduction_maxThreadsPerBlock = 256; // must be a powe
static constexpr int Multireduction_registersPerThread = 32;   // empirically determined optimal value

// __CUDA_ARCH__ is defined only in device code!
#if (__CUDA_ARCH__ >= 300 )
   static constexpr int Multireduction_minBlocksPerMultiprocessor = 8;
#else
#if (__CUDA_ARCH__ == 750 )
   // Turing has a limit of 1024 threads per multiprocessor
   static constexpr int Multireduction_minBlocksPerMultiprocessor = 4;
#else
   static constexpr int Multireduction_minBlocksPerMultiprocessor = 8;
#endif

template< int blockSizeX,
+4 −3
Original line number Diff line number Diff line
@@ -33,10 +33,11 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi

#ifdef HAVE_CUDA
// __CUDA_ARCH__ is defined only in device code!
#if (__CUDA_ARCH__ >= 300 )
   static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
#else
#if (__CUDA_ARCH__ == 750 )
   // Turing has a limit of 1024 threads per multiprocessor
   static constexpr int Reduction_minBlocksPerMultiprocessor = 4;
#else
   static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
#endif

template< int blockSize,