Commit 61b6628a authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Fixed __launch_bounds__ parameters for Ampére 8.6 architecture

parent 9b9f5a0d
Loading
Loading
Loading
Loading
+3 −0
Original line number Original line Diff line number Diff line
@@ -32,6 +32,9 @@ static constexpr int Multireduction_registersPerThread = 32; // empirically de
#if( __CUDA_ARCH__ == 750 )
#if( __CUDA_ARCH__ == 750 )
// Turing has a limit of 1024 threads per multiprocessor
// Turing has a limit of 1024 threads per multiprocessor
static constexpr int Multireduction_minBlocksPerMultiprocessor = 4;
static constexpr int Multireduction_minBlocksPerMultiprocessor = 4;
#elif( __CUDA_ARCH__ == 860 )
// Ampere 8.6 has a limit of 1536 threads per multiprocessor
static constexpr int Multireduction_minBlocksPerMultiprocessor = 6;
#else
#else
static constexpr int Multireduction_minBlocksPerMultiprocessor = 8;
static constexpr int Multireduction_minBlocksPerMultiprocessor = 8;
#endif
#endif
+3 −0
Original line number Original line Diff line number Diff line
@@ -332,6 +332,9 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi
#if( __CUDA_ARCH__ == 750 )
#if( __CUDA_ARCH__ == 750 )
// Turing has a limit of 1024 threads per multiprocessor
// Turing has a limit of 1024 threads per multiprocessor
static constexpr int Reduction_minBlocksPerMultiprocessor = 4;
static constexpr int Reduction_minBlocksPerMultiprocessor = 4;
#elif( __CUDA_ARCH__ == 860 )
// Ampere 8.6 has a limit of 1536 threads per multiprocessor
static constexpr int Reduction_minBlocksPerMultiprocessor = 6;
#else
#else
static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
#endif
#endif