Loading src/TNL/Algorithms/detail/CudaMultireductionKernel.h +3 −0 Original line number Original line Diff line number Diff line Loading @@ -32,6 +32,9 @@ static constexpr int Multireduction_registersPerThread = 32; // empirically de #if( __CUDA_ARCH__ == 750 ) #if( __CUDA_ARCH__ == 750 ) // Turing has a limit of 1024 threads per multiprocessor // Turing has a limit of 1024 threads per multiprocessor static constexpr int Multireduction_minBlocksPerMultiprocessor = 4; static constexpr int Multireduction_minBlocksPerMultiprocessor = 4; #elif( __CUDA_ARCH__ == 860 ) // Ampere 8.6 has a limit of 1536 threads per multiprocessor static constexpr int Multireduction_minBlocksPerMultiprocessor = 6; #else #else static constexpr int Multireduction_minBlocksPerMultiprocessor = 8; static constexpr int Multireduction_minBlocksPerMultiprocessor = 8; #endif #endif Loading src/TNL/Algorithms/detail/CudaReductionKernel.h +3 −0 Original line number Original line Diff line number Diff line Loading @@ -332,6 +332,9 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi #if( __CUDA_ARCH__ == 750 ) #if( __CUDA_ARCH__ == 750 ) // Turing has a limit of 1024 threads per multiprocessor // Turing has a limit of 1024 threads per multiprocessor static constexpr int Reduction_minBlocksPerMultiprocessor = 4; static constexpr int Reduction_minBlocksPerMultiprocessor = 4; #elif( __CUDA_ARCH__ == 860 ) // Ampere 8.6 has a limit of 1536 threads per multiprocessor static constexpr int Reduction_minBlocksPerMultiprocessor = 6; #else #else static constexpr int Reduction_minBlocksPerMultiprocessor = 8; static constexpr int Reduction_minBlocksPerMultiprocessor = 8; #endif #endif Loading Loading
src/TNL/Algorithms/detail/CudaMultireductionKernel.h +3 −0 Original line number Original line Diff line number Diff line Loading @@ -32,6 +32,9 @@ static constexpr int Multireduction_registersPerThread = 32; // empirically de #if( __CUDA_ARCH__ == 750 ) #if( __CUDA_ARCH__ == 750 ) // Turing has a limit of 1024 threads per multiprocessor // Turing has a limit of 1024 threads per multiprocessor static constexpr int Multireduction_minBlocksPerMultiprocessor = 4; static constexpr int Multireduction_minBlocksPerMultiprocessor = 4; #elif( __CUDA_ARCH__ == 860 ) // Ampere 8.6 has a limit of 1536 threads per multiprocessor static constexpr int Multireduction_minBlocksPerMultiprocessor = 6; #else #else static constexpr int Multireduction_minBlocksPerMultiprocessor = 8; static constexpr int Multireduction_minBlocksPerMultiprocessor = 8; #endif #endif Loading
src/TNL/Algorithms/detail/CudaReductionKernel.h +3 −0 Original line number Original line Diff line number Diff line Loading @@ -332,6 +332,9 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi #if( __CUDA_ARCH__ == 750 ) #if( __CUDA_ARCH__ == 750 ) // Turing has a limit of 1024 threads per multiprocessor // Turing has a limit of 1024 threads per multiprocessor static constexpr int Reduction_minBlocksPerMultiprocessor = 4; static constexpr int Reduction_minBlocksPerMultiprocessor = 4; #elif( __CUDA_ARCH__ == 860 ) // Ampere 8.6 has a limit of 1536 threads per multiprocessor static constexpr int Reduction_minBlocksPerMultiprocessor = 6; #else #else static constexpr int Reduction_minBlocksPerMultiprocessor = 8; static constexpr int Reduction_minBlocksPerMultiprocessor = 8; #endif #endif Loading