From 7a49e4780f22b667d8280ad1ad66d2f0707b165e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com> Date: Tue, 7 May 2019 23:06:20 +0200 Subject: [PATCH] Optimized CUDA reduction by decreasing desired grid size. --- src/TNL/Containers/Algorithms/CudaReductionKernel.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h index 3ef43a0553..d7a711cc7f 100644 --- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h @@ -195,11 +195,16 @@ struct CudaReductionKernelLauncher // where blocksPerMultiprocessor is determined according to the number of // available registers on the multiprocessor. // On Tesla K40c, desGridSize = 8 * 15 = 120. + // + // Update: + // It seems to be better to map only one CUDA block per one multiprocessor or maybe + // just slightly more. Therefore we omit blocksdPerMultiprocessor in the following. CudaReductionKernelLauncher( const Index size ) : activeDevice( Devices::CudaDeviceInfo::getActiveDevice() ), blocksdPerMultiprocessor( Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice ) / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread ) ), - desGridSize( blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ), + //desGridSize( blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ), + desGridSize( Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ), originalSize( size ) { } -- GitLab