From 7a49e4780f22b667d8280ad1ad66d2f0707b165e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 7 May 2019 23:06:20 +0200
Subject: [PATCH] Optimized CUDA reduction by decreasing desired grid size.

---
 src/TNL/Containers/Algorithms/CudaReductionKernel.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
index 3ef43a0553..d7a711cc7f 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -195,11 +195,16 @@ struct CudaReductionKernelLauncher
    // where blocksPerMultiprocessor is determined according to the number of
    // available registers on the multiprocessor.
    // On Tesla K40c, desGridSize = 8 * 15 = 120.
+   //
+   // Update:
+   // It seems to be better to map only one CUDA block per one multiprocessor or maybe
+   // just slightly more. Therefore we omit blocksdPerMultiprocessor in the following.
    CudaReductionKernelLauncher( const Index size )
    : activeDevice( Devices::CudaDeviceInfo::getActiveDevice() ),
      blocksdPerMultiprocessor( Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice )
                                / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread ) ),
-     desGridSize( blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ),
+     //desGridSize( blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ),
+     desGridSize( Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ),
      originalSize( size )
    {
    }
-- 
GitLab