diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
index 3ef43a05532122833c46d71a87a593f7d2e9c6e7..d7a711cc7f554b57bbd152a1e89b8fd54d2db806 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -195,11 +195,16 @@ struct CudaReductionKernelLauncher
    // where blocksPerMultiprocessor is determined according to the number of
    // available registers on the multiprocessor.
    // On Tesla K40c, desGridSize = 8 * 15 = 120.
+   //
+   // Update:
+   // It seems to be better to map only one CUDA block per one multiprocessor or maybe
+   // just slightly more. Therefore we omit blocksdPerMultiprocessor in the following.
    CudaReductionKernelLauncher( const Index size )
    : activeDevice( Devices::CudaDeviceInfo::getActiveDevice() ),
      blocksdPerMultiprocessor( Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice )
                                / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread ) ),
-     desGridSize( blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ),
+     //desGridSize( blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ),
+     desGridSize( Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ),
      originalSize( size )
    {
    }