From 27a355c65722ac58ecd3322eabe9a93a1d1d8406 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sat, 9 Feb 2019 21:39:00 +0100
Subject: [PATCH] Fixed block size in ParallelFor3D and improved heuristics

---
 src/TNL/ParallelFor.h | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/TNL/ParallelFor.h b/src/TNL/ParallelFor.h
index d0c2d0601b..9989954b56 100644
--- a/src/TNL/ParallelFor.h
+++ b/src/TNL/ParallelFor.h
@@ -288,20 +288,36 @@ struct ParallelFor3D< Devices::Cuda >
          const Index sizeZ = endZ - startZ;
 
          dim3 blockSize;
-         if( sizeX >= sizeY * sizeZ ) {
+         if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) {
             blockSize.x = TNL::min( 256, sizeX );
             blockSize.y = 1;
             blockSize.z = 1;
          }
-         else if( sizeY >= sizeX * sizeZ ) {
+         else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) {
             blockSize.x = 1;
             blockSize.y = TNL::min( 256, sizeY );
             blockSize.z = 1;
          }
-         else if( sizeZ >= sizeX * sizeY ) {
-            blockSize.x = 1;
+         else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) {
+            blockSize.x = TNL::min( 2, sizeX );
+            blockSize.y = TNL::min( 2, sizeY );
+            // CUDA allows max 64 for blockSize.z
+            blockSize.z = TNL::min( 64, sizeZ );
+         }
+         else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) {
+            blockSize.x = TNL::min( 32, sizeX );
+            blockSize.y = TNL::min( 8, sizeY );
+            blockSize.z = 1;
+         }
+         else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) {
+            blockSize.x = TNL::min( 32, sizeX );
             blockSize.y = 1;
-            blockSize.z = TNL::min( 256, sizeZ );
+            blockSize.z = TNL::min( 8, sizeZ );
+         }
+         else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) {
+            blockSize.x = 1;
+            blockSize.y = TNL::min( 32, sizeY );
+            blockSize.z = TNL::min( 8, sizeZ );
          }
          else {
             blockSize.x = TNL::min( 16, sizeX );
-- 
GitLab