From c81cd235285bf9267888d12db3ddfaa8d039a1ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Tue, 3 Apr 2018 10:41:03 +0200
Subject: [PATCH] Disabled CUDA device synchronization before ParallelFor2D and
 3D kernels

---
 src/TNL/ParallelFor.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/TNL/ParallelFor.h b/src/TNL/ParallelFor.h
index ec66906a5e..54a3a86412 100644
--- a/src/TNL/ParallelFor.h
+++ b/src/TNL/ParallelFor.h
@@ -193,7 +193,7 @@ struct ParallelFor2D< Devices::Cuda >
 
          dim3 blockSize;
          if( sizeX >= sizeY * sizeY ) {
-            blockSize.x = TNL::min( 256, sizeX );
+            blockSize.x = TNL::min( 128, sizeX );
             blockSize.y = 1;
          }
          else if( sizeY >= sizeX * sizeX ) {
@@ -212,7 +212,7 @@ struct ParallelFor2D< Devices::Cuda >
          gridCount.x = Devices::Cuda::getNumberOfGrids( sizeX );
          gridCount.y = Devices::Cuda::getNumberOfGrids( sizeY );
 
-         Devices::Cuda::synchronizeDevice();
+//         Devices::Cuda::synchronizeDevice();
 
          if( gridCount.x == 1 && gridCount.y == 1 )
             ParallelFor2DKernel< false, false ><<< gridSize, blockSize >>>
@@ -281,7 +281,7 @@ struct ParallelFor3D< Devices::Cuda >
          gridCount.y = Devices::Cuda::getNumberOfGrids( sizeY );
          gridCount.z = Devices::Cuda::getNumberOfGrids( sizeZ );
 
-         Devices::Cuda::synchronizeDevice();
+//         Devices::Cuda::synchronizeDevice();
 
          if( gridCount.x == 1 && gridCount.y == 1 && gridCount.z == 1 )
             ParallelFor3DKernel< false, false, false ><<< gridSize, blockSize >>>
-- 
GitLab