ParallelFor: reverted block size (testing for LBM) and removed synchronizations of smart pointers

944ccad1 · Jakub Klinkovský · 52f2995f · 944ccad1
Commit 944ccad1 authored 6 years ago by Jakub Klinkovský
--- a/src/TNL/ParallelFor.h
+++ b/src/TNL/ParallelFor.h
@@ -158,8 +158,6 @@ struct ParallelFor< Devices::Cuda >
         dim3 gridSize;
         gridSize.x = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( end - start, blockSize.x ) );

-         Devices::Cuda::synchronizeDevice();
-
         if( Devices::Cuda::getNumberOfGrids( end - start ) == 1 )
            ParallelForKernel< false ><<< gridSize, blockSize >>>( start, end, f, args... );
         else {
@@ -193,7 +191,7 @@ struct ParallelFor2D< Devices::Cuda >

         dim3 blockSize;
         if( sizeX >= sizeY * sizeY ) {
-            blockSize.x = TNL::min( 128, sizeX );
+            blockSize.x = TNL::min( 256, sizeX );
            blockSize.y = 1;
         }
         else if( sizeY >= sizeX * sizeX ) {
@@ -212,8 +210,6 @@ struct ParallelFor2D< Devices::Cuda >
         gridCount.x = Devices::Cuda::getNumberOfGrids( sizeX );
         gridCount.y = Devices::Cuda::getNumberOfGrids( sizeY );

-//         Devices::Cuda::synchronizeDevice();
-
         if( gridCount.x == 1 && gridCount.y == 1 )
            ParallelFor2DKernel< false, false ><<< gridSize, blockSize >>>
               ( startX, startY, endX, endY, f, args... );
@@ -281,8 +277,6 @@ struct ParallelFor3D< Devices::Cuda >
         gridCount.y = Devices::Cuda::getNumberOfGrids( sizeY );
         gridCount.z = Devices::Cuda::getNumberOfGrids( sizeZ );

-//         Devices::Cuda::synchronizeDevice();
-
         if( gridCount.x == 1 && gridCount.y == 1 && gridCount.z == 1 )
            ParallelFor3DKernel< false, false, false ><<< gridSize, blockSize >>>
               ( startX, startY, startZ, endX, endY, endZ, f, args... );