Optimization of ParallelFor on CPU.

2e26d884 · Tomáš Oberhuber · Tomáš Oberhuber · 075740ec · 2e26d884
Commit 2e26d884 authored 6 years ago by Tomáš Oberhuber Committed by Tomáš Oberhuber 6 years ago
--- a/src/TNL/ParallelFor.h
+++ b/src/TNL/ParallelFor.h
@@ -37,10 +37,21 @@ struct ParallelFor
   static void exec( Index start, Index end, Function f, FunctionArgs... args )
   {
 #ifdef HAVE_OPENMP
-      #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )
-#endif
+      // Benchmarks show that this is significantly faster compared
+      // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )'
+      if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )
+      {
+#pragma omp parallel for
+         for( Index i = start; i < end; i++ )
+            f( i, args... );
+      }
+      else
+         for( Index i = start; i < end; i++ )
+            f( i, args... );
+#else
      for( Index i = start; i < end; i++ )
         f( i, args... );
+#endif
   }
 };

@@ -53,11 +64,24 @@ struct ParallelFor2D
   static void exec( Index startX, Index startY, Index endX, Index endY, Function f, FunctionArgs... args )
   {
 #ifdef HAVE_OPENMP
-      #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )
-#endif
+      // Benchmarks show that this is significantly faster compared
+      // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )'
+      if( TNL::Devices::Host::isOMPEnabled() )
+      {
+#pragma omp parallel for
+         for( Index i = startX; i < endX; i++ )
+            for( Index j = startY; j < endY; j++ )
+               f( i, j, args... );
+      }
+      else
+         for( Index i = startX; i < endX; i++ )
+            for( Index j = startY; j < endY; j++ )
+               f( i, j, args... );
+#else
      for( Index i = startX; i < endX; i++ )
-      for( Index j = startY; j < endY; j++ )
-         f( i, j, args... );
+         for( Index j = startY; j < endY; j++ )
+            f( i, j, args... );
+#endif
   }
 };

@@ -70,12 +94,27 @@ struct ParallelFor3D
   static void exec( Index startX, Index startY, Index startZ, Index endX, Index endY, Index endZ, Function f, FunctionArgs... args )
   {
 #ifdef HAVE_OPENMP
-      #pragma omp parallel for collapse(2) if( TNL::Devices::Host::isOMPEnabled() )
-#endif
+      // Benchmarks show that this is significantly faster compared
+      // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )'
+     if( TNL::Devices::Host::isOMPEnabled() )
+     {
+#pragma omp parallel for collapse(2)
      for( Index i = startX; i < endX; i++ )
-      for( Index j = startY; j < endY; j++ )
-      for( Index k = startZ; k < endZ; k++ )
-         f( i, j, k, args... );
+         for( Index j = startY; j < endY; j++ )
+            for( Index k = startZ; k < endZ; k++ )
+               f( i, j, k, args... );
+     }
+     else
+         for( Index i = startX; i < endX; i++ )
+            for( Index j = startY; j < endY; j++ )
+               for( Index k = startZ; k < endZ; k++ )
+                  f( i, j, k, args... );
+#else
+      for( Index i = startX; i < endX; i++ )
+         for( Index j = startY; j < endY; j++ )
+            for( Index k = startZ; k < endZ; k++ )
+               f( i, j, k, args... );
+#endif
   }
 };