diff --git a/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h b/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h
index 599ed9e8e9d6f1c6d559be2815d0572e248ffb31..ba2602af215e98dea79cb9dc093df71d8a2cd52e 100644
--- a/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h
+++ b/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h
@@ -16,8 +16,8 @@
 #include <TNL/Containers/StaticVector.h>
 
 namespace TNL {
-namespace Meshes { 
-namespace DistributedMeshes { 
+namespace Meshes {
+namespace DistributedMeshes {
 
 
 template < typename MeshFunctionType,
@@ -39,7 +39,7 @@ template < typename MeshFunctionType,
 class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 1, RealType, Device, Index >
 {
    public:
-      static void BufferEntities( 
+      static void BufferEntities(
          MeshFunctionType& meshFunction,
          const MaskPointer& maskPointer,
          RealType* buffer,
@@ -71,15 +71,15 @@ class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 1, RealType, Device,
             }
          };
          ParallelFor< Device >::exec( 0, sizex, kernel );
-      };  
+      };
 };
 
 
 template< typename MeshFunctionType,
-          typename MaskPointer, 
+          typename MaskPointer,
           typename RealType,
           typename Device,
-          typename Index  > 
+          typename Index  >
 class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 2, RealType, Device, Index >
 {
    public:
@@ -99,7 +99,7 @@ class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 2, RealType, Device,
          Index sizey=size.y();
 
          auto mesh=meshFunction.getMesh();
-         RealType* meshFunctionData = meshFunction.getData().getData();      
+         RealType* meshFunctionData = meshFunction.getData().getData();
          const typename MaskPointer::ObjectType* mask( nullptr );
          if( maskPointer )
             mask = &maskPointer.template getData< Device >();
@@ -107,18 +107,18 @@ class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 2, RealType, Device,
          auto kernel = [ tobuffer, mask, mesh, buffer, isBoundary, meshFunctionData, beginx, sizex, beginy] __cuda_callable__ ( Index i, Index j )
          {
             typename MeshFunctionType::MeshType::Cell entity(mesh);
-            entity.getCoordinates().x() = beginx + j;
-            entity.getCoordinates().y() = beginy + i;				
+            entity.getCoordinates().x() = beginx + i;
+            entity.getCoordinates().y() = beginy + j;
             entity.refresh();
             if( ! isBoundary || ! mask || ( *mask )[ entity.getIndex() ] )
             {
                if( tobuffer )
-                  buffer[ i * sizex + j ] = meshFunctionData[ entity.getIndex() ];
+                  buffer[ j * sizex + i ] = meshFunctionData[ entity.getIndex() ];
                else
-                  meshFunctionData[ entity.getIndex() ] = buffer[ i * sizex + j ];
+                  meshFunctionData[ entity.getIndex() ] = buffer[ j * sizex + i ];
             }
          };
-         ParallelFor2D< Device >::exec( 0, 0, sizey, sizex, kernel );     
+         ParallelFor2D< Device >::exec( 0, 0, sizex, sizey, kernel );
       };
 };
 
@@ -140,7 +140,6 @@ class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 3, RealType, Device,
          const Containers::StaticVector<3,Index>& size,
          bool tobuffer)
       {
-
          Index beginx=begin.x();
          Index beginy=begin.y();
          Index beginz=begin.z();
@@ -152,24 +151,23 @@ class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 3, RealType, Device,
          RealType * meshFunctionData=meshFunction.getData().getData();
          const typename MaskPointer::ObjectType* mask( nullptr );
          if( maskPointer )
-            mask = &maskPointer.template getData< Device >();         
-         auto kernel = [ tobuffer, mesh, mask, buffer, isBoundary, meshFunctionData, beginx, sizex, beginy, sizey, beginz] __cuda_callable__ ( Index k, Index i, Index j )
+            mask = &maskPointer.template getData< Device >();
+         auto kernel = [ tobuffer, mesh, mask, buffer, isBoundary, meshFunctionData, beginx, sizex, beginy, sizey, beginz] __cuda_callable__ ( Index i, Index j, Index k )
          {
             typename MeshFunctionType::MeshType::Cell entity(mesh);
-            entity.getCoordinates().x() = beginx + j;
+            entity.getCoordinates().x() = beginx + i;
+            entity.getCoordinates().y() = beginy + j;
             entity.getCoordinates().z() = beginz + k;
-            entity.getCoordinates().y() = beginy + i;
             entity.refresh();
             if( ! isBoundary || ! mask || ( *mask )[ entity.getIndex() ] )
             {
                if( tobuffer )
-                  buffer[ k * sizex * sizey + i * sizex + j ] = 
-                     meshFunctionData[ entity.getIndex() ];
+                  buffer[ k * sizex * sizey + j * sizex + i ] = meshFunctionData[ entity.getIndex() ];
                else
-                  meshFunctionData[ entity.getIndex() ] = buffer[ k * sizex * sizey + i * sizex + j ];
+                  meshFunctionData[ entity.getIndex() ] = buffer[ k * sizex * sizey + j * sizex + i ];
             }
          };
-         ParallelFor3D< Device >::exec( 0, 0, 0, sizez, sizey, sizex, kernel ); 
+         ParallelFor3D< Device >::exec( 0, 0, 0, sizex, sizey, sizez, kernel );
       };
 };
 
diff --git a/src/TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h b/src/TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h
index fe2f82cffe1a54fb03f51e3aa6b85e8952397718..df36543f360564fce8905d97d198c2b40a2490a3 100644
--- a/src/TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h
+++ b/src/TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h
@@ -15,8 +15,8 @@
 #include <TNL/ParallelFor.h>
 
 namespace TNL {
-namespace Meshes { 
-namespace DistributedMeshes { 
+namespace Meshes {
+namespace DistributedMeshes {
 
 template<typename MeshFunctionType,
          int dim=MeshFunctionType::getMeshDimension()>
@@ -40,7 +40,7 @@ class CopyEntitiesHelper<MeshFunctionType, 1>
     typedef typename MeshFunctionType::MeshType::GlobalIndexType Index;
 
     static void Copy(MeshFunctionType &from, MeshFunctionType &to, CoordinatesType &fromBegin, CoordinatesType &toBegin, CoordinatesType &size)
-    {        
+    {
         auto toData=to.getData().getData();
         auto fromData=from.getData().getData();
         auto fromMesh=from.getMesh();
@@ -49,9 +49,9 @@ class CopyEntitiesHelper<MeshFunctionType, 1>
         {
             Cell fromEntity(fromMesh);
             Cell toEntity(toMesh);
-            toEntity.getCoordinates().x()=toBegin.x()+i;            
+            toEntity.getCoordinates().x()=toBegin.x()+i;
             toEntity.refresh();
-            fromEntity.getCoordinates().x()=fromBegin.x()+i;            
+            fromEntity.getCoordinates().x()=fromBegin.x()+i;
             fromEntity.refresh();
             toData[toEntity.getIndex()]=fromData[fromEntity.getIndex()];
         };
@@ -77,20 +77,19 @@ class CopyEntitiesHelper<MeshFunctionType,2>
         auto fromData=from.getData().getData();
         auto fromMesh=from.getMesh();
         auto toMesh=to.getMesh();
-        auto kernel = [fromData,toData, fromMesh, toMesh, fromBegin, toBegin] __cuda_callable__ ( Index j, Index i )
+        auto kernel = [fromData,toData, fromMesh, toMesh, fromBegin, toBegin] __cuda_callable__ ( Index i, Index j )
         {
             Cell fromEntity(fromMesh);
             Cell toEntity(toMesh);
             toEntity.getCoordinates().x()=toBegin.x()+i;
-            toEntity.getCoordinates().y()=toBegin.y()+j;            
+            toEntity.getCoordinates().y()=toBegin.y()+j;
             toEntity.refresh();
             fromEntity.getCoordinates().x()=fromBegin.x()+i;
-            fromEntity.getCoordinates().y()=fromBegin.y()+j;            
+            fromEntity.getCoordinates().y()=fromBegin.y()+j;
             fromEntity.refresh();
             toData[toEntity.getIndex()]=fromData[fromEntity.getIndex()];
         };
-        ParallelFor2D< typename MeshFunctionType::MeshType::DeviceType >::exec( (Index)0,(Index)0,(Index)size.y(), (Index)size.x(), kernel );
-
+        ParallelFor2D< typename MeshFunctionType::MeshType::DeviceType >::exec( (Index)0,(Index)0,(Index)size.x(), (Index)size.y(), kernel );
     }
 
 };
@@ -110,27 +109,25 @@ class CopyEntitiesHelper<MeshFunctionType,3>
         auto fromData=from.getData().getData();
         auto fromMesh=from.getMesh();
         auto toMesh=to.getMesh();
-        auto kernel = [fromData,toData, fromMesh, toMesh, fromBegin, toBegin] __cuda_callable__ ( Index k, Index j, Index i )
+        auto kernel = [fromData,toData, fromMesh, toMesh, fromBegin, toBegin] __cuda_callable__ ( Index i, Index j, Index k )
         {
             Cell fromEntity(fromMesh);
             Cell toEntity(toMesh);
             toEntity.getCoordinates().x()=toBegin.x()+i;
             toEntity.getCoordinates().y()=toBegin.y()+j;
-            toEntity.getCoordinates().z()=toBegin.z()+k;                                
+            toEntity.getCoordinates().z()=toBegin.z()+k;
             toEntity.refresh();
             fromEntity.getCoordinates().x()=fromBegin.x()+i;
             fromEntity.getCoordinates().y()=fromBegin.y()+j;
-            fromEntity.getCoordinates().z()=fromBegin.z()+k;            
+            fromEntity.getCoordinates().z()=fromBegin.z()+k;
             fromEntity.refresh();
             toData[toEntity.getIndex()]=fromData[fromEntity.getIndex()];
         };
-        ParallelFor3D< typename MeshFunctionType::MeshType::DeviceType >::exec( (Index)0,(Index)0,(Index)0,(Index)size.z() ,(Index)size.y(), (Index)size.x(), kernel );
+        ParallelFor3D< typename MeshFunctionType::MeshType::DeviceType >::exec( (Index)0,(Index)0,(Index)0,(Index)size.x(),(Index)size.y(), (Index)size.z(), kernel );
     }
 };
 
 
-
-
 } // namespace DistributedMeshes
 } // namespace Meshes
 } // namespace TNL
diff --git a/src/TNL/ParallelFor.h b/src/TNL/ParallelFor.h
index 78d4499827ed4f2dc841d47513df008e0493926f..d0c2d0601b7cbf565b3f602127c54e04ab229c80 100644
--- a/src/TNL/ParallelFor.h
+++ b/src/TNL/ParallelFor.h
@@ -37,10 +37,21 @@ struct ParallelFor
    static void exec( Index start, Index end, Function f, FunctionArgs... args )
    {
 #ifdef HAVE_OPENMP
-      #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )
-#endif
+      // Benchmarks show that this is significantly faster compared
+      // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )'
+      if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )
+      {
+         #pragma omp parallel for
+         for( Index i = start; i < end; i++ )
+            f( i, args... );
+      }
+      else
+         for( Index i = start; i < end; i++ )
+            f( i, args... );
+#else
       for( Index i = start; i < end; i++ )
          f( i, args... );
+#endif
    }
 };
 
@@ -53,11 +64,25 @@ struct ParallelFor2D
    static void exec( Index startX, Index startY, Index endX, Index endY, Function f, FunctionArgs... args )
    {
 #ifdef HAVE_OPENMP
-      #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )
-#endif
-      for( Index i = startX; i < endX; i++ )
+      // Benchmarks show that this is significantly faster compared
+      // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )'
+      if( TNL::Devices::Host::isOMPEnabled() )
+      {
+         #pragma omp parallel for
+         for( Index j = startY; j < endY; j++ )
+         for( Index i = startX; i < endX; i++ )
+            f( i, j, args... );
+      }
+      else {
+         for( Index j = startY; j < endY; j++ )
+         for( Index i = startX; i < endX; i++ )
+            f( i, j, args... );
+      }
+#else
       for( Index j = startY; j < endY; j++ )
+      for( Index i = startX; i < endX; i++ )
          f( i, j, args... );
+#endif
    }
 };
 
@@ -70,12 +95,28 @@ struct ParallelFor3D
    static void exec( Index startX, Index startY, Index startZ, Index endX, Index endY, Index endZ, Function f, FunctionArgs... args )
    {
 #ifdef HAVE_OPENMP
-      #pragma omp parallel for collapse(2) if( TNL::Devices::Host::isOMPEnabled() )
-#endif
-      for( Index i = startX; i < endX; i++ )
-      for( Index j = startY; j < endY; j++ )
+      // Benchmarks show that this is significantly faster compared
+      // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )'
+      if( TNL::Devices::Host::isOMPEnabled() )
+      {
+         #pragma omp parallel for collapse(2)
+         for( Index k = startZ; k < endZ; k++ )
+         for( Index j = startY; j < endY; j++ )
+         for( Index i = startX; i < endX; i++ )
+            f( i, j, k, args... );
+      }
+      else {
+         for( Index k = startZ; k < endZ; k++ )
+         for( Index j = startY; j < endY; j++ )
+         for( Index i = startX; i < endX; i++ )
+            f( i, j, k, args... );
+      }
+#else
       for( Index k = startZ; k < endZ; k++ )
+      for( Index j = startY; j < endY; j++ )
+      for( Index i = startX; i < endX; i++ )
          f( i, j, k, args... );
+#endif
    }
 };