diff --git a/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h b/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h index 599ed9e8e9d6f1c6d559be2815d0572e248ffb31..ba2602af215e98dea79cb9dc093df71d8a2cd52e 100644 --- a/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h +++ b/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h @@ -16,8 +16,8 @@ #include namespace TNL { -namespace Meshes { -namespace DistributedMeshes { +namespace Meshes { +namespace DistributedMeshes { template < typename MeshFunctionType, @@ -39,7 +39,7 @@ template < typename MeshFunctionType, class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 1, RealType, Device, Index > { public: - static void BufferEntities( + static void BufferEntities( MeshFunctionType& meshFunction, const MaskPointer& maskPointer, RealType* buffer, @@ -71,15 +71,15 @@ class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 1, RealType, Device, } }; ParallelFor< Device >::exec( 0, sizex, kernel ); - }; + }; }; template< typename MeshFunctionType, - typename MaskPointer, + typename MaskPointer, typename RealType, typename Device, - typename Index > + typename Index > class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 2, RealType, Device, Index > { public: @@ -99,7 +99,7 @@ class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 2, RealType, Device, Index sizey=size.y(); auto mesh=meshFunction.getMesh(); - RealType* meshFunctionData = meshFunction.getData().getData(); + RealType* meshFunctionData = meshFunction.getData().getData(); const typename MaskPointer::ObjectType* mask( nullptr ); if( maskPointer ) mask = &maskPointer.template getData< Device >(); @@ -107,18 +107,18 @@ class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 2, RealType, Device, auto kernel = [ tobuffer, mask, mesh, buffer, isBoundary, meshFunctionData, beginx, sizex, beginy] __cuda_callable__ ( Index i, Index j ) { typename MeshFunctionType::MeshType::Cell entity(mesh); - entity.getCoordinates().x() = beginx + j; - entity.getCoordinates().y() = beginy + i; + entity.getCoordinates().x() = beginx + i; + entity.getCoordinates().y() = beginy + j; entity.refresh(); if( ! isBoundary || ! mask || ( *mask )[ entity.getIndex() ] ) { if( tobuffer ) - buffer[ i * sizex + j ] = meshFunctionData[ entity.getIndex() ]; + buffer[ j * sizex + i ] = meshFunctionData[ entity.getIndex() ]; else - meshFunctionData[ entity.getIndex() ] = buffer[ i * sizex + j ]; + meshFunctionData[ entity.getIndex() ] = buffer[ j * sizex + i ]; } }; - ParallelFor2D< Device >::exec( 0, 0, sizey, sizex, kernel ); + ParallelFor2D< Device >::exec( 0, 0, sizex, sizey, kernel ); }; }; @@ -140,7 +140,6 @@ class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 3, RealType, Device, const Containers::StaticVector<3,Index>& size, bool tobuffer) { - Index beginx=begin.x(); Index beginy=begin.y(); Index beginz=begin.z(); @@ -152,24 +151,23 @@ class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 3, RealType, Device, RealType * meshFunctionData=meshFunction.getData().getData(); const typename MaskPointer::ObjectType* mask( nullptr ); if( maskPointer ) - mask = &maskPointer.template getData< Device >(); - auto kernel = [ tobuffer, mesh, mask, buffer, isBoundary, meshFunctionData, beginx, sizex, beginy, sizey, beginz] __cuda_callable__ ( Index k, Index i, Index j ) + mask = &maskPointer.template getData< Device >(); + auto kernel = [ tobuffer, mesh, mask, buffer, isBoundary, meshFunctionData, beginx, sizex, beginy, sizey, beginz] __cuda_callable__ ( Index i, Index j, Index k ) { typename MeshFunctionType::MeshType::Cell entity(mesh); - entity.getCoordinates().x() = beginx + j; + entity.getCoordinates().x() = beginx + i; + entity.getCoordinates().y() = beginy + j; entity.getCoordinates().z() = beginz + k; - entity.getCoordinates().y() = beginy + i; entity.refresh(); if( ! isBoundary || ! mask || ( *mask )[ entity.getIndex() ] ) { if( tobuffer ) - buffer[ k * sizex * sizey + i * sizex + j ] = - meshFunctionData[ entity.getIndex() ]; + buffer[ k * sizex * sizey + j * sizex + i ] = meshFunctionData[ entity.getIndex() ]; else - meshFunctionData[ entity.getIndex() ] = buffer[ k * sizex * sizey + i * sizex + j ]; + meshFunctionData[ entity.getIndex() ] = buffer[ k * sizex * sizey + j * sizex + i ]; } }; - ParallelFor3D< Device >::exec( 0, 0, 0, sizez, sizey, sizex, kernel ); + ParallelFor3D< Device >::exec( 0, 0, 0, sizex, sizey, sizez, kernel ); }; }; diff --git a/src/TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h b/src/TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h index fe2f82cffe1a54fb03f51e3aa6b85e8952397718..df36543f360564fce8905d97d198c2b40a2490a3 100644 --- a/src/TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h +++ b/src/TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h @@ -15,8 +15,8 @@ #include namespace TNL { -namespace Meshes { -namespace DistributedMeshes { +namespace Meshes { +namespace DistributedMeshes { template @@ -40,7 +40,7 @@ class CopyEntitiesHelper typedef typename MeshFunctionType::MeshType::GlobalIndexType Index; static void Copy(MeshFunctionType &from, MeshFunctionType &to, CoordinatesType &fromBegin, CoordinatesType &toBegin, CoordinatesType &size) - { + { auto toData=to.getData().getData(); auto fromData=from.getData().getData(); auto fromMesh=from.getMesh(); @@ -49,9 +49,9 @@ class CopyEntitiesHelper { Cell fromEntity(fromMesh); Cell toEntity(toMesh); - toEntity.getCoordinates().x()=toBegin.x()+i; + toEntity.getCoordinates().x()=toBegin.x()+i; toEntity.refresh(); - fromEntity.getCoordinates().x()=fromBegin.x()+i; + fromEntity.getCoordinates().x()=fromBegin.x()+i; fromEntity.refresh(); toData[toEntity.getIndex()]=fromData[fromEntity.getIndex()]; }; @@ -77,20 +77,19 @@ class CopyEntitiesHelper auto fromData=from.getData().getData(); auto fromMesh=from.getMesh(); auto toMesh=to.getMesh(); - auto kernel = [fromData,toData, fromMesh, toMesh, fromBegin, toBegin] __cuda_callable__ ( Index j, Index i ) + auto kernel = [fromData,toData, fromMesh, toMesh, fromBegin, toBegin] __cuda_callable__ ( Index i, Index j ) { Cell fromEntity(fromMesh); Cell toEntity(toMesh); toEntity.getCoordinates().x()=toBegin.x()+i; - toEntity.getCoordinates().y()=toBegin.y()+j; + toEntity.getCoordinates().y()=toBegin.y()+j; toEntity.refresh(); fromEntity.getCoordinates().x()=fromBegin.x()+i; - fromEntity.getCoordinates().y()=fromBegin.y()+j; + fromEntity.getCoordinates().y()=fromBegin.y()+j; fromEntity.refresh(); toData[toEntity.getIndex()]=fromData[fromEntity.getIndex()]; }; - ParallelFor2D< typename MeshFunctionType::MeshType::DeviceType >::exec( (Index)0,(Index)0,(Index)size.y(), (Index)size.x(), kernel ); - + ParallelFor2D< typename MeshFunctionType::MeshType::DeviceType >::exec( (Index)0,(Index)0,(Index)size.x(), (Index)size.y(), kernel ); } }; @@ -110,27 +109,25 @@ class CopyEntitiesHelper auto fromData=from.getData().getData(); auto fromMesh=from.getMesh(); auto toMesh=to.getMesh(); - auto kernel = [fromData,toData, fromMesh, toMesh, fromBegin, toBegin] __cuda_callable__ ( Index k, Index j, Index i ) + auto kernel = [fromData,toData, fromMesh, toMesh, fromBegin, toBegin] __cuda_callable__ ( Index i, Index j, Index k ) { Cell fromEntity(fromMesh); Cell toEntity(toMesh); toEntity.getCoordinates().x()=toBegin.x()+i; toEntity.getCoordinates().y()=toBegin.y()+j; - toEntity.getCoordinates().z()=toBegin.z()+k; + toEntity.getCoordinates().z()=toBegin.z()+k; toEntity.refresh(); fromEntity.getCoordinates().x()=fromBegin.x()+i; fromEntity.getCoordinates().y()=fromBegin.y()+j; - fromEntity.getCoordinates().z()=fromBegin.z()+k; + fromEntity.getCoordinates().z()=fromBegin.z()+k; fromEntity.refresh(); toData[toEntity.getIndex()]=fromData[fromEntity.getIndex()]; }; - ParallelFor3D< typename MeshFunctionType::MeshType::DeviceType >::exec( (Index)0,(Index)0,(Index)0,(Index)size.z() ,(Index)size.y(), (Index)size.x(), kernel ); + ParallelFor3D< typename MeshFunctionType::MeshType::DeviceType >::exec( (Index)0,(Index)0,(Index)0,(Index)size.x(),(Index)size.y(), (Index)size.z(), kernel ); } }; - - } // namespace DistributedMeshes } // namespace Meshes } // namespace TNL diff --git a/src/TNL/ParallelFor.h b/src/TNL/ParallelFor.h index 78d4499827ed4f2dc841d47513df008e0493926f..d0c2d0601b7cbf565b3f602127c54e04ab229c80 100644 --- a/src/TNL/ParallelFor.h +++ b/src/TNL/ParallelFor.h @@ -37,10 +37,21 @@ struct ParallelFor static void exec( Index start, Index end, Function f, FunctionArgs... args ) { #ifdef HAVE_OPENMP - #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 ) -#endif + // Benchmarks show that this is significantly faster compared + // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )' + if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 ) + { + #pragma omp parallel for + for( Index i = start; i < end; i++ ) + f( i, args... ); + } + else + for( Index i = start; i < end; i++ ) + f( i, args... ); +#else for( Index i = start; i < end; i++ ) f( i, args... ); +#endif } }; @@ -53,11 +64,25 @@ struct ParallelFor2D static void exec( Index startX, Index startY, Index endX, Index endY, Function f, FunctionArgs... args ) { #ifdef HAVE_OPENMP - #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() ) -#endif - for( Index i = startX; i < endX; i++ ) + // Benchmarks show that this is significantly faster compared + // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )' + if( TNL::Devices::Host::isOMPEnabled() ) + { + #pragma omp parallel for + for( Index j = startY; j < endY; j++ ) + for( Index i = startX; i < endX; i++ ) + f( i, j, args... ); + } + else { + for( Index j = startY; j < endY; j++ ) + for( Index i = startX; i < endX; i++ ) + f( i, j, args... ); + } +#else for( Index j = startY; j < endY; j++ ) + for( Index i = startX; i < endX; i++ ) f( i, j, args... ); +#endif } }; @@ -70,12 +95,28 @@ struct ParallelFor3D static void exec( Index startX, Index startY, Index startZ, Index endX, Index endY, Index endZ, Function f, FunctionArgs... args ) { #ifdef HAVE_OPENMP - #pragma omp parallel for collapse(2) if( TNL::Devices::Host::isOMPEnabled() ) -#endif - for( Index i = startX; i < endX; i++ ) - for( Index j = startY; j < endY; j++ ) + // Benchmarks show that this is significantly faster compared + // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )' + if( TNL::Devices::Host::isOMPEnabled() ) + { + #pragma omp parallel for collapse(2) + for( Index k = startZ; k < endZ; k++ ) + for( Index j = startY; j < endY; j++ ) + for( Index i = startX; i < endX; i++ ) + f( i, j, k, args... ); + } + else { + for( Index k = startZ; k < endZ; k++ ) + for( Index j = startY; j < endY; j++ ) + for( Index i = startX; i < endX; i++ ) + f( i, j, k, args... ); + } +#else for( Index k = startZ; k < endZ; k++ ) + for( Index j = startY; j < endY; j++ ) + for( Index i = startX; i < endX; i++ ) f( i, j, k, args... ); +#endif } };