From 9f44ea1d142af4caa0dd9a4afe405f09da02727d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matou=C5=A1=20Fencl?= Date: Fri, 30 Nov 2018 09:02:24 +0100 Subject: [PATCH 01/14] Change of int to IndexType and preparations for OpenMPI. --- .../hamilton-jacobi/HamiltonJacobiProblem.h | 2 ++ .../HamiltonJacobiProblem_impl.h | 20 +++++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem.h index a41442000..7f1bd4193 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem.h @@ -25,6 +25,8 @@ #include #include +#include + template< typename Mesh, typename DifferentialOperator, typename BoundaryCondition, diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem_impl.h index 3cc638849..9244b1833 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem_impl.h @@ -123,12 +123,24 @@ setInitialCondition( const Config::ParameterContainer& parameters, { this->bindDofs( mesh, dofs ); const String& initialConditionFile = parameters.getParameter< String >( "initial-condition" ); - if( ! this->solution.boundLoad( initialConditionFile ) ) + if(CommunicatorType::isDistributed()) { - std::cerr << "I am not able to load the initial condition from the file " << initialConditionFile << "." <getMesh().getDistributedMesh()->printProcessDistr() << std::endl; + if(distributedIOType==Meshes::DistributedMeshes::MpiIO) + Meshes::DistributedMeshes::DistributedGridIO ::load(initialConditionFile, *uPointer ); + if(distributedIOType==Meshes::DistributedMeshes::LocalCopy) + Meshes::DistributedMeshes::DistributedGridIO ::load(initialConditionFile, *uPointer ); + uPointer->template synchronize(); } - return true; + else + { + if( ! this->solution.boundLoad( initialConditionFile ) ) + { + std::cerr << "I am not able to load the initial condition from the file " << initialConditionFile << "." < Date: Fri, 30 Nov 2018 09:06:24 +0100 Subject: [PATCH 02/14] Changed int to IndexType --- .../tnlDirectEikonalMethodsBase_impl.h | 12 ++++++------ .../hamilton-jacobi/tnlFastSweepingMethod2D_impl.h | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h index 26444bcfa..47561768e 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h @@ -96,10 +96,10 @@ template< typename Real, template< int sizeSArray > void tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: -updateBlocks( const InterfaceMapType& interfaceMap, - MeshFunctionType& aux, - MeshFunctionType& helpFunc, - ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ) +updateBlocks( InterfaceMapType interfaceMap, + MeshFunctionType aux, + MeshFunctionType helpFunc, + ArrayContainer BlockIterHost, IndexType numThreadsPerBlock/*, Real **sArray*/ ) { #pragma omp parallel for schedule( dynamic ) for( IndexType i = 0; i < BlockIterHost.getSize(); i++ ) @@ -267,13 +267,13 @@ updateBlocks( const InterfaceMapType& interfaceMap, template< typename Real, typename Device, typename Index > -template< int sizeSArray > +template< IndexType sizeSArray > void tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: updateBlocks( const InterfaceMapType& interfaceMap, const MeshFunctionType& aux, MeshFunctionType& helpFunc, - ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ) + ArrayContainer BlockIterHost, IndexType numThreadsPerBlock/*, Real **sArray*/ ) { //#pragma omp parallel for schedule( dynamic ) for( IndexType i = 0; i < BlockIterHost.getSize(); i++ ) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index 89cb60881..f7be7e7de 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -97,11 +97,11 @@ solve( const MeshPointer& mesh, if( i == 0 ) { printf( "0: mesh x: %d\n", mesh->getDimensions().x() ); printf( "0: mesh y: %d\n", mesh->getDimensions().y() ); - //aux.save("aux_proc0.tnl"); - /*for( int k = 0; k < mesh->getDimensions().x()*mesh->getDimensions().y(); k++ ) + aux.save("aux_proc0.tnl"); + for( int k = 0; k < 16*16; k++ ) aux[ k ] = 10; - for( int k = 0; k < mesh->getDimensions().x(); k++ ){ - for( int l = 0; l < mesh->getDimensions().y(); l++ ) + for( int k = 0; k < 16; k++ ){ + for( int l = 0; l < 16; l++ ) printf("%f.2\t",aux[ k * 16 + l ] ); printf("\n"); }*/ -- GitLab From 933cc22bce2c84cd5c880862e94bf457424b6c17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matou=C5=A1=20Fencl?= Date: Fri, 30 Nov 2018 15:16:19 +0100 Subject: [PATCH 03/14] MPI ready in tnlDirectEikonal* --- .../hamilton-jacobi/HamiltonJacobiProblem.h | 2 -- .../HamiltonJacobiProblem_impl.h | 20 ++++--------------- .../tnlDirectEikonalMethodsBase_impl.h | 6 +++--- .../tnlFastSweepingMethod2D_impl.h | 8 ++++---- 4 files changed, 11 insertions(+), 25 deletions(-) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem.h index 7f1bd4193..a41442000 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem.h @@ -25,8 +25,6 @@ #include #include -#include - template< typename Mesh, typename DifferentialOperator, typename BoundaryCondition, diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem_impl.h index 9244b1833..3cc638849 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem_impl.h @@ -123,24 +123,12 @@ setInitialCondition( const Config::ParameterContainer& parameters, { this->bindDofs( mesh, dofs ); const String& initialConditionFile = parameters.getParameter< String >( "initial-condition" ); - if(CommunicatorType::isDistributed()) + if( ! this->solution.boundLoad( initialConditionFile ) ) { - std::cout<<"Nodes Distribution: " << uPointer->getMesh().getDistributedMesh()->printProcessDistr() << std::endl; - if(distributedIOType==Meshes::DistributedMeshes::MpiIO) - Meshes::DistributedMeshes::DistributedGridIO ::load(initialConditionFile, *uPointer ); - if(distributedIOType==Meshes::DistributedMeshes::LocalCopy) - Meshes::DistributedMeshes::DistributedGridIO ::load(initialConditionFile, *uPointer ); - uPointer->template synchronize(); + std::cerr << "I am not able to load the initial condition from the file " << initialConditionFile << "." <solution.boundLoad( initialConditionFile ) ) - { - std::cerr << "I am not able to load the initial condition from the file " << initialConditionFile << "." < >:: updateBlocks( InterfaceMapType interfaceMap, MeshFunctionType aux, MeshFunctionType helpFunc, - ArrayContainer BlockIterHost, IndexType numThreadsPerBlock/*, Real **sArray*/ ) + ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ) { #pragma omp parallel for schedule( dynamic ) for( IndexType i = 0; i < BlockIterHost.getSize(); i++ ) @@ -267,13 +267,13 @@ updateBlocks( InterfaceMapType interfaceMap, template< typename Real, typename Device, typename Index > -template< IndexType sizeSArray > +template< int sizeSArray > void tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: updateBlocks( const InterfaceMapType& interfaceMap, const MeshFunctionType& aux, MeshFunctionType& helpFunc, - ArrayContainer BlockIterHost, IndexType numThreadsPerBlock/*, Real **sArray*/ ) + ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ) { //#pragma omp parallel for schedule( dynamic ) for( IndexType i = 0; i < BlockIterHost.getSize(); i++ ) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index f7be7e7de..89cb60881 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -97,11 +97,11 @@ solve( const MeshPointer& mesh, if( i == 0 ) { printf( "0: mesh x: %d\n", mesh->getDimensions().x() ); printf( "0: mesh y: %d\n", mesh->getDimensions().y() ); - aux.save("aux_proc0.tnl"); - for( int k = 0; k < 16*16; k++ ) + //aux.save("aux_proc0.tnl"); + /*for( int k = 0; k < mesh->getDimensions().x()*mesh->getDimensions().y(); k++ ) aux[ k ] = 10; - for( int k = 0; k < 16; k++ ){ - for( int l = 0; l < 16; l++ ) + for( int k = 0; k < mesh->getDimensions().x(); k++ ){ + for( int l = 0; l < mesh->getDimensions().y(); l++ ) printf("%f.2\t",aux[ k * 16 + l ] ); printf("\n"); }*/ -- GitLab From 5c15d04c3ff096fd5b5df51ebf57386b8f218ddc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matou=C5=A1=20Fencl?= Date: Tue, 19 Feb 2019 19:04:27 +0100 Subject: [PATCH 04/14] MPI implemented for CPU and GPU in 2D but meshFunction.template synchronize< Communicator >(); doesn't copy overlaps. --- .../tnl-direct-eikonal-solver.h | 3 + .../tnlDirectEikonalMethodsBase.h | 12 +- .../tnlDirectEikonalMethodsBase_impl.h | 267 +++-- .../hamilton-jacobi/tnlDirectEikonalProblem.h | 2 + .../tnlDirectEikonalProblem_impl.h | 66 +- .../hamilton-jacobi/tnlFastSweepingMethod.h | 20 +- .../tnlFastSweepingMethod1D_impl.h | 13 +- .../tnlFastSweepingMethod2D_impl.h | 941 ++++++++++++------ .../tnlFastSweepingMethod3D_impl.h | 13 +- 9 files changed, 833 insertions(+), 504 deletions(-) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h index 1b46ecb3d..82411c939 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h @@ -36,6 +36,9 @@ class DirectEikonalSolverConfig { config.addDelimiter( "Direct eikonal equation solver settings:" ); config.addRequiredEntry< String >( "input-file", "Input file." ); + config.addEntry< String >( "distributed-grid-io-type", "Choose Distributed Grid IO Type", "LocalCopy"); + config.addEntryEnum< String >( "LocalCopy" ); + config.addEntryEnum< String >( "MpiIO" ); }; }; diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h index 24a388554..3a78d0f54 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h @@ -70,7 +70,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > InterfaceMapPointer& interfaceMap ); template< typename MeshEntity > - __cuda_callable__ void updateCell( MeshFunctionType& u, + __cuda_callable__ bool updateCell( MeshFunctionType& u, const MeshEntity& cell, const RealType velocity = 1.0 ); @@ -147,7 +147,12 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock =0); + TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, + Containers::StaticVector< 2, Index > vLower, Containers::StaticVector< 2, Index > vUpper, int k,int oddEvenBlock =0); + +template< typename Real, typename Device, typename Index > +__global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc ); template < typename Index > __global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, @@ -160,7 +165,8 @@ __global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, I template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap ); + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, + Containers::StaticVector< 2, Index > vLower, Containers::StaticVector< 2, Index > vUpper ); template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h index 68e9c5f2b..3f5b6eed2 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h @@ -722,6 +722,10 @@ initInterface( const MeshFunctionPointer& _input, { #ifdef HAVE_CUDA const MeshType& mesh = _input->getMesh(); + Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh.getDistributedMesh(); + + Containers::StaticVector< 2, Index > vLower = meshPom->getLowerOverlap(); + Containers::StaticVector< 2, Index > vUpper = meshPom->getUpperOverlap(); const int cudaBlockSize( 16 ); int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); @@ -731,30 +735,31 @@ initInterface( const MeshFunctionPointer& _input, Devices::Cuda::synchronizeDevice(); CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(), _output.template modifyData< Device >(), - _interfaceMap.template modifyData< Device >() ); + _interfaceMap.template modifyData< Device >(), + vLower, vUpper); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; #endif } if( std::is_same< Device, Devices::Host >::value ) { - MeshFunctionType input = _input.getData(); - - /*double A[320][320]; - std::ifstream fileInit("/home/maty/Downloads/initData.txt"); - - for (int i = 0; i < 320; i++) - for (int j = 0; j < 320; j++) - fileInit >> A[j]; - fileInit.close(); - for (int i = 0; i < 320; i++) - for (int j = 0; j < 320; j++) - input[i*320 + j] = A[j];*/ - - + MeshFunctionType input = _input.getData(); MeshFunctionType& output = _output.modifyData(); InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); const MeshType& mesh = input.getMesh(); +/*#ifdef HAVE_MPI + int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup ); + if( i == 0 ) + { + printf( "0: mesh x: %d\n", mesh.getDimensions().x() ); + printf( "0: mesh y: %d\n", mesh.getDimensions().y() ); + for( int k = 0; k < mesh.getDimensions().y(); k++ ){ + for( int l = 0; l < mesh.getDimensions().x(); l++ ) + printf( "%.2f\t", input[ k * 16 + l ] ); + printf("\n"); + } + } +#endif*/ typedef typename MeshType::Cell Cell; Cell cell( mesh ); for( cell.getCoordinates().y() = 0; @@ -766,8 +771,8 @@ initInterface( const MeshFunctionPointer& _input, { cell.refresh(); output[ cell.getIndex() ] = - input( cell ) >= 0 ? std::numeric_limits< RealType >::max() : - - std::numeric_limits< RealType >::max(); + input( cell ) >= 0 ? 10://std::numeric_limits< RealType >::max() : + -10;//- std::numeric_limits< RealType >::max(); interfaceMap[ cell.getIndex() ] = false; } @@ -850,6 +855,19 @@ initInterface( const MeshFunctionPointer& _input, } } } +#ifdef HAVE_MPI + //int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup ); + /*if( i == 0 ) + { + printf( "0: mesh x: %d\n", mesh.getDimensions().x() ); + printf( "0: mesh y: %d\n", mesh.getDimensions().y() ); + for( int k = 0; k < mesh.getDimensions().y(); k++ ){ + for( int l = 0; l < mesh.getDimensions().x(); l++ ) + printf("%.2f\t",output[ k * 16 + l ] ); + printf("\n"); + } + }*/ +#endif } } @@ -858,7 +876,7 @@ template< typename Real, typename Index > template< typename MeshEntity > __cuda_callable__ -void +bool tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: updateCell( MeshFunctionType& u, const MeshEntity& cell, @@ -890,47 +908,39 @@ updateCell( MeshFunctionType& u, b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1 >() ], u[ neighborEntities.template getEntityIndex< 0, 1 >() ] ); } - if( fabs( a ) == std::numeric_limits< RealType >::max() && - fabs( b ) == std::numeric_limits< RealType >::max() ) - return; - /*if( fabs( a ) == TypeInfo< Real >::getMaxValue() || - fabs( b ) == TypeInfo< Real >::getMaxValue() || - fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) ) - { - tmp = - fabs( a ) >= fabs( b ) ? b + TNL::sign( value ) * hy : - a + TNL::sign( value ) * hx; - }*/ - /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() && - fabs( b ) != TypeInfo< Real >::getMaxValue() && - fabs( a - b ) < TNL::sqrt( (hx * hx + hy * hy)/v ) ) - { - tmp = ( hx * hx * b + hy * hy * a + - sign( value ) * hx * hy * TNL::sqrt( ( hx * hx + hy * hy )/v - - ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy ); - u[ cell.getIndex() ] = tmp; - } - else - { - tmp = - fabs( a ) > fabs( b ) ? b + TNL::sign( value ) * hy/v : - a + TNL::sign( value ) * hx/v; - u[ cell.getIndex() ] = argAbsMin( value, tmp ); - //tmp = TypeInfo< RealType >::getMaxValue(); - }*/ + if( fabs( a ) == 10&&//std::numeric_limits< RealType >::max() && + fabs( b ) == 10)//std::numeric_limits< RealType >::max() ) + return false; + RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 }; sortMinims( pom ); tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v; - if( fabs( tmp ) < fabs( pom[ 1 ] ) ) - u[ cell.getIndex() ] = argAbsMin( value, tmp ); - else { + u[ cell.getIndex() ] = argAbsMin( value, tmp ); + tmp = value - u[ cell.getIndex() ]; + if ( fabs( tmp ) > 0.001*hx ){ + //printf( "Vracime true!\n"); + return true; + }else{ + //printf( "Vracime false2!\n"); + return false; + } + } + else { tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); u[ cell.getIndex() ] = argAbsMin( value, tmp ); + tmp = value - u[ cell.getIndex() ]; + if ( fabs( tmp ) > 0.001*hx ){ + //printf( "Vracime true3!\n"); + return true; + }else{ + //printf( "Vracime false!\n"); + return false; + } } } @@ -984,8 +994,8 @@ initInterface( const MeshFunctionPointer& _input, { cell.refresh(); output[ cell.getIndex() ] = - input( cell ) > 0 ? 10://std::numeric_limits< RealType >::max() : - -10;//- std::numeric_limits< RealType >::max(); + input( cell ) > 0 ? std::numeric_limits< RealType >::max() : + - std::numeric_limits< RealType >::max(); interfaceMap[ cell.getIndex() ] = false; } @@ -1011,31 +1021,7 @@ initInterface( const MeshFunctionPointer& _input, const IndexType e = neighbors.template getEntityIndex< 1, 0, 0 >(); const IndexType n = neighbors.template getEntityIndex< 0, 1, 0 >(); const IndexType t = neighbors.template getEntityIndex< 0, 0, 1 >(); - //Try exact initiation - /*const IndexType w = neighbors.template getEntityIndex< -1, 0, 0 >(); - const IndexType s = neighbors.template getEntityIndex< 0, -1, 0 >(); - const IndexType b = neighbors.template getEntityIndex< 0, 0, -1 >(); - if( c * input[ e ] <= 0 ) - { - output[ cell.getIndex() ] = c; - output[ e ] = input[ e ]; - interfaceMap[ e ] = true; - interfaceMap[ cell.getIndex() ] = true; - } - else if( c * input[ n ] <= 0 ) - { - output[ cell.getIndex() ] = c; - output[ n ] = input[ n ]; - interfaceMap[ n ] = true; - interfaceMap[ cell.getIndex() ] = true; - } - else if( c * input[ t ] <= 0 ) - { - output[ cell.getIndex() ] = c; - output[ t ] = input[ t ]; - interfaceMap[ t ] = true; - interfaceMap[ cell.getIndex() ] = true; - }*/ + if( c * input[ n ] <= 0 ) { if( c >= 0 ) @@ -1172,31 +1158,6 @@ updateCell( MeshFunctionType& u, fabs( c ) == std::numeric_limits< RealType >::max() ) return; - - /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() && - fabs( b ) != TypeInfo< Real >::getMaxValue() && - fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) ) - { - tmp = ( hx * hx * a + hy * hy * b + - sign( value ) * hx * hy * sqrt( ( hx * hx + hy * hy )/v - - ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy ); - } - if( fabs( a ) != TypeInfo< Real >::getMaxValue() && - fabs( c ) != TypeInfo< Real >::getMaxValue() && - fabs( a - c ) >= TNL::sqrt( (hx * hx + hz * hz)/v ) ) - { - tmp = ( hx * hx * a + hz * hz * c + - sign( value ) * hx * hz * sqrt( ( hx * hx + hz * hz )/v - - ( a - c ) * ( a - c ) ) )/( hx * hx + hz * hz ); - } - if( fabs( b ) != TypeInfo< Real >::getMaxValue() && - fabs( c ) != TypeInfo< Real >::getMaxValue() && - fabs( b - c ) >= TNL::sqrt( (hy * hy + hz * hz)/v ) ) - { - tmp = ( hy * hy * b + hz * hz * c + - sign( value ) * hy * hz * sqrt( ( hy * hy + hz * hz )/v - - ( b - c ) * ( b - c ) ) )/( hy * hy + hz * hz ); - }*/ RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz}; sortMinims( pom ); tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]; @@ -1279,8 +1240,8 @@ updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real a = TNL::argAbsMin( sArray[ thrj * sizeSArray + thri+1 ], sArray[ thrj * sizeSArray + thri-1 ] ); - if( fabs( a ) == std::numeric_limits< RealType >::max() && - fabs( b ) == std::numeric_limits< RealType >::max() ) + if( fabs( a ) == 10&&//std::numeric_limits< RealType >::max() && + fabs( b ) == 10)//std::numeric_limits< RealType >::max() ) return false; RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 }; @@ -1338,9 +1299,9 @@ updateCell3D( volatile Real *sArray, int thri, int thrj, int thrk, /*if( thrk == 8 ) printf("Calculating a = %f, b = %f, c = %f\n" , a, b, c );*/ - if( fabs( a ) == 10&& //std::numeric_limits< RealType >::max() && - fabs( b ) == 10&&//std::numeric_limits< RealType >::max() && - fabs( c ) == 10)//std::numeric_limits< RealType >::max() ) + if( fabs( a ) == std::numeric_limits< RealType >::max() && + fabs( b ) == std::numeric_limits< RealType >::max() && + fabs( c ) == std::numeric_limits< RealType >::max() ) return false; RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz}; @@ -1393,7 +1354,7 @@ updateCell3D( volatile Real *sArray, int thri, int thrj, int thrk, template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output, - Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap ) + Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap ) { int i = threadIdx.x + blockDim.x*blockIdx.x; const Meshes::Grid< 1, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); @@ -1444,7 +1405,8 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap ) + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, + Containers::StaticVector< 2, Index > vLower, Containers::StaticVector< 2, Index > vUpper ) { int i = threadIdx.x + blockDim.x*blockIdx.x; int j = blockDim.y*blockIdx.y + threadIdx.y; @@ -1460,54 +1422,57 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, output[ cind ] = - input( cell ) >= 0 ? std::numeric_limits< Real >::max() : - - std::numeric_limits< Real >::max(); + input( cell ) >= 0 ? 10://std::numeric_limits< Real >::max() : + - 10;//- std::numeric_limits< Real >::max(); interfaceMap[ cind ] = false; - const Real& hx = mesh.getSpaceSteps().x(); - const Real& hy = mesh.getSpaceSteps().y(); - cell.refresh(); - const Real& c = input( cell ); - if( ! cell.isBoundaryEntity() ) + if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] && i>vLower[0] && j> vLower[0] ) { - auto neighbors = cell.getNeighborEntities(); - Real pom = 0; - const Index e = neighbors.template getEntityIndex< 1, 0 >(); - const Index w = neighbors.template getEntityIndex< -1, 0 >(); - const Index n = neighbors.template getEntityIndex< 0, 1 >(); - const Index s = neighbors.template getEntityIndex< 0, -1 >(); - - if( c * input[ n ] <= 0 ) - { - pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cell.getIndex() ] = true; - } - if( c * input[ e ] <= 0 ) - { - pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ w ] <= 0 ) - { - pom = TNL::sign( c )*( hx * c )/( c - input[ w ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ s ] <= 0 ) + const Real& hx = mesh.getSpaceSteps().x(); + const Real& hy = mesh.getSpaceSteps().y(); + cell.refresh(); + const Real& c = input( cell ); + if( ! cell.isBoundaryEntity() ) { - pom = TNL::sign( c )*( hy * c )/( c - input[ s ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; + auto neighbors = cell.getNeighborEntities(); + Real pom = 0; + const Index e = neighbors.template getEntityIndex< 1, 0 >(); + const Index w = neighbors.template getEntityIndex< -1, 0 >(); + const Index n = neighbors.template getEntityIndex< 0, 1 >(); + const Index s = neighbors.template getEntityIndex< 0, -1 >(); - interfaceMap[ cind ] = true; + if( c * input[ n ] <= 0 ) + { + pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cell.getIndex() ] = true; + } + if( c * input[ e ] <= 0 ) + { + pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ w ] <= 0 ) + { + pom = TNL::sign( c )*( hx * c )/( c - input[ w ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ s ] <= 0 ) + { + pom = TNL::sign( c )*( hy * c )/( c - input[ s ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } } } } diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h index 61465bee9..41aea10a0 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h @@ -71,6 +71,8 @@ class tnlDirectEikonalProblem bool setInitialCondition( const Config::ParameterContainer& parameters, DofVectorPointer& dofs ); + + bool makeSnapshot( ); bool solve( DofVectorPointer& dosf ); diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h index 7437803e2..0aecde5db 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h @@ -12,6 +12,9 @@ */ #pragma once +#include + +#include "tnlDirectEikonalProblem.h" template< typename Mesh, typename Communicator, @@ -76,6 +79,11 @@ tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >:: setup( const Config::ParameterContainer& parameters, const String& prefix ) { + String param=parameters.getParameter< String >( "distributed-grid-io-type" ); + if(param=="MpiIO") + distributedIOType=Meshes::DistributedMeshes::MpiIO; + if(param=="LocalCopy") + distributedIOType=Meshes::DistributedMeshes::LocalCopy; return true; } @@ -117,14 +125,14 @@ setInitialCondition( const Config::ParameterContainer& parameters, String inputFile = parameters.getParameter< String >( "input-file" ); this->initialData->setMesh( this->getMesh() ); std::cout<<"setInitialCondition" <getMesh().getDistributedMesh()->printProcessDistr() << std::endl; + std::cout<<"Nodes Distribution: " << initialData->getMesh().getDistributedMesh()->printProcessDistr() << std::endl; if(distributedIOType==Meshes::DistributedMeshes::MpiIO) - Meshes::DistributedMeshes::DistributedGridIO ::load(inputFile, *u ); + Meshes::DistributedMeshes::DistributedGridIO ::load(inputFile, *initialData ); if(distributedIOType==Meshes::DistributedMeshes::LocalCopy) - Meshes::DistributedMeshes::DistributedGridIO ::load(inputFile, *u ); - u->template synchronize(); + Meshes::DistributedMeshes::DistributedGridIO ::load(inputFile, *initialData ); + initialData->template synchronize(); } else { @@ -141,6 +149,38 @@ setInitialCondition( const Config::ParameterContainer& parameters, return true; } +template< typename Mesh, + typename Communicator, + typename Anisotropy, + typename Real, + typename Index > +bool +tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >:: +makeSnapshot( ) +{ + std::cout << std::endl << "Writing output." << std::endl; + + //this->bindDofs( dofs ); + + FileName fileName; + fileName.setFileNameBase( "u-" ); + fileName.setExtension( "tnl" ); + + if(CommunicatorType::isDistributed()) + { + if(distributedIOType==Meshes::DistributedMeshes::MpiIO) + Meshes::DistributedMeshes::DistributedGridIO ::save(fileName.getFileName(), *u ); + if(distributedIOType==Meshes::DistributedMeshes::LocalCopy) + Meshes::DistributedMeshes::DistributedGridIO ::save(fileName.getFileName(), *u ); + } + else + { + if( ! this->u->save( fileName.getFileName() ) ) + return false; + } + return true; +} + template< typename Mesh, typename Communicator, @@ -151,7 +191,19 @@ bool tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >:: solve( DofVectorPointer& dofs ) { - FastSweepingMethod< MeshType, AnisotropyType > fsm; - fsm.solve( this->getMesh(), anisotropy, initialData ); + FastSweepingMethod< MeshType, Communicator,AnisotropyType > fsm; + fsm.solve( this->getMesh(), u, anisotropy, initialData ); + + /*int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup ); + const MeshPointer msh = this->getMesh(); + if( i == 0 && msh->getMeshDimension() == 2 ) + { + for( int k = 0; k < 9; k++ ){ + for( int l = 0; l < msh->getDimensions().x(); l++ ) + printf("%.2f\t",(*initialData)[ k * msh->getDimensions().x() + l ] ); + printf("\n"); + } + }*/ + makeSnapshot(); return true; } diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h index 57b1886e8..51b3faceb 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h @@ -17,6 +17,7 @@ template< typename Mesh, + typename Communicator, typename Anisotropy = Functions::Analytic::Constant< Mesh::getMeshDimension(), typename Mesh::RealType > > class FastSweepingMethod { @@ -25,8 +26,9 @@ class FastSweepingMethod template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > -class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy > +class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Communicator, Anisotropy > : public tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > { //static_assert( std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." ); @@ -47,7 +49,7 @@ class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy > using typename BaseType::MeshFunctionType; using typename BaseType::InterfaceMapPointer; using typename BaseType::MeshFunctionPointer; - + FastSweepingMethod(); @@ -56,6 +58,7 @@ class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy > void setMaxIterations( const IndexType& maxIterations ); void solve( const MeshPointer& mesh, + MeshFunctionPointer& Aux, const AnisotropyPointer& anisotropy, MeshFunctionPointer& u ); @@ -68,8 +71,9 @@ class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy > template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > -class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy > +class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy > : public tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > { //static_assert( std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." ); @@ -84,13 +88,14 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy > typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > BaseType; using MeshPointer = Pointers::SharedPointer< MeshType >; using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >; + using MPI = Communicators::MpiCommunicator; using typename BaseType::InterfaceMapType; using typename BaseType::MeshFunctionType; using typename BaseType::InterfaceMapPointer; using typename BaseType::MeshFunctionPointer; using typename BaseType::ArrayContainer; - + FastSweepingMethod(); const IndexType& getMaxIterations() const; @@ -98,8 +103,9 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy > void setMaxIterations( const IndexType& maxIterations ); void solve( const MeshPointer& mesh, + MeshFunctionPointer& Aux, const AnisotropyPointer& anisotropy, - MeshFunctionPointer& u ); + const MeshFunctionPointer& u ); protected: @@ -109,8 +115,9 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy > template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > -class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy > +class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy > : public tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > { //static_assert( std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." ); @@ -140,6 +147,7 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy > void setMaxIterations( const IndexType& maxIterations ); void solve( const MeshPointer& mesh, + MeshFunctionPointer& Aux, const AnisotropyPointer& anisotropy, MeshFunctionPointer& u ); diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h index 890c6cb4c..662a5b79c 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h @@ -18,8 +18,9 @@ template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > -FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Communicator, Anisotropy >:: FastSweepingMethod() : maxIterations( 1 ) { @@ -29,9 +30,10 @@ FastSweepingMethod() template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > const Index& -FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Communicator, Anisotropy >:: getMaxIterations() const { @@ -40,9 +42,10 @@ getMaxIterations() const template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > void -FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Communicator, Anisotropy >:: setMaxIterations( const IndexType& maxIterations ) { @@ -51,10 +54,12 @@ setMaxIterations( const IndexType& maxIterations ) template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > void -FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Communicator, Anisotropy >:: solve( const MeshPointer& mesh, + MeshFunctionPointer& Aux, const AnisotropyPointer& anisotropy, MeshFunctionPointer& u ) { diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index 89cb60881..f28202a18 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -14,8 +14,10 @@ #pragma once #include "tnlFastSweepingMethod.h" +#include "tnlDirectEikonalProblem.h" #include #include +#include "tnlDirectEikonalProblem.h" @@ -27,8 +29,9 @@ template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > -FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: FastSweepingMethod() : maxIterations( 1 ) { @@ -38,9 +41,10 @@ FastSweepingMethod() template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > const Index& -FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: getMaxIterations() const { @@ -49,9 +53,10 @@ getMaxIterations() const template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > void -FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: setMaxIterations( const IndexType& maxIterations ) { @@ -60,12 +65,14 @@ setMaxIterations( const IndexType& maxIterations ) template< typename Real, typename Device, typename Index, - typename Anisotropy > + typename Communicator, + typename Anisotropy > void -FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: solve( const MeshPointer& mesh, + MeshFunctionPointer& Aux, const AnisotropyPointer& anisotropy, - MeshFunctionPointer& u ) + const MeshFunctionPointer& u ) { MeshFunctionPointer auxPtr; InterfaceMapPointer interfaceMapPtr; @@ -81,6 +88,7 @@ solve( const MeshPointer& mesh, IndexType iteration( 0 ); InterfaceMapType interfaceMap = *interfaceMapPtr; MeshFunctionType aux = *auxPtr; + aux.template synchronize< Communicator >(); #ifdef HAVE_MPI @@ -116,263 +124,260 @@ solve( const MeshPointer& mesh, while( iteration < this->maxIterations ) { - if( std::is_same< DeviceType, Devices::Host >::value ) +#ifdef HAVE_MPI + int i = MPI::GetRank( MPI::AllGroup ); + + /*if( i == 0 ) { - int numThreadsPerBlock = -1; - - numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0)); - //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); - if( numThreadsPerBlock <= 16 ) - numThreadsPerBlock = 16; - else if(numThreadsPerBlock <= 32 ) - numThreadsPerBlock = 32; - else if(numThreadsPerBlock <= 64 ) - numThreadsPerBlock = 64; - else if(numThreadsPerBlock <= 128 ) - numThreadsPerBlock = 128; - else if(numThreadsPerBlock <= 256 ) - numThreadsPerBlock = 256; - else if(numThreadsPerBlock <= 512 ) - numThreadsPerBlock = 512; - else - numThreadsPerBlock = 1024; - //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); - - if( numThreadsPerBlock == -1 ){ - printf("Fail in setting numThreadsPerBlock.\n"); - break; + for( int k = 0; k < mesh->getDimensions().y(); k++ ){ + for( int l = 0; l < mesh->getDimensions().x(); l++ ) + printf("%.2f\t",aux[ k * mesh->getDimensions().x() + l ] ); + printf("\n"); } + }*/ + aux.template synchronize< Communicator >(); + Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh(); + + const int *neigh = meshPom->getNeighbors(); + MPI::Request *req; + req = new MPI::Request[meshPom->getNeighborsCount()]; + int WhileCount = 0; +#endif + + Containers::StaticVector< 2, IndexType > vLower = meshPom->getLowerOverlap(); + Containers::StaticVector< 2, IndexType > vUpper = meshPom->getUpperOverlap(); + printf( "%d: meshDimensions are (x,y) = (%d,%d).\n",i, mesh->getDimensions().x(), mesh->getDimensions().y() ); + printf( "%d: owerlaps are ([x1,x2],[y1,y2]) = ([%d,%d],[%d,%d]).\n",i, vLower[0], vUpper[0], vLower[1], vUpper[1] ); + int calculated = 1; + int calculate = 1; + + while( calculated ) + { + calculated = 0; + WhileCount++; - - - int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0); - int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0); - - //std::cout << "numBlocksX = " << numBlocksX << std::endl; - - /*Real **sArray = new Real*[numBlocksX*numBlocksY]; - for( int i = 0; i < numBlocksX * numBlocksY; i++ ) - sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];*/ - - ArrayContainer BlockIterHost; - BlockIterHost.setSize( numBlocksX * numBlocksY ); - BlockIterHost.setValue( 1 ); - int IsCalculationDone = 1; - - MeshFunctionPointer helpFunc( mesh ); - MeshFunctionPointer helpFunc1( mesh ); - helpFunc1 = auxPtr; - auxPtr = helpFunc; - helpFunc = helpFunc1; - //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl; - /*for( int k = numBlocksX-1; k >-1; k-- ){ - for( int l = 0; l < numBlocksY; l++ ){ - std::cout<< BlockIterHost[ l*numBlocksX + k ]; - } - std::cout<template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); - case 32: - this->template updateBlocks< 34 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); - case 64: - this->template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); - case 128: - this->template updateBlocks< 130 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); - case 256: - this->template updateBlocks< 258 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); - case 512: - this->template updateBlocks< 514 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); - default: - this->template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); + if( std::is_same< DeviceType, Devices::Host >::value && calculate ) + { + calculate = 0; + + /**--HERE-IS-PARALLEL-OMP-CODE--!!!WITHOUT MPI!!!--------------------**/ + /* + int numThreadsPerBlock = -1; + + numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0)); + //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); + if( numThreadsPerBlock <= 16 ) + numThreadsPerBlock = 16; + else if(numThreadsPerBlock <= 32 ) + numThreadsPerBlock = 32; + else if(numThreadsPerBlock <= 64 ) + numThreadsPerBlock = 64; + else if(numThreadsPerBlock <= 128 ) + numThreadsPerBlock = 128; + else if(numThreadsPerBlock <= 256 ) + numThreadsPerBlock = 256; + else if(numThreadsPerBlock <= 512 ) + numThreadsPerBlock = 512; + else + numThreadsPerBlock = 1024; + //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); + + if( numThreadsPerBlock == -1 ){ + printf("Fail in setting numThreadsPerBlock.\n"); + break; + } + + + + int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0); + int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0); + + //std::cout << "numBlocksX = " << numBlocksX << std::endl; + + //Real **sArray = new Real*[numBlocksX*numBlocksY]; + //for( int i = 0; i < numBlocksX * numBlocksY; i++ ) + // sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)]; + + ArrayContainer BlockIterHost; + BlockIterHost.setSize( numBlocksX * numBlocksY ); + BlockIterHost.setValue( 1 ); + int IsCalculationDone = 1; + + MeshFunctionPointer helpFunc( mesh ); + MeshFunctionPointer helpFunc1( mesh ); + helpFunc1 = auxPtr; + auxPtr = helpFunc; + helpFunc = helpFunc1; + //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl; + //for( int k = numBlocksX-1; k >-1; k-- ){ + // for( int l = 0; l < numBlocksY; l++ ){ + // std::cout<< BlockIterHost[ l*numBlocksX + k ]; + // } + // std::cout<template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + case 32: + this->template updateBlocks< 34 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + case 64: + this->template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + case 128: + this->template updateBlocks< 130 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + case 256: + this->template updateBlocks< 258 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + case 512: + this->template updateBlocks< 514 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + default: + this->template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + } + + + //Reduction + for( int i = 0; i < BlockIterHost.getSize(); i++ ){ + if( IsCalculationDone == 0 ){ + IsCalculationDone = IsCalculationDone || BlockIterHost[ i ]; + //break; + } + } + numWhile++; + //std::cout <<"numWhile = "<< numWhile <-1; j-- ){ + // for( int i = 0; i < numBlocksX; i++ ) + // std::cout << BlockIterHost[ j * numBlocksX + i ]; + // std::cout << std::endl; + // } + // std::cout << std::endl; + + this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY ); + + //std::cout<getDimensions().y(); k++ ){ + for( int l = 0; l < mesh->getDimensions().x(); l++ ) + printf("%.2f\t",aux[ k * mesh->getDimensions().x() + l ] ); + printf("\n"); + } + }*/ + + for( cell.getCoordinates().y() = 0 + vLower[1]; + cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = 0 + vLower[0]; + cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; + cell.getCoordinates().x()++ ) + { + cell.refresh(); + if( ! interfaceMap( cell ) ) + calculated = this->updateCell( aux, cell ) || calculated; + } } + //if( i == 0 ) + //{ + // for( int k = 0; k < mesh->getDimensions().y(); k++ ){ + // for( int l = 0; l < mesh->getDimensions().x(); l++ ) + // printf("%.2f\t",aux[ k * mesh->getDimensions().x() + l ] ); + // printf("\n"); + // } + //} + //aux.save( "aux-1.tnl" ); - //Reduction - for( int i = 0; i < BlockIterHost.getSize(); i++ ){ - if( IsCalculationDone == 0 ){ - IsCalculationDone = IsCalculationDone || BlockIterHost[ i ]; - //break; + for( cell.getCoordinates().y() = 0 + vLower[1]; + cell.getCoordinates().y() < mesh->getDimensions().y()-vUpper[1]; + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0]; + cell.getCoordinates().x() >= 0 + vLower[0]; + cell.getCoordinates().x()-- ) + { + //std::cerr << "2 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); } } - numWhile++; - /*std::cout <<"numWhile = "<< numWhile <-1; j-- ){ - for( int i = 0; i < numBlocksX; i++ ) - std::cout << BlockIterHost[ j * numBlocksX + i ]; - std::cout << std::endl; + //aux.save( "aux-2.tnl" ); + + for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 -vUpper[1]; + cell.getCoordinates().y() >= 0 + vLower[1] ; + cell.getCoordinates().y()-- ) + { + for( cell.getCoordinates().x() = 0 + vLower[0]; + cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; + cell.getCoordinates().x()++ ) + { + //std::cerr << "3 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } } - std::cout << std::endl;*/ - this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY ); + //aux.save( "aux-3.tnl" ); - /*for( int j = numBlocksY-1; j>-1; j-- ){ - for( int i = 0; i < numBlocksX; i++ ) - std::cout << "BlockIterHost = "<< j*numBlocksX + i<< " ," << BlockIterHost[ j * numBlocksX + i ]; - std::cout << std::endl; - } - std::cout << std::endl;*/ + for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 - vUpper[1]; + cell.getCoordinates().y() >= 0 + vLower[1]; + cell.getCoordinates().y()-- ) + { + for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0]; + cell.getCoordinates().x() >= 0 + vLower[0]; + cell.getCoordinates().x()-- ) + { + //std::cerr << "4 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } - //std::cout<getDimensions().y(); - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); - cell.getCoordinates().x()++ ) - { - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - - //aux.save( "aux-1.tnl" ); - - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh->getDimensions().y(); - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) - { - //std::cerr << "2 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - - //aux.save( "aux-2.tnl" ); - - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; - cell.getCoordinates().y() >= 0 ; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); - cell.getCoordinates().x()++ ) - { - //std::cerr << "3 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - - //aux.save( "aux-3.tnl" ); - - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; - cell.getCoordinates().y() >= 0; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) - { - //std::cerr << "4 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - - for( int j = 0; - j < mesh->getDimensions().y(); - j++ ) - { - for( int i = 0; - i < mesh->getDimensions().x(); - i++ ) - { - std::cout << aux[ i * mesh->getDimensions().y() + j ] << " "; - } - std::cout << std::endl; - }*/ - - } - if( std::is_same< DeviceType, Devices::Cuda >::value ) - { - // TODO: CUDA code -#ifdef HAVE_CUDA - TNL_CHECK_CUDA_DEVICE; - // Maximum cudaBlockSite is 32. Because of maximum num. of threads in kernel. - const int cudaBlockSize( 16 ); - - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); - int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); - dim3 blockSize( cudaBlockSize, cudaBlockSize ); - dim3 gridSize( numBlocksX, numBlocksY ); - - tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; - - int BlockIterD = 1; - - TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; - BlockIterDevice.setSize( numBlocksX * numBlocksY ); - BlockIterDevice.setValue( 1 ); - TNL_CHECK_CUDA_DEVICE; - - - TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom; - BlockIterPom.setSize( numBlocksX * numBlocksY ); - BlockIterPom.setValue( 0 ); - /*TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1; - BlockIterPom1.setSize( numBlocksX * numBlocksY ); - BlockIterPom1.setValue( 0 );*/ - /*int *BlockIterDevice; - cudaMalloc((void**) &BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/ - int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0); - //std::cout << "nBlocksNeigh = " << nBlocksNeigh << std::endl; - //free( BlockIter ); - /*int *BlockIterPom; - cudaMalloc((void**) &BlockIterPom, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/ - - int nBlocks = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0); - - TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; - dBlock.setSize( nBlocks ); - TNL_CHECK_CUDA_DEVICE; - /*int *dBlock; - cudaMalloc((void**) &dBlock, nBlocks * sizeof( int ) );*/ - - - MeshFunctionPointer helpFunc1( mesh ); - MeshFunctionPointer helpFunc( mesh ); - - helpFunc1 = auxPtr; - auxPtr = helpFunc; - helpFunc = helpFunc1; - - int numIter = 0; - - //int oddEvenBlock = 0; - while( BlockIterD ) + if( std::is_same< DeviceType, Devices::Cuda >::value && calculate ) { - /** HERE IS CHESS METHOD **/ + // TODO: CUDA code + + calculate = 0; + //if( i == 0 ) + // printf("%d: We are in Cuda code start.\n", i); +#ifdef HAVE_CUDA + + TNL_CHECK_CUDA_DEVICE; + // Maximum cudaBlockSite is 32. Because of maximum num. of threads in kernel. + const int cudaBlockSize( 16 ); + + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vLower[0] - vUpper[0], cudaBlockSize ); + int numBlocksXbez = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); + int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vLower[1] - vUpper[1], cudaBlockSize ); + int numBlocksYbez = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); + dim3 blockSize( cudaBlockSize, cudaBlockSize ); + dim3 gridSizeBez( numBlocksXbez, numBlocksYbez ); + dim3 gridSize( numBlocksX, numBlocksY ); + tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; + + int BlockIterD = 1; /*auxPtr = helpFunc; CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, @@ -408,15 +413,44 @@ solve( const MeshPointer& mesh, BlockIterD = dBlock.getElement( 0 );*/ - /**------------------------------------------------------------------------------------------------*/ + TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; + BlockIterDevice.setSize( numBlocksX * numBlocksY ); + BlockIterDevice.setValue( 1 ); + TNL_CHECK_CUDA_DEVICE; + + TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom; + BlockIterPom.setSize( numBlocksX * numBlocksY ); + BlockIterPom.setValue( 0 ); + TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1; + BlockIterPom1.setSize( numBlocksX * numBlocksY ); + BlockIterPom1.setValue( 0 ); + /*int *BlockIterDevice; + cudaMalloc((void**) &BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/ + int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0); + //std::cout << "nBlocksNeigh = " << nBlocksNeigh << std::endl; + //free( BlockIter ); + /*int *BlockIterPom; + cudaMalloc((void**) &BlockIterPom, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/ - /** HERE IS FIM **/ + int nBlocks = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0); + + TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; + dBlock.setSize( nBlocks ); + TNL_CHECK_CUDA_DEVICE; + /*int *dBlock; + cudaMalloc((void**) &dBlock, nBlocks * sizeof( int ) );*/ + + + MeshFunctionPointer helpFunc1( mesh ); + MeshFunctionPointer helpFunc( mesh ); + //helpFunc->bind( auxPtr->getData() ); + DeepCopy<<< gridSizeBez, blockSize >>>( auxPtr.template getData< Device>(), + helpFunc.template modifyData< Device>() ); helpFunc1 = auxPtr; auxPtr = helpFunc; helpFunc = helpFunc1; - TNL_CHECK_CUDA_DEVICE; //int pocBloku = 0; Devices::Cuda::synchronizeDevice(); @@ -428,67 +462,231 @@ solve( const MeshPointer& mesh, cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - //std::cout << "Pocet aktivnich bloku = " << pocBloku << std::endl; - //BlockIterPom1 = BlockIterDevice; - ///for( int i =0; i< numBlocksX; i++ ){ - // for( int j = 0; j < numBlocksY; j++ ) - // { - // std::cout << BlockIterPom1[j*numBlocksX + i]; - // } - // std::cout << std::endl; - //} - //std::cout << std::endl; + //int oddEvenBlock = 0; + //int numberWhile = 0; + while( BlockIterD /*numberWhile < 10*/) + { + //numberWhile++; + /** HERE IS CHESS METHOD **/ + + /*auxPtr = helpFunc; + + CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, + interfaceMapPtr.template getData< Device >(), + auxPtr.template getData< Device>(), + helpFunc.template modifyData< Device>(), + BlockIterDevice, + oddEvenBlock ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + auxPtr = helpFunc; + + oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; + + CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, + interfaceMapPtr.template getData< Device >(), + auxPtr.template getData< Device>(), + helpFunc.template modifyData< Device>(), + BlockIterDevice, vLower, vUpper, + oddEvenBlock ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + auxPtr = helpFunc; + + oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; + + CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + BlockIterD = dBlock.getElement( 0 );*/ + + /**------------------------------------------------------------------------------------------------*/ + + + /** HERE IS FIM **/ + + helpFunc1 = auxPtr; + auxPtr = helpFunc; + helpFunc = helpFunc1; + TNL_CHECK_CUDA_DEVICE; + + //int pocBloku = 0; + Devices::Cuda::synchronizeDevice(); + CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, + interfaceMapPtr.template getData< Device >(), + auxPtr.template modifyData< Device>(), + helpFunc.template modifyData< Device>(), + BlockIterDevice, vLower, vUpper, i ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; - GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice.getView(), BlockIterPom.getView(), numBlocksX, numBlocksY ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - BlockIterDevice = BlockIterPom; - //std::cout<< "Probehlo" << std::endl; - //TNL::swap( auxPtr, helpFunc ); + aux = *auxPtr; + interfaceMap = *interfaceMapPtr; +#endif + } + + +/**----------------------MPI-TO-DO---------------------------------------------**/ +#ifdef HAVE_MPI + //int i = MPI::GetRank( MPI::AllGroup ); + //TNL::Meshes::DistributedMeshes::DistributedMesh< MeshType > Mesh; + int neighCount = 0; // should this thread calculate again? + int calculpom[4] = {0,0,0,0}; - CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY ) ); - TNL_CHECK_CUDA_DEVICE; + if( i == 0 ){ + BlockIterPom1 = BlockIterDevice; + for( int i =0; i< numBlocksX; i++ ){ + for( int j = 0; j < numBlocksY; j++ ) + { + std::cout << BlockIterPom1[j*numBlocksX + i]; + } + std::cout << std::endl; + } + std::cout << std::endl; + } + GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, BlockIterPom, numBlocksX, numBlocksY ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + BlockIterDevice = BlockIterPom; + + if( i == 0 ){ + BlockIterPom1 = BlockIterDevice; + for( int i =0; i< numBlocksX; i++ ){ + for( int j = 0; j < numBlocksY; j++ ) + { + std::cout << BlockIterPom1[j*numBlocksX + i]; + } + std::cout << std::endl; + } + std::cout << std::endl; + } + //std::cout<< "Probehlo" << std::endl; + + //TNL::swap( auxPtr, helpFunc ); + + CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); + TNL_CHECK_CUDA_DEVICE; + + CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); + TNL_CHECK_CUDA_DEVICE; + + + //if( i == 0 ) + // printf("%d: We did parallel reduction.\n", i); + BlockIterD = dBlock.getElement( 0 ); + + //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + //if( i == 0 ) + // printf("%d: BlockIterD = %d.\n", i, BlockIterD); + +#ifdef HAVE_MPI + if( BlockIterD ){ + calculated = 1; + //printf( "calculated = %d\n",calculated ); + } +#endif + /**-----------------------------------------------------------------------------------------------------------*/ + /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) + BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ + numIter ++; + } + if( numIter%2 == 1 ){ + auxPtr = helpFunc; + } + /*cudaFree( BlockIterDevice ); + cudaFree( dBlock ); + delete BlockIter;*/ - CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks ); - TNL_CHECK_CUDA_DEVICE; + if( neigh[1] != -1 ) + { + req[neighCount] = MPI::ISend( &calculated, 1, neigh[1], 0, MPI::AllGroup ); + neighCount++; + + + req[neighCount] = MPI::IRecv( &calculpom[1], 1, neigh[1], 0, MPI::AllGroup ); + neighCount++; + } + if( neigh[2] != -1 ) + { + req[neighCount] = MPI::ISend( &calculated, 1, neigh[2], 0, MPI::AllGroup ); + neighCount++; + + req[neighCount] = MPI::IRecv( &calculpom[2], 1, neigh[2], 0, MPI::AllGroup ); + neighCount++; + } - BlockIterD = dBlock.getElement( 0 ); - //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + if( neigh[5] != -1 ) + { + req[neighCount] = MPI::ISend( &calculated, 1, neigh[5], 0, MPI::AllGroup ); + neighCount++; + + req[neighCount] = MPI::IRecv( &calculpom[3], 1, neigh[5], 0, MPI::AllGroup ); + neighCount++; + } + MPI::WaitAll(req,neighCount); + MPI::Allreduce( &calculated, &calculated, 1, MPI_LOR, MPI::AllGroup ); + aux.template synchronize< Communicator >(); + calculate = calculpom[0] || calculpom[1] || calculpom[2] || calculpom[3]; + aux.template synchronize< Communicator >(); - /**-----------------------------------------------------------------------------------------------------------*/ - /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) - BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ - numIter ++; - } - if( numIter == 1 ){ - auxPtr = helpFunc; + //printf( "%d: Receved reduced info about Calculated = %d.\n", i,calculated); + + + if( i == 0 ) + printf("WhileCount = %d\n",WhileCount); + //calculated = 0; /// DEBUG; } - /*cudaFree( BlockIterDevice ); - cudaFree( dBlock ); - delete BlockIter;*/ - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - aux = *auxPtr; - interfaceMap = *interfaceMapPtr; + String s( "aux-" + std::to_string( i ) + ".tnl" ); + aux.save( s ); + Aux=auxPtr; + + /*if( i == 0 ) + { + for( int k = 0; k < mesh->getDimensions().y(); k++ ){ + for( int l = 0; l < mesh->getDimensions().x(); l++ ) + printf("%.2f\t",aux[ k * mesh->getDimensions().x() + l ] ); + printf("\n"); + } + printf("\n"); + for( int k = 0; k < mesh->getDimensions().y(); k++ ){ + for( int l = 0; l < mesh->getDimensions().x(); l++ ) + printf("%.2f\t",(*Aux)[ k * mesh->getDimensions().x() + l ] ); + printf("\n"); + } + }*/ #endif - } - iteration++; + iteration++; } - //#endif aux.save("aux-final.tnl"); } #ifdef HAVE_CUDA - +// DeepCopy nebo pracne kopirovat kraje v zavislosti na vLower,vUpper z sArray do helpFunc. +template< typename Real, typename Device, typename Index > +__global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc ) +{ + int i = threadIdx.x + blockDim.x*blockIdx.x; + int j = blockDim.y*blockIdx.y + threadIdx.y; + const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >(); + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) + helpFunc[ j * mesh.getDimensions().x() + i ] = aux[ j * mesh.getDimensions().x() + i ]; +} template < typename Index > __global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, @@ -590,11 +788,16 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock ) + CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY ) ); + TNL_CHECK_CUDA_DEVICE; + + CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks ); + TNL_CHECK_CUDA_DEVICE; { int thri = threadIdx.x; int thrj = threadIdx.y; - int i = threadIdx.x + blockDim.x*blockIdx.x; - int j = blockDim.y*blockIdx.y + threadIdx.y; + int i = threadIdx.x + blockDim.x*blockIdx.x + vLower[0]; + int j = blockDim.y*blockIdx.y + threadIdx.y + vLower[1]; + const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >(); /** FOR CHESS METHOD */ //if( (blockIdx.y%2 + blockIdx.x) % 2 == oddEvenBlock ) //{ @@ -606,8 +809,8 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] ) { __syncthreads(); + /**-----------------------------------------*/ - const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); __shared__ int dimX; __shared__ int dimY; __shared__ Real hx; @@ -628,16 +831,22 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< xkolik = blockDim.x + 1; ykolik = blockDim.y + 1; - numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0); - numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0); + numOfBlocky = (dimY-vUpper[1]-vLower[1])/blockDim.y + (((dimY-vUpper[1]-vLower[1])%blockDim.y != 0) ? 1:0); + numOfBlockx = (dimX-vUpper[0]-vLower[0])/blockDim.x + (((dimX-vUpper[0]-vLower[0])%blockDim.x != 0) ? 1:0); if( numOfBlockx - 1 == blockIdx.x ) - xkolik = dimX - (blockIdx.x)*blockDim.x+1; + xkolik = (dimX-vUpper[0]-vLower[0]) - (blockIdx.x)*blockDim.x+1; if( numOfBlocky -1 == blockIdx.y ) - ykolik = dimY - (blockIdx.y)*blockDim.y+1; + ykolik = (dimY-vUpper[1]-vLower[1]) - (blockIdx.y)*blockDim.y+1; __syncthreads(); + /*if( thri==0 && thrj == 0 ) + { + printf("%d: DimX = %d, DimY = %d, xKolik = %d, yKolik = %d, numOfBlockX = %d, numOfBlockY = %d, blockIdx.x = %d, blockIdx.y = %d.\n", + k, dimX, dimY, xkolik, ykolik, numOfBlockx, numOfBlocky, blockIdx.x, blockIdx.y); + }*/ + int currentIndex = thrj * blockDim.x + thri; //__shared__ volatile bool changed[ blockDim.x*blockDim.y ]; __shared__ volatile bool changed[ (sizeSArray-2)*(sizeSArray-2)]; @@ -648,47 +857,98 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ]; __shared__ volatile Real sArray[ sizeSArray * sizeSArray ]; - sArray[ thrj * sizeSArray + thri ] = std::numeric_limits< Real >::max(); + sArray[ (thrj+1) * sizeSArray + thri +1 ] = 10;//std::numeric_limits< Real >::max(); - //filling sArray edges + /*if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 ) + { + printf( "Kraje: \n"); + for( int k = sizeSArray-1; k>-1; k-- ){ + for( int l = 0; l < sizeSArray; l++ ) + printf( "%.4f ", sArray[k * sizeSArray + l]); + printf( "\n"); + } + printf( "\n"); + } + __syncthreads();*/ + + //filling sArray edges if( thri == 0 ) { - if( dimX > (blockIdx.x+1) * blockDim.x && thrj+1 < ykolik ) - sArray[(thrj+1)*sizeSArray + xkolik] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX + xkolik ]; + if( dimX - vLower[ 0 ] > (blockIdx.x+1) * blockDim.x && thrj+1 < ykolik ) + sArray[(thrj+1)*sizeSArray + xkolik] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX + xkolik + vLower[0] ]; else - sArray[(thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max(); + sArray[(thrj+1)*sizeSArray + xkolik] = 10;//std::numeric_limits< Real >::max(); } - + if( thri == 1 ) - { - if( blockIdx.x != 0 && thrj+1 < ykolik ) - sArray[(thrj+1)*sizeSArray + 0] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX ]; + { + if( ( blockIdx.x != 0 || vLower[0] != 0 ) && thrj+1 < ykolik ) + sArray[(thrj+1)*sizeSArray + 0] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX + vLower[0] ]; else - sArray[(thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max(); + sArray[(thrj+1)*sizeSArray + 0] = 10;//std::numeric_limits< Real >::max(); } if( thri == 2 ) { - if( dimY > (blockIdx.y+1) * blockDim.y && thrj+1 < xkolik ) - sArray[ ykolik*sizeSArray + thrj+1 ] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + ykolik*dimX + thrj+1 ]; + if( dimY - vLower[ 1 ] > (blockIdx.y+1) * blockDim.y && thrj+1 < xkolik ) + sArray[ ykolik*sizeSArray + thrj+1 ] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + ykolik*dimX + thrj+1 + vLower[0] ]; else - sArray[ykolik*sizeSArray + thrj+1] = std::numeric_limits< Real >::max(); + sArray[ ykolik*sizeSArray + thrj+1 ] = 10;//std::numeric_limits< Real >::max(); } - + if( thri == 3 ) { - if( blockIdx.y != 0 && thrj+1 < xkolik ) - sArray[0*sizeSArray + thrj+1] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + thrj+1 ]; + if( ( blockIdx.y != 0 || vLower[1] != 0 ) && thrj+1 < xkolik ) + sArray[0*sizeSArray + thrj+1] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + thrj+1 + vLower[0] ]; else - sArray[0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max(); + sArray[0*sizeSArray + thrj+1] = 10;//std::numeric_limits< Real >::max(); } + /*__syncthreads(); + if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 1 ) + { + printf( "Kraje: \n"); + for( int k = sizeSArray-1; k>-1; k-- ){ + for( int l = 0; l < sizeSArray; l++ ) + printf( "%.4f ", sArray[k * sizeSArray + l]); + printf( "\n"); + } + printf( "\n"); + } + __syncthreads();*/ + - if( i < dimX && j < dimY ) - { + if( i < dimX && j < dimY && thri+1 < xkolik && thrj+1 < ykolik ) + { + /*if( k == 3 && blockIdx.x == 0 && blockIdx.y == 0 ) + printf("at index = %d\n", j*dimX + i);*/ sArray[(thrj+1)*sizeSArray + thri+1] = aux[ j*dimX + i ]; } __syncthreads(); + if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 ) + { + printf( "všechno před výpočtem: \n"); + for( int k = sizeSArray-1; k>-1; k-- ){ + for( int l = 0; l < sizeSArray; l++ ) + printf( "%.4f ", sArray[k * sizeSArray + l]); + printf( "\n"); + } + printf( "\n"); + } + + if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 ) + { + for( int k = mesh.getDimensions().y()-1; k>-1; k-- ){ + for( int l = 0; l < 17; l++ ) + printf( "%.2f ", aux[ k * mesh.getDimensions().x() + l ]); + printf( "\n"); + } + printf( "\n"); + } + + //main while cycle + //if( i == 0 && j == 0 ) + // printf("Overlaps [x1,y1],[x2,y2] = [%d,%d],[%d,%d]",vLower[0], vLower[1], vUpper[0], vUpper[1] ); while( changed[ 0 ] ) { @@ -697,10 +957,12 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< changed[ currentIndex] = false; //calculation of update cell - if( i < dimX && j < dimY ) + if( i < dimX - vUpper[0] && j < dimY - vUpper[1] /*&& i > vLower[0]-1 && j > vLower[1]-1*/ ) { if( ! interfaceMap[ j * dimX + i ] ) { + /*if( k == 1 && blockIdx.x == 1 && blockIdx.y == 0 ) + printf( "thri = %d, thrj = %d \n", thri, thrj );*/ changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy); } } @@ -751,20 +1013,41 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( thri == 0 && thrj == 0 && changed[ 0 ] ){ BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] = 1; } - /*if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 ) - { - for( int k = 15; k>-1; k-- ){ - for( int l = 0; l < 16; l++ ) - printf( "%f\t", sArray[k * 16 + l]); - printf( "\n"); - } - printf( "\n"); - }*/ __syncthreads(); } - if( i < dimX && j < dimY ) + + + + if( i < dimX && j < dimY && thri+1 < xkolik && thrj+1 < ykolik ) helpFunc[ j * dimX + i ] = sArray[ ( thrj + 1 ) * sizeSArray + thri + 1 ]; + __syncthreads(); + /*if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 1 ) + { + printf( "všechno po výpočtu: \n"); + for( int k = sizeSArray-1; k>-1; k-- ){ + for( int l = 0; l < sizeSArray; l++ ) + printf( "%.4f ", sArray[k * sizeSArray + l]); + printf( "\n"); + } + printf( "\n"); + }*/ - } + /*if( thri==0 && thrj == 0 && blockIdx.x == 1 && blockIdx.y == 1 && k == 1 ) + { + printf( "8: \n"); + for( int k = mesh.getDimensions().y()-1; k>-1; k-- ){ + for( int l = 0; l < mesh.getDimensions().x(); l++ ) + printf( "%.2f\t", helpFunc[ k * mesh.getDimensions().x() + l ]); + printf("\n"); + } + printf( "\n"); + }*/ + } + else + { + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) + helpFunc[ j * mesh.getDimensions().x() + i ] = aux[ j * mesh.getDimensions().x() + i ]; + } } #endif + TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock ) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h index 2a1183bc2..9c5471beb 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h @@ -18,8 +18,9 @@ template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > -FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: FastSweepingMethod() : maxIterations( 1 ) { @@ -29,9 +30,10 @@ FastSweepingMethod() template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > const Index& -FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: getMaxIterations() const { @@ -40,9 +42,10 @@ getMaxIterations() const template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > void -FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: setMaxIterations( const IndexType& maxIterations ) { @@ -51,10 +54,12 @@ setMaxIterations( const IndexType& maxIterations ) template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > void -FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: solve( const MeshPointer& mesh, + MeshFunctionPointer& Aux, const AnisotropyPointer& anisotropy, MeshFunctionPointer& u ) { -- GitLab From be284ee4ae11e0a53e2fc3c559bf2528cea91ebb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matou=C5=A1=20Fencl?= Date: Sun, 3 Mar 2019 08:26:21 +0100 Subject: [PATCH 05/14] 2D MPI cuda repaired --- .../tnl-direct-eikonal-solver.h | 2 +- .../tnlDirectEikonalMethodsBase.h | 2 +- .../tnlDirectEikonalMethodsBase_impl.h | 103 ++--- .../tnlDirectEikonalProblem_impl.h | 1 + .../hamilton-jacobi/tnlFastSweepingMethod.h | 2 + .../tnlFastSweepingMethod2D_impl.h | 357 ++++++++++-------- 6 files changed, 232 insertions(+), 235 deletions(-) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h index 82411c939..b2cfc65dc 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h @@ -36,7 +36,7 @@ class DirectEikonalSolverConfig { config.addDelimiter( "Direct eikonal equation solver settings:" ); config.addRequiredEntry< String >( "input-file", "Input file." ); - config.addEntry< String >( "distributed-grid-io-type", "Choose Distributed Grid IO Type", "LocalCopy"); + config.addEntry< String >( "distributed-grid-io-type", "Choose Distributed Grid IO Type", "MpiIO"); config.addEntryEnum< String >( "LocalCopy" ); config.addEntryEnum< String >( "MpiIO" ); }; diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h index 3a78d0f54..d933f1df3 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h @@ -152,7 +152,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< template< typename Real, typename Device, typename Index > __global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc ); + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, int copy, int k ); template < typename Index > __global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h index 3f5b6eed2..d7da1117e 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h @@ -771,8 +771,8 @@ initInterface( const MeshFunctionPointer& _input, { cell.refresh(); output[ cell.getIndex() ] = - input( cell ) >= 0 ? 10://std::numeric_limits< RealType >::max() : - -10;//- std::numeric_limits< RealType >::max(); + input( cell ) >= 0 ? std::numeric_limits< RealType >::max() : + - std::numeric_limits< RealType >::max(); interfaceMap[ cell.getIndex() ] = false; } @@ -908,8 +908,8 @@ updateCell( MeshFunctionType& u, b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1 >() ], u[ neighborEntities.template getEntityIndex< 0, 1 >() ] ); } - if( fabs( a ) == 10&&//std::numeric_limits< RealType >::max() && - fabs( b ) == 10)//std::numeric_limits< RealType >::max() ) + if( fabs( a ) == std::numeric_limits< RealType >::max() && + fabs( b ) == std::numeric_limits< RealType >::max() ) return false; RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 }; @@ -1021,81 +1021,46 @@ initInterface( const MeshFunctionPointer& _input, const IndexType e = neighbors.template getEntityIndex< 1, 0, 0 >(); const IndexType n = neighbors.template getEntityIndex< 0, 1, 0 >(); const IndexType t = neighbors.template getEntityIndex< 0, 0, 1 >(); - + + if( c * input[ n ] <= 0 ) { - if( c >= 0 ) - { - pom = ( hy * c )/( c - input[ n ]); - if( output[ cell.getIndex() ] > pom ) - output[ cell.getIndex() ] = pom; - - if ( output[ n ] < pom - hy) - output[ n ] = pom - hy; // ( hy * c )/( c - input[ n ]) - hy; - - }else - { - pom = - ( hy * c )/( c - input[ n ]); - if( output[ cell.getIndex() ] < pom ) - output[ cell.getIndex() ] = pom; - if( output[ n ] > hy + pom ) - output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]); - - } + pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); + if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) + output[ cell.getIndex() ] = pom; + pom = pom - TNL::sign( c )*hy; + if( TNL::abs( output[ n ] ) > TNL::abs( pom ) ) + output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy; + interfaceMap[ cell.getIndex() ] = true; interfaceMap[ n ] = true; } + if( c * input[ e ] <= 0 ) { - if( c >= 0 ) - { - pom = ( hx * c )/( c - input[ e ]); - if( output[ cell.getIndex() ] > pom ) - output[ cell.getIndex() ] = pom; - - pom = pom - hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; - if( output[ e ] < pom ) - output[ e ] = pom; - - }else - { - pom = - (hx * c)/( c - input[ e ]); - if( output[ cell.getIndex() ] < pom ) - output[ cell.getIndex() ] = pom; - - pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]); - if( output[ e ] > pom ) - output[ e ] = pom; - } + pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); + if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) + output[ cell.getIndex() ] = pom; + pom = pom - TNL::sign( c )*hx; + if( TNL::abs( output[ e ] ) > TNL::abs( pom ) ) + output[ e ] = pom; //( hy * c )/( c - input[ n ]) - hy; + interfaceMap[ cell.getIndex() ] = true; interfaceMap[ e ] = true; } + if( c * input[ t ] <= 0 ) { - if( c >= 0 ) - { - pom = ( hz * c )/( c - input[ t ]); - if( output[ cell.getIndex() ] > pom ) - output[ cell.getIndex() ] = pom; - - pom = pom - hz; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; - if( output[ t ] < pom ) - output[ t ] = pom; - - }else - { - pom = - (hz * c)/( c - input[ t ]); - if( output[ cell.getIndex() ] < pom ) - output[ cell.getIndex() ] = pom; - - pom = pom + hz; //output[ e ] = hx - (hx * c)/( c - input[ e ]); - if( output[ t ] > pom ) - output[ t ] = pom; - - } + pom = TNL::sign( c )*( hz * c )/( c - input[ t ]); + if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) + output[ cell.getIndex() ] = pom; + pom = pom - TNL::sign( c )*hz; + if( TNL::abs( output[ t ] ) > TNL::abs( pom ) ) + output[ t ] = pom; //( hy * c )/( c - input[ n ]) - hy; + interfaceMap[ cell.getIndex() ] = true; interfaceMap[ t ] = true; - } + } } /*output[ cell.getIndex() ] = c > 0 ? TypeInfo< RealType >::getMaxValue() : @@ -1240,8 +1205,8 @@ updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real a = TNL::argAbsMin( sArray[ thrj * sizeSArray + thri+1 ], sArray[ thrj * sizeSArray + thri-1 ] ); - if( fabs( a ) == 10&&//std::numeric_limits< RealType >::max() && - fabs( b ) == 10)//std::numeric_limits< RealType >::max() ) + if( fabs( a ) == std::numeric_limits< RealType >::max() && + fabs( b ) == std::numeric_limits< RealType >::max() ) return false; RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 }; @@ -1422,8 +1387,8 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, output[ cind ] = - input( cell ) >= 0 ? 10://std::numeric_limits< Real >::max() : - - 10;//- std::numeric_limits< Real >::max(); + input( cell ) >= 0 ? std::numeric_limits< Real >::max() : + - std::numeric_limits< Real >::max(); interfaceMap[ cind ] = false; if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] && i>vLower[0] && j> vLower[0] ) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h index 0aecde5db..56fa9496f 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h @@ -191,6 +191,7 @@ bool tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >:: solve( DofVectorPointer& dofs ) { + std::cout << "We are in solve()." << std::endl; FastSweepingMethod< MeshType, Communicator,AnisotropyType > fsm; fsm.solve( this->getMesh(), u, anisotropy, initialData ); diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h index 51b3faceb..8e1e6a72b 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h @@ -86,6 +86,8 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, typedef Index IndexType; typedef Anisotropy AnisotropyType; typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > BaseType; + typedef Communicator CommunicatorType; + using MeshPointer = Pointers::SharedPointer< MeshType >; using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >; using MPI = Communicators::MpiCommunicator; diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index f28202a18..bc1a97b43 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -19,6 +19,7 @@ #include #include "tnlDirectEikonalProblem.h" +#define ForDebug false // false <=> off @@ -124,39 +125,61 @@ solve( const MeshPointer& mesh, while( iteration < this->maxIterations ) { -#ifdef HAVE_MPI - int i = MPI::GetRank( MPI::AllGroup ); - - /*if( i == 0 ) - { - for( int k = 0; k < mesh->getDimensions().y(); k++ ){ - for( int l = 0; l < mesh->getDimensions().x(); l++ ) - printf("%.2f\t",aux[ k * mesh->getDimensions().x() + l ] ); - printf("\n"); - } - }*/ - aux.template synchronize< Communicator >(); Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh(); - - const int *neigh = meshPom->getNeighbors(); - MPI::Request *req; - req = new MPI::Request[meshPom->getNeighborsCount()]; - int WhileCount = 0; -#endif + + + int i = MPI::GetRank( MPI::AllGroup ); // number that identifies rank - Containers::StaticVector< 2, IndexType > vLower = meshPom->getLowerOverlap(); + // getting overlaps ( WITHOUT MPI SHOULD BE 0 ) + Containers::StaticVector< 2, IndexType > vLower = meshPom->getLowerOverlap(); Containers::StaticVector< 2, IndexType > vUpper = meshPom->getUpperOverlap(); + +#if ForDebug + int WhileCount = 0; // number of passages of while cycle with condition calculated printf( "%d: meshDimensions are (x,y) = (%d,%d).\n",i, mesh->getDimensions().x(), mesh->getDimensions().y() ); printf( "%d: owerlaps are ([x1,x2],[y1,y2]) = ([%d,%d],[%d,%d]).\n",i, vLower[0], vUpper[0], vLower[1], vUpper[1] ); - int calculated = 1; - int calculate = 1; + if( std::is_same< DeviceType, Devices::Host >::value && i == 0 ) + { + for( int j = mesh->getDimensions().y()-1; j>-1; j-- ){ + for( int i = 0; i < mesh->getDimensions().x(); i++ ) + std::cout << aux[ j * mesh->getDimensions().x() + i ] << " "; + std::cout << std::endl; + } + std::cout << std::endl; + } + + // TO SEE CUDA OVERLAPS + /*const int cudaBlockSize( 16 ); + int numBlocksXWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); + int numBlocksYWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); + dim3 gridSizeWithoutOverlaps( numBlocksXWithoutOverlaps, numBlocksYWithoutOverlaps ); + dim3 blockSize( cudaBlockSize, cudaBlockSize ); + MeshFunctionPointer helpFunc( mesh ); + DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( helpFunc.template getData< Device>(), + auxPtr.template modifyData< Device>(), 1, i ); */ + +#endif + + int calculated = 1; // indicates weather we calculated in the last passage of the while cycle + // calculated is same for all ranks + // without MPI should be FALSE at the end of while cycle body + int calculate = 1; // indicates if the thread should calculate again in upcoming passage of cycle + // calculate is a value that can differ in every rank + // without MPI should be FALSE at the end of while cycle body while( calculated ) { calculated = 0; +#if ForDebug WhileCount++; + /*if( std::is_same< DeviceType, Devices::Cuda >::value ) + { + DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(), + helpFunc.template modifyData< Device>(), 0, i ); + }*/ +#endif - if( std::is_same< DeviceType, Devices::Host >::value && calculate ) + if( std::is_same< DeviceType, Devices::Host >::value && calculate ) // should we calculate in Host? { calculate = 0; @@ -280,6 +303,7 @@ solve( const MeshPointer& mesh, } }*/ + // FSM FOR MPI and WITHOUT MPI for( cell.getCoordinates().y() = 0 + vLower[1]; cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; cell.getCoordinates().y()++ ) @@ -293,16 +317,6 @@ solve( const MeshPointer& mesh, calculated = this->updateCell( aux, cell ) || calculated; } } - //if( i == 0 ) - //{ - // for( int k = 0; k < mesh->getDimensions().y(); k++ ){ - // for( int l = 0; l < mesh->getDimensions().x(); l++ ) - // printf("%.2f\t",aux[ k * mesh->getDimensions().x() + l ] ); - // printf("\n"); - // } - //} - - //aux.save( "aux-1.tnl" ); for( cell.getCoordinates().y() = 0 + vLower[1]; cell.getCoordinates().y() < mesh->getDimensions().y()-vUpper[1]; @@ -352,30 +366,29 @@ solve( const MeshPointer& mesh, this->updateCell( aux, cell ); } } - } - if( std::is_same< DeviceType, Devices::Cuda >::value && calculate ) + + if( std::is_same< DeviceType, Devices::Cuda >::value && calculate ) // should we calculate on CUDA? { - // TODO: CUDA code - calculate = 0; - //if( i == 0 ) - // printf("%d: We are in Cuda code start.\n", i); -#ifdef HAVE_CUDA +#if ForDebug + printf("%d: We are in Cuda code start.\n", i); +#endif + +#ifdef HAVE_CUDA TNL_CHECK_CUDA_DEVICE; // Maximum cudaBlockSite is 32. Because of maximum num. of threads in kernel. const int cudaBlockSize( 16 ); + // Setting number of threads and blocks for kernel int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vLower[0] - vUpper[0], cudaBlockSize ); - int numBlocksXbez = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vLower[1] - vUpper[1], cudaBlockSize ); - int numBlocksYbez = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); dim3 blockSize( cudaBlockSize, cudaBlockSize ); - dim3 gridSizeBez( numBlocksXbez, numBlocksYbez ); dim3 gridSize( numBlocksX, numBlocksY ); - tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; + // Need for calling functions from kernel + BaseType ptr; int BlockIterD = 1; /*auxPtr = helpFunc; @@ -413,44 +426,50 @@ solve( const MeshPointer& mesh, BlockIterD = dBlock.getElement( 0 );*/ + // Array that identifies which blocks should be calculated. TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; BlockIterDevice.setSize( numBlocksX * numBlocksY ); BlockIterDevice.setValue( 1 ); TNL_CHECK_CUDA_DEVICE; - + // Array into which we identify the neighbours and then copy it into BlockIterDevice TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom; BlockIterPom.setSize( numBlocksX * numBlocksY ); BlockIterPom.setValue( 0 ); + +#if ForDebug // For printf of BlockIterDevice + TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1; + BlockIterPom1.setSize( numBlocksX * numBlocksY ); + BlockIterPom1.setValue( 0 ); +#endif TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1; BlockIterPom1.setSize( numBlocksX * numBlocksY ); BlockIterPom1.setValue( 0 ); - /*int *BlockIterDevice; - cudaMalloc((void**) &BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/ int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0); - //std::cout << "nBlocksNeigh = " << nBlocksNeigh << std::endl; - //free( BlockIter ); - /*int *BlockIterPom; - cudaMalloc((void**) &BlockIterPom, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/ - int nBlocks = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0); TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; dBlock.setSize( nBlocks ); + TNL::Containers::Array< int, Devices::Host, IndexType > dBlock1; + dBlock1.setSize( nBlocks ); TNL_CHECK_CUDA_DEVICE; - /*int *dBlock; - cudaMalloc((void**) &dBlock, nBlocks * sizeof( int ) );*/ - - MeshFunctionPointer helpFunc1( mesh ); + // Helping meshFunction that switches with AuxPtr in every calculation of CudaUpdateCellCaller<<<>>>() MeshFunctionPointer helpFunc( mesh ); - //helpFunc->bind( auxPtr->getData() ); - DeepCopy<<< gridSizeBez, blockSize >>>( auxPtr.template getData< Device>(), - helpFunc.template modifyData< Device>() ); + MeshFunctionPointer helpFunc1( mesh ); - helpFunc1 = auxPtr; - auxPtr = helpFunc; - helpFunc = helpFunc1; + // Setting number of threads and blocks in grid for DeepCopy of meshFunction + int numBlocksXWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); + int numBlocksYWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); + dim3 gridSizeWithoutOverlaps( numBlocksXWithoutOverlaps, numBlocksYWithoutOverlaps ); + + DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(), + helpFunc.template modifyData< Device>(), 1, i ); + +#if ForDebug + DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(), + helpFunc.template modifyData< Device>(), 0, i ); +#endif //int pocBloku = 0; Devices::Cuda::synchronizeDevice(); @@ -464,13 +483,12 @@ solve( const MeshPointer& mesh, //int oddEvenBlock = 0; //int numberWhile = 0; - while( BlockIterD /*numberWhile < 10*/) + while( BlockIterD ) { //numberWhile++; /** HERE IS CHESS METHOD **/ - /*auxPtr = helpFunc; - + /* CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template getData< Device>(), @@ -479,19 +497,17 @@ solve( const MeshPointer& mesh, oddEvenBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - auxPtr = helpFunc; oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), - auxPtr.template getData< Device>(), - helpFunc.template modifyData< Device>(), + helpFunc.template getData< Device>(), + auxPtr.template modifyData< Device>(), BlockIterDevice, vLower, vUpper, oddEvenBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - auxPtr = helpFunc; oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; @@ -507,14 +523,7 @@ solve( const MeshPointer& mesh, /**------------------------------------------------------------------------------------------------*/ - /** HERE IS FIM **/ - - helpFunc1 = auxPtr; - auxPtr = helpFunc; - helpFunc = helpFunc1; - TNL_CHECK_CUDA_DEVICE; - - //int pocBloku = 0; + /** HERE IS FIM **/ Devices::Cuda::synchronizeDevice(); CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), @@ -553,12 +562,17 @@ solve( const MeshPointer& mesh, } std::cout << std::endl; } +#endif + + // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now. GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, BlockIterPom, numBlocksX, numBlocksY ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; BlockIterDevice = BlockIterPom; - - if( i == 0 ){ + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; +#if ForDebug + if( i == 1 ){ BlockIterPom1 = BlockIterDevice; for( int i =0; i< numBlocksX; i++ ){ for( int j = 0; j < numBlocksY; j++ ) @@ -569,36 +583,39 @@ solve( const MeshPointer& mesh, } std::cout << std::endl; } - //std::cout<< "Probehlo" << std::endl; - - //TNL::swap( auxPtr, helpFunc ); - +#endif + // Parallel reduction to see if we should calculate again BlockIterD CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); + cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); + // Parallel reduction on dBlock because of too large number of blocks (more than maximum number of threads) + CudaParallelReduc<<< 1, 1024 >>>( dBlock, dBlock, nBlocks ); + cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - - - //if( i == 0 ) - // printf("%d: We did parallel reduction.\n", i); +#if ForDebug + if( i == 0 ){ + dBlock1 = dBlock; + printf("nBlocks = %d\n",nBlocks); + for( int m =0; m< nBlocks; m++ ){ + std::cout << dBlock1[m] << " "; + } + std::cout << std::endl; + } +#endif + // Copy of the first element which is result of parallel reduction BlockIterD = dBlock.getElement( 0 ); - - //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - //if( i == 0 ) - // printf("%d: BlockIterD = %d.\n", i, BlockIterD); -#ifdef HAVE_MPI + // When we change something then we should caclucate again in the next passage of MPI ( calculated = true ) + if( BlockIterD ){ calculated = 1; - //printf( "calculated = %d\n",calculated ); } -#endif + /**-----------------------------------------------------------------------------------------------------------*/ - /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) - BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ + numIter ++; } if( numIter%2 == 1 ){ @@ -637,37 +654,28 @@ solve( const MeshPointer& mesh, } MPI::WaitAll(req,neighCount); +#if ForDebug + printf( "%d: Sending Calculated = %d.\n", i, calculated ); +#endif MPI::Allreduce( &calculated, &calculated, 1, MPI_LOR, MPI::AllGroup ); aux.template synchronize< Communicator >(); calculate = calculpom[0] || calculpom[1] || calculpom[2] || calculpom[3]; +#if ForDebug + printf( "%d: Receved Calculated = %d.\n%d: Calculate = %d\n", i, calculated, i, calculate); +#endif aux.template synchronize< Communicator >(); + } - //printf( "%d: Receved reduced info about Calculated = %d.\n", i,calculated); - - - if( i == 0 ) +#if ForDebug + if( i == 1 ) printf("WhileCount = %d\n",WhileCount); - //calculated = 0; /// DEBUG; + //calculated = 0; // DEBUG; +#endif } String s( "aux-" + std::to_string( i ) + ".tnl" ); aux.save( s ); Aux=auxPtr; - - /*if( i == 0 ) - { - for( int k = 0; k < mesh->getDimensions().y(); k++ ){ - for( int l = 0; l < mesh->getDimensions().x(); l++ ) - printf("%.2f\t",aux[ k * mesh->getDimensions().x() + l ] ); - printf("\n"); - } - printf("\n"); - for( int k = 0; k < mesh->getDimensions().y(); k++ ){ - for( int l = 0; l < mesh->getDimensions().x(); l++ ) - printf("%.2f\t",(*Aux)[ k * mesh->getDimensions().x() + l ] ); - printf("\n"); - } - }*/ #endif iteration++; } @@ -679,13 +687,28 @@ solve( const MeshPointer& mesh, // DeepCopy nebo pracne kopirovat kraje v zavislosti na vLower,vUpper z sArray do helpFunc. template< typename Real, typename Device, typename Index > __global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc ) + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, int copy, int k ) { int i = threadIdx.x + blockDim.x*blockIdx.x; int j = blockDim.y*blockIdx.y + threadIdx.y; const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >(); - if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) - helpFunc[ j * mesh.getDimensions().x() + i ] = aux[ j * mesh.getDimensions().x() + i ]; + if( copy ){ + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) + helpFunc[ j * mesh.getDimensions().x() + i ] = aux[ j * mesh.getDimensions().x() + i ]; + } + else + { + if( i==0 && j == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 ) + { + for( int m = mesh.getDimensions().y()-1; m>-1; m-- ){ + for( int l = 0; l < 17; l++ ){ + printf( "%.2f ", aux[ m * mesh.getDimensions().x() + l ]); + } + printf( "\n"); + } + printf( "\n"); + } + } } template < typename Index > @@ -722,16 +745,22 @@ __global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cud int blId = blockIdx.x; int blockSize = blockDim.x; /*if ( i == 0 && blId == 0 ){ - printf( "nBlocks = %d \n", nBlocks ); - for( int j = nBlocks-1; j > -1 ; j--){ - printf( "cislo = %d \n", BlockIterDevice[ j ] ); - } - }*/ + printf( "nBlocks = %d\n", nBlocks ); + for( int j = nBlocks-1; j > -1 ; j--){ + printf( "%d: cislo = %d \n", j, BlockIterDevice[ j ] ); + } + }*/ __shared__ int sArray[ 1024 ]; sArray[ i ] = 0; if( blId * 1024 + i < nBlocks ) sArray[ i ] = BlockIterDevice[ blId * 1024 + i ]; __syncthreads(); + /*if ( i == 0 && blId == 0 ){ + printf( "nBlocks = %d\n", nBlocks ); + for( int j = 4; j > -1 ; j--){ + printf( "%d: cislo = %d \n", j, sArray[ j ] ); + } + }*/ /*extern __shared__ volatile int sArray[]; unsigned int i = threadIdx.x; unsigned int gid = blockIdx.x * blockSize * 2 + threadIdx.x; @@ -769,13 +798,19 @@ __global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cud __syncthreads(); if (i < 32 ) { - if( blockSize >= 64 ) sArray[ i ] += sArray[ i + 32 ]; - if( blockSize >= 32 ) sArray[ i ] += sArray[ i + 16 ]; - if( blockSize >= 16 ) sArray[ i ] += sArray[ i + 8 ]; - if( blockSize >= 8 ) sArray[ i ] += sArray[ i + 4 ]; - if( blockSize >= 4 ) sArray[ i ] += sArray[ i + 2 ]; - if( blockSize >= 2 ) sArray[ i ] += sArray[ i + 1 ]; + if( blockSize >= 64 ){ sArray[ i ] += sArray[ i + 32 ];} + __syncthreads(); + if( blockSize >= 32 ){ sArray[ i ] += sArray[ i + 16 ];} + __syncthreads(); + if( blockSize >= 16 ){ sArray[ i ] += sArray[ i + 8 ];} + if( blockSize >= 8 ){ sArray[ i ] += sArray[ i + 4 ];} + __syncthreads(); + if( blockSize >= 4 ){ sArray[ i ] += sArray[ i + 2 ];} + __syncthreads(); + if( blockSize >= 2 ){ sArray[ i ] += sArray[ i + 1 ];} + __syncthreads(); } + __syncthreads(); if( i == 0 ) dBlock[ blId ] = sArray[ 0 ]; @@ -841,11 +876,13 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< ykolik = (dimY-vUpper[1]-vLower[1]) - (blockIdx.y)*blockDim.y+1; __syncthreads(); +#if ForDebug /*if( thri==0 && thrj == 0 ) { printf("%d: DimX = %d, DimY = %d, xKolik = %d, yKolik = %d, numOfBlockX = %d, numOfBlockY = %d, blockIdx.x = %d, blockIdx.y = %d.\n", k, dimX, dimY, xkolik, ykolik, numOfBlockx, numOfBlocky, blockIdx.x, blockIdx.y); }*/ +#endif int currentIndex = thrj * blockDim.x + thri; //__shared__ volatile bool changed[ blockDim.x*blockDim.y ]; @@ -857,27 +894,16 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ]; __shared__ volatile Real sArray[ sizeSArray * sizeSArray ]; - sArray[ (thrj+1) * sizeSArray + thri +1 ] = 10;//std::numeric_limits< Real >::max(); - - /*if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 ) - { - printf( "Kraje: \n"); - for( int k = sizeSArray-1; k>-1; k-- ){ - for( int l = 0; l < sizeSArray; l++ ) - printf( "%.4f ", sArray[k * sizeSArray + l]); - printf( "\n"); - } - printf( "\n"); - } - __syncthreads();*/ + sArray[ (thrj+1) * sizeSArray + thri +1 ] = std::numeric_limits< Real >::max(); + //filling sArray edges if( thri == 0 ) { if( dimX - vLower[ 0 ] > (blockIdx.x+1) * blockDim.x && thrj+1 < ykolik ) sArray[(thrj+1)*sizeSArray + xkolik] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX + xkolik + vLower[0] ]; else - sArray[(thrj+1)*sizeSArray + xkolik] = 10;//std::numeric_limits< Real >::max(); + sArray[(thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max(); } if( thri == 1 ) @@ -885,7 +911,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( ( blockIdx.x != 0 || vLower[0] != 0 ) && thrj+1 < ykolik ) sArray[(thrj+1)*sizeSArray + 0] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX + vLower[0] ]; else - sArray[(thrj+1)*sizeSArray + 0] = 10;//std::numeric_limits< Real >::max(); + sArray[(thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max(); } if( thri == 2 ) @@ -893,7 +919,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( dimY - vLower[ 1 ] > (blockIdx.y+1) * blockDim.y && thrj+1 < xkolik ) sArray[ ykolik*sizeSArray + thrj+1 ] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + ykolik*dimX + thrj+1 + vLower[0] ]; else - sArray[ ykolik*sizeSArray + thrj+1 ] = 10;//std::numeric_limits< Real >::max(); + sArray[ ykolik*sizeSArray + thrj+1 ] = std::numeric_limits< Real >::max(); } @@ -902,7 +928,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( ( blockIdx.y != 0 || vLower[1] != 0 ) && thrj+1 < xkolik ) sArray[0*sizeSArray + thrj+1] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + thrj+1 + vLower[0] ]; else - sArray[0*sizeSArray + thrj+1] = 10;//std::numeric_limits< Real >::max(); + sArray[0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max(); } /*__syncthreads(); if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 1 ) @@ -918,19 +944,20 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< __syncthreads();*/ - if( i < dimX && j < dimY && thri+1 < xkolik && thrj+1 < ykolik ) + if( i-vLower[0] < dimX && j-vLower[1] < dimY && thri+1 < xkolik + vUpper[0] && thrj+1 < ykolik + vUpper[1] ) { - /*if( k == 3 && blockIdx.x == 0 && blockIdx.y == 0 ) + /*if( k == 1 && blockIdx.x == 0 && blockIdx.y == 0 ) printf("at index = %d\n", j*dimX + i);*/ - sArray[(thrj+1)*sizeSArray + thri+1] = aux[ j*dimX + i ]; + sArray[(thrj+1)*sizeSArray + thri+1] = aux[ (j)*dimX + i ]; } __syncthreads(); +#if ForDebug if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 ) { printf( "všechno před výpočtem: \n"); - for( int k = sizeSArray-1; k>-1; k-- ){ + for( int m = sizeSArray-1; m>-1; m-- ){ for( int l = 0; l < sizeSArray; l++ ) - printf( "%.4f ", sArray[k * sizeSArray + l]); + printf( "%.2f ", sArray[m * sizeSArray + l]); printf( "\n"); } printf( "\n"); @@ -938,14 +965,14 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 ) { - for( int k = mesh.getDimensions().y()-1; k>-1; k-- ){ + for( int m = mesh.getDimensions().y()-1; m>-1; m-- ){ for( int l = 0; l < 17; l++ ) - printf( "%.2f ", aux[ k * mesh.getDimensions().x() + l ]); + printf( "%.2f ", aux[ m * mesh.getDimensions().x() + l ]); printf( "\n"); } printf( "\n"); } - +#endif //main while cycle //if( i == 0 && j == 0 ) // printf("Overlaps [x1,y1],[x2,y2] = [%d,%d],[%d,%d]",vLower[0], vLower[1], vUpper[0], vUpper[1] ); @@ -1021,31 +1048,33 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( i < dimX && j < dimY && thri+1 < xkolik && thrj+1 < ykolik ) helpFunc[ j * dimX + i ] = sArray[ ( thrj + 1 ) * sizeSArray + thri + 1 ]; __syncthreads(); - /*if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 1 ) +#if ForDebug + if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 ) { printf( "všechno po výpočtu: \n"); - for( int k = sizeSArray-1; k>-1; k-- ){ + for( int m = sizeSArray-1; m>-1; m-- ){ for( int l = 0; l < sizeSArray; l++ ) - printf( "%.4f ", sArray[k * sizeSArray + l]); + printf( "%.2f ", sArray[m * sizeSArray + l]); printf( "\n"); } printf( "\n"); - }*/ + } - /*if( thri==0 && thrj == 0 && blockIdx.x == 1 && blockIdx.y == 1 && k == 1 ) + if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 ) { printf( "8: \n"); - for( int k = mesh.getDimensions().y()-1; k>-1; k-- ){ + for( int m = mesh.getDimensions().y()-1; m>-1; m-- ){ for( int l = 0; l < mesh.getDimensions().x(); l++ ) - printf( "%.2f\t", helpFunc[ k * mesh.getDimensions().x() + l ]); + printf( "%.2f ", helpFunc[ m * mesh.getDimensions().x() + l ]); printf("\n"); } printf( "\n"); - }*/ + } +#endif } else { - if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) + if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] ) helpFunc[ j * mesh.getDimensions().x() + i ] = aux[ j * mesh.getDimensions().x() + i ]; } } -- GitLab From df7abcac9456f6c8410d3e28f791fd11c5c8c382 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matou=C5=A1=20Fencl?= Date: Sun, 3 Mar 2019 09:54:03 +0100 Subject: [PATCH 06/14] First try to repair the installation error in 2D --- .../Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index bc1a97b43..586d37ba5 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -664,19 +664,19 @@ solve( const MeshPointer& mesh, printf( "%d: Receved Calculated = %d.\n%d: Calculate = %d\n", i, calculated, i, calculate); #endif aux.template synchronize< Communicator >(); - } #if ForDebug if( i == 1 ) printf("WhileCount = %d\n",WhileCount); //calculated = 0; // DEBUG; +#endif + } #endif } - String s( "aux-" + std::to_string( i ) + ".tnl" ); aux.save( s ); Aux=auxPtr; -#endif + iteration++; } aux.save("aux-final.tnl"); -- GitLab From 204f7f1da254bd7ee89e9d3e896cfae0f2298559 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matou=C5=A1=20Fencl?= Date: Mon, 11 Mar 2019 19:22:07 +0100 Subject: [PATCH 07/14] 2D and 3D solvers extended with MPI (3D has issue on biggest mesh) --- .../hamilton-jacobi/tnl-run-fsm-eoc-test | 127 +- .../tnlDirectEikonalMethodsBase.h | 18 +- .../tnlDirectEikonalMethodsBase_impl.h | 204 +-- .../tnlDirectEikonalProblem_impl.h | 14 +- .../hamilton-jacobi/tnlFastSweepingMethod.h | 4 + .../tnlFastSweepingMethod1D_impl.h | 2 +- .../tnlFastSweepingMethod2D_impl.h | 130 +- .../tnlFastSweepingMethod3D_impl.h | 1175 +++++++++++------ 8 files changed, 1088 insertions(+), 586 deletions(-) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test index 0dc50246f..24b782a82 100755 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test @@ -1,19 +1,33 @@ #!/bin/bash device="host" -dimensions="2D 3D" -#dimensions="3D" +#dimensions="2D 3D" +dimensions="2D" sizes1D="16 32 64 128 256 512 1024 2048 4096" #sizes1D="256" -sizes2D="16 32 64 128 256 512 1024" -#sizes2D="8" -sizes3D="16 32 64 128 256" +sizes2D="16 32 64 128 256 512 1024 2048 4096" +#sizes2D="16" +sizes3D="8 16 32 64 128 256" +#sizes3D="128 256" testFunctions="paraboloid" +#testFunctions="sin-wave-sdf" +#testFunctions="sin-bumps-sdf" snapshotPeriod=0.1 finalTime=1.5 -solverName="tnl-direct-eikonal-solver" -#solverName="gdb --args tnl-direct-eikonal-solver-dbg --catch-exceptions no" -# +realType="double" +#mpiRun="mpirun -np 4 -oversubscribe " +mpiRun="" + +## CAREFULL: If you set LocalCopy of MPI, you have to start with mpiRun even tnl-init +## This isnt problem with MpiIO. +## CAREFULL: For smoothly calculated error, you have to choose the right output function which +## is for both MpiIO, LocalCopy different. + +solverName=${mpiRun}"tnl-direct-eikonal-solver" +#solverName="gdb --args "${mpiRun}"tnl-direct-eikonal-solver-dbg --catch-exceptions no --mpi-gdb-debug false" +#scale=2.0 +#finalSdf="aux-0.tnl aux-1.tnl" +finalSdf="aux-final.tnl" setupTestFunction() { @@ -22,16 +36,17 @@ setupTestFunction() # then origin=-1.0 proportions=2.0 +# origin=-1.0 +# proportions=2.0 amplitude=1.0 - waveLength=0.2 - waveLengthX=0.2 - waveLengthY=0.2 - waveLengthZ=0.2 - wavesNumber=3.0 - wavesNumberX=0.5 - wavesNumberY=2.0 + waveLength=0.4 + waveLengthX=0.5 + waveLengthY=0.5 + waveLengthZ=0.5 + wavesNumber=1.25 + wavesNumberX=0.5 wavesNumberY=2.0 wavesNumberZ=3.0 - phase=0.1 + phase=-1.5 phaseX=0.0 phaseY=0.0 phaseZ=0.0 @@ -44,6 +59,7 @@ setupGrid() { dimensions=$1 gridSize=$2 + #scale=$3 tnl-grid-setup --dimensions ${dimensions} \ --origin-x ${origin} \ --origin-y ${origin} \ @@ -53,47 +69,51 @@ setupGrid() --proportions-z ${proportions} \ --size-x ${gridSize} \ --size-y ${gridSize} \ - --size-z ${gridSize} + --real-type ${realType} \ + --size-z ${gridSize} +#$((2*${gridSize})) } setInitialCondition() { testFunction=$1 tnl-init --test-function ${testFunction} \ - --output-file initial-u.tnl \ - --amplitude ${amplitude} \ - --wave-length ${waveLength} \ - --wave-length-x ${waveLengthX} \ - --wave-length-y ${waveLengthY} \ - --wave-length-z ${waveLengthZ} \ - --waves-number ${wavesNumber} \ - --waves-number-x ${wavesNumberX} \ - --waves-number-y ${wavesNumberY} \ - --waves-number-z ${wavesNumberZ} \ - --phase ${phase} \ - --phase-x ${phaseX} \ - --phase-y ${phaseY} \ - --phase-z ${phaseZ} \ - --sigma ${sigma} \ - --radius ${radius} + --real-type ${realType} \ + --output-file initial-u.tnl \ + --amplitude ${amplitude} \ + --wave-length ${waveLength} \ + --wave-length-x ${waveLengthX} \ + --wave-length-y ${waveLengthY} \ + --wave-length-z ${waveLengthZ} \ + --waves-number ${wavesNumber} \ + --waves-number-x ${wavesNumberX} \ + --waves-number-y ${wavesNumberY} \ + --waves-number-z ${wavesNumberZ} \ + --phase ${phase} \ + --phase-x ${phaseX} \ + --phase-y ${phaseY} \ + --phase-z ${phaseZ} \ + --sigma ${sigma} \ + --radius ${radius} - tnl-init --test-function ${testFunction}-sdf \ - --output-file exact-u.tnl \ - --amplitude ${amplitude} \ - --wave-length ${waveLength} \ - --wave-length-x ${waveLengthX} \ - --wave-length-y ${waveLengthY} \ - --wave-length-z ${waveLengthZ} \ - --waves-number ${wavesNumber} \ - --waves-number-x ${wavesNumberX} \ - --waves-number-y ${wavesNumberY} \ - --waves-number-z ${wavesNumberZ} \ - --phase ${phase} \ - --phase-x ${phaseX} \ - --phase-y ${phaseY} \ - --phase-z ${phaseZ} \ - --sigma ${sigma} \ - --radius ${radius} + tnl-init --test-function ${testFunction}-sdf \ + --real-type ${realType} \ + --output-file exact-u.tnl \ + --amplitude ${amplitude} \ + --wave-length ${waveLength} \ + --wave-length-x ${waveLengthX} \ + --wave-length-y ${waveLengthY} \ + --wave-length-z ${waveLengthZ} \ + --waves-number ${wavesNumber} \ + --waves-number-x ${wavesNumberX} \ + --waves-number-y ${wavesNumberY} \ + --waves-number-z ${wavesNumberZ} \ + --phase ${phase} \ + --phase-x ${phaseZ} \ + --phase-y ${phaseZ} \ + --phase-z ${phaseZ} \ + --sigma ${sigma} \ + --radius ${radius} \ } @@ -111,17 +131,22 @@ solve() --min-iterations 20 \ --convergence-residue 1.0e-12 \ --snapshot-period ${snapshotPeriod} \ + --real-type ${realType} \ --final-time ${finalTime} } computeError() { +for sweep in ${finalSdf} +do tnl-diff --mesh mesh.tnl \ - --input-files aux-final.tnl exact-u.tnl \ + --input-files exact-u.tnl u-00000.tnl \ --mode sequence \ --snapshot-period ${snapshotPeriod} \ --output-file errors.txt \ --write-difference yes +#aux-final.tnl \ +done } runTest() diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h index d933f1df3..c6a522d8f 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h @@ -62,12 +62,15 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > typedef Functions::MeshFunction< MeshType > MeshFunctionType; typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType; typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer; + typedef Containers::StaticVector< 2, Index > StaticVector; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >; using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >; + void initInterface( const MeshFunctionPointer& input, MeshFunctionPointer& output, - InterfaceMapPointer& interfaceMap ); + InterfaceMapPointer& interfaceMap, + StaticVector vLower, StaticVector vUpper ); template< typename MeshEntity > __cuda_callable__ bool updateCell( MeshFunctionType& u, @@ -101,15 +104,17 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > typedef Functions::MeshFunction< MeshType > MeshFunctionType; typedef Functions::MeshFunction< MeshType, 3, bool > InterfaceMapType; typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer; + typedef Containers::StaticVector< 3, Index > StaticVector; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >; using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >; void initInterface( const MeshFunctionPointer& input, MeshFunctionPointer& output, - InterfaceMapPointer& interfaceMap ); + InterfaceMapPointer& interfaceMap, + StaticVector vLower, StaticVector vUpper ); template< typename MeshEntity > - __cuda_callable__ void updateCell( MeshFunctionType& u, + __cuda_callable__ bool updateCell( MeshFunctionType& u, const MeshEntity& cell, const RealType velocity = 1.0); @@ -154,6 +159,10 @@ template< typename Real, typename Device, typename Index > __global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, int copy, int k ); +template< typename Real, typename Device, typename Index > +__global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, int copy, int k ); + template < typename Index > __global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::ArrayView< int, Devices::Cuda, Index > dBlock, int nBlocks ); @@ -171,7 +180,8 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output, - Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap ); + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, + Containers::StaticVector< 3, Index > vLower, Containers::StaticVector< 3, Index > vUpper ); template < int sizeSArray, typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h index d7da1117e..a5d3d81df 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h @@ -715,17 +715,14 @@ void tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: initInterface( const MeshFunctionPointer& _input, MeshFunctionPointer& _output, - InterfaceMapPointer& _interfaceMap ) + InterfaceMapPointer& _interfaceMap, + StaticVector vLower, StaticVector vUpper ) { if( std::is_same< Device, Devices::Cuda >::value ) { #ifdef HAVE_CUDA const MeshType& mesh = _input->getMesh(); - Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh.getDistributedMesh(); - - Containers::StaticVector< 2, Index > vLower = meshPom->getLowerOverlap(); - Containers::StaticVector< 2, Index > vUpper = meshPom->getUpperOverlap(); const int cudaBlockSize( 16 ); int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); @@ -748,7 +745,7 @@ initInterface( const MeshFunctionPointer& _input, InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); const MeshType& mesh = input.getMesh(); /*#ifdef HAVE_MPI - int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup ); + int i>s::>::GetRan>s::>::AllGroup ); if( i == 0 ) { printf( "0: mesh x: %d\n", mesh.getDimensions().x() ); @@ -778,11 +775,11 @@ initInterface( const MeshFunctionPointer& _input, const RealType& hx = mesh.getSpaceSteps().x(); const RealType& hy = mesh.getSpaceSteps().y(); - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh.getDimensions().y(); + for( cell.getCoordinates().y() = 0 + vLower[1]; + cell.getCoordinates().y() < mesh.getDimensions().y() - vUpper[1]; cell.getCoordinates().y() ++ ) - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh.getDimensions().x(); + for( cell.getCoordinates().x() = 0 + vLower[0]; + cell.getCoordinates().x() < mesh.getDimensions().x() - vUpper[0]; cell.getCoordinates().x() ++ ) { cell.refresh(); @@ -856,7 +853,7 @@ initInterface( const MeshFunctionPointer& _input, } } #ifdef HAVE_MPI - //int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup ); + //int i>s::>::GetRan>s::>::AllGroup ); /*if( i == 0 ) { printf( "0: mesh x: %d\n", mesh.getDimensions().x() ); @@ -951,13 +948,14 @@ void tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: initInterface( const MeshFunctionPointer& _input, MeshFunctionPointer& _output, - InterfaceMapPointer& _interfaceMap ) + InterfaceMapPointer& _interfaceMap, + StaticVector vLower, StaticVector vUpper ) { if( std::is_same< Device, Devices::Cuda >::value ) { #ifdef HAVE_CUDA const MeshType& mesh = _input->getMesh(); - + const int cudaBlockSize( 8 ); int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); @@ -969,7 +967,7 @@ initInterface( const MeshFunctionPointer& _input, Devices::Cuda::synchronizeDevice(); CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(), _output.template modifyData< Device >(), - _interfaceMap.template modifyData< Device >() ); + _interfaceMap.template modifyData< Device >(), vLower, vUpper ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; #endif @@ -979,8 +977,10 @@ initInterface( const MeshFunctionPointer& _input, const MeshFunctionType& input = _input.getData(); MeshFunctionType& output = _output.modifyData(); InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); + const MeshType& mesh = input.getMesh(); typedef typename MeshType::Cell Cell; + Cell cell( mesh ); for( cell.getCoordinates().z() = 0; cell.getCoordinates().z() < mesh.getDimensions().z(); @@ -1002,14 +1002,14 @@ initInterface( const MeshFunctionPointer& _input, const RealType& hx = mesh.getSpaceSteps().x(); const RealType& hy = mesh.getSpaceSteps().y(); const RealType& hz = mesh.getSpaceSteps().z(); - for( cell.getCoordinates().z() = 0; - cell.getCoordinates().z() < mesh.getDimensions().z(); + for( cell.getCoordinates().z() = 0 + vLower[2]; + cell.getCoordinates().z() < mesh.getDimensions().z() - vUpper[2]; cell.getCoordinates().z() ++ ) - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh.getDimensions().y(); + for( cell.getCoordinates().y() = 0 + vLower[1]; + cell.getCoordinates().y() < mesh.getDimensions().y() - vUpper[1]; cell.getCoordinates().y() ++ ) - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh.getDimensions().x(); + for( cell.getCoordinates().x() = 0 + vLower[0]; + cell.getCoordinates().x() < mesh.getDimensions().x() - vUpper[0]; cell.getCoordinates().x() ++ ) { cell.refresh(); @@ -1062,10 +1062,6 @@ initInterface( const MeshFunctionPointer& _input, interfaceMap[ t ] = true; } } - /*output[ cell.getIndex() ] = - c > 0 ? TypeInfo< RealType >::getMaxValue() : - -TypeInfo< RealType >::getMaxValue(); - interfaceMap[ cell.getIndex() ] = false;*/ //is on line 245 } } } @@ -1075,7 +1071,7 @@ template< typename Real, typename Index > template< typename MeshEntity > __cuda_callable__ -void +bool tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: updateCell( MeshFunctionType& u, const MeshEntity& cell, @@ -1101,6 +1097,7 @@ updateCell( MeshFunctionType& u, a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ], u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ] ); } + if( cell.getCoordinates().y() == 0 ) b = u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ]; else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 ) @@ -1109,7 +1106,9 @@ updateCell( MeshFunctionType& u, { b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ], u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ] ); - }if( cell.getCoordinates().z() == 0 ) + } + + if( cell.getCoordinates().z() == 0 ) c = u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ]; else if( cell.getCoordinates().z() == mesh.getDimensions().z() - 1 ) c = u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ]; @@ -1118,17 +1117,25 @@ updateCell( MeshFunctionType& u, c = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ], u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ] ); } + if( fabs( a ) == std::numeric_limits< RealType >::max() && fabs( b ) == std::numeric_limits< RealType >::max() && fabs( c ) == std::numeric_limits< RealType >::max() ) - return; + return false; RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz}; sortMinims( pom ); tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]; + if( fabs( tmp ) < fabs( pom[ 1 ] ) ) { - u[ cell.getIndex() ] = argAbsMin( value, tmp ); + u[ cell.getIndex() ] = argAbsMin( value, tmp ); + tmp = value - u[ cell.getIndex() ]; + if ( fabs( tmp ) > 0.001*hx ){ + return true; + }else{ + return false; + } } else { @@ -1138,6 +1145,12 @@ updateCell( MeshFunctionType& u, if( fabs( tmp ) < fabs( pom[ 2 ]) ) { u[ cell.getIndex() ] = argAbsMin( value, tmp ); + tmp = value - u[ cell.getIndex() ]; + if ( fabs( tmp ) > 0.001*hx ){ + return true; + }else{ + return false; + } } else { @@ -1146,6 +1159,12 @@ updateCell( MeshFunctionType& u, hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) - hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx ); u[ cell.getIndex() ] = argAbsMin( value, tmp ); + tmp = value - u[ cell.getIndex() ]; + if ( fabs( tmp ) > 0.001*hx ){ + return true; + }else{ + return false; + } } } } @@ -1391,7 +1410,7 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, - std::numeric_limits< Real >::max(); interfaceMap[ cind ] = false; - if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] && i>vLower[0] && j> vLower[0] ) + if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] && i>vLower[0] -1 && j> vLower[0]-1 ) { const Real& hx = mesh.getSpaceSteps().x(); const Real& hy = mesh.getSpaceSteps().y(); @@ -1446,7 +1465,8 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output, - Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap ) + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, + Containers::StaticVector< 3, Index > vLower, Containers::StaticVector< 3, Index > vUpper ) { int i = threadIdx.x + blockDim.x*blockIdx.x; int j = blockDim.y*blockIdx.y + threadIdx.y; @@ -1468,76 +1488,78 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3 interfaceMap[ cind ] = false; cell.refresh(); - const Real& hx = mesh.getSpaceSteps().x(); - const Real& hy = mesh.getSpaceSteps().y(); - const Real& hz = mesh.getSpaceSteps().z(); - const Real& c = input( cell ); - if( ! cell.isBoundaryEntity() ) + if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] && + k < mesh.getDimensions().y() - vUpper[2] && i>vLower[0]-1 && j> vLower[1]-1 && k>vLower[2]-1 ) { - auto neighbors = cell.getNeighborEntities(); - Real pom = 0; - const Index e = neighbors.template getEntityIndex< 1, 0, 0 >(); - const Index w = neighbors.template getEntityIndex< -1, 0, 0 >(); - const Index n = neighbors.template getEntityIndex< 0, 1, 0 >(); - const Index s = neighbors.template getEntityIndex< 0, -1, 0 >(); - const Index t = neighbors.template getEntityIndex< 0, 0, 1 >(); - const Index b = neighbors.template getEntityIndex< 0, 0, -1 >(); - - if( c * input[ n ] <= 0 ) - { - pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ e ] <= 0 ) - { - pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ w ] <= 0 ) - { - pom = TNL::sign( c )*( hx * c )/( c - input[ w ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ s ] <= 0 ) - { - pom = TNL::sign( c )*( hy * c )/( c - input[ s ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ b ] <= 0 ) - { - pom = TNL::sign( c )*( hz * c )/( c - input[ b ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ t ] <= 0 ) + const Real& hx = mesh.getSpaceSteps().x(); + const Real& hy = mesh.getSpaceSteps().y(); + const Real& hz = mesh.getSpaceSteps().z(); + const Real& c = input( cell ); + if( ! cell.isBoundaryEntity() ) { - pom = TNL::sign( c )*( hz * c )/( c - input[ t ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; + auto neighbors = cell.getNeighborEntities(); + Real pom = 0; + const Index e = neighbors.template getEntityIndex< 1, 0, 0 >(); + const Index w = neighbors.template getEntityIndex< -1, 0, 0 >(); + const Index n = neighbors.template getEntityIndex< 0, 1, 0 >(); + const Index s = neighbors.template getEntityIndex< 0, -1, 0 >(); + const Index t = neighbors.template getEntityIndex< 0, 0, 1 >(); + const Index b = neighbors.template getEntityIndex< 0, 0, -1 >(); - interfaceMap[ cind ] = true; + if( c * input[ n ] <= 0 ) + { + pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ e ] <= 0 ) + { + pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ w ] <= 0 ) + { + pom = TNL::sign( c )*( hx * c )/( c - input[ w ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ s ] <= 0 ) + { + pom = TNL::sign( c )*( hy * c )/( c - input[ s ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ b ] <= 0 ) + { + pom = TNL::sign( c )*( hz * c )/( c - input[ b ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ t ] <= 0 ) + { + pom = TNL::sign( c )*( hz * c )/( c - input[ t ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } } } } } - - template< typename Real, typename Device, typename Index > diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h index 56fa9496f..105a068d3 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h @@ -123,8 +123,7 @@ setInitialCondition( const Config::ParameterContainer& parameters, { this->bindDofs( dofs ); String inputFile = parameters.getParameter< String >( "input-file" ); - this->initialData->setMesh( this->getMesh() ); - std::cout<<"setInitialCondition" <initialData->setMesh( this->getMesh() ); if( CommunicatorType::isDistributed() ) { std::cout<<"Nodes Distribution: " << initialData->getMesh().getDistributedMesh()->printProcessDistr() << std::endl; @@ -191,20 +190,9 @@ bool tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >:: solve( DofVectorPointer& dofs ) { - std::cout << "We are in solve()." << std::endl; FastSweepingMethod< MeshType, Communicator,AnisotropyType > fsm; fsm.solve( this->getMesh(), u, anisotropy, initialData ); - /*int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup ); - const MeshPointer msh = this->getMesh(); - if( i == 0 && msh->getMeshDimension() == 2 ) - { - for( int k = 0; k < 9; k++ ){ - for( int l = 0; l < msh->getDimensions().x(); l++ ) - printf("%.2f\t",(*initialData)[ k * msh->getDimensions().x() + l ] ); - printf("\n"); - } - }*/ makeSnapshot(); return true; } diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h index 8e1e6a72b..a57ef1491 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h @@ -14,6 +14,7 @@ #include #include #include "tnlDirectEikonalMethodsBase.h" +#define ForDebug false // false <=> off template< typename Mesh, @@ -132,8 +133,11 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, typedef Index IndexType; typedef Anisotropy AnisotropyType; typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > BaseType; + typedef Communicator CommunicatorType; + using MeshPointer = Pointers::SharedPointer< MeshType >; using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >; + using MPI = Communicators::MpiCommunicator; using typename BaseType::InterfaceMapType; using typename BaseType::MeshFunctionType; diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h index 662a5b79c..f2f033ccb 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h @@ -109,7 +109,7 @@ solve( const MeshPointer& mesh, dim3 blockSize( cudaBlockSize ); dim3 gridSize( numBlocksX ); - tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > ptr; + BaseType ptr; diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index 586d37ba5..5eac5232b 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -19,7 +19,6 @@ #include #include "tnlDirectEikonalProblem.h" -#define ForDebug false // false <=> off @@ -79,8 +78,27 @@ solve( const MeshPointer& mesh, InterfaceMapPointer interfaceMapPtr; auxPtr->setMesh( mesh ); interfaceMapPtr->setMesh( mesh ); + + //Distributed mesh for MPI overlaps (without MPI null pointer) + Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh(); + + int i = MPI::GetRank( MPI::AllGroup ); // number that identifies rank + + // getting overlaps ( WITHOUT MPI SHOULD BE 0 ) + Containers::StaticVector< 2, IndexType > vLower; + vLower[0] = 0; vLower[1] = 0; + Containers::StaticVector< 2, IndexType > vUpper; + vUpper[0] = 0; vUpper[1] = 0; +#ifdef HAVE_MPI + if( CommunicatorType::isDistributed() ) //If we started solver with MPI + { + vLower = meshPom->getLowerOverlap(); + vUpper = meshPom->getUpperOverlap(); + } +#endif + std::cout << "Initiating the interface cells ..." << std::endl; - BaseType::initInterface( u, auxPtr, interfaceMapPtr ); + BaseType::initInterface( u, auxPtr, interfaceMapPtr, vLower, vUpper ); auxPtr->save( "aux-ini.tnl" ); @@ -124,29 +142,20 @@ solve( const MeshPointer& mesh, #endif while( iteration < this->maxIterations ) - { - Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh(); - - - int i = MPI::GetRank( MPI::AllGroup ); // number that identifies rank - - // getting overlaps ( WITHOUT MPI SHOULD BE 0 ) - Containers::StaticVector< 2, IndexType > vLower = meshPom->getLowerOverlap(); - Containers::StaticVector< 2, IndexType > vUpper = meshPom->getUpperOverlap(); - + { #if ForDebug int WhileCount = 0; // number of passages of while cycle with condition calculated printf( "%d: meshDimensions are (x,y) = (%d,%d).\n",i, mesh->getDimensions().x(), mesh->getDimensions().y() ); printf( "%d: owerlaps are ([x1,x2],[y1,y2]) = ([%d,%d],[%d,%d]).\n",i, vLower[0], vUpper[0], vLower[1], vUpper[1] ); - if( std::is_same< DeviceType, Devices::Host >::value && i == 0 ) + /*if( std::is_same< DeviceType, Devices::Host >::value && i == 0 ) { for( int j = mesh->getDimensions().y()-1; j>-1; j-- ){ - for( int i = 0; i < mesh->getDimensions().x(); i++ ) - std::cout << aux[ j * mesh->getDimensions().x() + i ] << " "; + for( int m = 0; m < mesh->getDimensions().x(); m++ ) + std::cout << aux[ j * mesh->getDimensions().x() + m ] << " "; std::cout << std::endl; } std::cout << std::endl; - } + }*/ // TO SEE CUDA OVERLAPS /*const int cudaBlockSize( 16 ); @@ -314,7 +323,9 @@ solve( const MeshPointer& mesh, { cell.refresh(); if( ! interfaceMap( cell ) ) + { calculated = this->updateCell( aux, cell ) || calculated; + } } } @@ -379,6 +390,7 @@ solve( const MeshPointer& mesh, #ifdef HAVE_CUDA TNL_CHECK_CUDA_DEVICE; // Maximum cudaBlockSite is 32. Because of maximum num. of threads in kernel. + // IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2) const int cudaBlockSize( 16 ); // Setting number of threads and blocks for kernel @@ -442,27 +454,24 @@ solve( const MeshPointer& mesh, BlockIterPom1.setSize( numBlocksX * numBlocksY ); BlockIterPom1.setValue( 0 ); #endif - TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1; - BlockIterPom1.setSize( numBlocksX * numBlocksY ); - BlockIterPom1.setValue( 0 ); int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0); - int nBlocks = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0); - - TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; - dBlock.setSize( nBlocks ); - TNL::Containers::Array< int, Devices::Host, IndexType > dBlock1; - dBlock1.setSize( nBlocks ); - TNL_CHECK_CUDA_DEVICE; + // for CudaPrallelReduc (replaced with .containsValue(1)) + //int nBlocks = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0); + //TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; + //dBlock.setSize( nBlocks ); + //TNL::Containers::Array< int, Devices::Host, IndexType > dBlock1; + //dBlock1.setSize( nBlocks ); + //TNL_CHECK_CUDA_DEVICE; // Helping meshFunction that switches with AuxPtr in every calculation of CudaUpdateCellCaller<<<>>>() MeshFunctionPointer helpFunc( mesh ); - MeshFunctionPointer helpFunc1( mesh ); + //MeshFunctionPointer helpFunc1( mesh ); // Setting number of threads and blocks in grid for DeepCopy of meshFunction int numBlocksXWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); int numBlocksYWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); dim3 gridSizeWithoutOverlaps( numBlocksXWithoutOverlaps, numBlocksYWithoutOverlaps ); - + DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(), 1, i ); @@ -486,7 +495,7 @@ solve( const MeshPointer& mesh, while( BlockIterD ) { //numberWhile++; - /** HERE IS CHESS METHOD **/ + /** HERE IS CHESS METHOD (NO MPI) **/ /* CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, @@ -523,7 +532,7 @@ solve( const MeshPointer& mesh, /**------------------------------------------------------------------------------------------------*/ - /** HERE IS FIM **/ + /** HERE IS FIM FOR MPI AND WITHOUT MPI **/ Devices::Cuda::synchronizeDevice(); CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), @@ -564,7 +573,8 @@ solve( const MeshPointer& mesh, } #endif - // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now. + // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now. + Devices::Cuda::synchronizeDevice(); GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, BlockIterPom, numBlocksX, numBlocksY ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; @@ -584,31 +594,27 @@ solve( const MeshPointer& mesh, std::cout << std::endl; } #endif - // Parallel reduction to see if we should calculate again BlockIterD + // "Parallel reduction" to see if we should calculate again BlockIterD + BlockIterD = BlockIterDevice.containsValue(1); + /*Devices::Cuda::synchronizeDevice(); CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; // Parallel reduction on dBlock because of too large number of blocks (more than maximum number of threads) + Devices::Cuda::synchronizeDevice(); CudaParallelReduc<<< 1, 1024 >>>( dBlock, dBlock, nBlocks ); cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; -#if ForDebug - if( i == 0 ){ - dBlock1 = dBlock; - printf("nBlocks = %d\n",nBlocks); - for( int m =0; m< nBlocks; m++ ){ - std::cout << dBlock1[m] << " "; - } - std::cout << std::endl; - } -#endif + TNL_CHECK_CUDA_DEVICE;*/ + // Copy of the first element which is result of parallel reduction + /*Devices::Cuda::synchronizeDevice(); BlockIterD = dBlock.getElement( 0 ); cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + TNL_CHECK_CUDA_DEVICE;*/ // When we change something then we should caclucate again in the next passage of MPI ( calculated = true ) + if( BlockIterD ){ calculated = 1; @@ -663,22 +669,23 @@ solve( const MeshPointer& mesh, #if ForDebug printf( "%d: Receved Calculated = %d.\n%d: Calculate = %d\n", i, calculated, i, calculate); #endif - aux.template synchronize< Communicator >(); #if ForDebug if( i == 1 ) printf("WhileCount = %d\n",WhileCount); //calculated = 0; // DEBUG; -#endif - } #endif } - String s( "aux-" + std::to_string( i ) + ".tnl" ); - aux.save( s ); - Aux=auxPtr; - - iteration++; +#endif + if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculated 0! + calculated = 0; + } + iteration++; } + //String s( "aux-" + std::to_string( i ) + ".tnl" ); + //aux.save( s ); + Aux=auxPtr; // copy it for MakeSnapshot + aux.save("aux-final.tnl"); } @@ -708,6 +715,16 @@ __global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 2, Real, } printf( "\n"); } + if( i==0 && j == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 ) + { + for( int m = mesh.getDimensions().y()-1; m>-1; m-- ){ + for( int l = 0; l < 17; l++ ){ + printf( "%.2f ", helpFunc[ m * mesh.getDimensions().x() + l ]); + } + printf( "\n"); + } + printf( "\n"); + } } } @@ -733,7 +750,10 @@ __global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, I pom = 1;//BlockIterPom[ i ] = 1; } - BlockIterPom[ i ] = pom;//BlockIterPom[ i ]; + if( BlockIterDevice[ i ] != 1 ) + BlockIterPom[ i ] = pom;//BlockIterPom[ i ]; + else + BlockIterPom[ i ] = 1; } } @@ -866,8 +886,8 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< xkolik = blockDim.x + 1; ykolik = blockDim.y + 1; - numOfBlocky = (dimY-vUpper[1]-vLower[1])/blockDim.y + (((dimY-vUpper[1]-vLower[1])%blockDim.y != 0) ? 1:0); - numOfBlockx = (dimX-vUpper[0]-vLower[0])/blockDim.x + (((dimX-vUpper[0]-vLower[0])%blockDim.x != 0) ? 1:0); + numOfBlocky = gridDim.y;//(dimY-vUpper[1]-vLower[1])/blockDim.y + (((dimY-vUpper[1]-vLower[1])%blockDim.y != 0) ? 1:0); + numOfBlockx = gridDim.x;//(dimX-vUpper[0]-vLower[0])/blockDim.x + (((dimX-vUpper[0]-vLower[0])%blockDim.x != 0) ? 1:0); if( numOfBlockx - 1 == blockIdx.x ) xkolik = (dimX-vUpper[0]-vLower[0]) - (blockIdx.x)*blockDim.x+1; diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h index 9c5471beb..40a1efeba 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h @@ -15,6 +15,7 @@ #include "tnlFastSweepingMethod.h" + template< typename Real, typename Device, typename Index, @@ -67,8 +68,25 @@ solve( const MeshPointer& mesh, InterfaceMapPointer interfaceMapPtr; auxPtr->setMesh( mesh ); interfaceMapPtr->setMesh( mesh ); + + //Distributed mesh for overlaps (without MPI is null pointer) + Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh(); + + // getting overlaps ( WITHOUT MPI SHOULD BE 0 ) + Containers::StaticVector< 3, IndexType > vLower; + vLower[0] = 0; vLower[1] = 0; vLower[2] = 0; + Containers::StaticVector< 3, IndexType > vUpper; + vUpper[0] = 0; vUpper[1] = 0; vUpper[2] = 0; +#ifdef HAVE_MPI + if( CommunicatorType::isDistributed() ) + { + vLower = meshPom->getLowerOverlap(); + vUpper = meshPom->getUpperOverlap(); + } +#endif + std::cout << "Initiating the interface cells ..." << std::endl; - BaseType::initInterface( u, auxPtr, interfaceMapPtr ); + BaseType::initInterface( u, auxPtr, interfaceMapPtr, vLower, vUpper ); auxPtr->save( "aux-ini.tnl" ); typename MeshType::Cell cell( *mesh ); @@ -76,313 +94,525 @@ solve( const MeshPointer& mesh, IndexType iteration( 0 ); MeshFunctionType aux = *auxPtr; InterfaceMapType interfaceMap = * interfaceMapPtr; + aux.template synchronize< Communicator >(); //synchronization of intial conditions + int i = MPI::GetRank( MPI::AllGroup ); //getting identification of MPI thread +#if ForDebug + if( i == 2 ){ + aux.save("aux-init2.tnl"); + mesh->save("mesh-2.tnl"); + } + if( i == 1 ){ + aux.save("aux-init1.tnl"); + mesh->save("mesh-1.tnl"); + } + if( i == 3 ){ + aux.save("aux-init3.tnl"); + mesh->save("mesh-3.tnl"); + } + if( i == 0 ){ + aux.save("aux-init0.tnl"); + mesh->save("mesh-0.tnl"); + } +#endif + while( iteration < this->maxIterations ) - { - if( std::is_same< DeviceType, Devices::Host >::value ) + { +#if ForDebug + int WhileCount = 0; // number of passages of while cycle with condition calculated + printf( "%d: meshDimensions are (x,y,z) = (%d,%d,%d).\n",i, mesh->getDimensions().x(), mesh->getDimensions().y(), mesh->getDimensions().z() ); + printf( "%d: owerlaps are ([x1,x2],[y1,y2],[z1,z2]) = ([%d,%d],[%d,%d],[%d,%d]).\n",i, vLower[0], vUpper[0], vLower[1], vUpper[1], vUpper[2], vLower[2] ); + /*if( std::is_same< DeviceType, Devices::Host >::value && i == 2 ) { - int numThreadsPerBlock = 64; - - - int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0); - int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0); - int numBlocksZ = mesh->getDimensions().z() / numThreadsPerBlock + (mesh->getDimensions().z() % numThreadsPerBlock != 0 ? 1:0); - //std::cout << "numBlocksX = " << numBlocksX << std::endl; - - /*Real **sArray = new Real*[numBlocksX*numBlocksY]; - for( int i = 0; i < numBlocksX * numBlocksY; i++ ) - sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];*/ - - ArrayContainer BlockIterHost; - BlockIterHost.setSize( numBlocksX * numBlocksY * numBlocksZ ); - BlockIterHost.setValue( 1 ); - int IsCalculationDone = 1; - - MeshFunctionPointer helpFunc( mesh ); - MeshFunctionPointer helpFunc1( mesh ); - helpFunc1 = auxPtr; - auxPtr = helpFunc; - helpFunc = helpFunc1; - //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl; - /*for( int k = numBlocksX-1; k >-1; k-- ){ - for( int l = 0; l < numBlocksY; l++ ){ - std::cout<< BlockIterHost[ l*numBlocksX + k ]; - } - std::cout<template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); + for( int j = mesh->getDimensions().y()-1; j>-1; j-- ){ + for( int m = 0; m < mesh->getDimensions().x(); m++ ) + printf( "%.2f " , aux[ j*mesh->getDimensions().x() + m ]); + printf("\n"); + } + printf("\n"); + }*/ +#endif + + int calculated = 1; // indicates weather we calculated in the last passage of the while cycle + // calculated is same for all ranks + // without MPI should be FALSE at the end of while cycle body + int calculate = 1; // indicates if the thread should calculate again in upcoming passage of cycle + // calculate is a value that can differ in every rank + // without MPI should be FALSE at the end of while cycle body + + while( calculated ) + { + calculated = 0; +#if ForDebug + WhileCount++; +#endif + if( std::is_same< DeviceType, Devices::Host >::value && calculate ) // should we calculate in Host? + { + calculate = 0; + +/** HERE IS FSM FOR OPENMP (NO MPI) - isnt worthy */ + /*int numThreadsPerBlock = 64; + + + int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0); + int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0); + int numBlocksZ = mesh->getDimensions().z() / numThreadsPerBlock + (mesh->getDimensions().z() % numThreadsPerBlock != 0 ? 1:0); + //std::cout << "numBlocksX = " << numBlocksX << std::endl; + + //Real **sArray = new Real*[numBlocksX*numBlocksY]; + // for( int i = 0; i < numBlocksX * numBlocksY; i++ ) + // sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)]; + + ArrayContainer BlockIterHost; + BlockIterHost.setSize( numBlocksX * numBlocksY * numBlocksZ ); + BlockIterHost.setValue( 1 ); + int IsCalculationDone = 1; + + MeshFunctionPointer helpFunc( mesh ); + MeshFunctionPointer helpFunc1( mesh ); + helpFunc1 = auxPtr; + auxPtr = helpFunc; + helpFunc = helpFunc1; + //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl; + //for( int k = numBlocksX-1; k >-1; k-- ){ + //for( int l = 0; l < numBlocksY; l++ ){ + // std::cout<< BlockIterHost[ l*numBlocksX + k ]; + // } + // std::cout<template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + + //Reduction + for( int i = 0; i < BlockIterHost.getSize(); i++ ){ + if( IsCalculationDone == 0 ){ + IsCalculationDone = IsCalculationDone || BlockIterHost[ i ]; + //break; + } + } + numWhile++; + + + this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY, numBlocksZ ); + + //string s( "aux-"+ std::to_string(numWhile) + ".tnl"); + //aux.save( s ); + } + if( numWhile == 1 ){ + auxPtr = helpFunc; + } + aux = *auxPtr;*/ +/**------------------------------------------------------------------------------*/ + + +/** HERE IS FSM WITH MPI AND WITHOUT MPI */ + +#if ForDebug + if( i == 1 ){ + aux.save("aux-final10.tnl"); + } +#endif + for( cell.getCoordinates().z() = 0 + vLower[2]; + cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; + cell.getCoordinates().z()++ ) + { + for( cell.getCoordinates().y() = 0 + vLower[1]; + cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = 0 + vLower[0]; + cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; + cell.getCoordinates().x()++ ) + { + cell.refresh(); + if( ! interfaceMap( cell ) ) + { + //getting information weather we calculated in this passage + calculated = this->updateCell( aux, cell ) || calculated; + } + } + } + } +#if ForDebug + if( i == 1 ){ + aux.save("aux-final11.tnl"); + } + int pocNull = 0; + for( cell.getCoordinates().z() = 0 + vLower[2]; + cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; + cell.getCoordinates().z()++ ) + { + for( cell.getCoordinates().y() = 0 + vLower[1]; + cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = 0 + vLower[0]; + cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; + cell.getCoordinates().x()++ ) + { + cell.refresh(); + if( fabs( aux(cell) ) < 0.002 ) + pocNull++; + } + } + } + printf("%d: 1. pocNull = %d\n", i , pocNull); +#endif + for( cell.getCoordinates().z() = 0 + vLower[2]; + cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; + cell.getCoordinates().z()++ ) + { + for( cell.getCoordinates().y() = 0 + vLower[1]; + cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0]; + cell.getCoordinates().x() >= 0 + vLower[0]; + cell.getCoordinates().x()-- ) + { + //std::cerr << "2 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + } +#if ForDebug + if( i == 1 ){ + aux.save("aux-final12.tnl"); + } + pocNull = 0; + for( cell.getCoordinates().z() = 0 + vLower[2]; + cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; + cell.getCoordinates().z()++ ) + { + for( cell.getCoordinates().y() = 0 + vLower[1]; + cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = 0 + vLower[0]; + cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; + cell.getCoordinates().x()++ ) + { + cell.refresh(); + if( fabs( aux(cell) ) < 0.002 ) + pocNull++; + } + } + } + printf("%d: 2. pocNull = %d\n", i , pocNull); +#endif + //aux.save( "aux-2.tnl" ); + for( cell.getCoordinates().z() = 0 + vLower[2]; + cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; + cell.getCoordinates().z()++ ) + { + for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 - vUpper[1]; + cell.getCoordinates().y() >= 0 + vLower[1]; + cell.getCoordinates().y()-- ) + { + for( cell.getCoordinates().x() = 0 + vLower[0]; + cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; + cell.getCoordinates().x()++ ) + { + //std::cerr << "3 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + } +#if ForDebug + if( i == 1 ){ + aux.save("aux-final13.tnl"); + } + pocNull = 0; + for( cell.getCoordinates().z() = 0 + vLower[2]; + cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; + cell.getCoordinates().z()++ ) + { + for( cell.getCoordinates().y() = 0 + vLower[1]; + cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = 0 + vLower[0]; + cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; + cell.getCoordinates().x()++ ) + { + cell.refresh(); + if( fabs( aux(cell) ) < 0.002 ) + pocNull++; + } + } + } + printf("%d: 3. pocNull = %d\n", i , pocNull); +#endif + //aux.save( "aux-3.tnl" ); - //Reduction - for( int i = 0; i < BlockIterHost.getSize(); i++ ){ - if( IsCalculationDone == 0 ){ - IsCalculationDone = IsCalculationDone || BlockIterHost[ i ]; - //break; + for( cell.getCoordinates().z() = 0 + vLower[2]; + cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; + cell.getCoordinates().z()++ ) + { + for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 - vUpper[1]; + cell.getCoordinates().y() >= 0 + vLower[1]; + cell.getCoordinates().y()-- ) + { + for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0]; + cell.getCoordinates().x() >= 0 + vLower[0]; + cell.getCoordinates().x()-- ) + { + //std::cerr << "4 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + } +#if ForDebug + if( i == 1 ){ + aux.save("aux-final14.tnl"); + } + pocNull = 0; + for( cell.getCoordinates().z() = 0 + vLower[2]; + cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; + cell.getCoordinates().z()++ ) + { + for( cell.getCoordinates().y() = 0 + vLower[1]; + cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = 0 + vLower[0]; + cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; + cell.getCoordinates().x()++ ) + { + cell.refresh(); + if( fabs( aux(cell) ) < 0.002 ) + pocNull++; + } } } - numWhile++; - std::cout <<"numWhile = "<< numWhile <-1; j-- ){ - for( int i = 0; i < numBlocksX; i++ ){ - //std::cout << (*auxPtr)[ k * numBlocksX * numBlocksY + j * numBlocksX + i ] << " "; - std::cout << BlockIterHost[ k * numBlocksX * numBlocksY + j * numBlocksX + i ]; + printf("%d: 4. pocNull = %d\n", i , pocNull); +#endif + //aux.save( "aux-4.tnl" ); + + for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1 - vUpper[2]; + cell.getCoordinates().z() >= 0 + vLower[2]; + cell.getCoordinates().z()-- ) + { + for( cell.getCoordinates().y() = 0 + vLower[1]; + cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = 0 + vLower[0]; + cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; + cell.getCoordinates().x()++ ) + { + //std::cerr << "5 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + } +#if ForDebug + if( i == 1 ){ + aux.save("aux-final15.tnl"); + } + pocNull = 0; + for( cell.getCoordinates().z() = 0 + vLower[2]; + cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; + cell.getCoordinates().z()++ ) + { + for( cell.getCoordinates().y() = 0 + vLower[1]; + cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = 0 + vLower[0]; + cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; + cell.getCoordinates().x()++ ) + { + cell.refresh(); + if( fabs( aux(cell) ) < 0.002 ) + pocNull++; } - std::cout << std::endl; } - std::cout << std::endl; } - std::cout << std::endl;*/ + printf("%d: 5. pocNull = %d\n", i , pocNull); + #endif + //aux.save( "aux-5.tnl" ); - this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY, numBlocksZ ); + for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1 - vUpper[2]; + cell.getCoordinates().z() >= 0 + vLower[2]; + cell.getCoordinates().z()-- ) + { + for( cell.getCoordinates().y() = 0 + vLower[1]; + cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0]; + cell.getCoordinates().x() >= 0 + vLower[0]; + cell.getCoordinates().x()-- ) + { + //std::cerr << "6 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + } +#if ForDebug + if( i == 1 ){ + aux.save("aux-final16.tnl"); + } + pocNull = 0; + for( cell.getCoordinates().z() = 0 + vLower[2]; + cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; + cell.getCoordinates().z()++ ) + { + for( cell.getCoordinates().y() = 0 + vLower[1]; + cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = 0 + vLower[0]; + cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; + cell.getCoordinates().x()++ ) + { + cell.refresh(); + if( fabs( aux(cell) ) < 0.002 ) + pocNull++; + } + } + } + printf("%d: 6. pocNull = %d\n", i , pocNull); +#endif + //aux.save( "aux-6.tnl" ); - /*for( int k = 0; k < numBlocksZ; k++ ){ - for( int j = numBlocksY-1; j>-1; j-- ){ - for( int i = 0; i < numBlocksX; i++ ){ - //std::cout << (*auxPtr)[ k * numBlocksX * numBlocksY + j * numBlocksX + i ] << " "; - std::cout << BlockIterHost[ k * numBlocksX * numBlocksY + j * numBlocksX + i ]; + for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1 - vUpper[2]; + cell.getCoordinates().z() >= 0 + vLower[2]; + cell.getCoordinates().z()-- ) + { + for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 - vUpper[1]; + cell.getCoordinates().y() >= 0 + vLower[1]; + cell.getCoordinates().y()-- ) + { + for( cell.getCoordinates().x() = 0 + vLower[0]; + cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; + cell.getCoordinates().x()++ ) + { + //std::cerr << "7 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); } - std::cout << std::endl; } - std::cout << std::endl; - }*/ + } - /*for( int j = numBlocksY-1; j>-1; j-- ){ - for( int i = 0; i < numBlocksX; i++ ) - std::cout << "BlockIterHost = "<< j*numBlocksX + i<< " ," << BlockIterHost[ j * numBlocksX + i ]; - std::cout << std::endl; - } - std::cout << std::endl;*/ +#if ForDebug + if( i == 1 ){ + aux.save("aux-final17.tnl"); + } + pocNull = 0; + for( cell.getCoordinates().z() = 0 + vLower[2]; + cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; + cell.getCoordinates().z()++ ) + { + for( cell.getCoordinates().y() = 0 + vLower[1]; + cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = 0 + vLower[0]; + cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; + cell.getCoordinates().x()++ ) + { + cell.refresh(); + if( fabs( aux(cell) ) < 0.002 ) + pocNull++; + } + } + } + printf("%d: 7. pocNull = %d\n", i , pocNull); +#endif + //aux.save( "aux-7.tnl" ); - //std::cout<getDimensions().z() - 1 - vUpper[2]; + cell.getCoordinates().z() >= 0 + vLower[2]; + cell.getCoordinates().z()-- ) + { + for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 - vUpper[1]; + cell.getCoordinates().y() >= 0 + vLower[1]; + cell.getCoordinates().y()-- ) + { + for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0]; + cell.getCoordinates().x() >= 0 + vLower[0]; + cell.getCoordinates().x()-- ) + { + //std::cerr << "8 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + } +#if ForDebug + if( i == 1 ){ + aux.save("aux-final18.tnl"); + } + pocNull = 0; + for( cell.getCoordinates().z() = 0 + vLower[2]; + cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; + cell.getCoordinates().z()++ ) + { + for( cell.getCoordinates().y() = 0 + vLower[1]; + cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = 0 + vLower[0]; + cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; + cell.getCoordinates().x()++ ) + { + cell.refresh(); + if( fabs( aux(cell) ) < 0.002 ) + pocNull++; + } + } + } + printf("%d: 8. pocNull = %d\n", i , pocNull); + for( cell.getCoordinates().z() = 0 + vLower[2]; + cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; + cell.getCoordinates().z()++ ) + { + for( cell.getCoordinates().y() = 0 + vLower[1]; + cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = 0 + vLower[0]; + cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; + cell.getCoordinates().x()++ ) + { + cell.refresh(); + printf("%.2f ", aux(cell)); + } + printf("\n"); + } + printf("\n"); + } +#endif + + /**----------------------------------------------------------------------------------*/ } - aux = *auxPtr; - - /*for( cell.getCoordinates().z() = 0; - cell.getCoordinates().z() < mesh->getDimensions().z(); - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh->getDimensions().y(); - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); - cell.getCoordinates().x()++ ) - { - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } - //aux.save( "aux-1.tnl" ); - - for( cell.getCoordinates().z() = 0; - cell.getCoordinates().z() < mesh->getDimensions().z(); - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh->getDimensions().y(); - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) - { - //std::cerr << "2 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } - //aux.save( "aux-2.tnl" ); - for( cell.getCoordinates().z() = 0; - cell.getCoordinates().z() < mesh->getDimensions().z(); - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; - cell.getCoordinates().y() >= 0 ; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); - cell.getCoordinates().x()++ ) - { - //std::cerr << "3 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } - //aux.save( "aux-3.tnl" ); - - for( cell.getCoordinates().z() = 0; - cell.getCoordinates().z() < mesh->getDimensions().z(); - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; - cell.getCoordinates().y() >= 0; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) - { - //std::cerr << "4 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } - //aux.save( "aux-4.tnl" ); - - for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1; - cell.getCoordinates().z() >= 0; - cell.getCoordinates().z()-- ) - { - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh->getDimensions().y(); - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); - cell.getCoordinates().x()++ ) - { - //std::cerr << "5 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } - //aux.save( "aux-5.tnl" ); - - for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1; - cell.getCoordinates().z() >= 0; - cell.getCoordinates().z()-- ) - { - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh->getDimensions().y(); - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) - { - //std::cerr << "6 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } - //aux.save( "aux-6.tnl" ); - - for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1; - cell.getCoordinates().z() >= 0; - cell.getCoordinates().z()-- ) - { - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; - cell.getCoordinates().y() >= 0 ; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); - cell.getCoordinates().x()++ ) - { - //std::cerr << "7 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } - //aux.save( "aux-7.tnl" ); - - for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1; - cell.getCoordinates().z() >= 0; - cell.getCoordinates().z()-- ) - { - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; - cell.getCoordinates().y() >= 0; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) - { - //std::cerr << "8 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - }*/ - } - if( std::is_same< DeviceType, Devices::Cuda >::value ) - { - // TODO: CUDA code -#ifdef HAVE_CUDA - const int cudaBlockSize( 8 ); - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); - int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); - int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z(), cudaBlockSize ); - if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) - std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; - dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); - dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); - - tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr; - - - int BlockIterD = 1; - - TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; - BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ ); - BlockIterDevice.setValue( 1 ); - TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom; - BlockIterPom.setSize( numBlocksX * numBlocksY * numBlocksZ ); - BlockIterPom.setValue( 0 ); - /*int *BlockIterDevice; - cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/ - int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0); - - TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; - dBlock.setSize( nBlocks ); - dBlock.setValue( 0 ); - - int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0); - /*int *dBlock; - cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/ - MeshFunctionPointer helpFunc1( mesh ); - MeshFunctionPointer helpFunc( mesh ); - - helpFunc1 = auxPtr; - auxPtr = helpFunc; - helpFunc = helpFunc1; - int numIter = 0; - - while( BlockIterD ) + if( std::is_same< DeviceType, Devices::Cuda >::value && calculate ) { - helpFunc1 = auxPtr; - auxPtr = helpFunc; - helpFunc = helpFunc1; - TNL_CHECK_CUDA_DEVICE; +#ifdef HAVE_CUDA + // cudaBlockSize is a size of blocks. It's the number raised to the 3 power. + // the number should be less than 10^3 (num of threads in one grid is maximally 1024) + // IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2) + const int cudaBlockSize( 8 ); CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), @@ -404,32 +634,192 @@ solve( const MeshPointer& mesh, CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); - numIter++; - /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) - BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ - + aux = *auxPtr; + interfaceMap = *interfaceMapPtr; +#endif } - if( numIter == 1 ){ - auxPtr = helpFunc; + +#ifdef HAVE_MPI + if( CommunicatorType::isDistributed() ){ + + const int *neigh = meshPom->getNeighbors(); // Getting nembers of distributed mesh + MPI::Request *req; + req = new MPI::Request[meshPom->getNeighborsCount()]; + + int neighCount = 0; // we know the number in runtime and it can differ for every MPI thread + // Getting information weather some of six neghbours (top, bottom, right, left, ahead, behind) calculated + int calculpom[6] = {0,0,0,0,0,0}; + + + if( neigh[0] != -1 ) // if you have west neighbour + { + // if we have this neighbour, we send calculated, one number, to him, ... + req[neighCount] = MPI::ISend( &calculated, 1, neigh[0], 0, MPI::AllGroup ); + neighCount++; + // and we recive the same information from him + req[neighCount] = MPI::IRecv( &calculpom[0], 1, neigh[0], 0, MPI::AllGroup ); + neighCount++; + } + + if( neigh[1] != -1 ) // east + { + req[neighCount] = MPI::ISend( &calculated, 1, neigh[1], 0, MPI::AllGroup ); + neighCount++; + + + req[neighCount] = MPI::IRecv( &calculpom[1], 1, neigh[1], 0, MPI::AllGroup ); + neighCount++; + } + + if( neigh[2] != -1 ) // north + { + req[neighCount] = MPI::ISend( &calculated, 1, neigh[2], 0, MPI::AllGroup ); + neighCount++; + + req[neighCount] = MPI::IRecv( &calculpom[2], 1, neigh[2], 0, MPI::AllGroup ); + neighCount++; + } + + if( neigh[5] != -1 ) //south + { + req[neighCount] = MPI::ISend( &calculated, 1, neigh[5], 0, MPI::AllGroup ); + neighCount++; + + req[neighCount] = MPI::IRecv( &calculpom[3], 1, neigh[5], 0, MPI::AllGroup ); + neighCount++; + } + + if( neigh[8] != -1 ) // top + { + req[neighCount] = MPI::ISend( &calculated, 1, neigh[8], 0, MPI::AllGroup ); + neighCount++; + + req[neighCount] = MPI::IRecv( &calculpom[4], 1, neigh[8], 0, MPI::AllGroup ); + neighCount++; + } + + if( neigh[17] != -1 ) //bottom + { + req[neighCount] = MPI::ISend( &calculated, 1, neigh[17], 0, MPI::AllGroup ); + neighCount++; + + req[neighCount] = MPI::IRecv( &calculpom[5], 1, neigh[17], 0, MPI::AllGroup ); + neighCount++; + } + + MPI::WaitAll(req,neighCount); //waiting for all to have all the information +#if ForDebug + printf( "%d: Sending Calculated = %d.\n", i, calculated ); + printf( "%d: calculpom[0] = %d, calculpom[1] = %d, calculpom[2] = %d, calculpom[3] = %d, calculpom[4] = %d," + "calculpom[5] = %d", i ,calculpom[0],calculpom[1],calculpom[2],calculpom[3],calculpom[4],calculpom[5]); +#endif + // if one of the MPI thread had calculated = 1, then all get 1. Otherwise all get 0 + MPI::Allreduce( &calculated, &calculated, 1, MPI_LOR, MPI::AllGroup ); + // synchronizate the overlaps + aux.template synchronize< Communicator >(); + // if any of my neighbours had calculated = 1, than I should calculate again (but all of us has to go throw while(calculated)) + calculate = calculpom[0] || calculpom[1] || calculpom[2] || + calculpom[3] || calculpom[4] || calculpom[5]; +#if ForDebug + printf( "%d: Receved Calculated = %d.\n%d: Calculate = %d\n", i, calculated, i, calculate); +#endif + +#if ForDebug + if( i == 1 ) + printf("WhileCount = %d\n",WhileCount); + if( i == 2 ){ + aux.save("aux-final2.tnl"); + mesh->save("mesh-2.tnl"); + } + if( i == 1 ){ + aux.save("aux-final1.tnl"); + mesh->save("mesh-1.tnl"); + } + if( i == 3 ){ + aux.save("aux-final3.tnl"); + mesh->save("mesh-3.tnl"); + } + if( i == 0 ){ + aux.save("aux-final0.tnl"); + mesh->save("mesh-0.tnl"); + } + //calculated = 0; // DEBUG; +#endif } - //cudaFree( BlockIterDevice ); - //cudaFree( dBlock ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - aux = *auxPtr; - interfaceMap = *interfaceMapPtr; #endif + if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculated 0! + calculated = 0; //otherwise we would go throw the FSM code and CUDA FSM code again uselessly } - //aux.save( "aux-8.tnl" ); iteration++; } + // Saving the results into Aux for MakeSnapshot function. + Aux = auxPtr; aux.save("aux-final.tnl"); } #ifdef HAVE_CUDA +// DeepCopy nebo pracne kopirovat kraje v zavislosti na vLower,vUpper z sArray do helpFunc. +template< typename Real, typename Device, typename Index > +__global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, int copy, int k ) +{ + int thri = threadIdx.x + blockDim.x*blockIdx.x; + int thrj = blockDim.y*blockIdx.y + threadIdx.y; + int thrk = blockDim.z*blockIdx.z + threadIdx.z; + + const Meshes::Grid< 3, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >(); + if( copy ){ + if( thri < mesh.getDimensions().x() && thrj < mesh.getDimensions().y() && thrk < mesh.getDimensions().z() ) + { + helpFunc[ thrk * mesh.getDimensions().x() * mesh.getDimensions().y() + thrj * mesh.getDimensions().x() + thri ] = + aux[ thrk * mesh.getDimensions().x() * mesh.getDimensions().y() + thrj * mesh.getDimensions().x() + thri ]; + } + } + else // for debug, values can be printed only from cuda kernel + { + if( thrk == 0 && thri==0 && thrj == 0 && blockIdx.z == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 0 ) + { + printf("%d: DimX = %d, DimY = %d, DimZ = %d\n", k,mesh.getDimensions().x(),mesh.getDimensions().y(),mesh.getDimensions().z() ); + for( int z = mesh.getDimensions().z()-1; z > mesh.getDimensions().z()-2; z-- ) + { + for( int y = 0; y < mesh.getDimensions().y(); y++ ) + { + for( int x = 0; x < mesh.getDimensions().x(); x++ ) + { + printf("%.2f ", helpFunc[ z *mesh.getDimensions().y()*mesh.getDimensions().x() + y*mesh.getDimensions().x() + x ]); + } + printf("\n"); + } + printf("\n"); + } + printf("\n"); + } + if( thrk == 0 && thri==0 && thrj == 0 && blockIdx.z == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 1 ) + { + printf("%d: DimX = %d, DimY = %d, DimZ = %d\n", k,mesh.getDimensions().x(),mesh.getDimensions().y(),mesh.getDimensions().z() ); + + if( k == 1 ) + { + for( int z = 1; z < 2; z++ ) + { + for( int y = 0; y < mesh.getDimensions().y(); y++ ) + { + for( int x = 0; x < mesh.getDimensions().x(); x++ ) + { + printf("%.2f ", aux[ z *mesh.getDimensions().y()*mesh.getDimensions().x() + y*mesh.getDimensions().x() + x ]); + } + printf("\n"); + } + printf("\n"); + } + printf("\n"); + } + } + } +} + template < typename Index > __global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, @@ -444,21 +834,24 @@ __global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, l = i/( numBlockX * numBlockY ); k = (i-l*numBlockX * numBlockY )/(numBlockX ); m = (i-l*numBlockX * numBlockY )%( numBlockX ); - if( m > 0 && BlockIterDevice[ i - 1 ] ){ + if( m > 0 && BlockIterDevice[ i - 1 ] ){ // left neighbour pom = 1;//BlockIterPom[ i ] = 1; - }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){ + }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){ // right neighbour pom = 1;//BlockIterPom[ i ] = 1; - }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){ + }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){ // bottom neighbour pom = 1;// BlockIterPom[ i ] = 1; - }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){ + }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){ // top neighbour pom = 1;//BlockIterPom[ i ] = 1; - }else if( l > 0 && BlockIterDevice[ i - numBlockX*numBlockY ] ){ + }else if( l > 0 && BlockIterDevice[ i - numBlockX*numBlockY ] ){ // neighbour behind pom = 1; - }else if( l < numBlockZ-1 && BlockIterDevice[ i + numBlockX*numBlockY ] ){ + }else if( l < numBlockZ-1 && BlockIterDevice[ i + numBlockX*numBlockY ] ){ // neighbour in front pom = 1; } - BlockIterPom[ i ] = pom;//BlockIterPom[ i ]; + if( !BlockIterDevice[ i ] ) // only in CudaUpdateCellCaller can BlockIterDevice gain 0 + BlockIterPom[ i ] = pom; + else + BlockIterPom[ i ] = 1; } } @@ -471,23 +864,25 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< { int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z; int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z; - int i = threadIdx.x + blockDim.x*blockIdx.x; - int j = blockDim.y*blockIdx.y + threadIdx.y; - int k = blockDim.z*blockIdx.z + threadIdx.z; + int i = threadIdx.x + blockDim.x*blockIdx.x + vLower[0]; // WITH OVERLAPS!!! i,j,k aren't coordinates of all values + int j = blockDim.y*blockIdx.y + threadIdx.y + vLower[1]; + int k = blockDim.z*blockIdx.z + threadIdx.z + vLower[2]; int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri; + const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); - if( BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] ) + // should this block calculate? + if( BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] ) { __syncthreads(); - __shared__ volatile bool changed[ 8*8*8/*(sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2)*/]; - + // Array indicates weather some threads calculated (for parallel reduction) + __shared__ volatile bool changed[ (sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2) ]; changed[ currentIndex ] = false; + if( thrj == 0 && thri == 0 && thrk == 0 ) - changed[ 0 ] = true; + changed[ 0 ] = true; // first indicates weather we should calculate again (princip of parallel reduction) - const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); - __shared__ Real hx; __shared__ int dimX; + __shared__ Real hx; __shared__ int dimX; //getting stepps and size of mesh __shared__ Real hy; __shared__ int dimY; __shared__ Real hz; __shared__ int dimZ; @@ -500,16 +895,19 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< dimX = mesh.getDimensions().x(); dimY = mesh.getDimensions().y(); dimZ = mesh.getDimensions().z(); + // we dont know if we will calculate in here, more info down in code BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 0; } - __shared__ volatile Real sArray[ 10*10*10/*sizeSArray * sizeSArray * sizeSArray*/ ]; + + // sArray contains values of one block (coppied from aux) and edges (not MPI) of those blocks + __shared__ volatile Real sArray[ sizeSArray * sizeSArray * sizeSArray ]; sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = std::numeric_limits< Real >::max(); - //filling sArray edges + // getting some usefull information int numOfBlockx; int numOfBlocky; int numOfBlockz; - int xkolik; + int xkolik; // maximum of threads in x direction (for all blocks different) int ykolik; int zkolik; xkolik = blockDim.x + 1; @@ -521,65 +919,104 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< __syncthreads(); if( numOfBlockx - 1 == blIdx ) - xkolik = dimX - (blIdx)*blockDim.x+1; + xkolik = (dimX-vUpper[0]-vLower[0]) - (blIdx)*blockDim.x+1; if( numOfBlocky -1 == blIdy ) - ykolik = dimY - (blIdy)*blockDim.y+1; + ykolik = (dimY-vUpper[1]-vLower[1]) - (blIdy)*blockDim.y+1; if( numOfBlockz-1 == blIdz ) - zkolik = dimZ - (blIdz)*blockDim.z+1; + zkolik = (dimZ-vUpper[2]-vLower[2]) - (blIdz)*blockDim.z+1; __syncthreads(); - if( thri == 0 ) + //filling sArray edges + if( thri == 0 ) //x bottom { - if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik ) - sArray[(thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ]; - else + if( (blIdx != 0 || vLower[0] !=0) && thrj+1 < ykolik && thrk+1 < zkolik ) + sArray[ (thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0 ] = + aux[ (blIdz*blockDim.z + vLower[2]) * dimX * dimY + (blIdy * blockDim.y+vLower[1])*dimX + + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY + vLower[0] ]; + else sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max(); } - if( thri == 1 ) + if( thri == 1 ) //xtop { - if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik ) - sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ]; - else + if( dimX - vLower[ 0 ] > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik ) + sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = + aux[ (blIdz*blockDim.z + vLower[2]) * dimX * dimY + (blIdy * blockDim.y+vLower[1])*dimX + + blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY + vLower[0] ]; + else sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max(); } - if( thri == 2 ) + if( thri == 2 ) //y bottom { - if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik ) - sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ]; + if( (blIdy != 0 || vLower[1] !=0) && thrj+1 < xkolik && thrk+1 < zkolik ) + sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = + aux[ (blIdz*blockDim.z + vLower[2]) * dimX * dimY + (blIdy * blockDim.y+vLower[1])*dimX + + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY + vLower[0] ]; else sArray[ (thrk+1) * sizeSArray * sizeSArray + 0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max(); } - if( thri == 3 ) + if( thri == 3 ) //y top { - if( dimY > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik ) - sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ]; - else + if( dimY - vLower[ 1 ] > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik ) + sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = + aux[ (blIdz*blockDim.z + vLower[2]) * dimX * dimY + ((blIdy+1) * blockDim.y+vLower[1])*dimX + + blIdx*blockDim.x + thrj + thrk*dimX*dimY + vLower[0] ]; + else sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = std::numeric_limits< Real >::max(); } - if( thri == 4 ) + if( thri == 4 ) //z bottom { - if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik ) - sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ]; - else + if( (blIdz != 0 || vLower[2] !=0) && thrj+1 < ykolik && thrk+1 < xkolik ) + sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = + aux[ (blIdz*blockDim.z + vLower[2]) * dimX * dimY + (blIdy * blockDim.y+vLower[1])*dimX + + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk + vLower[0] ]; + else sArray[0 * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thrk+1] = std::numeric_limits< Real >::max(); } - if( thri == 5 ) + if( thri == 5 ) //z top { - if( dimZ > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik ) - sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ]; - else + if( dimZ - vLower[ 2 ] > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik ) + sArray[ zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = + aux[ ((blIdz+1)*blockDim.z + vLower[2]) * dimX * dimY + (blIdy * blockDim.y+vLower[1])*dimX + + blIdx*blockDim.x + thrj * dimX + thrk + vLower[0] ]; + else sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = std::numeric_limits< Real >::max(); } - if( i < dimX && j < dimY && k < dimZ ) + // Copy all other values that aren't edges + if( i - vLower[0] < dimX && j - vLower[1] < dimY && k - vLower[2] < dimZ && + thri+1 < xkolik + vUpper[0] && thrj+1 < ykolik + vUpper[1] && thrk+1 < zkolik + vUpper[2] ) { sArray[(thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = aux[ k*dimX*dimY + j*dimX + i ]; } __syncthreads(); +#if ForDebug + /*if( thri==0 && thrj == 0 && thrk == 0 && blockIdx.z == 0 && blockIdx.x == 2 && blockIdx.y == 2 && MPIthread == 1 ) + { + printf( "všechno před výpočtem: \n"); + for( int m = sizeSArray-1; m>-1; m-- ){ + for( int l = 0; l < sizeSArray; l++ ) + printf( "%.2f ", sArray[4*sizeSArray * sizeSArray + m * sizeSArray + l]); + printf( "\n"); + } + printf( "\n"); + } + + if(thri==0 && thrj == 0 && thrk == 0 && blockIdx.z == 0 && blockIdx.x == 2 && blockIdx.y == 2 && MPIthread == 1 ) + { + for( int m = 24; m>14; m-- ){ + for( int l = 15; l < 25; l++ ) + printf("%.2f ", aux[ 4 *mesh.getDimensions().y()*mesh.getDimensions().x() + m*mesh.getDimensions().x() + l ]); + printf( "\n"); + } + printf( "\n"); + }*/ +#endif + + //main while cycle. each value can get information only from neighbour but that information has to spread there while( changed[ 0 ] ) { __syncthreads(); @@ -587,16 +1024,17 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< changed[ currentIndex ] = false; //calculation of update cell - if( i < dimX && j < dimY && k < dimZ ) + if( i < dimX - vUpper[0] && j < dimY - vUpper[1] && k < dimZ - vUpper[2] ) { if( ! interfaceMap[ k*dimX*dimY + j * dimX + i ] ) { - changed[ currentIndex ] = ptr.updateCell3D< sizeSArray >( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz); + // calculate new value depending on neighbours in sArray on (thri+1, thrj+1) coordinates + changed[ currentIndex ] = ptr.updateCell3D< sizeSArray >( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz); } } __syncthreads(); - //pyramid reduction + //pyramid reduction (parallel reduction) if( blockDim.x*blockDim.y*blockDim.z == 1024 ) { if( currentIndex < 512 ) @@ -640,30 +1078,25 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< } __syncthreads(); - /*if(thri == 0 && thrj ==0 && thrk ==0 && blIdx == 0 && blIdy == 0 && blIdz == 0) - { - //for(int m = 0; m < 8; m++){ - int m = 4; - for(int n = 0; n<8; n++){ - for(int b=0; b<8; b++) - printf(" %i ", changed[m*64 + n*8 + b]); - printf("\n"); - } - printf("\n \n"); - } - //}*/ - + // if we calculated, then the BlockIterDevice should contain the info about this whole block! (only one number for one block) if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 ) { - //printf( "Setting block calculation. Block = %d.\n",blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ); BlockIterDevice[ blIdz * gridDim.x * gridDim.y + blIdy * gridDim.x + blIdx ] = 1; } __syncthreads(); } - if( i < dimX && j < dimY && k < dimZ ) + // copy results into helpFunc (not into aux bcs of conflicts) + if( i < dimX && j < dimY && k < dimZ && thri+1 < xkolik && thrj+1 < ykolik && thrk+1 < zkolik ) helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thri+1 ]; - } + } + else // if not, then it should at least copy the values from aux to helpFunc. + { + if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] + && k < mesh.getDimensions().z() - vUpper[2]) + helpFunc[ k * mesh.getDimensions().x() * mesh.getDimensions().y() + j * mesh.getDimensions().x() + i ] = + aux[ k * mesh.getDimensions().x() * mesh.getDimensions().y() + j * mesh.getDimensions().x() + i ]; + } } #endif -- GitLab From 7af752a5290d881a36bbd287235cc5da862b63e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matou=C5=A1=20Fencl?= Date: Mon, 11 Mar 2019 20:01:43 +0100 Subject: [PATCH 08/14] DeepCopy removed from CUDA --- .../tnlFastSweepingMethod2D_impl.h | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index 5eac5232b..f9bef30c3 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -465,17 +465,31 @@ solve( const MeshPointer& mesh, // Helping meshFunction that switches with AuxPtr in every calculation of CudaUpdateCellCaller<<<>>>() MeshFunctionPointer helpFunc( mesh ); + helpFunc.template modifyData() = auxPtr.template getData(); + Devices::Cuda::synchronizeDevice(); //MeshFunctionPointer helpFunc1( mesh ); // Setting number of threads and blocks in grid for DeepCopy of meshFunction - int numBlocksXWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); + /*int numBlocksXWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); int numBlocksYWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); dim3 gridSizeWithoutOverlaps( numBlocksXWithoutOverlaps, numBlocksYWithoutOverlaps ); - + + + Devices::Cuda::synchronizeDevice(); DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(), 1, i ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + Devices::Cuda::synchronizeDevice(); + DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(), + helpFunc.template modifyData< Device>(), 0, i ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE;*/ #if ForDebug + /*int numBlocksXWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); + int numBlocksYWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); + dim3 gridSizeWithoutOverlaps( numBlocksXWithoutOverlaps, numBlocksYWithoutOverlaps );*/ DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(), 0, i ); #endif @@ -536,7 +550,7 @@ solve( const MeshPointer& mesh, Devices::Cuda::synchronizeDevice(); CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), - auxPtr.template modifyData< Device>(), + auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(), BlockIterDevice, vLower, vUpper, i ); cudaDeviceSynchronize(); @@ -701,7 +715,7 @@ __global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 2, Real, const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >(); if( copy ){ if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) - helpFunc[ j * mesh.getDimensions().x() + i ] = aux[ j * mesh.getDimensions().x() + i ]; + helpFunc[ j * mesh.getDimensions().x() + i ] = 1;//aux[ j * mesh.getDimensions().x() + i ]; } else { -- GitLab From d4412d3188dfd01f6d38d4a9095f5eaf3f847e6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matou=C5=A1=20Fencl?= Date: Sat, 16 Mar 2019 11:09:27 +0100 Subject: [PATCH 09/14] Refactoring --- .../tnlDirectEikonalMethodBase1D_impl.h | 204 +++ .../tnlDirectEikonalMethodBase2D_impl.h | 803 ++++++++++++ .../tnlDirectEikonalMethodBase3D_impl.h | 1091 +++++++++++++++++ .../tnlDirectEikonalMethodsBase.h | 79 +- .../hamilton-jacobi/tnlFastSweepingMethod.h | 27 +- .../tnlFastSweepingMethod2D_impl.h | 707 +++-------- .../tnlFastSweepingMethod3D_impl.h | 1032 +++------------- 7 files changed, 2493 insertions(+), 1450 deletions(-) create mode 100644 src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h create mode 100644 src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h create mode 100644 src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h new file mode 100644 index 000000000..55129c4e1 --- /dev/null +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h @@ -0,0 +1,204 @@ +/* + * File: tnlDirectEikonalMethodBase1D_impl.h + * Author: Fencl + * + * Created on March 15, 2019 + */ + +#pragma once + +template< typename Real, + typename Device, + typename Index > +void +tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >:: +initInterface( const MeshFunctionPointer& _input, + MeshFunctionPointer& _output, + InterfaceMapPointer& _interfaceMap ) +{ + if( std::is_same< Device, Devices::Cuda >::value ) + { +#ifdef HAVE_CUDA + const MeshType& mesh = _input->getMesh(); + + const int cudaBlockSize( 16 ); + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); + dim3 blockSize( cudaBlockSize ); + dim3 gridSize( numBlocksX ); + Devices::Cuda::synchronizeDevice(); + CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(), + _output.template modifyData< Device >(), + _interfaceMap.template modifyData< Device >() ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; +#endif + } + if( std::is_same< Device, Devices::Host >::value ) + { + const MeshType& mesh = _input->getMesh(); + typedef typename MeshType::Cell Cell; + const MeshFunctionType& input = _input.getData(); + MeshFunctionType& output = _output.modifyData(); + InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); + Cell cell( mesh ); + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh.getDimensions().x(); + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + output[ cell.getIndex() ] = + input( cell ) >= 0 ? std::numeric_limits< RealType >::max() : + -std::numeric_limits< RealType >::max(); + interfaceMap[ cell.getIndex() ] = false; + } + + + const RealType& h = mesh.getSpaceSteps().x(); + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh.getDimensions().x() - 1; + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + const RealType& c = input( cell ); + if( ! cell.isBoundaryEntity() ) + { + const auto& neighbors = cell.getNeighborEntities(); + Real pom = 0; + //const IndexType& c = cell.getIndex(); + const IndexType e = neighbors.template getEntityIndex< 1 >(); + if( c * input[ e ] <= 0 ) + { + pom = TNL::sign( c )*( h * c )/( c - input[ e ]); + if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) + output[ cell.getIndex() ] = pom; + + pom = pom - TNL::sign( c )*h; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; + if( TNL::abs( output[ e ] ) > TNL::abs( pom ) ) + output[ e ] = pom; + + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ e ] = true; + } + } + } + } +} + +template< typename Real, + typename Device, + typename Index > +template< typename MeshEntity > +void +tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >:: +updateCell( MeshFunctionType& u, + const MeshEntity& cell, + const RealType v ) +{ + const auto& neighborEntities = cell.template getNeighborEntities< 1 >(); + const MeshType& mesh = cell.getMesh(); + const RealType& h = mesh.getSpaceSteps().x(); + const RealType value = u( cell ); + RealType a, tmp = std::numeric_limits< RealType >::max(); + + if( cell.getCoordinates().x() == 0 ) + a = u[ neighborEntities.template getEntityIndex< 1 >() ]; + else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 ) + a = u[ neighborEntities.template getEntityIndex< -1 >() ]; + else + { + a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1 >() ], + u[ neighborEntities.template getEntityIndex< 1 >() ] ); + } + + if( fabs( a ) == std::numeric_limits< RealType >::max() ) + return; + + tmp = a + TNL::sign( value ) * h/v; + + u[ cell.getIndex() ] = argAbsMin( value, tmp ); +} + +template< typename Real, + typename Device, + typename Index > +__cuda_callable__ +bool +tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >:: +updateCell( volatile Real sArray[18], int thri, const Real h, const Real v ) +{ + const RealType value = sArray[ thri ]; + RealType a, tmp = std::numeric_limits< RealType >::max(); + + a = TNL::argAbsMin( sArray[ thri+1 ], + sArray[ thri-1 ] ); + + if( fabs( a ) == std::numeric_limits< RealType >::max() ) + return false; + + tmp = a + TNL::sign( value ) * h/v; + + + sArray[ thri ] = argAbsMin( value, tmp ); + + tmp = value - sArray[ thri ]; + if ( fabs( tmp ) > 0.001*h ) + return true; + else + return false; +} + +#ifdef HAVE_CUDA +template < typename Real, typename Device, typename Index > +__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, + Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output, + Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap ) +{ + int i = threadIdx.x + blockDim.x*blockIdx.x; + const Meshes::Grid< 1, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); + + if( i < mesh.getDimensions().x() ) + { + typedef typename Meshes::Grid< 1, Real, Device, Index >::Cell Cell; + Cell cell( mesh ); + cell.getCoordinates().x() = i; + cell.refresh(); + const Index cind = cell.getIndex(); + + + output[ cind ] = + input( cell ) >= 0 ? std::numeric_limits< Real >::max() : + - std::numeric_limits< Real >::max(); + interfaceMap[ cind ] = false; + + const Real& h = mesh.getSpaceSteps().x(); + cell.refresh(); + const Real& c = input( cell ); + if( ! cell.isBoundaryEntity() ) + { + auto neighbors = cell.getNeighborEntities(); + Real pom = 0; + const Index e = neighbors.template getEntityIndex< 1 >(); + const Index w = neighbors.template getEntityIndex< -1 >(); + if( c * input[ e ] <= 0 ) + { + pom = TNL::sign( c )*( h * c )/( c - input[ e ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ w ] <= 0 ) + { + pom = TNL::sign( c )*( h * c )/( c - input[ w ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + } + } + +} +#endif + + diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h new file mode 100644 index 000000000..583e22478 --- /dev/null +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h @@ -0,0 +1,803 @@ +/* + * File: tnlDirectEikonalMethodBase2D_impl.h + * Author: Fencl + * + * Created on March 15, 2019 + */ + +#pragma once + +template< typename Real, + typename Device, + typename Index > +void +tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: +initInterface( const MeshFunctionPointer& _input, + MeshFunctionPointer& _output, + InterfaceMapPointer& _interfaceMap, + const StaticVector vecLowerOverlaps, + const StaticVector vecUpperOverlaps ) +{ + + if( std::is_same< Device, Devices::Cuda >::value ) + { +#ifdef HAVE_CUDA + const MeshType& mesh = _input->getMesh(); + + const int cudaBlockSize( 16 ); + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); + int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); + dim3 blockSize( cudaBlockSize, cudaBlockSize ); + dim3 gridSize( numBlocksX, numBlocksY ); + Devices::Cuda::synchronizeDevice(); + CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(), + _output.template modifyData< Device >(), + _interfaceMap.template modifyData< Device >(), + vecLowerOverlaps, vecUpperOverlaps); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; +#endif + } + if( std::is_same< Device, Devices::Host >::value ) + { + MeshFunctionType input = _input.getData(); + MeshFunctionType& output = _output.modifyData(); + InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); + const MeshType& mesh = input.getMesh(); + typedef typename MeshType::Cell Cell; + Cell cell( mesh ); + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh.getDimensions().y(); + cell.getCoordinates().y() ++ ) + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh.getDimensions().x(); + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + output[ cell.getIndex() ] = + input( cell ) >= 0 ? std::numeric_limits< RealType >::max() : + - std::numeric_limits< RealType >::max(); + interfaceMap[ cell.getIndex() ] = false; + } + + const RealType& hx = mesh.getSpaceSteps().x(); + const RealType& hy = mesh.getSpaceSteps().y(); + for( cell.getCoordinates().y() = 0 + vecLowerOverlaps[1]; + cell.getCoordinates().y() < mesh.getDimensions().y() - vecUpperOverlaps[1]; + cell.getCoordinates().y() ++ ) + for( cell.getCoordinates().x() = 0 + vecLowerOverlaps[0]; + cell.getCoordinates().x() < mesh.getDimensions().x() - vecUpperOverlaps[0]; + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + const RealType& c = input( cell ); + if( ! cell.isBoundaryEntity() ) + { + auto neighbors = cell.getNeighborEntities(); + Real pom = 0; + const IndexType e = neighbors.template getEntityIndex< 1, 0 >(); + const IndexType n = neighbors.template getEntityIndex< 0, 1 >(); + if( c * input[ n ] <= 0 ) + { + pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); + if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) + output[ cell.getIndex() ] = pom; + pom = pom - TNL::sign( c )*hy; + if( TNL::abs( output[ n ] ) > TNL::abs( pom ) ) + output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy; + + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ n ] = true; + } + if( c * input[ e ] <= 0 ) + { + pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); + if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) + output[ cell.getIndex() ] = pom; + + pom = pom - TNL::sign( c )*hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; + if( TNL::abs( output[ e ] ) > TNL::abs( pom ) ) + output[ e ] = pom; + + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ e ] = true; + } + } + } + } +} + +template< typename Real, + typename Device, + typename Index > +template< typename MeshEntity > +__cuda_callable__ +bool +tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: +updateCell( MeshFunctionType& u, + const MeshEntity& cell, + const RealType v) +{ + const auto& neighborEntities = cell.template getNeighborEntities< 2 >(); + const MeshType& mesh = cell.getMesh(); + const RealType& hx = mesh.getSpaceSteps().x(); + const RealType& hy = mesh.getSpaceSteps().y(); + const RealType value = u( cell ); + RealType a, b, tmp = std::numeric_limits< RealType >::max(); + + if( cell.getCoordinates().x() == 0 ) + a = u[ neighborEntities.template getEntityIndex< 1, 0 >() ]; + else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 ) + a = u[ neighborEntities.template getEntityIndex< -1, 0 >() ]; + else + { + a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0 >() ], + u[ neighborEntities.template getEntityIndex< 1, 0 >() ] ); + } + + if( cell.getCoordinates().y() == 0 ) + b = u[ neighborEntities.template getEntityIndex< 0, 1 >()]; + else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 ) + b = u[ neighborEntities.template getEntityIndex< 0, -1 >() ]; + else + { + b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1 >() ], + u[ neighborEntities.template getEntityIndex< 0, 1 >() ] ); + } + + if( fabs( a ) == std::numeric_limits< RealType >::max() && + fabs( b ) == std::numeric_limits< RealType >::max() ) + return false; + + RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), hx, hy, 0.0 }; + + tmp = getNewValue( pom , value, v ); + + u[ cell.getIndex() ] = tmp; + + + tmp = value - u[ cell.getIndex() ]; + + if ( fabs( tmp ) > 0.001*hx ) + return true; + else + return false; + +} + +template< typename Real, + typename Device, + typename Index > +template< int sizeSArray > +__cuda_callable__ +bool +tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: +updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real hy, + const Real v ) +{ + const RealType value = sArray[ thrj * sizeSArray + thri ]; + RealType a, b, tmp = std::numeric_limits< RealType >::max(); + + b = TNL::argAbsMin( sArray[ (thrj+1) * sizeSArray + thri ], + sArray[ (thrj-1) * sizeSArray + thri ] ); + + a = TNL::argAbsMin( sArray[ thrj * sizeSArray + thri+1 ], + sArray[ thrj * sizeSArray + thri-1 ] ); + + if( fabs( a ) == std::numeric_limits< RealType >::max() && + fabs( b ) == std::numeric_limits< RealType >::max() ) + return false; + + RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 }; + + tmp = getNewValue( pom , value, v ); + + sArray[ thrj * sizeSArray + thri ] = tmp; + tmp = value - sArray[ thrj * sizeSArray + thri ]; + if ( fabs( tmp ) > 0.001*hx ) + return true; + else + return false; +} + +template< typename Real, + typename Device, + typename Index > +__cuda_callable__ +Real +tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: +getNewValue( RealType valuesAndSteps[], const RealType originalValue, const RealType v ) +{ + RealType newValue = std::numeric_limits< RealType >::max(); + sortMinims( valuesAndSteps ); + + // calculation of real value taken from ZHAO + newValue = valuesAndSteps[ 0 ] + TNL::sign( originalValue ) * valuesAndSteps[ 3 ]/v; + if( fabs( newValue ) < fabs( valuesAndSteps[ 1 ] ) ) + { + newValue = argAbsMin( originalValue, newValue ); + } + else + { + newValue = ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] * valuesAndSteps[ 1 ] + + valuesAndSteps[ 4 ] * valuesAndSteps[ 4 ] * valuesAndSteps[ 0 ] + + TNL::sign( originalValue ) * valuesAndSteps[ 3 ] * valuesAndSteps[ 4 ] * + TNL::sqrt( ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] + valuesAndSteps[ 4 ] * valuesAndSteps[ 4 ] )/( v * v ) - + ( valuesAndSteps[ 1 ] - valuesAndSteps[ 0 ] ) * + ( valuesAndSteps[ 1 ] - valuesAndSteps[ 0 ] ) ) )/ + ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] + valuesAndSteps[ 4 ] * valuesAndSteps[ 4 ] ); + newValue = argAbsMin( originalValue, newValue ); + } + + return newValue; +} + + +template < typename T1 > +__cuda_callable__ void sortMinims( T1 pom[] ) +{ + T1 tmp[6] = {0.0,0.0,0.0,0.0,0.0,0.0}; + if( fabs(pom[0]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[2])){ + tmp[0] = pom[0]; tmp[1] = pom[1]; tmp[2] = pom[2]; + tmp[3] = pom[3]; tmp[4] = pom[4]; tmp[5] = pom[5]; + + } + else if( fabs(pom[0]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[1]) ){ + tmp[0] = pom[0]; tmp[1] = pom[2]; tmp[2] = pom[1]; + tmp[3] = pom[3]; tmp[4] = pom[5]; tmp[5] = pom[4]; + } + else if( fabs(pom[1]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[2]) ){ + tmp[0] = pom[1]; tmp[1] = pom[0]; tmp[2] = pom[2]; + tmp[3] = pom[4]; tmp[4] = pom[3]; tmp[5] = pom[5]; + } + else if( fabs(pom[1]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[0]) ){ + tmp[0] = pom[1]; tmp[1] = pom[2]; tmp[2] = pom[0]; + tmp[3] = pom[4]; tmp[4] = pom[5]; tmp[5] = pom[3]; + } + else if( fabs(pom[2]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[1]) ){ + tmp[0] = pom[2]; tmp[1] = pom[0]; tmp[2] = pom[1]; + tmp[3] = pom[5]; tmp[4] = pom[3]; tmp[5] = pom[4]; + } + else if( fabs(pom[2]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[0]) ){ + tmp[0] = pom[2]; tmp[1] = pom[1]; tmp[2] = pom[0]; + tmp[3] = pom[5]; tmp[4] = pom[4]; tmp[5] = pom[3]; + } + + for( unsigned int i = 0; i < 6; i++ ) + { + pom[ i ] = tmp[ i ]; + } +} + +#ifdef HAVE_CUDA +template < typename Real, typename Device, typename Index > +__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output, + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, + const Containers::StaticVector< 2, Index > vecLowerOverlaps, + const Containers::StaticVector< 2, Index > vecUpperOverlaps ) +{ + int i = threadIdx.x + blockDim.x*blockIdx.x; + int j = blockDim.y*blockIdx.y + threadIdx.y; + const Meshes::Grid< 2, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); + + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) + { + typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell; + Cell cell( mesh ); + cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; + cell.refresh(); + const Index cind = cell.getIndex(); + + + output[ cind ] = + input( cell ) >= 0 ? std::numeric_limits< Real >::max() : + - std::numeric_limits< Real >::max(); + interfaceMap[ cind ] = false; + + if( i < mesh.getDimensions().x() - vecUpperOverlaps[ 0 ] && + j < mesh.getDimensions().y() - vecUpperOverlaps[ 1 ] && + i>vecLowerOverlaps[ 0 ] -1 && j> vecLowerOverlaps[ 0 ]-1 ) + { + const Real& hx = mesh.getSpaceSteps().x(); + const Real& hy = mesh.getSpaceSteps().y(); + cell.refresh(); + const Real& c = input( cell ); + if( ! cell.isBoundaryEntity() ) + { + auto neighbors = cell.getNeighborEntities(); + Real tmp = 0; + const Index e = neighbors.template getEntityIndex< 1, 0 >(); + const Index w = neighbors.template getEntityIndex< -1, 0 >(); + const Index n = neighbors.template getEntityIndex< 0, 1 >(); + const Index s = neighbors.template getEntityIndex< 0, -1 >(); + + if( c * input[ n ] <= 0 ) + { + tmp = TNL::sign( c )*( hy * c )/( c - input[ n ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( tmp ) ) + output[ cind ] = tmp; + + interfaceMap[ cell.getIndex() ] = true; + } + + + if( c * input[ e ] <= 0 ) + { + tmp = TNL::sign( c )*( hx * c )/( c - input[ e ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( tmp ) ) + output[ cind ] = tmp; + + interfaceMap[ cind ] = true; + } + if( c * input[ w ] <= 0 ) + { + tmp = TNL::sign( c )*( hx * c )/( c - input[ w ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( tmp ) ) + output[ cind ] = tmp; + + interfaceMap[ cind ] = true; + } + if( c * input[ s ] <= 0 ) + { + tmp = TNL::sign( c )*( hy * c )/( c - input[ s ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( tmp ) ) + output[ cind ] = tmp; + + interfaceMap[ cind ] = true; + } + } + } + } +} + + +template < typename Index > +__global__ void GetNeighbours( const TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator, + TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY ) +{ + int i = blockIdx.x * 1024 + threadIdx.x; + + if( i < numBlockX * numBlockY ) + { + int pom = 0;//BlockIterPom[ i ] = 0; + int m=0, k=0; + m = i%numBlockX; + k = i/numBlockX; + if( m > 0 && blockCalculationIndicator[ i - 1 ] ){ + pom = 1;//BlockIterPom[ i ] = 1; + }else if( m < numBlockX -1 && blockCalculationIndicator[ i + 1 ] ){ + pom = 1;//BlockIterPom[ i ] = 1; + }else if( k > 0 && blockCalculationIndicatorHelp[ i - numBlockX ] ){ + pom = 1;// BlockIterPom[ i ] = 1; + }else if( k < numBlockY -1 && blockCalculationIndicator[ i + numBlockX ] ){ + pom = 1;//BlockIterPom[ i ] = 1; + } + + if( blockCalculationIndicator[ i ] != 1 ) + blockCalculationIndicatorHelp[ i ] = pom;//BlockIterPom[ i ]; + else + blockCalculationIndicatorHelp[ i ] = 1; + } +} + + + + +template < int sizeSArray, typename Real, typename Device, typename Index > +__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, + const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, + const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, + TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator, + const Containers::StaticVector< 2, Index > vecLowerOverlaps, + const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock ) +{ + // Setting up threads + int thri = threadIdx.x; int thrj = threadIdx.y; + int i = threadIdx.x + blockDim.x*blockIdx.x + vecLowerOverlaps[0]; + int j = blockDim.y*blockIdx.y + threadIdx.y + vecLowerOverlaps[1]; + const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >(); + +/** FOR CHESS METHOD */ + //if( (blockIdx.y%2 + blockIdx.x) % 2 == oddEvenBlock ) + //{ +/**------------------------------------------*/ + +/** FOR FIM METHOD */ + if( blockCalculationIndicator[ blockIdx.y * gridDim.x + blockIdx.x ] ) + { + __syncthreads(); +/**-----------------------------------------*/ + + const int dimX = mesh.getDimensions().x(); const int dimY = mesh.getDimensions().y(); + const Real hx = mesh.getSpaceSteps().x(); const Real hy = mesh.getSpaceSteps().y(); + if( thri==0 && thrj == 0) + { + blockCalculationIndicator[ blockIdx.y * gridDim.x + blockIdx.x ] = 0; + } + __syncthreads(); + int maxThreadsInXDirection; + int maxThreadsInYDirection; + + // Maximum threads in each direction can differ + // e.g. cudaBlockSize = 16, dimX = 50, then: + // blockIdx maxThreadsInXDirection calculation [from, to] sArray [from, to] + // 0 16 [ 0,15] [ 0,16] //"-1" set to inf + // 1 16 [16,31] [15,32] + // 2 16 [32,47] [31,48] + // 3 2 [48,50] [47,50] // rest set to inf + // same for YDirection because blocks are squared + maxThreadsInXDirection = blockDim.x + 1; + maxThreadsInYDirection = blockDim.y + 1; + + if( gridDim.x - 1 == blockIdx.x ) // care about number of values if we are in last block + maxThreadsInXDirection = (dimX-vecUpperOverlaps[0]-vecLowerOverlaps[0]) - (blockIdx.x)*blockDim.x+1; + + if( gridDim.y - 1 == blockIdx.y ) // care about number of values if we are in last block + maxThreadsInYDirection = (dimY-vecUpperOverlaps[1]-vecLowerOverlaps[1]) - (blockIdx.y)*blockDim.y+1; + __syncthreads(); + + // Setting changed array that contains info: "Did the value of this thread changed in last passage?" + // Will be used in parallel reduction ( inside block level ) + int currentIndex = thrj * blockDim.x + thri; + __shared__ volatile bool changed[ ( sizeSArray - 2 ) * ( sizeSArray - 2 ) ]; + changed[ currentIndex ] = false; + if( thrj == 0 && thri == 0 ) + changed[ 0 ] = true; // fist must be true to start while cycle + + + //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ]; + __shared__ volatile Real sArray[ sizeSArray * sizeSArray ]; + sArray[ (thrj+1) * sizeSArray + thri +1 ] = std::numeric_limits< Real >::max(); + + + //filling sArray edges + if( thri == 0 ) // + { + if( dimX - vecLowerOverlaps[ 0 ] > (blockIdx.x+1) * blockDim.x && thrj+1 < maxThreadsInYDirection ) + sArray[ (thrj+1)*sizeSArray + maxThreadsInXDirection ] = + aux[ (blockIdx.y*blockDim.y+vecLowerOverlaps[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 // this to get to right possition + + (thrj+1)*dimX + maxThreadsInXDirection + vecLowerOverlaps[0] ]; // rest to get the right sArray overlap + else + sArray[ (thrj+1)*sizeSArray + maxThreadsInXDirection ] = std::numeric_limits< Real >::max(); + } + + if( thri == 1 ) + { + if( ( blockIdx.x != 0 || vecLowerOverlaps[0] != 0 ) && thrj+1 < maxThreadsInYDirection ) + sArray[(thrj+1)*sizeSArray + 0] = + aux[ (blockIdx.y*blockDim.y+vecLowerOverlaps[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + + (thrj+1)*dimX + vecLowerOverlaps[0] ]; + else + sArray[(thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max(); + } + + if( thri == 2 ) + { + if( dimY - vecLowerOverlaps[ 1 ] > (blockIdx.y+1) * blockDim.y && thrj+1 < maxThreadsInXDirection ) + sArray[ maxThreadsInYDirection * sizeSArray + thrj+1 ] = + aux[ ( blockIdx.y * blockDim.y + vecLowerOverlaps[ 1 ] ) * dimX - dimX + blockIdx.x * blockDim.x - 1 + + maxThreadsInYDirection * dimX + thrj + 1 + vecLowerOverlaps[0] ]; + else + sArray[ maxThreadsInYDirection*sizeSArray + thrj+1 ] = std::numeric_limits< Real >::max(); + + } + + if( thri == 3 ) + { + if( ( blockIdx.y != 0 || vecLowerOverlaps[1] != 0 ) && thrj+1 < maxThreadsInXDirection ) + sArray[0*sizeSArray + thrj+1] = + aux[ ( blockIdx.y * blockDim.y + vecLowerOverlaps[ 1 ] ) * dimX - dimX + blockIdx.x * blockDim.x - 1 + + thrj + 1 + vecLowerOverlaps[ 0 ] ]; + else + sArray[0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max(); + } + + // Filling sArray inside + if( i - vecLowerOverlaps[ 0 ] < dimX && j - vecLowerOverlaps[ 1 ] < dimY && + thri + 1 < maxThreadsInXDirection + vecUpperOverlaps[ 0 ] && + thrj + 1 < maxThreadsInYDirection + vecUpperOverlaps[ 1 ] ) + { + sArray[ ( thrj + 1 ) * sizeSArray + thri + 1 ] = aux[ j * dimX + i ]; + } + __syncthreads(); + + //main while cycle ( CALCULATES TILL VALUES ARE CHANGING ) + while( changed[ 0 ] ) + { + __syncthreads(); + + changed[ currentIndex] = false; + + //calculation of update cell + if( i < dimX - vecUpperOverlaps[ 0 ] && j < dimY - vecUpperOverlaps[ 1 ] ) + { + if( ! interfaceMap[ j * dimX + i ] ) + { + changed[ currentIndex ] = ptr.updateCell( sArray, thri + 1, thrj + 1, hx, hy ); + } + } + __syncthreads(); + + //pyramid reduction + if( blockDim.x * blockDim.y == 1024 ) + { + if( currentIndex < 512 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; + } + } + __syncthreads(); + if( blockDim.x * blockDim.y >= 512 ) + { + if( currentIndex < 256 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; + } + } + __syncthreads(); + if( blockDim.x * blockDim.y >= 256 ) + { + if( currentIndex < 128 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ]; + } + } + __syncthreads(); + if( blockDim.x * blockDim.y >= 128 ) + { + if( currentIndex < 64 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; + } + } + __syncthreads(); + if( currentIndex < 32 ) + { + if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ]; + if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ]; + if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ]; + if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ]; + if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; + if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; + } + // result of reduction is in changed[ 0 ] + + // If we calculated in passage, then the blockCalculationIndicator for this block has to be 1 + // means that we calculated in this block + if( thri == 0 && thrj == 0 && changed[ 0 ] ){ + blockCalculationIndicator[ blockIdx.y * gridDim.x + blockIdx.x ] = 1; + } + __syncthreads(); + } + + + + if( i < dimX && j < dimY && thri+1 < maxThreadsInXDirection && thrj+1 < maxThreadsInYDirection ) + helpFunc[ j * dimX + i ] = sArray[ ( thrj + 1 ) * sizeSArray + thri + 1 ]; + __syncthreads(); + } + else + { + if( i < mesh.getDimensions().x() - vecUpperOverlaps[0] && j < mesh.getDimensions().y() - vecUpperOverlaps[1] ) + helpFunc[ j * mesh.getDimensions().x() + i ] = aux[ j * mesh.getDimensions().x() + i ]; + } +} +#endif + + + +/// ====================OPEN=MP============================================ +template< typename Real, + typename Device, + typename Index > +template< int sizeSArray > +void +tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: +updateBlocks( InterfaceMapType interfaceMap, + MeshFunctionType aux, + MeshFunctionType helpFunc, + ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ) +{ +#pragma omp parallel for schedule( dynamic ) + for( IndexType i = 0; i < BlockIterHost.getSize(); i++ ) + { + if( BlockIterHost[ i ] ) + { + MeshType mesh = interfaceMap.template getMesh< Devices::Host >(); + + int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); + //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl; + int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0); + int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0); + //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl; + int xkolik = numThreadsPerBlock + 1; + int ykolik = numThreadsPerBlock + 1; + + int blIdx = i%numOfBlockx; + int blIdy = i/numOfBlockx; + //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl; + + if( numOfBlockx - 1 == blIdx ) + xkolik = dimX - (blIdx)*numThreadsPerBlock+1; + + if( numOfBlocky -1 == blIdy ) + ykolik = dimY - (blIdy)*numThreadsPerBlock+1; + //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl; + + + /*bool changed[numThreadsPerBlock*numThreadsPerBlock]; + changed[ 0 ] = 1;*/ + Real hx = mesh.getSpaceSteps().x(); + Real hy = mesh.getSpaceSteps().y(); + + bool changed = false; + BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0; + + + Real *sArray; + sArray = new Real[ sizeSArray * sizeSArray ]; + if( sArray == nullptr ) + std::cout << "Error while allocating memory for sArray." << std::endl; + + for( IndexType thri = 0; thri < sizeSArray; thri++ ){ + for( IndexType thrj = 0; thrj < sizeSArray; thrj++ ) + sArray[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max(); + } + + + //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); + for( IndexType thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ ) + { + if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik ) + sArray[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ]; + + + if( blIdx != 0 && thrj+1 < ykolik ) + sArray[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ]; + + if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik ) + sArray[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ]; + + if( blIdy != 0 && thrj+1 < xkolik ) + sArray[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ]; + } + + for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ + for( IndexType l = 0; l < numThreadsPerBlock; l++ ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) + sArray[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ]; + } + + for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ + for( IndexType l = 0; l < numThreadsPerBlock; l++ ){ + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ){ + //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl; + if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) + { + changed = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy) || changed; + + } + } + } + } + /*aux.save( "aux-1pruch.tnl" ); + for( int k = 0; k < sizeSArray; k++ ){ + for( int l = 0; l < sizeSArray; l++ ) { + std::cout << sArray[ k * sizeSArray + l] << " "; + } + std::cout << std::endl; + }*/ + + for( IndexType k = 0; k < numThreadsPerBlock; k++ ) + for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ) { + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) + { + if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) + { + this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy); + } + } + } + /*aux.save( "aux-2pruch.tnl" ); + for( int k = 0; k < sizeSArray; k++ ){ + for( int l = 0; l < sizeSArray; l++ ) { + std::cout << sArray[ k * sizeSArray + l] << " "; + } + std::cout << std::endl; + }*/ + + for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ) + for( IndexType l = 0; l < numThreadsPerBlock; l++ ) { + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) + { + if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) + { + this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy); + } + } + } + /*aux.save( "aux-3pruch.tnl" ); + for( int k = 0; k < sizeSArray; k++ ){ + for( int l = 0; l < sizeSArray; l++ ) { + std::cout << sArray[ k * sizeSArray + l] << " "; + } + std::cout << std::endl; + }*/ + + for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){ + for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ) { + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) + { + if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) + { + this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx, hy, 1.0); + } + } + } + } + /*aux.save( "aux-4pruch.tnl" ); + for( int k = 0; k < sizeSArray; k++ ){ + for( int l = 0; l < sizeSArray; l++ ) { + std::cout << sArray[ k * sizeSArray + l] << " "; + } + std::cout << std::endl; + }*/ + + + if( changed ){ + BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 1; + } + + + for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ + for( IndexType l = 0; l < numThreadsPerBlock; l++ ) { + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) + helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] = sArray[ (k + 1)* sizeSArray + l + 1 ]; + //std::cout<< sArray[k+1][l+1]; + } + //std::cout< +void +tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: +getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY ) +{ + int* BlockIterPom; + BlockIterPom = new int [numBlockX * numBlockY]; + + for(int i = 0; i < numBlockX * numBlockY; i++) + { + BlockIterPom[ i ] = 0;//BlockIterPom[ i ] = 0; + int m=0, k=0; + m = i%numBlockX; + k = i/numBlockX; + if( m > 0 && BlockIterHost[ i - 1 ] ){ + BlockIterPom[ i ] = 1; + }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){ + BlockIterPom[ i ] = 1; + }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){ + BlockIterPom[ i ] = 1; + }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){ + BlockIterPom[ i ] = 1; + } + } + + for(int i = 0; i < numBlockX * numBlockY; i++) + { + if( !BlockIterHost[ i ] ) + BlockIterHost[ i ] = BlockIterPom[ i ]; + } + delete[] BlockIterPom; +} + + + diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h new file mode 100644 index 000000000..91f9a0efe --- /dev/null +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h @@ -0,0 +1,1091 @@ +/* + * File: tnlDirectEikonalMethodBase3D_impl.h + * Author: Fencl + * + * Created on March 15, 2019 + */ + +#pragma once + +template< typename Real, + typename Device, + typename Index > +void +tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: +initInterface( const MeshFunctionPointer& _input, + MeshFunctionPointer& _output, + InterfaceMapPointer& _interfaceMap, + StaticVector vLower, StaticVector vUpper ) +{ + if( std::is_same< Device, Devices::Cuda >::value ) + { +#ifdef HAVE_CUDA + const MeshType& mesh = _input->getMesh(); + + const int cudaBlockSize( 8 ); + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); + int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); + int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize ); + if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) + std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; + dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); + dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); + Devices::Cuda::synchronizeDevice(); + CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(), + _output.template modifyData< Device >(), + _interfaceMap.template modifyData< Device >(), vLower, vUpper ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; +#endif + } + if( std::is_same< Device, Devices::Host >::value ) + { + const MeshFunctionType& input = _input.getData(); + MeshFunctionType& output = _output.modifyData(); + InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); + + const MeshType& mesh = input.getMesh(); + typedef typename MeshType::Cell Cell; + + Cell cell( mesh ); + for( cell.getCoordinates().z() = 0; + cell.getCoordinates().z() < mesh.getDimensions().z(); + cell.getCoordinates().z() ++ ) + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh.getDimensions().y(); + cell.getCoordinates().y() ++ ) + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh.getDimensions().x(); + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + output[ cell.getIndex() ] = + input( cell ) > 0 ? std::numeric_limits< RealType >::max() : + - std::numeric_limits< RealType >::max(); + interfaceMap[ cell.getIndex() ] = false; + } + + const RealType& hx = mesh.getSpaceSteps().x(); + const RealType& hy = mesh.getSpaceSteps().y(); + const RealType& hz = mesh.getSpaceSteps().z(); + for( cell.getCoordinates().z() = 0 + vLower[2]; + cell.getCoordinates().z() < mesh.getDimensions().z() - vUpper[2]; + cell.getCoordinates().z() ++ ) + for( cell.getCoordinates().y() = 0 + vLower[1]; + cell.getCoordinates().y() < mesh.getDimensions().y() - vUpper[1]; + cell.getCoordinates().y() ++ ) + for( cell.getCoordinates().x() = 0 + vLower[0]; + cell.getCoordinates().x() < mesh.getDimensions().x() - vUpper[0]; + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + const RealType& c = input( cell ); + if( ! cell.isBoundaryEntity() ) + { + auto neighbors = cell.getNeighborEntities(); + Real pom = 0; + const IndexType e = neighbors.template getEntityIndex< 1, 0, 0 >(); + const IndexType n = neighbors.template getEntityIndex< 0, 1, 0 >(); + const IndexType t = neighbors.template getEntityIndex< 0, 0, 1 >(); + + + if( c * input[ n ] <= 0 ) + { + pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); + if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) + output[ cell.getIndex() ] = pom; + pom = pom - TNL::sign( c )*hy; + if( TNL::abs( output[ n ] ) > TNL::abs( pom ) ) + output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy; + + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ n ] = true; + } + + if( c * input[ e ] <= 0 ) + { + pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); + if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) + output[ cell.getIndex() ] = pom; + pom = pom - TNL::sign( c )*hx; + if( TNL::abs( output[ e ] ) > TNL::abs( pom ) ) + output[ e ] = pom; //( hy * c )/( c - input[ n ]) - hy; + + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ e ] = true; + } + + if( c * input[ t ] <= 0 ) + { + pom = TNL::sign( c )*( hz * c )/( c - input[ t ]); + if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) + output[ cell.getIndex() ] = pom; + pom = pom - TNL::sign( c )*hz; + if( TNL::abs( output[ t ] ) > TNL::abs( pom ) ) + output[ t ] = pom; //( hy * c )/( c - input[ n ]) - hy; + + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ t ] = true; + } + } + } + } +} + +template< typename Real, + typename Device, + typename Index > +template< typename MeshEntity > +__cuda_callable__ +bool +tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: +updateCell( MeshFunctionType& u, + const MeshEntity& cell, + const RealType v ) +{ + const auto& neighborEntities = cell.template getNeighborEntities< 3 >(); + const MeshType& mesh = cell.getMesh(); + + const RealType& hx = mesh.getSpaceSteps().x(); + const RealType& hy = mesh.getSpaceSteps().y(); + const RealType& hz = mesh.getSpaceSteps().z(); + const RealType value = u( cell ); + //std::cout << value << std::endl; + RealType a, b, c, tmp = std::numeric_limits< RealType >::max(); + + + if( cell.getCoordinates().x() == 0 ) + a = u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ]; + else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 ) + a = u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ]; + else + { + a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ], + u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ] ); + } + + if( cell.getCoordinates().y() == 0 ) + b = u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ]; + else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 ) + b = u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ]; + else + { + b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ], + u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ] ); + } + + if( cell.getCoordinates().z() == 0 ) + c = u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ]; + else if( cell.getCoordinates().z() == mesh.getDimensions().z() - 1 ) + c = u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ]; + else + { + c = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ], + u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ] ); + } + + if( fabs( a ) == std::numeric_limits< RealType >::max() && + fabs( b ) == std::numeric_limits< RealType >::max() && + fabs( c ) == std::numeric_limits< RealType >::max() ) + return false; + + RealType pom[6] = { a, b, c, hx, hy, hz}; + tmp = getNewValue( pom , value, v ); + + u[ cell.getIndex() ] = tmp; + tmp = value - u[ cell.getIndex() ]; + if ( fabs( tmp ) > 0.001*hx ) + return true; + else + return false; + /*sortMinims( pom ); + tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]; + + if( fabs( tmp ) < fabs( pom[ 1 ] ) ) + { + u[ cell.getIndex() ] = argAbsMin( value, tmp ); + tmp = value - u[ cell.getIndex() ]; + if ( fabs( tmp ) > 0.001*hx ){ + return true; + }else{ + return false; + } + } + else + { + tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + + TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - + ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); + if( fabs( tmp ) < fabs( pom[ 2 ]) ) + { + u[ cell.getIndex() ] = argAbsMin( value, tmp ); + tmp = value - u[ cell.getIndex() ]; + if ( fabs( tmp ) > 0.001*hx ){ + return true; + }else{ + return false; + } + } + else + { + tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c + + TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - + hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) - + hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx ); + u[ cell.getIndex() ] = argAbsMin( value, tmp ); + tmp = value - u[ cell.getIndex() ]; + if ( fabs( tmp ) > 0.001*hx ){ + return true; + }else{ + return false; + } + } + }*/ +} + +template< typename Real, + typename Device, + typename Index > +template< int sizeSArray > +__cuda_callable__ +bool +tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: +updateCell( volatile Real *sArray, int thri, int thrj, int thrk, + const Real hx, const Real hy, const Real hz, const Real v ) +{ + const RealType value = sArray[thrk *sizeSArray * sizeSArray + thrj * sizeSArray + thri]; + + RealType a, b, c, tmp = std::numeric_limits< RealType >::max(); + + c = TNL::argAbsMin( sArray[ (thrk+1)* sizeSArray*sizeSArray + thrj * sizeSArray + thri ], + sArray[ (thrk-1) * sizeSArray *sizeSArray + thrj* sizeSArray + thri ] ); + + b = TNL::argAbsMin( sArray[ thrk* sizeSArray*sizeSArray + (thrj+1) * sizeSArray + thri ], + sArray[ thrk* sizeSArray * sizeSArray + (thrj-1)* sizeSArray +thri ] ); + + a = TNL::argAbsMin( sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri+1 ], + sArray[ thrk* sizeSArray * sizeSArray + thrj* sizeSArray +thri-1 ] ); + + if( fabs( a ) == std::numeric_limits< RealType >::max() && + fabs( b ) == std::numeric_limits< RealType >::max() && + fabs( c ) == std::numeric_limits< RealType >::max() ) + return false; + + RealType pom[6] = { a, b, c, hx, hy, hz}; + + tmp = getNewValue( pom , value, v ); + + sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ] = tmp; + tmp = value - sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ]; + if ( fabs( tmp ) > 0.001*hx ) + return true; + else + return false; + /*sortMinims( pom ); + + // calculation of real value taken from ZHAO + tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v; + if( fabs( tmp ) < fabs( pom[ 1 ] ) ) + { + sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ] = argAbsMin( value, tmp ); + tmp = value - sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ]; + if ( fabs( tmp ) > 0.001*hx ) + return true; + else + return false; + } + else + { + tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + + TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - + ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); + if( fabs( tmp ) < fabs( pom[ 2 ]) ) + { + sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ] = argAbsMin( value, tmp ); + tmp = value - sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ]; + if ( fabs( tmp ) > 0.001*hx ) + return true; + else + return false; + } + else + { + tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c + + TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - + hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) - + hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx ); + sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ] = argAbsMin( value, tmp ); + tmp = value - sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ]; + if ( fabs( tmp ) > 0.001*hx ) + return true; + else + return false; + } + }*/ + +} + + +template< typename Real, + typename Device, + typename Index > +__cuda_callable__ +Real +tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: +getNewValue( RealType valuesAndSteps[], const RealType originalValue, const RealType v ) +{ + RealType newValue = std::numeric_limits< RealType >::max(); + sortMinims( valuesAndSteps ); + + // calculation of real value taken from ZHAO + newValue = valuesAndSteps[ 0 ] + TNL::sign( originalValue ) * valuesAndSteps[ 3 ]/v; + if( fabs( newValue ) < fabs( valuesAndSteps[ 1 ] ) ) + { + newValue = argAbsMin( originalValue, newValue ); + } + else + { + newValue = ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] * valuesAndSteps[ 1 ] + + valuesAndSteps[ 4 ] * valuesAndSteps[ 4 ] * valuesAndSteps[ 0 ] + + TNL::sign( originalValue ) * valuesAndSteps[ 3 ] * valuesAndSteps[ 4 ] * + TNL::sqrt( ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] + valuesAndSteps[ 4 ] * valuesAndSteps[ 4 ] )/( v * v ) - + ( valuesAndSteps[ 1 ] - valuesAndSteps[ 0 ] ) * + ( valuesAndSteps[ 1 ] - valuesAndSteps[ 0 ] ) ) )/ + ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] + valuesAndSteps[ 4 ] * valuesAndSteps[ 4 ] ); + if( fabs( newValue ) < fabs( valuesAndSteps[ 2 ]) ) + { + newValue = argAbsMin( originalValue, newValue ); + } + else + { + // Value from algorithm by Zhao + newValue = ( valuesAndSteps[4] * valuesAndSteps[4] * valuesAndSteps[5] * valuesAndSteps[5] * valuesAndSteps[0] + + valuesAndSteps[3] * valuesAndSteps[3] * valuesAndSteps[5] * valuesAndSteps[5] * valuesAndSteps[1] + + valuesAndSteps[3] * valuesAndSteps[3] * valuesAndSteps[4] * valuesAndSteps[4] * valuesAndSteps[2] + + TNL::sign(originalValue) * valuesAndSteps[3] * valuesAndSteps[4] * valuesAndSteps[5] * TNL::sqrt( + (valuesAndSteps[3] * valuesAndSteps[3] * valuesAndSteps[5] * valuesAndSteps[5] + + valuesAndSteps[4] * valuesAndSteps[4] * valuesAndSteps[5] * valuesAndSteps[5] + + valuesAndSteps[3] * valuesAndSteps[3] * valuesAndSteps[4] * valuesAndSteps[4]) / (v * v) - + valuesAndSteps[5] * valuesAndSteps[5] * (valuesAndSteps[0] - valuesAndSteps[1]) * (valuesAndSteps[0] - valuesAndSteps[1]) - + valuesAndSteps[4] * valuesAndSteps[4] * (valuesAndSteps[0] - valuesAndSteps[2]) * (valuesAndSteps[0] - valuesAndSteps[2]) - + valuesAndSteps[3] * valuesAndSteps[3] * (valuesAndSteps[1] - valuesAndSteps[2]) * (valuesAndSteps[1] - valuesAndSteps[2]))) / ( + valuesAndSteps[3] * valuesAndSteps[3] * valuesAndSteps[4] * valuesAndSteps[4] + + valuesAndSteps[4] * valuesAndSteps[4] * valuesAndSteps[5] * valuesAndSteps[5] + + valuesAndSteps[5] * valuesAndSteps[5] * valuesAndSteps[3] * valuesAndSteps[3]); + newValue = argAbsMin( originalValue, newValue ); + } + } + + return newValue; +} + + +#ifdef HAVE_CUDA +template < typename Real, typename Device, typename Index > +__global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output, + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, + Containers::StaticVector< 3, Index > vLower, Containers::StaticVector< 3, Index > vUpper ) +{ + int i = threadIdx.x + blockDim.x*blockIdx.x; + int j = blockDim.y*blockIdx.y + threadIdx.y; + int k = blockDim.z*blockIdx.z + threadIdx.z; + const Meshes::Grid< 3, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); + + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() ) + { + typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell; + Cell cell( mesh ); + cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k; + cell.refresh(); + const Index cind = cell.getIndex(); + + + output[ cind ] = + input( cell ) >= 0 ? std::numeric_limits< Real >::max() : + - std::numeric_limits< Real >::max(); + interfaceMap[ cind ] = false; + cell.refresh(); + + if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] && + k < mesh.getDimensions().y() - vUpper[2] && i>vLower[0]-1 && j> vLower[1]-1 && k>vLower[2]-1 ) + { + const Real& hx = mesh.getSpaceSteps().x(); + const Real& hy = mesh.getSpaceSteps().y(); + const Real& hz = mesh.getSpaceSteps().z(); + const Real& c = input( cell ); + if( ! cell.isBoundaryEntity() ) + { + auto neighbors = cell.getNeighborEntities(); + Real pom = 0; + const Index e = neighbors.template getEntityIndex< 1, 0, 0 >(); + const Index w = neighbors.template getEntityIndex< -1, 0, 0 >(); + const Index n = neighbors.template getEntityIndex< 0, 1, 0 >(); + const Index s = neighbors.template getEntityIndex< 0, -1, 0 >(); + const Index t = neighbors.template getEntityIndex< 0, 0, 1 >(); + const Index b = neighbors.template getEntityIndex< 0, 0, -1 >(); + + if( c * input[ n ] <= 0 ) + { + pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ e ] <= 0 ) + { + pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ w ] <= 0 ) + { + pom = TNL::sign( c )*( hx * c )/( c - input[ w ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ s ] <= 0 ) + { + pom = TNL::sign( c )*( hy * c )/( c - input[ s ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ b ] <= 0 ) + { + pom = TNL::sign( c )*( hz * c )/( c - input[ b ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ t ] <= 0 ) + { + pom = TNL::sign( c )*( hz * c )/( c - input[ t ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + } + } + } +} + + +template < typename Index > +__global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, + TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, + int numBlockX, int numBlockY, int numBlockZ ) +{ + int i = blockIdx.x * 1024 + threadIdx.x; + + if( i < numBlockX * numBlockY * numBlockZ ) + { + int pom = 0;//BlockIterPom[ i ] = 0; + int m=0, l=0, k=0; + l = i/( numBlockX * numBlockY ); + k = (i-l*numBlockX * numBlockY )/(numBlockX ); + m = (i-l*numBlockX * numBlockY )%( numBlockX ); + if( m > 0 && BlockIterDevice[ i - 1 ] ){ // left neighbour + pom = 1;//BlockIterPom[ i ] = 1; + }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){ // right neighbour + pom = 1;//BlockIterPom[ i ] = 1; + }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){ // bottom neighbour + pom = 1;// BlockIterPom[ i ] = 1; + }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){ // top neighbour + pom = 1;//BlockIterPom[ i ] = 1; + }else if( l > 0 && BlockIterDevice[ i - numBlockX*numBlockY ] ){ // neighbour behind + pom = 1; + }else if( l < numBlockZ-1 && BlockIterDevice[ i + numBlockX*numBlockY ] ){ // neighbour in front + pom = 1; + } + + if( !BlockIterDevice[ i ] ) // only in CudaUpdateCellCaller can BlockIterDevice gain 0 + BlockIterPom[ i ] = pom; + else + BlockIterPom[ i ] = 1; + } +} + + +template < int sizeSArray, typename Real, typename Device, typename Index > +__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, + const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, + const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, + TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, + Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps ) +{ + int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z; + int i = threadIdx.x + blockDim.x*blockIdx.x + vecLowerOverlaps[0]; // WITH OVERLAPS!!! i,j,k aren't coordinates of all values + int j = blockDim.y*blockIdx.y + threadIdx.y + vecLowerOverlaps[1]; + int k = blockDim.z*blockIdx.z + threadIdx.z + vecLowerOverlaps[2]; + int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri; + const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); + + // should this block calculate? + if( BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] ) + { + __syncthreads(); + + // Array indicates weather some threads calculated (for parallel reduction) + __shared__ volatile bool changed[ (sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2) ]; + changed[ currentIndex ] = false; + + if( thrj == 0 && thri == 0 && thrk == 0 ) + changed[ 0 ] = true; // first indicates weather we should calculate again (princip of parallel reduction) + + //getting stepps and size of mesh + const Real hx = mesh.getSpaceSteps().x(); const int dimX = mesh.getDimensions().x(); + const Real hy = mesh.getSpaceSteps().y(); const int dimY = mesh.getDimensions().y(); + const Real hz = mesh.getSpaceSteps().z(); const int dimZ = mesh.getDimensions().z(); + + if( thrj == 1 && thri == 1 && thrk == 1 ) + { + // we dont know if we will calculate in here, more info down in code + BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 0; + } + + // sArray contains values of one block (coppied from aux) and edges (not MPI) of those blocks + __shared__ volatile Real sArray[ sizeSArray * sizeSArray * sizeSArray ]; + sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = std::numeric_limits< Real >::max(); + + + int xkolik = blockDim.x + 1;// maximum of threads in x direction (for all blocks different) + int ykolik = blockDim.y + 1; + int zkolik = blockDim.z + 1; + __syncthreads(); + + if( gridDim.x - 1 == blockIdx.x ) + xkolik = (dimX-vecUpperOverlaps[0]-vecLowerOverlaps[0]) - blockIdx.x*blockDim.x+1; + if( gridDim.y -1 == blockIdx.y ) + ykolik = (dimY-vecUpperOverlaps[1]-vecLowerOverlaps[1]) - (blockIdx.y)*blockDim.y+1; + if( gridDim.z-1 == blockIdx.z ) + zkolik = (dimZ-vecUpperOverlaps[2]-vecLowerOverlaps[2]) - (blockIdx.z)*blockDim.z+1; + __syncthreads(); + + //filling sArray edges + if( thri == 0 ) //x bottom + { + if( (blockIdx.x != 0 || vecLowerOverlaps[0] !=0) && thrj+1 < ykolik && thrk+1 < zkolik ) + sArray[ (thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0 ] = + aux[ (blockIdx.z*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + (blockIdx.y * blockDim.y+vecLowerOverlaps[1])*dimX + + blockIdx.x*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY + vecLowerOverlaps[0] ]; + else + sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max(); + } + + if( thri == 1 ) //xtop + { + if( dimX - vecLowerOverlaps[ 0 ] > (blockIdx.x+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik ) + sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = + aux[ (blockIdx.z*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + (blockIdx.y * blockDim.y+vecLowerOverlaps[1])*dimX + + blockIdx.x*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY + vecLowerOverlaps[0] ]; + else + sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max(); + } + if( thri == 2 ) //y bottom + { + if( (blockIdx.y != 0 || vecLowerOverlaps[1] !=0) && thrj+1 < xkolik && thrk+1 < zkolik ) + sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = + aux[ (blockIdx.z*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + (blockIdx.y * blockDim.y+vecLowerOverlaps[1])*dimX + + blockIdx.x*blockDim.x - dimX + thrj + thrk*dimX*dimY + vecLowerOverlaps[0] ]; + else + sArray[ (thrk+1) * sizeSArray * sizeSArray + 0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max(); + } + + if( thri == 3 ) //y top + { + if( dimY - vecLowerOverlaps[ 1 ] > (blockIdx.y+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik ) + sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = + aux[ (blockIdx.z*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + ((blockIdx.y+1) * blockDim.y+vecLowerOverlaps[1])*dimX + + blockIdx.x*blockDim.x + thrj + thrk*dimX*dimY + vecLowerOverlaps[0] ]; + else + sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = std::numeric_limits< Real >::max(); + } + if( thri == 4 ) //z bottom + { + if( (blockIdx.z != 0 || vecLowerOverlaps[2] !=0) && thrj+1 < ykolik && thrk+1 < xkolik ) + sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = + aux[ (blockIdx.z*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + (blockIdx.y * blockDim.y+vecLowerOverlaps[1])*dimX + + blockIdx.x*blockDim.x - dimX * dimY + thrj * dimX + thrk + vecLowerOverlaps[0] ]; + else + sArray[0 * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thrk+1] = std::numeric_limits< Real >::max(); + } + + if( thri == 5 ) //z top + { + if( dimZ - vecLowerOverlaps[ 2 ] > (blockIdx.z+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik ) + sArray[ zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = + aux[ ((blockIdx.z+1)*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + (blockIdx.y * blockDim.y+vecLowerOverlaps[1])*dimX + + blockIdx.x*blockDim.x + thrj * dimX + thrk + vecLowerOverlaps[0] ]; + else + sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = std::numeric_limits< Real >::max(); + } + + // Copy all other values that aren't edges + if( i - vecLowerOverlaps[0] < dimX && j - vecLowerOverlaps[1] < dimY && k - vecLowerOverlaps[2] < dimZ && + thri+1 < xkolik + vecUpperOverlaps[0] && thrj+1 < ykolik + vecUpperOverlaps[1] && thrk+1 < zkolik + vecUpperOverlaps[2] ) + { + sArray[(thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = aux[ k*dimX*dimY + j*dimX + i ]; + } + __syncthreads(); + + //main while cycle. each value can get information only from neighbour but that information has to spread there + while( changed[ 0 ] ) + { + __syncthreads(); + + changed[ currentIndex ] = false; + + //calculation of update cell + if( i < dimX - vecUpperOverlaps[0] && j < dimY - vecUpperOverlaps[1] && k < dimZ - vecUpperOverlaps[2] ) + { + if( ! interfaceMap[ k*dimX*dimY + j * dimX + i ] ) + { + // calculate new value depending on neighbours in sArray on (thri+1, thrj+1) coordinates + changed[ currentIndex ] = ptr.updateCell< sizeSArray >( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz); + } + } + __syncthreads(); + + //pyramid reduction (parallel reduction) + if( blockDim.x*blockDim.y*blockDim.z == 1024 ) + { + if( currentIndex < 512 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; + } + } + __syncthreads(); + if( blockDim.x*blockDim.y*blockDim.z >= 512 ) + { + if( currentIndex < 256 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; + } + } + __syncthreads(); + if( blockDim.x*blockDim.y*blockDim.z >= 256 ) + { + if( currentIndex < 128 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ]; + } + } + __syncthreads(); + if( blockDim.x*blockDim.y*blockDim.z >= 128 ) + { + if( currentIndex < 64 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; + } + } + __syncthreads(); + if( currentIndex < 32 ) + { + if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ]; + if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ]; + if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ]; + if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ]; + if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; + if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; + } + __syncthreads(); + + // if we calculated, then the BlockIterDevice should contain the info about this whole block! (only one number for one block) + if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 ) + { + BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 1; + } + __syncthreads(); + } + + // copy results into helpFunc (not into aux bcs of conflicts) + if( i < dimX && j < dimY && k < dimZ && thri+1 < xkolik && thrj+1 < ykolik && thrk+1 < zkolik ) + helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thri+1 ]; + + } + else // if not, then it should at least copy the values from aux to helpFunc. + { + if( i < mesh.getDimensions().x() - vecUpperOverlaps[0] && j < mesh.getDimensions().y() - vecUpperOverlaps[1] + && k < mesh.getDimensions().z() - vecUpperOverlaps[2]) + helpFunc[ k * mesh.getDimensions().x() * mesh.getDimensions().y() + j * mesh.getDimensions().x() + i ] = + aux[ k * mesh.getDimensions().x() * mesh.getDimensions().y() + j * mesh.getDimensions().x() + i ]; + } +} +#endif + + +/// ==========================OPEN=MP================================= +template< typename Real, + typename Device, + typename Index > +template< int sizeSArray > +void +tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: +updateBlocks( const InterfaceMapType interfaceMap, + const MeshFunctionType aux, + MeshFunctionType& helpFunc, + ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ) +{ + //#pragma omp parallel for schedule( dynamic ) + for( IndexType i = 0; i < BlockIterHost.getSize(); i++ ) + { + if( BlockIterHost[ i ] ) + { + MeshType mesh = interfaceMap.template getMesh< Devices::Host >(); + + int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); + int dimZ = mesh.getDimensions().z(); + //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl; + int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0); + int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0); + int numOfBlockz = dimZ/numThreadsPerBlock + ((dimZ%numThreadsPerBlock != 0) ? 1:0); + //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl; + int xkolik = numThreadsPerBlock + 1; + int ykolik = numThreadsPerBlock + 1; + int zkolik = numThreadsPerBlock + 1; + + + int blIdz = i/( numOfBlockx * numOfBlocky ); + int blIdy = (i-blIdz*numOfBlockx * numOfBlocky )/(numOfBlockx ); + int blIdx = (i-blIdz*numOfBlockx * numOfBlocky )%( numOfBlockx ); + //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl; + + if( numOfBlockx - 1 == blIdx ) + xkolik = dimX - (blIdx)*numThreadsPerBlock+1; + if( numOfBlocky -1 == blIdy ) + ykolik = dimY - (blIdy)*numThreadsPerBlock+1; + if( numOfBlockz-1 == blIdz ) + zkolik = dimZ - (blIdz)*numThreadsPerBlock+1; + //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl; + + + /*bool changed[numThreadsPerBlock*numThreadsPerBlock]; + changed[ 0 ] = 1;*/ + Real hx = mesh.getSpaceSteps().x(); + Real hy = mesh.getSpaceSteps().y(); + Real hz = mesh.getSpaceSteps().z(); + + bool changed = false; + BlockIterHost[ i ] = 0; + + + Real *sArray; + sArray = new Real[ sizeSArray * sizeSArray * sizeSArray ]; + if( sArray == nullptr ) + std::cout << "Error while allocating memory for sArray." << std::endl; + + for( IndexType k = 0; k < sizeSArray; k++ ) + for( IndexType l = 0; l < sizeSArray; l++ ) + for( IndexType m = 0; m < sizeSArray; m++ ){ + sArray[ m * sizeSArray * sizeSArray + k * sizeSArray + l ] = std::numeric_limits< Real >::max(); + } + + + for( IndexType thrk = 0; thrk < numThreadsPerBlock; thrk++ ) + for( IndexType thrj = 0; thrj < numThreadsPerBlock; thrj++ ) + { + if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik ) + sArray[(thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = + aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX -1 + thrk*dimX*dimY ]; + + if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < zkolik ) + sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = + aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy *numThreadsPerBlock*dimX+ blIdx*numThreadsPerBlock + numThreadsPerBlock + thrj * dimX + thrk*dimX*dimY ]; + + if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik ) + sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = + aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX + thrj + thrk*dimX*dimY ]; + + if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik && thrk+1 < zkolik ) + sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = + aux[ blIdz*numThreadsPerBlock * dimX * dimY + (blIdy+1) * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj + thrk*dimX*dimY ]; + + if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik ) + sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = + aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX * dimY + thrj * dimX + thrk ]; + + if( dimZ > (blIdz+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < xkolik ) + sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = + aux[ (blIdz+1)*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX + thrk ]; + } + + for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ + for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ + for( IndexType l = 0; l < numThreadsPerBlock; l++ ){ + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + sArray[(m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1] = + aux[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ]; + } + } + } + /*string s; + int numWhile = 0; + for( int k = 0; k < numThreadsPerBlock; k++ ){ + for( int l = 0; l < numThreadsPerBlock; l++ ) + for( int m = 0; m < numThreadsPerBlock; m++ ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; + } + numWhile++; + s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; + helpFunc.save( s );*/ + + for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ + for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ + for( IndexType l = 0; l < numThreadsPerBlock; l++ ){ + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){ + //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl; + if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] ) + { + //printf("In with point m = %d, k = %d, l = %d\n", m, k, l); + changed = this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz) || changed; + + } + } + } + } + } + /*for( int k = 0; k < numThreadsPerBlock; k++ ){ + for( int l = 0; l < numThreadsPerBlock; l++ ) + for( int m = 0; m < numThreadsPerBlock; m++ ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; + } + numWhile++; + s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; + helpFunc.save( s );*/ + + for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){ + for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ + for( IndexType l = 0; l template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); + } + } + } + } + } + /*for( int k = 0; k < numThreadsPerBlock; k++ ){ + for( int l = 0; l < numThreadsPerBlock; l++ ) + for( int m = 0; m < numThreadsPerBlock; m++ ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; + } + numWhile++; + s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; + helpFunc.save( s );*/ + + for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ + for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ + for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){ + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + { + if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] ) + { + this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); + } + } + } + } + } + /*for( int k = 0; k < numThreadsPerBlock; k++ ){ + for( int l = 0; l < numThreadsPerBlock; l++ ) + for( int m = 0; m < numThreadsPerBlock; m++ ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; + } + numWhile++; + s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; + helpFunc.save( s ); + */ + for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){ + for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ + for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){ + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + { + if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] ) + { + this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); + } + } + } + } + } + /*for( int k = 0; k < numThreadsPerBlock; k++ ){ + for( int l = 0; l < numThreadsPerBlock; l++ ) + for( int m = 0; m < numThreadsPerBlock; m++ ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; + } + numWhile++; + s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; + helpFunc.save( s );*/ + + for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ + for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){ + for( IndexType l = 0; l template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); + } + } + } + } + } + /*for( int k = 0; k < numThreadsPerBlock; k++ ){ + for( int l = 0; l < numThreadsPerBlock; l++ ) + for( int m = 0; m < numThreadsPerBlock; m++ ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; + } + numWhile++; + s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; + helpFunc.save( s );*/ + + for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){ + for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){ + for( IndexType l = 0; l template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); + } + } + } + } + } + /*for( int k = 0; k < numThreadsPerBlock; k++ ){ + for( int l = 0; l < numThreadsPerBlock; l++ ) + for( int m = 0; m < numThreadsPerBlock; m++ ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; + } + numWhile++; + s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; + helpFunc.save( s );*/ + + for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ + for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){ + for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){ + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + { + if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] ) + { + this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); + } + } + } + } + } + /*for( int k = 0; k < numThreadsPerBlock; k++ ){ + for( int l = 0; l < numThreadsPerBlock; l++ ) + for( int m = 0; m < numThreadsPerBlock; m++ ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; + } + numWhile++; + s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; + helpFunc.save( s );*/ + + + for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){ + for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){ + for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){ + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + { + if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] ) + { + this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); + } + } + } + } + } + /*for( int k = 0; k < numThreadsPerBlock; k++ ){ + for( int l = 0; l < numThreadsPerBlock; l++ ) + for( int m = 0; m < numThreadsPerBlock; m++ ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; + } + numWhile++; + s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; + helpFunc.save( s );*/ + + if( changed ){ + BlockIterHost[ i ] = 1; + } + + + for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ + for( IndexType l = 0; l < numThreadsPerBlock; l++ ) { + for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){ + helpFunc[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] = + sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; + //std::cout << helpFunc[ m*dimX*dimY + k*dimX + l ] << " "; + } + } + //std::cout << std::endl; + } + //std::cout << std::endl; + } + //helpFunc.save( "helpF.tnl"); + delete []sArray; + } + } +} + +template< typename Real, + typename Device, + typename Index > +void +tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: +getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ ) +{ + int* BlockIterPom; + BlockIterPom = new int [ numBlockX * numBlockY * numBlockZ ]; + + for( int i = 0; i< BlockIterHost.getSize(); i++) + { + BlockIterPom[ i ] = 0; + + int m=0, l=0, k=0; + l = i/( numBlockX * numBlockY ); + k = (i-l*numBlockX * numBlockY )/(numBlockX ); + m = (i-l*numBlockX * numBlockY )%( numBlockX ); + + if( m > 0 && BlockIterHost[ i - 1 ] ){ + BlockIterPom[ i ] = 1; + }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){ + BlockIterPom[ i ] = 1; + }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){ + BlockIterPom[ i ] = 1; + }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){ + BlockIterPom[ i ] = 1; + }else if( l > 0 && BlockIterHost[ i - numBlockX*numBlockY ] ){ + BlockIterPom[ i ] = 1; + }else if( l < numBlockZ-1 && BlockIterHost[ i + numBlockX*numBlockY ] ){ + BlockIterPom[ i ] = 1; + } + } + for( int i = 0; i< BlockIterHost.getSize(); i++) + { + BlockIterHost[ i ] = BlockIterPom[ i ]; + } +} diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h index c6a522d8f..e0ece04bf 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h @@ -7,9 +7,9 @@ #pragma once -#include -#include -#include +//#include +//#include +//#include using namespace TNL; @@ -63,25 +63,32 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType; typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer; typedef Containers::StaticVector< 2, Index > StaticVector; + + using MeshPointer = Pointers::SharedPointer< MeshType >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >; using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >; - + // CALLER FOR HOST AND CUDA void initInterface( const MeshFunctionPointer& input, MeshFunctionPointer& output, InterfaceMapPointer& interfaceMap, StaticVector vLower, StaticVector vUpper ); - + + // FOR HOST template< typename MeshEntity > __cuda_callable__ bool updateCell( MeshFunctionType& u, const MeshEntity& cell, const RealType velocity = 1.0 ); + // FOR CUDA template< int sizeSArray > - __cuda_callable__ bool updateCell( volatile Real *sArray, - int thri, int thrj, const Real hx, const Real hy, - const Real velocity = 1.0 ); - + __cuda_callable__ bool updateCell( volatile RealType *sArray, + int thri, int thrj, const RealType hx, const RealType hy, + const RealType velocity = 1.0 ); + +// FOR OPENMP WILL BE REMOVED + void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY ); + template< int sizeSArray > void updateBlocks( const InterfaceMapType& interfaceMap, MeshFunctionType& aux, @@ -108,16 +115,27 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >; using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >; + // CALLER FOR HOST AND CUDA void initInterface( const MeshFunctionPointer& input, MeshFunctionPointer& output, InterfaceMapPointer& interfaceMap, StaticVector vLower, StaticVector vUpper ); + // FOR HOST template< typename MeshEntity > __cuda_callable__ bool updateCell( MeshFunctionType& u, const MeshEntity& cell, const RealType velocity = 1.0); + // FOR CUDA + template< int sizeSArray > + __cuda_callable__ bool updateCell( volatile Real *sArray, + int thri, int thrj, int thrk, const RealType hx, const RealType hy, const RealType hz, + const RealType velocity = 1.0 ); + + // OPENMP WILL BE REMOVED + void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ ); + template< int sizeSArray > void updateBlocks( const InterfaceMapType& interfaceMap, const MeshFunctionType& aux, @@ -126,16 +144,15 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > void getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY, int numBlockZ ); - template< int sizeSArray > - __cuda_callable__ bool updateCell3D( volatile Real *sArray, - int thri, int thrj, int thrk, const Real hx, const Real hy, const Real hz, - const Real velocity = 1.0 ); + __cuda_callable__ RealType getNewValue( RealType valuesAndSteps[], + const RealType originalValue, const RealType v ); }; template < typename T1 > __cuda_callable__ void sortMinims( T1 pom[] ); #ifdef HAVE_CUDA +// 1D template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output, @@ -147,21 +164,25 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& aux, bool *BlockIterDevice ); + + + +// 2D +template < typename Real, typename Device, typename Index > +__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output, + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, + const Containers::StaticVector< 2, Index > vecLowerOverlas, + const Containers::StaticVector< 2, Index > vecUpperOerlaps ); + template < int sizeSArray, typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, - TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, - Containers::StaticVector< 2, Index > vLower, Containers::StaticVector< 2, Index > vUpper, int k,int oddEvenBlock =0); - -template< typename Real, typename Device, typename Index > -__global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, int copy, int k ); - -template< typename Real, typename Device, typename Index > -__global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, - Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, int copy, int k ); + TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator, + const Containers::StaticVector< 2, Index > vecLowerOverlaps, + const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock =0); template < typename Index > __global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, @@ -171,17 +192,13 @@ template < typename Index > __global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY ); -template < typename Real, typename Device, typename Index > -__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, - Containers::StaticVector< 2, Index > vLower, Containers::StaticVector< 2, Index > vUpper ); +// 3D template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, - Containers::StaticVector< 3, Index > vLower, Containers::StaticVector< 3, Index > vUpper ); + Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps ); template < int sizeSArray, typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, @@ -196,4 +213,6 @@ __global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, int numBlockX, int numBlockY, int numBlockZ ); #endif -#include "tnlDirectEikonalMethodsBase_impl.h" +#include "tnlDirectEikonalMethodBase1D_impl.h" +#include "tnlDirectEikonalMethodBase2D_impl.h" +#include "tnlDirectEikonalMethodBase3D_impl.h" diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h index a57ef1491..8c58ee610 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h @@ -10,11 +10,10 @@ #pragma once -#include -#include -#include +//#include +//#include +//#include #include "tnlDirectEikonalMethodsBase.h" -#define ForDebug false // false <=> off template< typename Mesh, @@ -88,6 +87,7 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, typedef Anisotropy AnisotropyType; typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > BaseType; typedef Communicator CommunicatorType; + typedef Containers::StaticVector< 2, Index > StaticVector; using MeshPointer = Pointers::SharedPointer< MeshType >; using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >; @@ -113,6 +113,15 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, protected: const IndexType maxIterations; + + void setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps, + const MeshPointer& mesh); + + bool goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, + MeshFunctionType& aux, const InterfaceMapType& interfaceMap, + const AnisotropyPointer& anisotropy ); + + void getInfoFromNeighbours( int& calculated, int& calculateAgain, const MeshPointer& mesh ); }; template< typename Real, @@ -134,6 +143,7 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, typedef Anisotropy AnisotropyType; typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > BaseType; typedef Communicator CommunicatorType; + typedef Containers::StaticVector< 3, Index > StaticVector; using MeshPointer = Pointers::SharedPointer< MeshType >; using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >; @@ -161,6 +171,15 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, protected: const IndexType maxIterations; + + void setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps, + const MeshPointer& mesh); + + bool goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, + MeshFunctionType& aux, const InterfaceMapType& interfaceMap, + const AnisotropyPointer& anisotropy ); + + void getInfoFromNeighbours( int& calculated, int& calculateAgain, const MeshPointer& mesh ); }; diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index f9bef30c3..66f9e6cdf 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -13,19 +13,6 @@ #pragma once -#include "tnlFastSweepingMethod.h" -#include "tnlDirectEikonalProblem.h" -#include -#include -#include "tnlDirectEikonalProblem.h" - - - - -#include -#include -#include - template< typename Real, typename Device, typename Index, @@ -79,28 +66,14 @@ solve( const MeshPointer& mesh, auxPtr->setMesh( mesh ); interfaceMapPtr->setMesh( mesh ); - //Distributed mesh for MPI overlaps (without MPI null pointer) - Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh(); - - int i = MPI::GetRank( MPI::AllGroup ); // number that identifies rank - - // getting overlaps ( WITHOUT MPI SHOULD BE 0 ) - Containers::StaticVector< 2, IndexType > vLower; - vLower[0] = 0; vLower[1] = 0; - Containers::StaticVector< 2, IndexType > vUpper; - vUpper[0] = 0; vUpper[1] = 0; -#ifdef HAVE_MPI - if( CommunicatorType::isDistributed() ) //If we started solver with MPI - { - vLower = meshPom->getLowerOverlap(); - vUpper = meshPom->getUpperOverlap(); - } -#endif + // Setting overlaps ( WITHOUT MPI SHOULD BE 0 ) + StaticVector vecLowerOverlaps, vecUpperOverlaps; + setOverlaps( vecLowerOverlaps, vecUpperOverlaps, mesh ); std::cout << "Initiating the interface cells ..." << std::endl; - BaseType::initInterface( u, auxPtr, interfaceMapPtr, vLower, vUpper ); + BaseType::initInterface( u, auxPtr, interfaceMapPtr, vecLowerOverlaps, vecUpperOverlaps ); - auxPtr->save( "aux-ini.tnl" ); + //auxPtr->save( "aux-ini.tnl" ); typename MeshType::Cell cell( *mesh ); @@ -142,57 +115,26 @@ solve( const MeshPointer& mesh, #endif while( iteration < this->maxIterations ) - { -#if ForDebug - int WhileCount = 0; // number of passages of while cycle with condition calculated - printf( "%d: meshDimensions are (x,y) = (%d,%d).\n",i, mesh->getDimensions().x(), mesh->getDimensions().y() ); - printf( "%d: owerlaps are ([x1,x2],[y1,y2]) = ([%d,%d],[%d,%d]).\n",i, vLower[0], vUpper[0], vLower[1], vUpper[1] ); - /*if( std::is_same< DeviceType, Devices::Host >::value && i == 0 ) - { - for( int j = mesh->getDimensions().y()-1; j>-1; j-- ){ - for( int m = 0; m < mesh->getDimensions().x(); m++ ) - std::cout << aux[ j * mesh->getDimensions().x() + m ] << " "; - std::cout << std::endl; - } - std::cout << std::endl; - }*/ - - // TO SEE CUDA OVERLAPS - /*const int cudaBlockSize( 16 ); - int numBlocksXWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); - int numBlocksYWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); - dim3 gridSizeWithoutOverlaps( numBlocksXWithoutOverlaps, numBlocksYWithoutOverlaps ); - dim3 blockSize( cudaBlockSize, cudaBlockSize ); - MeshFunctionPointer helpFunc( mesh ); - DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( helpFunc.template getData< Device>(), - auxPtr.template modifyData< Device>(), 1, i ); */ - -#endif - - int calculated = 1; // indicates weather we calculated in the last passage of the while cycle - // calculated is same for all ranks + { + // calculatedBefore indicates weather we calculated in the last passage of the while cycle + // calculatedBefore is same for all ranks // without MPI should be FALSE at the end of while cycle body - int calculate = 1; // indicates if the thread should calculate again in upcoming passage of cycle - // calculate is a value that can differ in every rank + int calculatedBefore = 1; + + // calculateMPIAgain indicates if the thread should calculate again in upcoming passage of while cycle + // calculateMPIAgain is a value that can differ in every rank // without MPI should be FALSE at the end of while cycle body + int calculateMPIAgain = 1; - while( calculated ) + while( calculatedBefore ) { - calculated = 0; -#if ForDebug - WhileCount++; - /*if( std::is_same< DeviceType, Devices::Cuda >::value ) - { - DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(), - helpFunc.template modifyData< Device>(), 0, i ); - }*/ -#endif + calculatedBefore = 0; - if( std::is_same< DeviceType, Devices::Host >::value && calculate ) // should we calculate in Host? + if( std::is_same< DeviceType, Devices::Host >::value && calculateMPIAgain ) // should we calculate in Host? { - calculate = 0; + calculateMPIAgain = 0; - /**--HERE-IS-PARALLEL-OMP-CODE--!!!WITHOUT MPI!!!--------------------**/ + /**--HERE-IS-PARALLEL-OMP-CODE--!!!WITHOUT MPI!!!--------------------**/ /* int numThreadsPerBlock = -1; @@ -300,92 +242,38 @@ solve( const MeshPointer& mesh, auxPtr = helpFunc; } */ - /**-END-OF-OMP-PARALLEL------------------------------------------------**/ + /**-END-OF-OMP-PARALLEL------------------------------------------------**/ - /*if( i == 1 ) - { - for( int k = 0; k < mesh->getDimensions().y(); k++ ){ - for( int l = 0; l < mesh->getDimensions().x(); l++ ) - printf("%.2f\t",aux[ k * mesh->getDimensions().x() + l ] ); - printf("\n"); - } - }*/ - // FSM FOR MPI and WITHOUT MPI - for( cell.getCoordinates().y() = 0 + vLower[1]; - cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0 + vLower[0]; - cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; - cell.getCoordinates().x()++ ) - { - cell.refresh(); - if( ! interfaceMap( cell ) ) - { - calculated = this->updateCell( aux, cell ) || calculated; - } - } - } - - for( cell.getCoordinates().y() = 0 + vLower[1]; - cell.getCoordinates().y() < mesh->getDimensions().y()-vUpper[1]; - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0]; - cell.getCoordinates().x() >= 0 + vLower[0]; - cell.getCoordinates().x()-- ) - { - //std::cerr << "2 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - + StaticVector boundsFrom; StaticVector boundsTo; + // UP and RIGHT + boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; + boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; + calculatedBefore = goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); + //aux.save("aux-1.tnl"); + + // UP and LEFL + boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; + boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = -1 + vecLowerOverlaps[0]; + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); //aux.save( "aux-2.tnl" ); - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 -vUpper[1]; - cell.getCoordinates().y() >= 0 + vLower[1] ; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = 0 + vLower[0]; - cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; - cell.getCoordinates().x()++ ) - { - //std::cerr << "3 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - + // DOWN and RIGHT + boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; + boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); //aux.save( "aux-3.tnl" ); - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 - vUpper[1]; - cell.getCoordinates().y() >= 0 + vLower[1]; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0]; - cell.getCoordinates().x() >= 0 + vLower[0]; - cell.getCoordinates().x()-- ) - { - //std::cerr << "4 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } + // DOWN and LEFT + boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; + boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); + } - - if( std::is_same< DeviceType, Devices::Cuda >::value && calculate ) // should we calculate on CUDA? + if( std::is_same< DeviceType, Devices::Cuda >::value && calculateMPIAgain ) // should we calculate on CUDA? { - calculate = 0; - -#if ForDebug - printf("%d: We are in Cuda code start.\n", i); -#endif + calculateMPIAgain = 0; #ifdef HAVE_CUDA TNL_CHECK_CUDA_DEVICE; @@ -394,8 +282,8 @@ solve( const MeshPointer& mesh, const int cudaBlockSize( 16 ); // Setting number of threads and blocks for kernel - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vLower[0] - vUpper[0], cudaBlockSize ); - int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vLower[1] - vUpper[1], cudaBlockSize ); + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize ); + int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize ); dim3 blockSize( cudaBlockSize, cudaBlockSize ); dim3 gridSize( numBlocksX, numBlocksY ); @@ -439,60 +327,21 @@ solve( const MeshPointer& mesh, BlockIterD = dBlock.getElement( 0 );*/ // Array that identifies which blocks should be calculated. - TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; - BlockIterDevice.setSize( numBlocksX * numBlocksY ); - BlockIterDevice.setValue( 1 ); + // All blocks should calculate in first passage ( setValue(1) ) + TNL::Containers::Array< int, Devices::Cuda, IndexType > blockCalculationIndicator( numBlocksX * numBlocksY ); + blockCalculationIndicator.setValue( 1 ); TNL_CHECK_CUDA_DEVICE; - // Array into which we identify the neighbours and then copy it into BlockIterDevice - TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom; - BlockIterPom.setSize( numBlocksX * numBlocksY ); - BlockIterPom.setValue( 0 ); + // Array into which we identify the neighbours and then copy it into blockCalculationIndicator + TNL::Containers::Array< int, Devices::Cuda, IndexType > blockCalculationIndicatorHelp(numBlocksX * numBlocksY ); + blockCalculationIndicatorHelp.setValue( 0 ); -#if ForDebug // For printf of BlockIterDevice - TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1; - BlockIterPom1.setSize( numBlocksX * numBlocksY ); - BlockIterPom1.setValue( 0 ); -#endif + // number of Blocks for kernel that calculates neighbours. int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0); - // for CudaPrallelReduc (replaced with .containsValue(1)) - //int nBlocks = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0); - //TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; - //dBlock.setSize( nBlocks ); - //TNL::Containers::Array< int, Devices::Host, IndexType > dBlock1; - //dBlock1.setSize( nBlocks ); - //TNL_CHECK_CUDA_DEVICE; // Helping meshFunction that switches with AuxPtr in every calculation of CudaUpdateCellCaller<<<>>>() MeshFunctionPointer helpFunc( mesh ); - helpFunc.template modifyData() = auxPtr.template getData(); - Devices::Cuda::synchronizeDevice(); - //MeshFunctionPointer helpFunc1( mesh ); - - // Setting number of threads and blocks in grid for DeepCopy of meshFunction - /*int numBlocksXWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); - int numBlocksYWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); - dim3 gridSizeWithoutOverlaps( numBlocksXWithoutOverlaps, numBlocksYWithoutOverlaps ); - - - Devices::Cuda::synchronizeDevice(); - DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(), - helpFunc.template modifyData< Device>(), 1, i ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - Devices::Cuda::synchronizeDevice(); - DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(), - helpFunc.template modifyData< Device>(), 0, i ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE;*/ - -#if ForDebug - /*int numBlocksXWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); - int numBlocksYWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); - dim3 gridSizeWithoutOverlaps( numBlocksXWithoutOverlaps, numBlocksYWithoutOverlaps );*/ - DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(), - helpFunc.template modifyData< Device>(), 0, i ); -#endif + helpFunc.template modifyData() = auxPtr.template getData(); //int pocBloku = 0; Devices::Cuda::synchronizeDevice(); @@ -505,18 +354,16 @@ solve( const MeshPointer& mesh, TNL_CHECK_CUDA_DEVICE; //int oddEvenBlock = 0; - //int numberWhile = 0; - while( BlockIterD ) + while( calculateCudaBlocksAgain ) { - //numberWhile++; - /** HERE IS CHESS METHOD (NO MPI) **/ + /** HERE IS CHESS METHOD (NO MPI) **/ /* CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(), - BlockIterDevice, + blockCalculationIndicator, oddEvenBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; @@ -527,14 +374,14 @@ solve( const MeshPointer& mesh, interfaceMapPtr.template getData< Device >(), helpFunc.template getData< Device>(), auxPtr.template modifyData< Device>(), - BlockIterDevice, vLower, vUpper, + blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps, oddEvenBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; - CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); + CudaParallelReduc<<< nBlocks , 1024 >>>( blockCalculationIndicator, dBlock, ( numBlocksX * numBlocksY ) ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); @@ -543,16 +390,14 @@ solve( const MeshPointer& mesh, BlockIterD = dBlock.getElement( 0 );*/ - /**------------------------------------------------------------------------------------------------*/ + /**------------------------------------------------------------------------------------------------*/ - /** HERE IS FIM FOR MPI AND WITHOUT MPI **/ + /** HERE IS FIM FOR MPI AND WITHOUT MPI **/ Devices::Cuda::synchronizeDevice(); - CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, - interfaceMapPtr.template getData< Device >(), - auxPtr.template getData< Device>(), - helpFunc.template modifyData< Device>(), - BlockIterDevice, vLower, vUpper, i ); + CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), + auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(), + blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; @@ -589,53 +434,22 @@ solve( const MeshPointer& mesh, // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now. Devices::Cuda::synchronizeDevice(); - GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, BlockIterPom, numBlocksX, numBlocksY ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - BlockIterDevice = BlockIterPom; + GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator, blockCalculationIndicatorHelp, numBlocksX, numBlocksY ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; -#if ForDebug - if( i == 1 ){ - BlockIterPom1 = BlockIterDevice; - for( int i =0; i< numBlocksX; i++ ){ - for( int j = 0; j < numBlocksY; j++ ) - { - std::cout << BlockIterPom1[j*numBlocksX + i]; - } - std::cout << std::endl; - } - std::cout << std::endl; - } -#endif - // "Parallel reduction" to see if we should calculate again BlockIterD - BlockIterD = BlockIterDevice.containsValue(1); - /*Devices::Cuda::synchronizeDevice(); - CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); + blockCalculationIndicator = blockCalculationIndicatorHelp; cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - // Parallel reduction on dBlock because of too large number of blocks (more than maximum number of threads) - Devices::Cuda::synchronizeDevice(); - CudaParallelReduc<<< 1, 1024 >>>( dBlock, dBlock, nBlocks ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE;*/ - - // Copy of the first element which is result of parallel reduction - /*Devices::Cuda::synchronizeDevice(); - BlockIterD = dBlock.getElement( 0 ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE;*/ + // "Parallel reduction" to see if we should calculate again BlockIterD + calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1); // When we change something then we should caclucate again in the next passage of MPI ( calculated = true ) - - - if( BlockIterD ){ - calculated = 1; + if( calculateCudaBlocksAgain ){ + calculatedBefore = 1; } - /**-----------------------------------------------------------------------------------------------------------*/ - +/**-----------------------------------------------------------------------------------------------------------*/ numIter ++; } if( numIter%2 == 1 ){ @@ -679,96 +493,66 @@ solve( const MeshPointer& mesh, #endif MPI::Allreduce( &calculated, &calculated, 1, MPI_LOR, MPI::AllGroup ); aux.template synchronize< Communicator >(); - calculate = calculpom[0] || calculpom[1] || calculpom[2] || calculpom[3]; -#if ForDebug - printf( "%d: Receved Calculated = %d.\n%d: Calculate = %d\n", i, calculated, i, calculate); -#endif - -#if ForDebug - if( i == 1 ) - printf("WhileCount = %d\n",WhileCount); - //calculated = 0; // DEBUG; -#endif } #endif if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculated 0! - calculated = 0; + calculatedBefore = 0; } iteration++; } - //String s( "aux-" + std::to_string( i ) + ".tnl" ); - //aux.save( s ); Aux=auxPtr; // copy it for MakeSnapshot - - aux.save("aux-final.tnl"); } -#ifdef HAVE_CUDA -// DeepCopy nebo pracne kopirovat kraje v zavislosti na vLower,vUpper z sArray do helpFunc. -template< typename Real, typename Device, typename Index > -__global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, int copy, int k ) +// PROTECTED FUNCTIONS: + +template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > +void +FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: +setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps, + const MeshPointer& mesh) { - int i = threadIdx.x + blockDim.x*blockIdx.x; - int j = blockDim.y*blockIdx.y + threadIdx.y; - const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >(); - if( copy ){ - if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) - helpFunc[ j * mesh.getDimensions().x() + i ] = 1;//aux[ j * mesh.getDimensions().x() + i ]; - } - else + vecLowerOverlaps[0] = 0; vecLowerOverlaps[1] = 0; vecUpperOverlaps[0] = 0; vecUpperOverlaps[1] = 0; +#ifdef HAVE_MPI + if( CommunicatorType::isDistributed() ) //If we started solver with MPI { - if( i==0 && j == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 ) - { - for( int m = mesh.getDimensions().y()-1; m>-1; m-- ){ - for( int l = 0; l < 17; l++ ){ - printf( "%.2f ", aux[ m * mesh.getDimensions().x() + l ]); - } - printf( "\n"); - } - printf( "\n"); - } - if( i==0 && j == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 ) - { - for( int m = mesh.getDimensions().y()-1; m>-1; m-- ){ - for( int l = 0; l < 17; l++ ){ - printf( "%.2f ", helpFunc[ m * mesh.getDimensions().x() + l ]); - } - printf( "\n"); - } - printf( "\n"); - } + //Distributed mesh for MPI overlaps (without MPI null pointer) + const Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh(); + vecLowerOverlaps = meshPom->getLowerOverlap(); + vecUpperOverlaps = meshPom->getUpperOverlap(); } +#endif } template < typename Index > __global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY ) { - int i = blockIdx.x * 1024 + threadIdx.x; + bool calculated = false; + const MeshType& mesh = aux.getMesh(); + const IndexType stepX = boundsFrom[0] < boundsTo[0]? 1 : -1; + const IndexType stepY = boundsFrom[1] < boundsTo[1]? 1 : -1; + + typename MeshType::Cell cell( mesh ); + cell.refresh(); - if( i < numBlockX * numBlockY ) + for( cell.getCoordinates().y() = boundsFrom[1]; + TNL::abs( cell.getCoordinates().y() - boundsTo[1] ) > 0; + cell.getCoordinates().y() += stepY ) { - int pom = 0;//BlockIterPom[ i ] = 0; - int m=0, k=0; - m = i%numBlockX; - k = i/numBlockX; - if( m > 0 && BlockIterDevice[ i - 1 ] ){ - pom = 1;//BlockIterPom[ i ] = 1; - }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){ - pom = 1;//BlockIterPom[ i ] = 1; - }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){ - pom = 1;// BlockIterPom[ i ] = 1; - }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){ - pom = 1;//BlockIterPom[ i ] = 1; + for( cell.getCoordinates().x() = boundsFrom[0]; + TNL::abs( cell.getCoordinates().x() - boundsTo[0] ) > 0; + cell.getCoordinates().x() += stepX ) + { + cell.refresh(); + if( ! interfaceMap( cell ) ) + { + calculated = this->updateCell( aux, cell ) || calculated; + } } - - if( BlockIterDevice[ i ] != 1 ) - BlockIterPom[ i ] = pom;//BlockIterPom[ i ]; - else - BlockIterPom[ i ] = 1; } + return calculated; } template < typename Index > @@ -863,254 +647,51 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks ); TNL_CHECK_CUDA_DEVICE; { - int thri = threadIdx.x; int thrj = threadIdx.y; - int i = threadIdx.x + blockDim.x*blockIdx.x + vLower[0]; - int j = blockDim.y*blockIdx.y + threadIdx.y + vLower[1]; - const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >(); - /** FOR CHESS METHOD */ - //if( (blockIdx.y%2 + blockIdx.x) % 2 == oddEvenBlock ) - //{ - /**------------------------------------------*/ + Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh(); + int calculateFromNeighbours[4] = {0,0,0,0}; + const int *neighbours = meshDistr->getNeighbors(); // Getting neighbors of distributed mesh + MPI::Request *requestsInformation; + requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ]; - /** FOR FIM METHOD */ + int neighCount = 0; // should this thread calculate again? - if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] ) - { - __syncthreads(); - - /**-----------------------------------------*/ - __shared__ int dimX; - __shared__ int dimY; - __shared__ Real hx; - __shared__ Real hy; - if( thri==0 && thrj == 0) - { - dimX = mesh.getDimensions().x(); - dimY = mesh.getDimensions().y(); - hx = mesh.getSpaceSteps().x(); - hy = mesh.getSpaceSteps().y(); - BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] = 0; - } - __syncthreads(); - int numOfBlockx; - int numOfBlocky; - int xkolik; - int ykolik; - - xkolik = blockDim.x + 1; - ykolik = blockDim.y + 1; - numOfBlocky = gridDim.y;//(dimY-vUpper[1]-vLower[1])/blockDim.y + (((dimY-vUpper[1]-vLower[1])%blockDim.y != 0) ? 1:0); - numOfBlockx = gridDim.x;//(dimX-vUpper[0]-vLower[0])/blockDim.x + (((dimX-vUpper[0]-vLower[0])%blockDim.x != 0) ? 1:0); - - if( numOfBlockx - 1 == blockIdx.x ) - xkolik = (dimX-vUpper[0]-vLower[0]) - (blockIdx.x)*blockDim.x+1; - - if( numOfBlocky -1 == blockIdx.y ) - ykolik = (dimY-vUpper[1]-vLower[1]) - (blockIdx.y)*blockDim.y+1; - __syncthreads(); - -#if ForDebug - /*if( thri==0 && thrj == 0 ) - { - printf("%d: DimX = %d, DimY = %d, xKolik = %d, yKolik = %d, numOfBlockX = %d, numOfBlockY = %d, blockIdx.x = %d, blockIdx.y = %d.\n", - k, dimX, dimY, xkolik, ykolik, numOfBlockx, numOfBlocky, blockIdx.x, blockIdx.y); - }*/ -#endif - - int currentIndex = thrj * blockDim.x + thri; - //__shared__ volatile bool changed[ blockDim.x*blockDim.y ]; - __shared__ volatile bool changed[ (sizeSArray-2)*(sizeSArray-2)]; - changed[ currentIndex ] = false; - if( thrj == 0 && thri == 0 ) - changed[ 0 ] = true; - - - //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ]; - __shared__ volatile Real sArray[ sizeSArray * sizeSArray ]; - sArray[ (thrj+1) * sizeSArray + thri +1 ] = std::numeric_limits< Real >::max(); - - - //filling sArray edges - if( thri == 0 ) - { - if( dimX - vLower[ 0 ] > (blockIdx.x+1) * blockDim.x && thrj+1 < ykolik ) - sArray[(thrj+1)*sizeSArray + xkolik] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX + xkolik + vLower[0] ]; - else - sArray[(thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max(); - } - - if( thri == 1 ) - { - if( ( blockIdx.x != 0 || vLower[0] != 0 ) && thrj+1 < ykolik ) - sArray[(thrj+1)*sizeSArray + 0] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX + vLower[0] ]; - else - sArray[(thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max(); - } - - if( thri == 2 ) - { - if( dimY - vLower[ 1 ] > (blockIdx.y+1) * blockDim.y && thrj+1 < xkolik ) - sArray[ ykolik*sizeSArray + thrj+1 ] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + ykolik*dimX + thrj+1 + vLower[0] ]; - else - sArray[ ykolik*sizeSArray + thrj+1 ] = std::numeric_limits< Real >::max(); - - } - - if( thri == 3 ) - { - if( ( blockIdx.y != 0 || vLower[1] != 0 ) && thrj+1 < xkolik ) - sArray[0*sizeSArray + thrj+1] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + thrj+1 + vLower[0] ]; - else - sArray[0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max(); - } - /*__syncthreads(); - if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 1 ) - { - printf( "Kraje: \n"); - for( int k = sizeSArray-1; k>-1; k-- ){ - for( int l = 0; l < sizeSArray; l++ ) - printf( "%.4f ", sArray[k * sizeSArray + l]); - printf( "\n"); - } - printf( "\n"); - } - __syncthreads();*/ - - - if( i-vLower[0] < dimX && j-vLower[1] < dimY && thri+1 < xkolik + vUpper[0] && thrj+1 < ykolik + vUpper[1] ) - { - /*if( k == 1 && blockIdx.x == 0 && blockIdx.y == 0 ) - printf("at index = %d\n", j*dimX + i);*/ - sArray[(thrj+1)*sizeSArray + thri+1] = aux[ (j)*dimX + i ]; - } - __syncthreads(); -#if ForDebug - if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 ) - { - printf( "všechno před výpočtem: \n"); - for( int m = sizeSArray-1; m>-1; m-- ){ - for( int l = 0; l < sizeSArray; l++ ) - printf( "%.2f ", sArray[m * sizeSArray + l]); - printf( "\n"); - } - printf( "\n"); - } - - if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 ) - { - for( int m = mesh.getDimensions().y()-1; m>-1; m-- ){ - for( int l = 0; l < 17; l++ ) - printf( "%.2f ", aux[ m * mesh.getDimensions().x() + l ]); - printf( "\n"); - } - printf( "\n"); - } -#endif - //main while cycle - //if( i == 0 && j == 0 ) - // printf("Overlaps [x1,y1],[x2,y2] = [%d,%d],[%d,%d]",vLower[0], vLower[1], vUpper[0], vUpper[1] ); - - while( changed[ 0 ] ) - { - __syncthreads(); - - changed[ currentIndex] = false; - - //calculation of update cell - if( i < dimX - vUpper[0] && j < dimY - vUpper[1] /*&& i > vLower[0]-1 && j > vLower[1]-1*/ ) - { - if( ! interfaceMap[ j * dimX + i ] ) - { - /*if( k == 1 && blockIdx.x == 1 && blockIdx.y == 0 ) - printf( "thri = %d, thrj = %d \n", thri, thrj );*/ - changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy); - } - } - __syncthreads(); - - //pyramid reduction - if( blockDim.x*blockDim.y == 1024 ) - { - if( currentIndex < 512 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; - } - } - __syncthreads(); - if( blockDim.x*blockDim.y >= 512 ) - { - if( currentIndex < 256 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; - } - } - __syncthreads(); - if( blockDim.x*blockDim.y >= 256 ) - { - if( currentIndex < 128 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ]; - } - } - __syncthreads(); - if( blockDim.x*blockDim.y >= 128 ) - { - if( currentIndex < 64 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; - } - } - __syncthreads(); - if( currentIndex < 32 ) - { - if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ]; - if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ]; - if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ]; - if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ]; - if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; - if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; - } - if( thri == 0 && thrj == 0 && changed[ 0 ] ){ - BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] = 1; - } - __syncthreads(); - } - - - - if( i < dimX && j < dimY && thri+1 < xkolik && thrj+1 < ykolik ) - helpFunc[ j * dimX + i ] = sArray[ ( thrj + 1 ) * sizeSArray + thri + 1 ]; - __syncthreads(); -#if ForDebug - if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 ) - { - printf( "všechno po výpočtu: \n"); - for( int m = sizeSArray-1; m>-1; m-- ){ - for( int l = 0; l < sizeSArray; l++ ) - printf( "%.2f ", sArray[m * sizeSArray + l]); - printf( "\n"); - } - printf( "\n"); - } - - if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 ) - { - printf( "8: \n"); - for( int m = mesh.getDimensions().y()-1; m>-1; m-- ){ - for( int l = 0; l < mesh.getDimensions().x(); l++ ) - printf( "%.2f ", helpFunc[ m * mesh.getDimensions().x() + l ]); - printf("\n"); - } - printf( "\n"); - } -#endif + if( neighbours[0] != -1 ) // LEFT + { + requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[0], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = + MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup ); } - else + + if( neighbours[1] != -1 ) // RIGHT + { + requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = + MPI::IRecv( &calculateFromNeighbours[1], 1, neighbours[1], 0, MPI::AllGroup ); + } + + if( neighbours[2] != -1 ) //UP + { + requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[2], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = + MPI::IRecv( &calculateFromNeighbours[2], 1, neighbours[2], 0, MPI::AllGroup ); + } + + if( neighbours[5] != -1 ) //DOWN { - if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] ) - helpFunc[ j * mesh.getDimensions().x() + i ] = aux[ j * mesh.getDimensions().x() + i ]; + requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[5], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = + MPI::IRecv( &calculateFromNeighbours[3], 1, neighbours[5], 0, MPI::AllGroup ); } + MPI::WaitAll( requestsInformation, neighCount ); + + MPI::Allreduce( &calculatedBefore, &calculatedBefore, 1, MPI_LOR, MPI::AllGroup ); + calculateMPIAgain = calculateFromNeighbours[0] || calculateFromNeighbours[1] || + calculateFromNeighbours[2] || calculateFromNeighbours[3]; } #endif TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock ) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h index 40a1efeba..2d73b174e 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h @@ -13,8 +13,6 @@ #pragma once -#include "tnlFastSweepingMethod.h" - template< typename Real, typename Device, @@ -69,24 +67,12 @@ solve( const MeshPointer& mesh, auxPtr->setMesh( mesh ); interfaceMapPtr->setMesh( mesh ); - //Distributed mesh for overlaps (without MPI is null pointer) - Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh(); - // getting overlaps ( WITHOUT MPI SHOULD BE 0 ) - Containers::StaticVector< 3, IndexType > vLower; - vLower[0] = 0; vLower[1] = 0; vLower[2] = 0; - Containers::StaticVector< 3, IndexType > vUpper; - vUpper[0] = 0; vUpper[1] = 0; vUpper[2] = 0; -#ifdef HAVE_MPI - if( CommunicatorType::isDistributed() ) - { - vLower = meshPom->getLowerOverlap(); - vUpper = meshPom->getUpperOverlap(); - } -#endif + Containers::StaticVector< 3, IndexType > vecLowerOverlaps, vecUpperOverlaps; + setOverlaps( vecLowerOverlaps, vecUpperOverlaps, mesh ); std::cout << "Initiating the interface cells ..." << std::endl; - BaseType::initInterface( u, auxPtr, interfaceMapPtr, vLower, vUpper ); + BaseType::initInterface( u, auxPtr, interfaceMapPtr, vecLowerOverlaps, vecUpperOverlaps ); auxPtr->save( "aux-ini.tnl" ); typename MeshType::Cell cell( *mesh ); @@ -95,59 +81,26 @@ solve( const MeshPointer& mesh, MeshFunctionType aux = *auxPtr; InterfaceMapType interfaceMap = * interfaceMapPtr; aux.template synchronize< Communicator >(); //synchronization of intial conditions - int i = MPI::GetRank( MPI::AllGroup ); //getting identification of MPI thread -#if ForDebug - if( i == 2 ){ - aux.save("aux-init2.tnl"); - mesh->save("mesh-2.tnl"); - } - if( i == 1 ){ - aux.save("aux-init1.tnl"); - mesh->save("mesh-1.tnl"); - } - if( i == 3 ){ - aux.save("aux-init3.tnl"); - mesh->save("mesh-3.tnl"); - } - if( i == 0 ){ - aux.save("aux-init0.tnl"); - mesh->save("mesh-0.tnl"); - } -#endif while( iteration < this->maxIterations ) - { -#if ForDebug - int WhileCount = 0; // number of passages of while cycle with condition calculated - printf( "%d: meshDimensions are (x,y,z) = (%d,%d,%d).\n",i, mesh->getDimensions().x(), mesh->getDimensions().y(), mesh->getDimensions().z() ); - printf( "%d: owerlaps are ([x1,x2],[y1,y2],[z1,z2]) = ([%d,%d],[%d,%d],[%d,%d]).\n",i, vLower[0], vUpper[0], vLower[1], vUpper[1], vUpper[2], vLower[2] ); - /*if( std::is_same< DeviceType, Devices::Host >::value && i == 2 ) - { - for( int j = mesh->getDimensions().y()-1; j>-1; j-- ){ - for( int m = 0; m < mesh->getDimensions().x(); m++ ) - printf( "%.2f " , aux[ j*mesh->getDimensions().x() + m ]); - printf("\n"); - } - printf("\n"); - }*/ -#endif - - int calculated = 1; // indicates weather we calculated in the last passage of the while cycle - // calculated is same for all ranks + { + // indicates weather we calculated in the last passage of the while cycle + // calculatedBefore is same for all ranks // without MPI should be FALSE at the end of while cycle body - int calculate = 1; // indicates if the thread should calculate again in upcoming passage of cycle - // calculate is a value that can differ in every rank + int calculatedBefore = 1; + + // indicates if the MPI process should calculate again in upcoming passage of cycle + // calculateMPIAgain is a value that can differ in every rank // without MPI should be FALSE at the end of while cycle body + int calculateMPIAgain = 1; - while( calculated ) + while( calculatedBefore ) { - calculated = 0; -#if ForDebug - WhileCount++; -#endif - if( std::is_same< DeviceType, Devices::Host >::value && calculate ) // should we calculate in Host? + calculatedBefore = 0; + + if( std::is_same< DeviceType, Devices::Host >::value && calculateMPIAgain ) // should we calculate in Host? { - calculate = 0; + calculateMPIAgain = 0; /** HERE IS FSM FOR OPENMP (NO MPI) - isnt worthy */ /*int numThreadsPerBlock = 64; @@ -212,401 +165,60 @@ solve( const MeshPointer& mesh, /** HERE IS FSM WITH MPI AND WITHOUT MPI */ + StaticVector boundsFrom; StaticVector boundsTo; -#if ForDebug - if( i == 1 ){ - aux.save("aux-final10.tnl"); - } -#endif - for( cell.getCoordinates().z() = 0 + vLower[2]; - cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = 0 + vLower[1]; - cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0 + vLower[0]; - cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; - cell.getCoordinates().x()++ ) - { - cell.refresh(); - if( ! interfaceMap( cell ) ) - { - //getting information weather we calculated in this passage - calculated = this->updateCell( aux, cell ) || calculated; - } - } - } - } -#if ForDebug - if( i == 1 ){ - aux.save("aux-final11.tnl"); - } - int pocNull = 0; - for( cell.getCoordinates().z() = 0 + vLower[2]; - cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = 0 + vLower[1]; - cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0 + vLower[0]; - cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; - cell.getCoordinates().x()++ ) - { - cell.refresh(); - if( fabs( aux(cell) ) < 0.002 ) - pocNull++; - } - } - } - printf("%d: 1. pocNull = %d\n", i , pocNull); -#endif - for( cell.getCoordinates().z() = 0 + vLower[2]; - cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = 0 + vLower[1]; - cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0]; - cell.getCoordinates().x() >= 0 + vLower[0]; - cell.getCoordinates().x()-- ) - { - //std::cerr << "2 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } -#if ForDebug - if( i == 1 ){ - aux.save("aux-final12.tnl"); - } - pocNull = 0; - for( cell.getCoordinates().z() = 0 + vLower[2]; - cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = 0 + vLower[1]; - cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0 + vLower[0]; - cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; - cell.getCoordinates().x()++ ) - { - cell.refresh(); - if( fabs( aux(cell) ) < 0.002 ) - pocNull++; - } - } - } - printf("%d: 2. pocNull = %d\n", i , pocNull); -#endif - //aux.save( "aux-2.tnl" ); - for( cell.getCoordinates().z() = 0 + vLower[2]; - cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 - vUpper[1]; - cell.getCoordinates().y() >= 0 + vLower[1]; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = 0 + vLower[0]; - cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; - cell.getCoordinates().x()++ ) - { - //std::cerr << "3 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } -#if ForDebug - if( i == 1 ){ - aux.save("aux-final13.tnl"); - } - pocNull = 0; - for( cell.getCoordinates().z() = 0 + vLower[2]; - cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = 0 + vLower[1]; - cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0 + vLower[0]; - cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; - cell.getCoordinates().x()++ ) - { - cell.refresh(); - if( fabs( aux(cell) ) < 0.002 ) - pocNull++; - } - } - } - printf("%d: 3. pocNull = %d\n", i , pocNull); -#endif - //aux.save( "aux-3.tnl" ); + // TOP, NORTH and EAST + boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2]; + boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; + boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; + calculatedBefore = goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - for( cell.getCoordinates().z() = 0 + vLower[2]; - cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 - vUpper[1]; - cell.getCoordinates().y() >= 0 + vLower[1]; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0]; - cell.getCoordinates().x() >= 0 + vLower[0]; - cell.getCoordinates().x()-- ) - { - //std::cerr << "4 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } -#if ForDebug - if( i == 1 ){ - aux.save("aux-final14.tnl"); - } - pocNull = 0; - for( cell.getCoordinates().z() = 0 + vLower[2]; - cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = 0 + vLower[1]; - cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0 + vLower[0]; - cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; - cell.getCoordinates().x()++ ) - { - cell.refresh(); - if( fabs( aux(cell) ) < 0.002 ) - pocNull++; - } - } - } - printf("%d: 4. pocNull = %d\n", i , pocNull); -#endif - //aux.save( "aux-4.tnl" ); + // TOP, NORTH and WEST + boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2]; + boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; + boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = vecLowerOverlaps[0]; + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1 - vUpper[2]; - cell.getCoordinates().z() >= 0 + vLower[2]; - cell.getCoordinates().z()-- ) - { - for( cell.getCoordinates().y() = 0 + vLower[1]; - cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0 + vLower[0]; - cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; - cell.getCoordinates().x()++ ) - { - //std::cerr << "5 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } -#if ForDebug - if( i == 1 ){ - aux.save("aux-final15.tnl"); - } - pocNull = 0; - for( cell.getCoordinates().z() = 0 + vLower[2]; - cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = 0 + vLower[1]; - cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0 + vLower[0]; - cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; - cell.getCoordinates().x()++ ) - { - cell.refresh(); - if( fabs( aux(cell) ) < 0.002 ) - pocNull++; - } - } - } - printf("%d: 5. pocNull = %d\n", i , pocNull); - #endif - //aux.save( "aux-5.tnl" ); + // TOP, SOUTH and EAST + boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2]; + boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = vecLowerOverlaps[1]; + boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1 - vUpper[2]; - cell.getCoordinates().z() >= 0 + vLower[2]; - cell.getCoordinates().z()-- ) - { - for( cell.getCoordinates().y() = 0 + vLower[1]; - cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0]; - cell.getCoordinates().x() >= 0 + vLower[0]; - cell.getCoordinates().x()-- ) - { - //std::cerr << "6 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } -#if ForDebug - if( i == 1 ){ - aux.save("aux-final16.tnl"); - } - pocNull = 0; - for( cell.getCoordinates().z() = 0 + vLower[2]; - cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = 0 + vLower[1]; - cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0 + vLower[0]; - cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; - cell.getCoordinates().x()++ ) - { - cell.refresh(); - if( fabs( aux(cell) ) < 0.002 ) - pocNull++; - } - } - } - printf("%d: 6. pocNull = %d\n", i , pocNull); -#endif - //aux.save( "aux-6.tnl" ); + // TOP, SOUTH and WEST + boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2]; + boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = vecLowerOverlaps[1]; + boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = vecLowerOverlaps[0]; + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); + + // BOTTOM, NOTH and EAST + boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = vecLowerOverlaps[2]; + boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; + boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1 - vUpper[2]; - cell.getCoordinates().z() >= 0 + vLower[2]; - cell.getCoordinates().z()-- ) - { - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 - vUpper[1]; - cell.getCoordinates().y() >= 0 + vLower[1]; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = 0 + vLower[0]; - cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; - cell.getCoordinates().x()++ ) - { - //std::cerr << "7 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } + // BOTTOM, NOTH and WEST + boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = vecLowerOverlaps[2]; + boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; + boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = vecLowerOverlaps[0]; + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); -#if ForDebug - if( i == 1 ){ - aux.save("aux-final17.tnl"); - } - pocNull = 0; - for( cell.getCoordinates().z() = 0 + vLower[2]; - cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = 0 + vLower[1]; - cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0 + vLower[0]; - cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; - cell.getCoordinates().x()++ ) - { - cell.refresh(); - if( fabs( aux(cell) ) < 0.002 ) - pocNull++; - } - } - } - printf("%d: 7. pocNull = %d\n", i , pocNull); -#endif - //aux.save( "aux-7.tnl" ); + // BOTTOM, SOUTH and EAST + boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = vecLowerOverlaps[2]; + boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = vecLowerOverlaps[1]; + boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); + + // BOTTOM, SOUTH and WEST + boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = vecLowerOverlaps[2]; + boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = vecLowerOverlaps[1]; + boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = vecLowerOverlaps[0]; + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1 - vUpper[2]; - cell.getCoordinates().z() >= 0 + vLower[2]; - cell.getCoordinates().z()-- ) - { - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 - vUpper[1]; - cell.getCoordinates().y() >= 0 + vLower[1]; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0]; - cell.getCoordinates().x() >= 0 + vLower[0]; - cell.getCoordinates().x()-- ) - { - //std::cerr << "8 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } -#if ForDebug - if( i == 1 ){ - aux.save("aux-final18.tnl"); - } - pocNull = 0; - for( cell.getCoordinates().z() = 0 + vLower[2]; - cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = 0 + vLower[1]; - cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0 + vLower[0]; - cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; - cell.getCoordinates().x()++ ) - { - cell.refresh(); - if( fabs( aux(cell) ) < 0.002 ) - pocNull++; - } - } - } - printf("%d: 8. pocNull = %d\n", i , pocNull); - for( cell.getCoordinates().z() = 0 + vLower[2]; - cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2]; - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = 0 + vLower[1]; - cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1]; - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0 + vLower[0]; - cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0]; - cell.getCoordinates().x()++ ) - { - cell.refresh(); - printf("%.2f ", aux(cell)); - } - printf("\n"); - } - printf("\n"); - } -#endif - /**----------------------------------------------------------------------------------*/ + /**----------------------------------------------------------------------------------*/ } - if( std::is_same< DeviceType, Devices::Cuda >::value && calculate ) + if( std::is_same< DeviceType, Devices::Cuda >::value && calculateMPIAgain ) { #ifdef HAVE_CUDA // cudaBlockSize is a size of blocks. It's the number raised to the 3 power. @@ -640,115 +252,18 @@ solve( const MeshPointer& mesh, } #ifdef HAVE_MPI - if( CommunicatorType::isDistributed() ){ - - const int *neigh = meshPom->getNeighbors(); // Getting nembers of distributed mesh - MPI::Request *req; - req = new MPI::Request[meshPom->getNeighborsCount()]; - - int neighCount = 0; // we know the number in runtime and it can differ for every MPI thread - // Getting information weather some of six neghbours (top, bottom, right, left, ahead, behind) calculated - int calculpom[6] = {0,0,0,0,0,0}; - - - if( neigh[0] != -1 ) // if you have west neighbour - { - // if we have this neighbour, we send calculated, one number, to him, ... - req[neighCount] = MPI::ISend( &calculated, 1, neigh[0], 0, MPI::AllGroup ); - neighCount++; - // and we recive the same information from him - req[neighCount] = MPI::IRecv( &calculpom[0], 1, neigh[0], 0, MPI::AllGroup ); - neighCount++; - } - - if( neigh[1] != -1 ) // east - { - req[neighCount] = MPI::ISend( &calculated, 1, neigh[1], 0, MPI::AllGroup ); - neighCount++; - - - req[neighCount] = MPI::IRecv( &calculpom[1], 1, neigh[1], 0, MPI::AllGroup ); - neighCount++; - } - - if( neigh[2] != -1 ) // north - { - req[neighCount] = MPI::ISend( &calculated, 1, neigh[2], 0, MPI::AllGroup ); - neighCount++; - - req[neighCount] = MPI::IRecv( &calculpom[2], 1, neigh[2], 0, MPI::AllGroup ); - neighCount++; - } - - if( neigh[5] != -1 ) //south - { - req[neighCount] = MPI::ISend( &calculated, 1, neigh[5], 0, MPI::AllGroup ); - neighCount++; - - req[neighCount] = MPI::IRecv( &calculpom[3], 1, neigh[5], 0, MPI::AllGroup ); - neighCount++; - } - - if( neigh[8] != -1 ) // top - { - req[neighCount] = MPI::ISend( &calculated, 1, neigh[8], 0, MPI::AllGroup ); - neighCount++; - - req[neighCount] = MPI::IRecv( &calculpom[4], 1, neigh[8], 0, MPI::AllGroup ); - neighCount++; - } - - if( neigh[17] != -1 ) //bottom - { - req[neighCount] = MPI::ISend( &calculated, 1, neigh[17], 0, MPI::AllGroup ); - neighCount++; - - req[neighCount] = MPI::IRecv( &calculpom[5], 1, neigh[17], 0, MPI::AllGroup ); - neighCount++; - } - - MPI::WaitAll(req,neighCount); //waiting for all to have all the information -#if ForDebug - printf( "%d: Sending Calculated = %d.\n", i, calculated ); - printf( "%d: calculpom[0] = %d, calculpom[1] = %d, calculpom[2] = %d, calculpom[3] = %d, calculpom[4] = %d," - "calculpom[5] = %d", i ,calculpom[0],calculpom[1],calculpom[2],calculpom[3],calculpom[4],calculpom[5]); -#endif - // if one of the MPI thread had calculated = 1, then all get 1. Otherwise all get 0 - MPI::Allreduce( &calculated, &calculated, 1, MPI_LOR, MPI::AllGroup ); + if( CommunicatorType::isDistributed() ) + { + getInfoFromNeighbours( calculatedBefore, calculateMPIAgain, mesh ); + // synchronizate the overlaps aux.template synchronize< Communicator >(); - // if any of my neighbours had calculated = 1, than I should calculate again (but all of us has to go throw while(calculated)) - calculate = calculpom[0] || calculpom[1] || calculpom[2] || - calculpom[3] || calculpom[4] || calculpom[5]; -#if ForDebug - printf( "%d: Receved Calculated = %d.\n%d: Calculate = %d\n", i, calculated, i, calculate); -#endif - -#if ForDebug - if( i == 1 ) - printf("WhileCount = %d\n",WhileCount); - if( i == 2 ){ - aux.save("aux-final2.tnl"); - mesh->save("mesh-2.tnl"); - } - if( i == 1 ){ - aux.save("aux-final1.tnl"); - mesh->save("mesh-1.tnl"); - } - if( i == 3 ){ - aux.save("aux-final3.tnl"); - mesh->save("mesh-3.tnl"); - } - if( i == 0 ){ - aux.save("aux-final0.tnl"); - mesh->save("mesh-0.tnl"); - } - //calculated = 0; // DEBUG; -#endif + } #endif - if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculated 0! - calculated = 0; //otherwise we would go throw the FSM code and CUDA FSM code again uselessly + + if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculatedBefore 0! + calculatedBefore = 0; //otherwise we would go throw the FSM code and CUDA FSM code again uselessly } //aux.save( "aux-8.tnl" ); iteration++; @@ -759,65 +274,69 @@ solve( const MeshPointer& mesh, aux.save("aux-final.tnl"); } -#ifdef HAVE_CUDA -// DeepCopy nebo pracne kopirovat kraje v zavislosti na vLower,vUpper z sArray do helpFunc. -template< typename Real, typename Device, typename Index > -__global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, - Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, int copy, int k ) +// PROTECTED FUNCTIONS: + +template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > +void +FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: +setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps, + const MeshPointer& mesh) { - int thri = threadIdx.x + blockDim.x*blockIdx.x; - int thrj = blockDim.y*blockIdx.y + threadIdx.y; - int thrk = blockDim.z*blockIdx.z + threadIdx.z; - - const Meshes::Grid< 3, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >(); - if( copy ){ - if( thri < mesh.getDimensions().x() && thrj < mesh.getDimensions().y() && thrk < mesh.getDimensions().z() ) - { - helpFunc[ thrk * mesh.getDimensions().x() * mesh.getDimensions().y() + thrj * mesh.getDimensions().x() + thri ] = - aux[ thrk * mesh.getDimensions().x() * mesh.getDimensions().y() + thrj * mesh.getDimensions().x() + thri ]; - } + vecLowerOverlaps[0] = 0; vecLowerOverlaps[1] = 0; vecLowerOverlaps[2] = 0; + vecUpperOverlaps[0] = 0; vecUpperOverlaps[1] = 0; vecUpperOverlaps[2] = 0; +#ifdef HAVE_MPI + if( CommunicatorType::isDistributed() ) //If we started solver with MPI + { + //Distributed mesh for MPI overlaps (without MPI null pointer) + const Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh(); + vecLowerOverlaps = meshPom->getLowerOverlap(); + vecUpperOverlaps = meshPom->getUpperOverlap(); } - else // for debug, values can be printed only from cuda kernel +#endif +} + + + + +template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > +bool +FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: +goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, + MeshFunctionType& aux, const InterfaceMapType& interfaceMap, + const AnisotropyPointer& anisotropy ) +{ + bool calculated = false; + const MeshType& mesh = aux.getMesh(); + const IndexType stepX = boundsFrom[0] < boundsTo[0]? 1 : -1; + const IndexType stepY = boundsFrom[1] < boundsTo[1]? 1 : -1; + const IndexType stepZ = boundsFrom[2] < boundsTo[2]? 1 : -1; + + typename MeshType::Cell cell( mesh ); + cell.refresh(); + + for( cell.getCoordinates().z() = boundsFrom[2]; + TNL::abs( cell.getCoordinates().z() - boundsTo[2] ) > 0; + cell.getCoordinates().z() += stepZ ) { - if( thrk == 0 && thri==0 && thrj == 0 && blockIdx.z == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 0 ) - { - printf("%d: DimX = %d, DimY = %d, DimZ = %d\n", k,mesh.getDimensions().x(),mesh.getDimensions().y(),mesh.getDimensions().z() ); - for( int z = mesh.getDimensions().z()-1; z > mesh.getDimensions().z()-2; z-- ) - { - for( int y = 0; y < mesh.getDimensions().y(); y++ ) - { - for( int x = 0; x < mesh.getDimensions().x(); x++ ) - { - printf("%.2f ", helpFunc[ z *mesh.getDimensions().y()*mesh.getDimensions().x() + y*mesh.getDimensions().x() + x ]); - } - printf("\n"); - } - printf("\n"); - } - printf("\n"); - } - if( thrk == 0 && thri==0 && thrj == 0 && blockIdx.z == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 1 ) + for( cell.getCoordinates().y() = boundsFrom[1]; + TNL::abs( cell.getCoordinates().y() - boundsTo[1] ) > 0; + cell.getCoordinates().y() += stepY ) { - printf("%d: DimX = %d, DimY = %d, DimZ = %d\n", k,mesh.getDimensions().x(),mesh.getDimensions().y(),mesh.getDimensions().z() ); - - if( k == 1 ) + for( cell.getCoordinates().x() = boundsFrom[0]; + TNL::abs( cell.getCoordinates().x() - boundsTo[0] ) > 0; + cell.getCoordinates().x() += stepX ) { - for( int z = 1; z < 2; z++ ) + cell.refresh(); + if( ! interfaceMap( cell ) ) { - for( int y = 0; y < mesh.getDimensions().y(); y++ ) - { - for( int x = 0; x < mesh.getDimensions().x(); x++ ) - { - printf("%.2f ", aux[ z *mesh.getDimensions().y()*mesh.getDimensions().x() + y*mesh.getDimensions().x() + x ]); - } - printf("\n"); - } - printf("\n"); + calculated = this->updateCell( aux, cell ) || calculated; } - printf("\n"); } } } + return calculated; } template < typename Index > @@ -825,33 +344,22 @@ __global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY, int numBlockZ ) { - int i = blockIdx.x * 1024 + threadIdx.x; + Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh(); - if( i < numBlockX * numBlockY * numBlockZ ) + int calculateFromNeighbours[6] = {0,0,0,0,0,0}; + + const int *neighbours = meshDistr->getNeighbors(); // Getting neighbors of distributed mesh + MPI::Request *requestsInformation; + requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ]; + + int neighCount = 0; // should this thread calculate again? + + if( neighbours[0] != -1 ) // WEST { - int pom = 0;//BlockIterPom[ i ] = 0; - int m=0, l=0, k=0; - l = i/( numBlockX * numBlockY ); - k = (i-l*numBlockX * numBlockY )/(numBlockX ); - m = (i-l*numBlockX * numBlockY )%( numBlockX ); - if( m > 0 && BlockIterDevice[ i - 1 ] ){ // left neighbour - pom = 1;//BlockIterPom[ i ] = 1; - }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){ // right neighbour - pom = 1;//BlockIterPom[ i ] = 1; - }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){ // bottom neighbour - pom = 1;// BlockIterPom[ i ] = 1; - }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){ // top neighbour - pom = 1;//BlockIterPom[ i ] = 1; - }else if( l > 0 && BlockIterDevice[ i - numBlockX*numBlockY ] ){ // neighbour behind - pom = 1; - }else if( l < numBlockZ-1 && BlockIterDevice[ i + numBlockX*numBlockY ] ){ // neighbour in front - pom = 1; - } - - if( !BlockIterDevice[ i ] ) // only in CudaUpdateCellCaller can BlockIterDevice gain 0 - BlockIterPom[ i ] = pom; - else - BlockIterPom[ i ] = 1; + requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[0], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = + MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup ); } } @@ -870,233 +378,51 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri; const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); - // should this block calculate? - if( BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] ) + if( neighbours[1] != -1 ) // EAST { - __syncthreads(); - - // Array indicates weather some threads calculated (for parallel reduction) - __shared__ volatile bool changed[ (sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2) ]; - changed[ currentIndex ] = false; - - if( thrj == 0 && thri == 0 && thrk == 0 ) - changed[ 0 ] = true; // first indicates weather we should calculate again (princip of parallel reduction) - - __shared__ Real hx; __shared__ int dimX; //getting stepps and size of mesh - __shared__ Real hy; __shared__ int dimY; - __shared__ Real hz; __shared__ int dimZ; - - if( thrj == 1 && thri == 1 && thrk == 1 ) - { - //printf( "We are in the calculation. Block = %d.\n",blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ); - hx = mesh.getSpaceSteps().x(); - hy = mesh.getSpaceSteps().y(); - hz = mesh.getSpaceSteps().z(); - dimX = mesh.getDimensions().x(); - dimY = mesh.getDimensions().y(); - dimZ = mesh.getDimensions().z(); - // we dont know if we will calculate in here, more info down in code - BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 0; - } - - // sArray contains values of one block (coppied from aux) and edges (not MPI) of those blocks - __shared__ volatile Real sArray[ sizeSArray * sizeSArray * sizeSArray ]; - sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = std::numeric_limits< Real >::max(); - - // getting some usefull information - int numOfBlockx; - int numOfBlocky; - int numOfBlockz; - int xkolik; // maximum of threads in x direction (for all blocks different) - int ykolik; - int zkolik; - xkolik = blockDim.x + 1; - ykolik = blockDim.y + 1; - zkolik = blockDim.z + 1; - numOfBlockx = gridDim.x; - numOfBlocky = gridDim.y; - numOfBlockz = gridDim.z; - __syncthreads(); - - if( numOfBlockx - 1 == blIdx ) - xkolik = (dimX-vUpper[0]-vLower[0]) - (blIdx)*blockDim.x+1; - if( numOfBlocky -1 == blIdy ) - ykolik = (dimY-vUpper[1]-vLower[1]) - (blIdy)*blockDim.y+1; - if( numOfBlockz-1 == blIdz ) - zkolik = (dimZ-vUpper[2]-vLower[2]) - (blIdz)*blockDim.z+1; - __syncthreads(); - - //filling sArray edges - if( thri == 0 ) //x bottom - { - if( (blIdx != 0 || vLower[0] !=0) && thrj+1 < ykolik && thrk+1 < zkolik ) - sArray[ (thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0 ] = - aux[ (blIdz*blockDim.z + vLower[2]) * dimX * dimY + (blIdy * blockDim.y+vLower[1])*dimX - + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY + vLower[0] ]; - else - sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max(); - } - - if( thri == 1 ) //xtop - { - if( dimX - vLower[ 0 ] > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik ) - sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = - aux[ (blIdz*blockDim.z + vLower[2]) * dimX * dimY + (blIdy * blockDim.y+vLower[1])*dimX - + blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY + vLower[0] ]; - else - sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max(); - } - if( thri == 2 ) //y bottom - { - if( (blIdy != 0 || vLower[1] !=0) && thrj+1 < xkolik && thrk+1 < zkolik ) - sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = - aux[ (blIdz*blockDim.z + vLower[2]) * dimX * dimY + (blIdy * blockDim.y+vLower[1])*dimX - + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY + vLower[0] ]; - else - sArray[ (thrk+1) * sizeSArray * sizeSArray + 0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max(); - } - - if( thri == 3 ) //y top - { - if( dimY - vLower[ 1 ] > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik ) - sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = - aux[ (blIdz*blockDim.z + vLower[2]) * dimX * dimY + ((blIdy+1) * blockDim.y+vLower[1])*dimX - + blIdx*blockDim.x + thrj + thrk*dimX*dimY + vLower[0] ]; - else - sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = std::numeric_limits< Real >::max(); - } - if( thri == 4 ) //z bottom - { - if( (blIdz != 0 || vLower[2] !=0) && thrj+1 < ykolik && thrk+1 < xkolik ) - sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = - aux[ (blIdz*blockDim.z + vLower[2]) * dimX * dimY + (blIdy * blockDim.y+vLower[1])*dimX - + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk + vLower[0] ]; - else - sArray[0 * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thrk+1] = std::numeric_limits< Real >::max(); - } - - if( thri == 5 ) //z top - { - if( dimZ - vLower[ 2 ] > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik ) - sArray[ zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = - aux[ ((blIdz+1)*blockDim.z + vLower[2]) * dimX * dimY + (blIdy * blockDim.y+vLower[1])*dimX - + blIdx*blockDim.x + thrj * dimX + thrk + vLower[0] ]; - else - sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = std::numeric_limits< Real >::max(); - } - - // Copy all other values that aren't edges - if( i - vLower[0] < dimX && j - vLower[1] < dimY && k - vLower[2] < dimZ && - thri+1 < xkolik + vUpper[0] && thrj+1 < ykolik + vUpper[1] && thrk+1 < zkolik + vUpper[2] ) - { - sArray[(thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = aux[ k*dimX*dimY + j*dimX + i ]; - } - __syncthreads(); - -#if ForDebug - /*if( thri==0 && thrj == 0 && thrk == 0 && blockIdx.z == 0 && blockIdx.x == 2 && blockIdx.y == 2 && MPIthread == 1 ) - { - printf( "všechno před výpočtem: \n"); - for( int m = sizeSArray-1; m>-1; m-- ){ - for( int l = 0; l < sizeSArray; l++ ) - printf( "%.2f ", sArray[4*sizeSArray * sizeSArray + m * sizeSArray + l]); - printf( "\n"); - } - printf( "\n"); - } - - if(thri==0 && thrj == 0 && thrk == 0 && blockIdx.z == 0 && blockIdx.x == 2 && blockIdx.y == 2 && MPIthread == 1 ) - { - for( int m = 24; m>14; m-- ){ - for( int l = 15; l < 25; l++ ) - printf("%.2f ", aux[ 4 *mesh.getDimensions().y()*mesh.getDimensions().x() + m*mesh.getDimensions().x() + l ]); - printf( "\n"); - } - printf( "\n"); - }*/ -#endif - - //main while cycle. each value can get information only from neighbour but that information has to spread there - while( changed[ 0 ] ) - { - __syncthreads(); - - changed[ currentIndex ] = false; - - //calculation of update cell - if( i < dimX - vUpper[0] && j < dimY - vUpper[1] && k < dimZ - vUpper[2] ) - { - if( ! interfaceMap[ k*dimX*dimY + j * dimX + i ] ) - { - // calculate new value depending on neighbours in sArray on (thri+1, thrj+1) coordinates - changed[ currentIndex ] = ptr.updateCell3D< sizeSArray >( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz); - } - } - __syncthreads(); - - //pyramid reduction (parallel reduction) - if( blockDim.x*blockDim.y*blockDim.z == 1024 ) - { - if( currentIndex < 512 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; - } - } - __syncthreads(); - if( blockDim.x*blockDim.y*blockDim.z >= 512 ) - { - if( currentIndex < 256 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; - } - } - __syncthreads(); - if( blockDim.x*blockDim.y*blockDim.z >= 256 ) - { - if( currentIndex < 128 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ]; - } - } - __syncthreads(); - if( blockDim.x*blockDim.y*blockDim.z >= 128 ) - { - if( currentIndex < 64 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; - } - } - __syncthreads(); - if( currentIndex < 32 ) - { - if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ]; - if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ]; - if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ]; - if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ]; - if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; - if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; - } - __syncthreads(); - - // if we calculated, then the BlockIterDevice should contain the info about this whole block! (only one number for one block) - if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 ) - { - BlockIterDevice[ blIdz * gridDim.x * gridDim.y + blIdy * gridDim.x + blIdx ] = 1; - } - __syncthreads(); - } - - // copy results into helpFunc (not into aux bcs of conflicts) - if( i < dimX && j < dimY && k < dimZ && thri+1 < xkolik && thrj+1 < ykolik && thrk+1 < zkolik ) - helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thri+1 ]; - + requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = + MPI::IRecv( &calculateFromNeighbours[1], 1, neighbours[1], 0, MPI::AllGroup ); } - else // if not, then it should at least copy the values from aux to helpFunc. + + if( neighbours[2] != -1 ) //NORTH { - if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] - && k < mesh.getDimensions().z() - vUpper[2]) - helpFunc[ k * mesh.getDimensions().x() * mesh.getDimensions().y() + j * mesh.getDimensions().x() + i ] = - aux[ k * mesh.getDimensions().x() * mesh.getDimensions().y() + j * mesh.getDimensions().x() + i ]; + requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[2], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = + MPI::IRecv( &calculateFromNeighbours[2], 1, neighbours[2], 0, MPI::AllGroup ); } -} + + if( neighbours[5] != -1 ) //SOUTH + { + requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[5], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = + MPI::IRecv( &calculateFromNeighbours[3], 1, neighbours[5], 0, MPI::AllGroup ); + } + + if( neighbours[8] != -1 ) // TOP + { + requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[8], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = + MPI::IRecv( &calculateFromNeighbours[4], 1, neighbours[8], 0, MPI::AllGroup ); + } + + if( neighbours[17] != -1 ) //BOTTOM + { + requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[17], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = + MPI::IRecv( &calculateFromNeighbours[5], 1, neighbours[17], 0, MPI::AllGroup ); + } + + MPI::WaitAll( requestsInformation, neighCount ); + + MPI::Allreduce( &calculatedBefore, &calculatedBefore, 1, MPI_LOR, MPI::AllGroup ); + calculateMPIAgain = calculateFromNeighbours[0] || calculateFromNeighbours[1] || + calculateFromNeighbours[2] || calculateFromNeighbours[3] || + calculateFromNeighbours[4] || calculateFromNeighbours[5]; +} #endif -- GitLab From b42fa59a58ec27b6266d772620df32926c0117db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matou=C5=A1=20Fencl?= Date: Sat, 16 Mar 2019 11:10:46 +0100 Subject: [PATCH 10/14] deleting tnlDirectEikonalMethodsBase_impl.h --- .../tnlDirectEikonalMethodsBase_impl.h | 1591 ----------------- 1 file changed, 1591 deletions(-) delete mode 100644 src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h deleted file mode 100644 index a5d3d81df..000000000 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +++ /dev/null @@ -1,1591 +0,0 @@ -/* - * File: tnlDirectEikonalMethodsBase_impl.h - * Author: oberhuber - * - * Created on July 14, 2016, 3:22 PM - */ - -#pragma once - -#include - -#include -#include "tnlFastSweepingMethod.h" -#include "tnlDirectEikonalMethodsBase.h" - -template< typename Real, - typename Device, - typename Index > -void -tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >:: -initInterface( const MeshFunctionPointer& _input, - MeshFunctionPointer& _output, - InterfaceMapPointer& _interfaceMap ) -{ - if( std::is_same< Device, Devices::Cuda >::value ) - { -#ifdef HAVE_CUDA - const MeshType& mesh = _input->getMesh(); - - const int cudaBlockSize( 16 ); - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); - dim3 blockSize( cudaBlockSize ); - dim3 gridSize( numBlocksX ); - Devices::Cuda::synchronizeDevice(); - CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(), - _output.template modifyData< Device >(), - _interfaceMap.template modifyData< Device >() ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; -#endif - } - if( std::is_same< Device, Devices::Host >::value ) - { - const MeshType& mesh = _input->getMesh(); - typedef typename MeshType::Cell Cell; - const MeshFunctionType& input = _input.getData(); - MeshFunctionType& output = _output.modifyData(); - InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); - Cell cell( mesh ); - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh.getDimensions().x(); - cell.getCoordinates().x() ++ ) - { - cell.refresh(); - output[ cell.getIndex() ] = - input( cell ) >= 0 ? std::numeric_limits< RealType >::max() : - -std::numeric_limits< RealType >::max(); - interfaceMap[ cell.getIndex() ] = false; - } - - - const RealType& h = mesh.getSpaceSteps().x(); - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh.getDimensions().x() - 1; - cell.getCoordinates().x() ++ ) - { - cell.refresh(); - const RealType& c = input( cell ); - if( ! cell.isBoundaryEntity() ) - { - const auto& neighbors = cell.getNeighborEntities(); - Real pom = 0; - //const IndexType& c = cell.getIndex(); - const IndexType e = neighbors.template getEntityIndex< 1 >(); - if( c * input[ e ] <= 0 ) - { - pom = TNL::sign( c )*( h * c )/( c - input[ e ]); - if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) - output[ cell.getIndex() ] = pom; - - pom = pom - TNL::sign( c )*h; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; - if( TNL::abs( output[ e ] ) > TNL::abs( pom ) ) - output[ e ] = pom; - - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ e ] = true; - } - } - } - } -} - -template< typename Real, - typename Device, - typename Index > -template< int sizeSArray > -void -tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: -updateBlocks( InterfaceMapType interfaceMap, - MeshFunctionType aux, - MeshFunctionType helpFunc, - ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ) -{ -#pragma omp parallel for schedule( dynamic ) - for( IndexType i = 0; i < BlockIterHost.getSize(); i++ ) - { - if( BlockIterHost[ i ] ) - { - MeshType mesh = interfaceMap.template getMesh< Devices::Host >(); - - int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); - //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl; - int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0); - int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0); - //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl; - int xkolik = numThreadsPerBlock + 1; - int ykolik = numThreadsPerBlock + 1; - - int blIdx = i%numOfBlockx; - int blIdy = i/numOfBlockx; - //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl; - - if( numOfBlockx - 1 == blIdx ) - xkolik = dimX - (blIdx)*numThreadsPerBlock+1; - - if( numOfBlocky -1 == blIdy ) - ykolik = dimY - (blIdy)*numThreadsPerBlock+1; - //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl; - - - /*bool changed[numThreadsPerBlock*numThreadsPerBlock]; - changed[ 0 ] = 1;*/ - Real hx = mesh.getSpaceSteps().x(); - Real hy = mesh.getSpaceSteps().y(); - - bool changed = false; - BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0; - - - Real *sArray; - sArray = new Real[ sizeSArray * sizeSArray ]; - if( sArray == nullptr ) - std::cout << "Error while allocating memory for sArray." << std::endl; - - for( IndexType thri = 0; thri < sizeSArray; thri++ ){ - for( IndexType thrj = 0; thrj < sizeSArray; thrj++ ) - sArray[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max(); - } - - - //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); - for( IndexType thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ ) - { - if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik ) - sArray[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ]; - - - if( blIdx != 0 && thrj+1 < ykolik ) - sArray[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ]; - - if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik ) - sArray[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ]; - - if( blIdy != 0 && thrj+1 < xkolik ) - sArray[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ]; - } - - for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ - for( IndexType l = 0; l < numThreadsPerBlock; l++ ) - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) - sArray[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ]; - } - - for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ - for( IndexType l = 0; l < numThreadsPerBlock; l++ ){ - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ){ - //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl; - if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) - { - changed = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy) || changed; - - } - } - } - } - /*aux.save( "aux-1pruch.tnl" ); - for( int k = 0; k < sizeSArray; k++ ){ - for( int l = 0; l < sizeSArray; l++ ) { - std::cout << sArray[ k * sizeSArray + l] << " "; - } - std::cout << std::endl; - }*/ - - for( IndexType k = 0; k < numThreadsPerBlock; k++ ) - for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ) { - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) - { - if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) - { - this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy); - } - } - } - /*aux.save( "aux-2pruch.tnl" ); - for( int k = 0; k < sizeSArray; k++ ){ - for( int l = 0; l < sizeSArray; l++ ) { - std::cout << sArray[ k * sizeSArray + l] << " "; - } - std::cout << std::endl; - }*/ - - for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ) - for( IndexType l = 0; l < numThreadsPerBlock; l++ ) { - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) - { - if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) - { - this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy); - } - } - } - /*aux.save( "aux-3pruch.tnl" ); - for( int k = 0; k < sizeSArray; k++ ){ - for( int l = 0; l < sizeSArray; l++ ) { - std::cout << sArray[ k * sizeSArray + l] << " "; - } - std::cout << std::endl; - }*/ - - for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){ - for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ) { - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) - { - if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) - { - this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx, hy, 1.0); - } - } - } - } - /*aux.save( "aux-4pruch.tnl" ); - for( int k = 0; k < sizeSArray; k++ ){ - for( int l = 0; l < sizeSArray; l++ ) { - std::cout << sArray[ k * sizeSArray + l] << " "; - } - std::cout << std::endl; - }*/ - - - if( changed ){ - BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 1; - } - - - for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ - for( IndexType l = 0; l < numThreadsPerBlock; l++ ) { - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) - helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] = sArray[ (k + 1)* sizeSArray + l + 1 ]; - //std::cout<< sArray[k+1][l+1]; - } - //std::cout< -template< int sizeSArray > -void -tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: -updateBlocks( const InterfaceMapType& interfaceMap, - const MeshFunctionType& aux, - MeshFunctionType& helpFunc, - ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ) -{ -//#pragma omp parallel for schedule( dynamic ) - for( IndexType i = 0; i < BlockIterHost.getSize(); i++ ) - { - if( BlockIterHost[ i ] ) - { - MeshType mesh = interfaceMap.template getMesh< Devices::Host >(); - - int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); - int dimZ = mesh.getDimensions().z(); - //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl; - int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0); - int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0); - int numOfBlockz = dimZ/numThreadsPerBlock + ((dimZ%numThreadsPerBlock != 0) ? 1:0); - //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl; - int xkolik = numThreadsPerBlock + 1; - int ykolik = numThreadsPerBlock + 1; - int zkolik = numThreadsPerBlock + 1; - - - int blIdz = i/( numOfBlockx * numOfBlocky ); - int blIdy = (i-blIdz*numOfBlockx * numOfBlocky )/(numOfBlockx ); - int blIdx = (i-blIdz*numOfBlockx * numOfBlocky )%( numOfBlockx ); - //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl; - - if( numOfBlockx - 1 == blIdx ) - xkolik = dimX - (blIdx)*numThreadsPerBlock+1; - if( numOfBlocky -1 == blIdy ) - ykolik = dimY - (blIdy)*numThreadsPerBlock+1; - if( numOfBlockz-1 == blIdz ) - zkolik = dimZ - (blIdz)*numThreadsPerBlock+1; - //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl; - - - /*bool changed[numThreadsPerBlock*numThreadsPerBlock]; - changed[ 0 ] = 1;*/ - Real hx = mesh.getSpaceSteps().x(); - Real hy = mesh.getSpaceSteps().y(); - Real hz = mesh.getSpaceSteps().z(); - - bool changed = false; - BlockIterHost[ i ] = 0; - - - Real *sArray; - sArray = new Real[ sizeSArray * sizeSArray * sizeSArray ]; - if( sArray == nullptr ) - std::cout << "Error while allocating memory for sArray." << std::endl; - - for( IndexType k = 0; k < sizeSArray; k++ ) - for( IndexType l = 0; l < sizeSArray; l++ ) - for( IndexType m = 0; m < sizeSArray; m++ ){ - sArray[ m * sizeSArray * sizeSArray + k * sizeSArray + l ] = std::numeric_limits< Real >::max(); - } - - - for( IndexType thrk = 0; thrk < numThreadsPerBlock; thrk++ ) - for( IndexType thrj = 0; thrj < numThreadsPerBlock; thrj++ ) - { - if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik ) - sArray[(thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = - aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX -1 + thrk*dimX*dimY ]; - - if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < zkolik ) - sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = - aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy *numThreadsPerBlock*dimX+ blIdx*numThreadsPerBlock + numThreadsPerBlock + thrj * dimX + thrk*dimX*dimY ]; - - if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik ) - sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = - aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX + thrj + thrk*dimX*dimY ]; - - if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik && thrk+1 < zkolik ) - sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = - aux[ blIdz*numThreadsPerBlock * dimX * dimY + (blIdy+1) * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj + thrk*dimX*dimY ]; - - if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik ) - sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = - aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX * dimY + thrj * dimX + thrk ]; - - if( dimZ > (blIdz+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < xkolik ) - sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = - aux[ (blIdz+1)*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX + thrk ]; - } - - for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ - for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ - for( IndexType l = 0; l < numThreadsPerBlock; l++ ){ - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - sArray[(m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1] = - aux[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ]; - } - } - } - /*string s; - int numWhile = 0; - for( int k = 0; k < numThreadsPerBlock; k++ ){ - for( int l = 0; l < numThreadsPerBlock; l++ ) - for( int m = 0; m < numThreadsPerBlock; m++ ) - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; - } - numWhile++; - s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; - helpFunc.save( s );*/ - - for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ - for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ - for( IndexType l = 0; l < numThreadsPerBlock; l++ ){ - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){ - //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl; - if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] ) - { - //printf("In with point m = %d, k = %d, l = %d\n", m, k, l); - changed = this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz) || changed; - - } - } - } - } - } - /*for( int k = 0; k < numThreadsPerBlock; k++ ){ - for( int l = 0; l < numThreadsPerBlock; l++ ) - for( int m = 0; m < numThreadsPerBlock; m++ ) - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; - } - numWhile++; - s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; - helpFunc.save( s );*/ - - for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){ - for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ - for( IndexType l = 0; l template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); - } - } - } - } - } - /*for( int k = 0; k < numThreadsPerBlock; k++ ){ - for( int l = 0; l < numThreadsPerBlock; l++ ) - for( int m = 0; m < numThreadsPerBlock; m++ ) - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; - } - numWhile++; - s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; - helpFunc.save( s );*/ - - for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ - for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ - for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){ - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - { - if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] ) - { - this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); - } - } - } - } - } - /*for( int k = 0; k < numThreadsPerBlock; k++ ){ - for( int l = 0; l < numThreadsPerBlock; l++ ) - for( int m = 0; m < numThreadsPerBlock; m++ ) - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; - } - numWhile++; - s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; - helpFunc.save( s ); - */ - for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){ - for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ - for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){ - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - { - if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] ) - { - this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); - } - } - } - } - } - /*for( int k = 0; k < numThreadsPerBlock; k++ ){ - for( int l = 0; l < numThreadsPerBlock; l++ ) - for( int m = 0; m < numThreadsPerBlock; m++ ) - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; - } - numWhile++; - s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; - helpFunc.save( s );*/ - - for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ - for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){ - for( IndexType l = 0; l template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); - } - } - } - } - } - /*for( int k = 0; k < numThreadsPerBlock; k++ ){ - for( int l = 0; l < numThreadsPerBlock; l++ ) - for( int m = 0; m < numThreadsPerBlock; m++ ) - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; - } - numWhile++; - s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; - helpFunc.save( s );*/ - - for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){ - for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){ - for( IndexType l = 0; l template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); - } - } - } - } - } - /*for( int k = 0; k < numThreadsPerBlock; k++ ){ - for( int l = 0; l < numThreadsPerBlock; l++ ) - for( int m = 0; m < numThreadsPerBlock; m++ ) - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; - } - numWhile++; - s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; - helpFunc.save( s );*/ - - for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ - for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){ - for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){ - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - { - if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] ) - { - this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); - } - } - } - } - } - /*for( int k = 0; k < numThreadsPerBlock; k++ ){ - for( int l = 0; l < numThreadsPerBlock; l++ ) - for( int m = 0; m < numThreadsPerBlock; m++ ) - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; - } - numWhile++; - s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; - helpFunc.save( s );*/ - - - for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){ - for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){ - for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){ - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - { - if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] ) - { - this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); - } - } - } - } - } - /*for( int k = 0; k < numThreadsPerBlock; k++ ){ - for( int l = 0; l < numThreadsPerBlock; l++ ) - for( int m = 0; m < numThreadsPerBlock; m++ ) - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; - } - numWhile++; - s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; - helpFunc.save( s );*/ - - if( changed ){ - BlockIterHost[ i ] = 1; - } - - - for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ - for( IndexType l = 0; l < numThreadsPerBlock; l++ ) { - for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){ - helpFunc[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] = - sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; - //std::cout << helpFunc[ m*dimX*dimY + k*dimX + l ] << " "; - } - } - //std::cout << std::endl; - } - //std::cout << std::endl; - } - //helpFunc.save( "helpF.tnl"); - delete []sArray; - } - } -} -template< typename Real, - typename Device, - typename Index > -void -tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: -getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY, int numBlockZ ) -{ - int* BlockIterPom; - BlockIterPom = new int [ numBlockX * numBlockY * numBlockZ ]; - - for( int i = 0; i< BlockIterHost.getSize(); i++) - { - BlockIterPom[ i ] = 0; - - int m=0, l=0, k=0; - l = i/( numBlockX * numBlockY ); - k = (i-l*numBlockX * numBlockY )/(numBlockX ); - m = (i-l*numBlockX * numBlockY )%( numBlockX ); - - if( m > 0 && BlockIterHost[ i - 1 ] ){ - BlockIterPom[ i ] = 1; - }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){ - BlockIterPom[ i ] = 1; - }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){ - BlockIterPom[ i ] = 1; - }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){ - BlockIterPom[ i ] = 1; - }else if( l > 0 && BlockIterHost[ i - numBlockX*numBlockY ] ){ - BlockIterPom[ i ] = 1; - }else if( l < numBlockZ-1 && BlockIterHost[ i + numBlockX*numBlockY ] ){ - BlockIterPom[ i ] = 1; - } - } - for( int i = 0; i< BlockIterHost.getSize(); i++) - { - BlockIterHost[ i ] = BlockIterPom[ i ]; - } -} - - -template< typename Real, - typename Device, - typename Index > -void -tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: -getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY ) -{ - int* BlockIterPom; - BlockIterPom = new int [numBlockX * numBlockY]; - - for(int i = 0; i < numBlockX * numBlockY; i++) - { - BlockIterPom[ i ] = 0;//BlockIterPom[ i ] = 0; - int m=0, k=0; - m = i%numBlockX; - k = i/numBlockX; - if( m > 0 && BlockIterHost[ i - 1 ] ){ - BlockIterPom[ i ] = 1; - }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){ - BlockIterPom[ i ] = 1; - }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){ - BlockIterPom[ i ] = 1; - }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){ - BlockIterPom[ i ] = 1; - } - //BlockIterPom[ i ]; - } - - for(int i = 0; i < numBlockX * numBlockY; i++) - { - if( !BlockIterHost[ i ] ) - BlockIterHost[ i ] = BlockIterPom[ i ]; - } - /*else - BlockIter[ i ] = 0;*/ - /*for( int i = numBlockX-1; i > -1; i-- ) - { - for( int j = 0; j< numBlockY; j++ ) - std::cout << BlockIterHost[ i*numBlockY + j ]; - std::cout << std::endl; - } - std::cout << std::endl;*/ - delete[] BlockIterPom; -} - -template< typename Real, - typename Device, - typename Index > -template< typename MeshEntity > -void -tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >:: -updateCell( MeshFunctionType& u, - const MeshEntity& cell, - const RealType v ) -{ - const auto& neighborEntities = cell.template getNeighborEntities< 1 >(); - const MeshType& mesh = cell.getMesh(); - const RealType& h = mesh.getSpaceSteps().x(); - const RealType value = u( cell ); - RealType a, tmp = std::numeric_limits< RealType >::max(); - - if( cell.getCoordinates().x() == 0 ) - a = u[ neighborEntities.template getEntityIndex< 1 >() ]; - else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 ) - a = u[ neighborEntities.template getEntityIndex< -1 >() ]; - else - { - a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1 >() ], - u[ neighborEntities.template getEntityIndex< 1 >() ] ); - } - - if( fabs( a ) == std::numeric_limits< RealType >::max() ) - return; - - tmp = a + TNL::sign( value ) * h/v; - - u[ cell.getIndex() ] = argAbsMin( value, tmp ); -} - -template< typename Real, - typename Device, - typename Index > -void -tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: -initInterface( const MeshFunctionPointer& _input, - MeshFunctionPointer& _output, - InterfaceMapPointer& _interfaceMap, - StaticVector vLower, StaticVector vUpper ) -{ - - if( std::is_same< Device, Devices::Cuda >::value ) - { -#ifdef HAVE_CUDA - const MeshType& mesh = _input->getMesh(); - - const int cudaBlockSize( 16 ); - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); - int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); - dim3 blockSize( cudaBlockSize, cudaBlockSize ); - dim3 gridSize( numBlocksX, numBlocksY ); - Devices::Cuda::synchronizeDevice(); - CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(), - _output.template modifyData< Device >(), - _interfaceMap.template modifyData< Device >(), - vLower, vUpper); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; -#endif - } - if( std::is_same< Device, Devices::Host >::value ) - { - MeshFunctionType input = _input.getData(); - MeshFunctionType& output = _output.modifyData(); - InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); - const MeshType& mesh = input.getMesh(); -/*#ifdef HAVE_MPI - int i>s::>::GetRan>s::>::AllGroup ); - if( i == 0 ) - { - printf( "0: mesh x: %d\n", mesh.getDimensions().x() ); - printf( "0: mesh y: %d\n", mesh.getDimensions().y() ); - for( int k = 0; k < mesh.getDimensions().y(); k++ ){ - for( int l = 0; l < mesh.getDimensions().x(); l++ ) - printf( "%.2f\t", input[ k * 16 + l ] ); - printf("\n"); - } - } -#endif*/ - typedef typename MeshType::Cell Cell; - Cell cell( mesh ); - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh.getDimensions().y(); - cell.getCoordinates().y() ++ ) - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh.getDimensions().x(); - cell.getCoordinates().x() ++ ) - { - cell.refresh(); - output[ cell.getIndex() ] = - input( cell ) >= 0 ? std::numeric_limits< RealType >::max() : - - std::numeric_limits< RealType >::max(); - interfaceMap[ cell.getIndex() ] = false; - } - - const RealType& hx = mesh.getSpaceSteps().x(); - const RealType& hy = mesh.getSpaceSteps().y(); - for( cell.getCoordinates().y() = 0 + vLower[1]; - cell.getCoordinates().y() < mesh.getDimensions().y() - vUpper[1]; - cell.getCoordinates().y() ++ ) - for( cell.getCoordinates().x() = 0 + vLower[0]; - cell.getCoordinates().x() < mesh.getDimensions().x() - vUpper[0]; - cell.getCoordinates().x() ++ ) - { - cell.refresh(); - const RealType& c = input( cell ); - if( ! cell.isBoundaryEntity() ) - { - auto neighbors = cell.getNeighborEntities(); - Real pom = 0; - const IndexType e = neighbors.template getEntityIndex< 1, 0 >(); - const IndexType n = neighbors.template getEntityIndex< 0, 1 >(); - //Try init with exact data: - /*if( c * input[ n ] <= 0 ) - { - output[ cell.getIndex() ] = c; - output[ n ] = input[ n ]; - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ n ] = true; - } - if( c * input[ e ] <= 0 ) - { - output[ cell.getIndex() ] = c; - output[ e ] = input[ e ]; - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ e ] = true; - }*/ - if( c * input[ n ] <= 0 ) - { - /*if( c >= 0 ) - {*/ - pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); - if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) - output[ cell.getIndex() ] = pom; - pom = pom - TNL::sign( c )*hy; - if( TNL::abs( output[ n ] ) > TNL::abs( pom ) ) - output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy; - /*}else - { - pom = - ( hy * c )/( c - input[ n ]); - if( output[ cell.getIndex() ] < pom ) - output[ cell.getIndex() ] = pom; - if( output[ n ] > hy + pom ) - output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]); - }*/ - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ n ] = true; - } - if( c * input[ e ] <= 0 ) - { - /*if( c >= 0 ) - {*/ - pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); - if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) - output[ cell.getIndex() ] = pom; - - pom = pom - TNL::sign( c )*hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; - if( TNL::abs( output[ e ] ) > TNL::abs( pom ) ) - output[ e ] = pom; - /*}else - { - pom = - (hx * c)/( c - input[ e ]); - if( output[ cell.getIndex() ] < pom ) - output[ cell.getIndex() ] = pom; - - pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]); - if( output[ e ] > pom ) - output[ e ] = pom; - }*/ - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ e ] = true; - } - } - } -#ifdef HAVE_MPI - //int i>s::>::GetRan>s::>::AllGroup ); - /*if( i == 0 ) - { - printf( "0: mesh x: %d\n", mesh.getDimensions().x() ); - printf( "0: mesh y: %d\n", mesh.getDimensions().y() ); - for( int k = 0; k < mesh.getDimensions().y(); k++ ){ - for( int l = 0; l < mesh.getDimensions().x(); l++ ) - printf("%.2f\t",output[ k * 16 + l ] ); - printf("\n"); - } - }*/ -#endif - } -} - -template< typename Real, - typename Device, - typename Index > -template< typename MeshEntity > -__cuda_callable__ -bool -tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: -updateCell( MeshFunctionType& u, - const MeshEntity& cell, - const RealType v) -{ - const auto& neighborEntities = cell.template getNeighborEntities< 2 >(); - const MeshType& mesh = cell.getMesh(); - const RealType& hx = mesh.getSpaceSteps().x(); - const RealType& hy = mesh.getSpaceSteps().y(); - const RealType value = u( cell ); - RealType a, b, tmp = std::numeric_limits< RealType >::max(); - - if( cell.getCoordinates().x() == 0 ) - a = u[ neighborEntities.template getEntityIndex< 1, 0 >() ]; - else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 ) - a = u[ neighborEntities.template getEntityIndex< -1, 0 >() ]; - else - { - a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0 >() ], - u[ neighborEntities.template getEntityIndex< 1, 0 >() ] ); - } - - if( cell.getCoordinates().y() == 0 ) - b = u[ neighborEntities.template getEntityIndex< 0, 1 >()]; - else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 ) - b = u[ neighborEntities.template getEntityIndex< 0, -1 >() ]; - else - { - b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1 >() ], - u[ neighborEntities.template getEntityIndex< 0, 1 >() ] ); - } - if( fabs( a ) == std::numeric_limits< RealType >::max() && - fabs( b ) == std::numeric_limits< RealType >::max() ) - return false; - - RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 }; - sortMinims( pom ); - tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v; - - if( fabs( tmp ) < fabs( pom[ 1 ] ) ) - { - u[ cell.getIndex() ] = argAbsMin( value, tmp ); - tmp = value - u[ cell.getIndex() ]; - if ( fabs( tmp ) > 0.001*hx ){ - //printf( "Vracime true!\n"); - return true; - }else{ - //printf( "Vracime false2!\n"); - return false; - } - } - else { - tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + - TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - - ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); - u[ cell.getIndex() ] = argAbsMin( value, tmp ); - tmp = value - u[ cell.getIndex() ]; - if ( fabs( tmp ) > 0.001*hx ){ - //printf( "Vracime true3!\n"); - return true; - }else{ - //printf( "Vracime false!\n"); - return false; - } - } -} - -template< typename Real, - typename Device, - typename Index > -void -tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: -initInterface( const MeshFunctionPointer& _input, - MeshFunctionPointer& _output, - InterfaceMapPointer& _interfaceMap, - StaticVector vLower, StaticVector vUpper ) -{ - if( std::is_same< Device, Devices::Cuda >::value ) - { -#ifdef HAVE_CUDA - const MeshType& mesh = _input->getMesh(); - - const int cudaBlockSize( 8 ); - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); - int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); - int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize ); - if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) - std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; - dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); - dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); - Devices::Cuda::synchronizeDevice(); - CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(), - _output.template modifyData< Device >(), - _interfaceMap.template modifyData< Device >(), vLower, vUpper ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; -#endif - } - if( std::is_same< Device, Devices::Host >::value ) - { - const MeshFunctionType& input = _input.getData(); - MeshFunctionType& output = _output.modifyData(); - InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); - - const MeshType& mesh = input.getMesh(); - typedef typename MeshType::Cell Cell; - - Cell cell( mesh ); - for( cell.getCoordinates().z() = 0; - cell.getCoordinates().z() < mesh.getDimensions().z(); - cell.getCoordinates().z() ++ ) - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh.getDimensions().y(); - cell.getCoordinates().y() ++ ) - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh.getDimensions().x(); - cell.getCoordinates().x() ++ ) - { - cell.refresh(); - output[ cell.getIndex() ] = - input( cell ) > 0 ? std::numeric_limits< RealType >::max() : - - std::numeric_limits< RealType >::max(); - interfaceMap[ cell.getIndex() ] = false; - } - - const RealType& hx = mesh.getSpaceSteps().x(); - const RealType& hy = mesh.getSpaceSteps().y(); - const RealType& hz = mesh.getSpaceSteps().z(); - for( cell.getCoordinates().z() = 0 + vLower[2]; - cell.getCoordinates().z() < mesh.getDimensions().z() - vUpper[2]; - cell.getCoordinates().z() ++ ) - for( cell.getCoordinates().y() = 0 + vLower[1]; - cell.getCoordinates().y() < mesh.getDimensions().y() - vUpper[1]; - cell.getCoordinates().y() ++ ) - for( cell.getCoordinates().x() = 0 + vLower[0]; - cell.getCoordinates().x() < mesh.getDimensions().x() - vUpper[0]; - cell.getCoordinates().x() ++ ) - { - cell.refresh(); - const RealType& c = input( cell ); - if( ! cell.isBoundaryEntity() ) - { - auto neighbors = cell.getNeighborEntities(); - Real pom = 0; - const IndexType e = neighbors.template getEntityIndex< 1, 0, 0 >(); - const IndexType n = neighbors.template getEntityIndex< 0, 1, 0 >(); - const IndexType t = neighbors.template getEntityIndex< 0, 0, 1 >(); - - - if( c * input[ n ] <= 0 ) - { - pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); - if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) - output[ cell.getIndex() ] = pom; - pom = pom - TNL::sign( c )*hy; - if( TNL::abs( output[ n ] ) > TNL::abs( pom ) ) - output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy; - - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ n ] = true; - } - - if( c * input[ e ] <= 0 ) - { - pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); - if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) - output[ cell.getIndex() ] = pom; - pom = pom - TNL::sign( c )*hx; - if( TNL::abs( output[ e ] ) > TNL::abs( pom ) ) - output[ e ] = pom; //( hy * c )/( c - input[ n ]) - hy; - - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ e ] = true; - } - - if( c * input[ t ] <= 0 ) - { - pom = TNL::sign( c )*( hz * c )/( c - input[ t ]); - if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) - output[ cell.getIndex() ] = pom; - pom = pom - TNL::sign( c )*hz; - if( TNL::abs( output[ t ] ) > TNL::abs( pom ) ) - output[ t ] = pom; //( hy * c )/( c - input[ n ]) - hy; - - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ t ] = true; - } - } - } - } -} - -template< typename Real, - typename Device, - typename Index > -template< typename MeshEntity > -__cuda_callable__ -bool -tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: -updateCell( MeshFunctionType& u, - const MeshEntity& cell, - const RealType v ) -{ - const auto& neighborEntities = cell.template getNeighborEntities< 3 >(); - const MeshType& mesh = cell.getMesh(); - - const RealType& hx = mesh.getSpaceSteps().x(); - const RealType& hy = mesh.getSpaceSteps().y(); - const RealType& hz = mesh.getSpaceSteps().z(); - const RealType value = u( cell ); - //std::cout << value << std::endl; - RealType a, b, c, tmp = std::numeric_limits< RealType >::max(); - - - if( cell.getCoordinates().x() == 0 ) - a = u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ]; - else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 ) - a = u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ]; - else - { - a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ], - u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ] ); - } - - if( cell.getCoordinates().y() == 0 ) - b = u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ]; - else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 ) - b = u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ]; - else - { - b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ], - u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ] ); - } - - if( cell.getCoordinates().z() == 0 ) - c = u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ]; - else if( cell.getCoordinates().z() == mesh.getDimensions().z() - 1 ) - c = u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ]; - else - { - c = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ], - u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ] ); - } - - if( fabs( a ) == std::numeric_limits< RealType >::max() && - fabs( b ) == std::numeric_limits< RealType >::max() && - fabs( c ) == std::numeric_limits< RealType >::max() ) - return false; - - RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz}; - sortMinims( pom ); - tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]; - - if( fabs( tmp ) < fabs( pom[ 1 ] ) ) - { - u[ cell.getIndex() ] = argAbsMin( value, tmp ); - tmp = value - u[ cell.getIndex() ]; - if ( fabs( tmp ) > 0.001*hx ){ - return true; - }else{ - return false; - } - } - else - { - tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + - TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - - ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); - if( fabs( tmp ) < fabs( pom[ 2 ]) ) - { - u[ cell.getIndex() ] = argAbsMin( value, tmp ); - tmp = value - u[ cell.getIndex() ]; - if ( fabs( tmp ) > 0.001*hx ){ - return true; - }else{ - return false; - } - } - else - { - tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c + - TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - - hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) - - hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx ); - u[ cell.getIndex() ] = argAbsMin( value, tmp ); - tmp = value - u[ cell.getIndex() ]; - if ( fabs( tmp ) > 0.001*hx ){ - return true; - }else{ - return false; - } - } - } -} - -template < typename T1 > -__cuda_callable__ void sortMinims( T1 pom[] ) -{ - T1 tmp[6] = {0.0,0.0,0.0,0.0,0.0,0.0}; - if( fabs(pom[0]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[2])){ - tmp[0] = pom[0]; tmp[1] = pom[1]; tmp[2] = pom[2]; - tmp[3] = pom[3]; tmp[4] = pom[4]; tmp[5] = pom[5]; - - } - else if( fabs(pom[0]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[1]) ){ - tmp[0] = pom[0]; tmp[1] = pom[2]; tmp[2] = pom[1]; - tmp[3] = pom[3]; tmp[4] = pom[5]; tmp[5] = pom[4]; - } - else if( fabs(pom[1]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[2]) ){ - tmp[0] = pom[1]; tmp[1] = pom[0]; tmp[2] = pom[2]; - tmp[3] = pom[4]; tmp[4] = pom[3]; tmp[5] = pom[5]; - } - else if( fabs(pom[1]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[0]) ){ - tmp[0] = pom[1]; tmp[1] = pom[2]; tmp[2] = pom[0]; - tmp[3] = pom[4]; tmp[4] = pom[5]; tmp[5] = pom[3]; - } - else if( fabs(pom[2]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[1]) ){ - tmp[0] = pom[2]; tmp[1] = pom[0]; tmp[2] = pom[1]; - tmp[3] = pom[5]; tmp[4] = pom[3]; tmp[5] = pom[4]; - } - else if( fabs(pom[2]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[0]) ){ - tmp[0] = pom[2]; tmp[1] = pom[1]; tmp[2] = pom[0]; - tmp[3] = pom[5]; tmp[4] = pom[4]; tmp[5] = pom[3]; - } - - for( unsigned int i = 0; i < 6; i++ ) - { - pom[ i ] = tmp[ i ]; - } -} - -template< typename Real, - typename Device, - typename Index > -template< int sizeSArray > -__cuda_callable__ -bool -tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: -updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real hy, - const Real v ) -{ - const RealType value = sArray[ thrj * sizeSArray + thri ]; - RealType a, b, tmp = std::numeric_limits< RealType >::max(); - - b = TNL::argAbsMin( sArray[ (thrj+1) * sizeSArray + thri ], - sArray[ (thrj-1) * sizeSArray + thri ] ); - - a = TNL::argAbsMin( sArray[ thrj * sizeSArray + thri+1 ], - sArray[ thrj * sizeSArray + thri-1 ] ); - - if( fabs( a ) == std::numeric_limits< RealType >::max() && - fabs( b ) == std::numeric_limits< RealType >::max() ) - return false; - - RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 }; - sortMinims( pom ); - tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v; - - - if( fabs( tmp ) < fabs( pom[ 1 ] ) ) - { - sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp ); - tmp = value - sArray[ thrj * sizeSArray + thri ]; - if ( fabs( tmp ) > 0.001*hx ) - return true; - else - return false; - } - else - { - tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + - TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - - ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); - sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp ); - tmp = value - sArray[ thrj * sizeSArray + thri ]; - if ( fabs( tmp ) > 0.001*hx ) - return true; - else - return false; - } - - return false; -} -template< typename Real, - typename Device, - typename Index > -template< int sizeSArray > -__cuda_callable__ -bool -tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: -updateCell3D( volatile Real *sArray, int thri, int thrj, int thrk, - const Real hx, const Real hy, const Real hz, const Real v ) -{ - const RealType value = sArray[thrk *sizeSArray * sizeSArray + thrj * sizeSArray + thri]; - - RealType a, b, c, tmp = std::numeric_limits< RealType >::max(); - - c = TNL::argAbsMin( sArray[ (thrk+1)* sizeSArray*sizeSArray + thrj * sizeSArray + thri ], - sArray[ (thrk-1) * sizeSArray *sizeSArray + thrj* sizeSArray + thri ] ); - - b = TNL::argAbsMin( sArray[ thrk* sizeSArray*sizeSArray + (thrj+1) * sizeSArray + thri ], - sArray[ thrk* sizeSArray * sizeSArray + (thrj-1)* sizeSArray +thri ] ); - - a = TNL::argAbsMin( sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri+1 ], - sArray[ thrk* sizeSArray * sizeSArray + thrj* sizeSArray +thri-1 ] ); - - /*if( thrk == 8 ) - printf("Calculating a = %f, b = %f, c = %f\n" , a, b, c );*/ - - if( fabs( a ) == std::numeric_limits< RealType >::max() && - fabs( b ) == std::numeric_limits< RealType >::max() && - fabs( c ) == std::numeric_limits< RealType >::max() ) - return false; - - RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz}; - - sortMinims( pom ); - - tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]; - if( fabs( tmp ) < fabs( pom[ 1 ] ) ) - { - sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ] = argAbsMin( value, tmp ); - tmp = value - sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ]; - if ( fabs( tmp ) > 0.001*hx ) - return true; - else - return false; - } - else - { - tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + - TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - - ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); - if( fabs( tmp ) < fabs( pom[ 2 ]) ) - { - sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ] = argAbsMin( value, tmp ); - tmp = value - sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ]; - if ( fabs( tmp ) > 0.001*hx ) - return true; - else - return false; - } - else - { - tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c + - TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - - hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) - - hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx ); - sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ] = argAbsMin( value, tmp ); - tmp = value - sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ]; - if ( fabs( tmp ) > 0.001*hx ) - return true; - else - return false; - } - } - - return false; -} - -#ifdef HAVE_CUDA -template < typename Real, typename Device, typename Index > -__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, - Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output, - Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap ) -{ - int i = threadIdx.x + blockDim.x*blockIdx.x; - const Meshes::Grid< 1, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); - - if( i < mesh.getDimensions().x() ) - { - typedef typename Meshes::Grid< 1, Real, Device, Index >::Cell Cell; - Cell cell( mesh ); - cell.getCoordinates().x() = i; - cell.refresh(); - const Index cind = cell.getIndex(); - - - output[ cind ] = - input( cell ) >= 0 ? std::numeric_limits< Real >::max() : - - std::numeric_limits< Real >::max(); - interfaceMap[ cind ] = false; - - const Real& h = mesh.getSpaceSteps().x(); - cell.refresh(); - const Real& c = input( cell ); - if( ! cell.isBoundaryEntity() ) - { - auto neighbors = cell.getNeighborEntities(); - Real pom = 0; - const Index e = neighbors.template getEntityIndex< 1 >(); - const Index w = neighbors.template getEntityIndex< -1 >(); - if( c * input[ e ] <= 0 ) - { - pom = TNL::sign( c )*( h * c )/( c - input[ e ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ w ] <= 0 ) - { - pom = TNL::sign( c )*( h * c )/( c - input[ w ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - } - } - -} -template < typename Real, typename Device, typename Index > -__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, - Containers::StaticVector< 2, Index > vLower, Containers::StaticVector< 2, Index > vUpper ) -{ - int i = threadIdx.x + blockDim.x*blockIdx.x; - int j = blockDim.y*blockIdx.y + threadIdx.y; - const Meshes::Grid< 2, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); - - if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) - { - typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell; - Cell cell( mesh ); - cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; - cell.refresh(); - const Index cind = cell.getIndex(); - - - output[ cind ] = - input( cell ) >= 0 ? std::numeric_limits< Real >::max() : - - std::numeric_limits< Real >::max(); - interfaceMap[ cind ] = false; - - if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] && i>vLower[0] -1 && j> vLower[0]-1 ) - { - const Real& hx = mesh.getSpaceSteps().x(); - const Real& hy = mesh.getSpaceSteps().y(); - cell.refresh(); - const Real& c = input( cell ); - if( ! cell.isBoundaryEntity() ) - { - auto neighbors = cell.getNeighborEntities(); - Real pom = 0; - const Index e = neighbors.template getEntityIndex< 1, 0 >(); - const Index w = neighbors.template getEntityIndex< -1, 0 >(); - const Index n = neighbors.template getEntityIndex< 0, 1 >(); - const Index s = neighbors.template getEntityIndex< 0, -1 >(); - - if( c * input[ n ] <= 0 ) - { - pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cell.getIndex() ] = true; - } - if( c * input[ e ] <= 0 ) - { - pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ w ] <= 0 ) - { - pom = TNL::sign( c )*( hx * c )/( c - input[ w ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ s ] <= 0 ) - { - pom = TNL::sign( c )*( hy * c )/( c - input[ s ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - } - } - } -} - -template < typename Real, typename Device, typename Index > -__global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, - Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output, - Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, - Containers::StaticVector< 3, Index > vLower, Containers::StaticVector< 3, Index > vUpper ) -{ - int i = threadIdx.x + blockDim.x*blockIdx.x; - int j = blockDim.y*blockIdx.y + threadIdx.y; - int k = blockDim.z*blockIdx.z + threadIdx.z; - const Meshes::Grid< 3, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); - - if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() ) - { - typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell; - Cell cell( mesh ); - cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k; - cell.refresh(); - const Index cind = cell.getIndex(); - - - output[ cind ] = - input( cell ) >= 0 ? std::numeric_limits< Real >::max() : - - std::numeric_limits< Real >::max(); - interfaceMap[ cind ] = false; - cell.refresh(); - - if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] && - k < mesh.getDimensions().y() - vUpper[2] && i>vLower[0]-1 && j> vLower[1]-1 && k>vLower[2]-1 ) - { - const Real& hx = mesh.getSpaceSteps().x(); - const Real& hy = mesh.getSpaceSteps().y(); - const Real& hz = mesh.getSpaceSteps().z(); - const Real& c = input( cell ); - if( ! cell.isBoundaryEntity() ) - { - auto neighbors = cell.getNeighborEntities(); - Real pom = 0; - const Index e = neighbors.template getEntityIndex< 1, 0, 0 >(); - const Index w = neighbors.template getEntityIndex< -1, 0, 0 >(); - const Index n = neighbors.template getEntityIndex< 0, 1, 0 >(); - const Index s = neighbors.template getEntityIndex< 0, -1, 0 >(); - const Index t = neighbors.template getEntityIndex< 0, 0, 1 >(); - const Index b = neighbors.template getEntityIndex< 0, 0, -1 >(); - - if( c * input[ n ] <= 0 ) - { - pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ e ] <= 0 ) - { - pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ w ] <= 0 ) - { - pom = TNL::sign( c )*( hx * c )/( c - input[ w ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ s ] <= 0 ) - { - pom = TNL::sign( c )*( hy * c )/( c - input[ s ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ b ] <= 0 ) - { - pom = TNL::sign( c )*( hz * c )/( c - input[ b ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ t ] <= 0 ) - { - pom = TNL::sign( c )*( hz * c )/( c - input[ t ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - } - } - } -} - - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -bool -tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >:: -updateCell( volatile Real sArray[18], int thri, const Real h, const Real v ) -{ - const RealType value = sArray[ thri ]; - RealType a, tmp = std::numeric_limits< RealType >::max(); - - a = TNL::argAbsMin( sArray[ thri+1 ], - sArray[ thri-1 ] ); - - if( fabs( a ) == std::numeric_limits< RealType >::max() ) - return false; - - tmp = a + TNL::sign( value ) * h/v; - - - sArray[ thri ] = argAbsMin( value, tmp ); - - tmp = value - sArray[ thri ]; - if ( fabs( tmp ) > 0.001*h ) - return true; - else - return false; -} -#endif -- GitLab From f2dc45179eed5e340509370dfe9b252817173007 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 11 Apr 2019 14:07:15 +0200 Subject: [PATCH 11/14] Fixed saving with expcetions. --- .../Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h index 105a068d3..c36c4dca9 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h @@ -173,10 +173,7 @@ makeSnapshot( ) Meshes::DistributedMeshes::DistributedGridIO ::save(fileName.getFileName(), *u ); } else - { - if( ! this->u->save( fileName.getFileName() ) ) - return false; - } + this->u->save( fileName.getFileName() ); return true; } -- GitLab From 008601adc763ccbbcfdf8db6901a9af11d75b723 Mon Sep 17 00:00:00 2001 From: fencl Date: Mon, 23 Sep 2019 22:09:53 +0200 Subject: [PATCH 12/14] Fix 2D GPU neighbours. Version with Chess method and OpenMP FSM methods. --- .../tnlDirectEikonalMethodBase2D_impl.h | 10 +-- .../tnlDirectEikonalMethodBase3D_impl.h | 6 +- .../tnlFastSweepingMethod2D_impl.h | 15 ++--- .../tnlFastSweepingMethod3D_impl.h | 64 +++++++++++++++---- 4 files changed, 62 insertions(+), 33 deletions(-) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h index 583e22478..50ea7bde8 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h @@ -365,13 +365,13 @@ __global__ void GetNeighbours( const TNL::Containers::Array< int, Devices::Cuda, m = i%numBlockX; k = i/numBlockX; if( m > 0 && blockCalculationIndicator[ i - 1 ] ){ - pom = 1;//BlockIterPom[ i ] = 1; + pom = 1;//blockCalculationIndicatorHelp[ i ] = 1; }else if( m < numBlockX -1 && blockCalculationIndicator[ i + 1 ] ){ - pom = 1;//BlockIterPom[ i ] = 1; - }else if( k > 0 && blockCalculationIndicatorHelp[ i - numBlockX ] ){ - pom = 1;// BlockIterPom[ i ] = 1; + pom = 1;//blockCalculationIndicatorHelp[ i ] = 1; + }else if( k > 0 && blockCalculationIndicator[ i - numBlockX ] ){ + pom = 1;// blockCalculationIndicatorHelp[ i ] = 1; }else if( k < numBlockY -1 && blockCalculationIndicator[ i + numBlockX ] ){ - pom = 1;//BlockIterPom[ i ] = 1; + pom = 1;//blockCalculationIndicatorHelp[ i ] = 1; } if( blockCalculationIndicator[ i ] != 1 ) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h index 91f9a0efe..5b2a4b685 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h @@ -109,7 +109,7 @@ initInterface( const MeshFunctionPointer& _input, output[ cell.getIndex() ] = pom; pom = pom - TNL::sign( c )*hx; if( TNL::abs( output[ e ] ) > TNL::abs( pom ) ) - output[ e ] = pom; //( hy * c )/( c - input[ n ]) - hy; + output[ e ] = pom; interfaceMap[ cell.getIndex() ] = true; interfaceMap[ e ] = true; @@ -122,7 +122,7 @@ initInterface( const MeshFunctionPointer& _input, output[ cell.getIndex() ] = pom; pom = pom - TNL::sign( c )*hz; if( TNL::abs( output[ t ] ) > TNL::abs( pom ) ) - output[ t ] = pom; //( hy * c )/( c - input[ n ]) - hy; + output[ t ] = pom; interfaceMap[ cell.getIndex() ] = true; interfaceMap[ t ] = true; @@ -736,7 +736,7 @@ updateBlocks( const InterfaceMapType interfaceMap, MeshFunctionType& helpFunc, ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ) { - //#pragma omp parallel for schedule( dynamic ) + #pragma omp parallel for schedule( dynamic ) for( IndexType i = 0; i < BlockIterHost.getSize(); i++ ) { if( BlockIterHost[ i ] ) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index 66f9e6cdf..31d3f8b32 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -363,7 +363,7 @@ solve( const MeshPointer& mesh, interfaceMapPtr.template getData< Device >(), auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(), - blockCalculationIndicator, + blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps, oddEvenBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; @@ -381,15 +381,8 @@ solve( const MeshPointer& mesh, oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; - CudaParallelReduc<<< nBlocks , 1024 >>>( blockCalculationIndicator, dBlock, ( numBlocksX * numBlocksY ) ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - - BlockIterD = dBlock.getElement( 0 );*/ - + calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1); + */ /**------------------------------------------------------------------------------------------------*/ @@ -441,7 +434,7 @@ solve( const MeshPointer& mesh, cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - // "Parallel reduction" to see if we should calculate again BlockIterD + // "Parallel reduction" to see if we should calculate again calculateCudaBlocksAgain calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1); // When we change something then we should caclucate again in the next passage of MPI ( calculated = true ) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h index 2d73b174e..4895c7693 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h @@ -103,8 +103,30 @@ solve( const MeshPointer& mesh, calculateMPIAgain = 0; /** HERE IS FSM FOR OPENMP (NO MPI) - isnt worthy */ - /*int numThreadsPerBlock = 64; + /*int numThreadsPerBlock = -1; + numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0)); + //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); + if( numThreadsPerBlock <= 16 ) + numThreadsPerBlock = 16; + else if(numThreadsPerBlock <= 32 ) + numThreadsPerBlock = 32; + else if(numThreadsPerBlock <= 64 ) + numThreadsPerBlock = 64; + else if(numThreadsPerBlock <= 128 ) + numThreadsPerBlock = 128; + else if(numThreadsPerBlock <= 256 ) + numThreadsPerBlock = 256; + else if(numThreadsPerBlock <= 512 ) + numThreadsPerBlock = 512; + else + numThreadsPerBlock = 1024; + //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); + + if( numThreadsPerBlock == -1 ){ + printf("Fail in setting numThreadsPerBlock.\n"); + break; + } int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0); int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0); @@ -140,8 +162,22 @@ solve( const MeshPointer& mesh, helpFunc1 = auxPtr; auxPtr = helpFunc; helpFunc = helpFunc1; + switch ( numThreadsPerBlock ){ + case 16: + this->template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + case 32: + this->template updateBlocks< 34 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + case 64: this->template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); - + case 128: + this->template updateBlocks< 130 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + case 256: + this->template updateBlocks< 258 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + case 512: + this->template updateBlocks< 514 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + default: + this->template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + } //Reduction for( int i = 0; i < BlockIterHost.getSize(); i++ ){ if( IsCalculationDone == 0 ){ @@ -176,43 +212,43 @@ solve( const MeshPointer& mesh, // TOP, NORTH and WEST boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2]; boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; - boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = vecLowerOverlaps[0]; + boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); // TOP, SOUTH and EAST boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2]; - boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = vecLowerOverlaps[1]; + boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); // TOP, SOUTH and WEST boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2]; - boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = vecLowerOverlaps[1]; - boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = vecLowerOverlaps[0]; + boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; + boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); // BOTTOM, NOTH and EAST - boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = vecLowerOverlaps[2]; + boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2]; boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); // BOTTOM, NOTH and WEST - boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = vecLowerOverlaps[2]; + boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2]; boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; - boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = vecLowerOverlaps[0]; + boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); // BOTTOM, SOUTH and EAST - boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = vecLowerOverlaps[2]; - boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = vecLowerOverlaps[1]; + boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2]; + boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); // BOTTOM, SOUTH and WEST - boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = vecLowerOverlaps[2]; - boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = vecLowerOverlaps[1]; - boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = vecLowerOverlaps[0]; + boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2]; + boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; + boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); -- GitLab From ac30546025477c6f06c55c30670808b86df82b34 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Sat, 28 Sep 2019 21:49:31 +0200 Subject: [PATCH 13/14] Fixed passing of Arrays by ArrayView. --- .../tnlDirectEikonalMethodBase2D_impl.h | 10 +- .../tnlDirectEikonalMethodBase3D_impl.h | 8 +- .../tnlDirectEikonalMethodsBase.h | 41 +-- .../tnlFastSweepingMethod2D_impl.h | 285 +++--------------- .../tnlFastSweepingMethod3D_impl.h | 121 +++++--- 5 files changed, 161 insertions(+), 304 deletions(-) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h index 50ea7bde8..c470a77ef 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h @@ -353,8 +353,8 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, template < typename Index > -__global__ void GetNeighbours( const TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator, - TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY ) +__global__ void GetNeighbours( const TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator, + TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY ) { int i = blockIdx.x * 1024 + threadIdx.x; @@ -389,7 +389,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, - TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator, + TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator, const Containers::StaticVector< 2, Index > vecLowerOverlaps, const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock ) { @@ -598,7 +598,7 @@ tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: updateBlocks( InterfaceMapType interfaceMap, MeshFunctionType aux, MeshFunctionType helpFunc, - ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ) + ArrayContainerView BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ) { #pragma omp parallel for schedule( dynamic ) for( IndexType i = 0; i < BlockIterHost.getSize(); i++ ) @@ -769,7 +769,7 @@ template< typename Real, typename Index > void tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: -getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY ) +getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY ) { int* BlockIterPom; BlockIterPom = new int [numBlockX * numBlockY]; diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h index 5b2a4b685..32548abcf 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h @@ -480,8 +480,8 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3 template < typename Index > -__global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, - TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, +__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, + TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY, int numBlockZ ) { int i = blockIdx.x * 1024 + threadIdx.x; @@ -520,7 +520,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, - TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, + TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps ) { int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z; @@ -1056,7 +1056,7 @@ template< typename Real, typename Index > void tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: -getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ ) +getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY, int numBlockZ ) { int* BlockIterPom; BlockIterPom = new int [ numBlockX * numBlockY * numBlockZ ]; diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h index e0ece04bf..7cba99f65 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h @@ -62,6 +62,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > typedef Functions::MeshFunction< MeshType > MeshFunctionType; typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType; typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer; + using ArrayContainerView = typename ArrayContainer::ViewType; typedef Containers::StaticVector< 2, Index > StaticVector; using MeshPointer = Pointers::SharedPointer< MeshType >; @@ -87,15 +88,18 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > const RealType velocity = 1.0 ); // FOR OPENMP WILL BE REMOVED - void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY ); + void getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY ); template< int sizeSArray > - void updateBlocks( const InterfaceMapType& interfaceMap, - MeshFunctionType& aux, - MeshFunctionType& helpFunc, - ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ); + void updateBlocks( InterfaceMapType interfaceMap, + MeshFunctionType aux, + MeshFunctionType helpFunc, + ArrayContainerView BlockIterHost, int numThreadsPerBlock ); + + protected: - void getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY ); + __cuda_callable__ RealType getNewValue( RealType valuesAndSteps[], + const RealType originalValue, const RealType v ); }; template< typename Real, @@ -111,6 +115,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > typedef Functions::MeshFunction< MeshType > MeshFunctionType; typedef Functions::MeshFunction< MeshType, 3, bool > InterfaceMapType; typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer; + using ArrayContainerView = typename ArrayContainer::ViewType; typedef Containers::StaticVector< 3, Index > StaticVector; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >; using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >; @@ -134,15 +139,15 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > const RealType velocity = 1.0 ); // OPENMP WILL BE REMOVED - void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ ); + void getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY, int numBlockZ ); template< int sizeSArray > - void updateBlocks( const InterfaceMapType& interfaceMap, - const MeshFunctionType& aux, + void updateBlocks( const InterfaceMapType interfaceMap, + const MeshFunctionType aux, MeshFunctionType& helpFunc, - ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ); + ArrayContainer BlockIterHost, int numThreadsPerBlock ); - void getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY, int numBlockZ ); + protected: __cuda_callable__ RealType getNewValue( RealType valuesAndSteps[], const RealType originalValue, const RealType v ); @@ -180,17 +185,14 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, - TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator, + TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator, const Containers::StaticVector< 2, Index > vecLowerOverlaps, const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock =0); template < typename Index > -__global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > dBlock, int nBlocks ); +__global__ void GetNeighbours( const TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator, + TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY ); -template < typename Index > -__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY ); // 3D @@ -205,10 +207,11 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice ); + TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, + Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps ); template < typename Index > -__global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, +__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY, int numBlockZ ); #endif diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index 31d3f8b32..e5638c11d 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -80,40 +80,9 @@ solve( const MeshPointer& mesh, IndexType iteration( 0 ); InterfaceMapType interfaceMap = *interfaceMapPtr; MeshFunctionType aux = *auxPtr; - aux.template synchronize< Communicator >(); - - -#ifdef HAVE_MPI - int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup ); - //printf( "Hello world from rank: %d ", i ); - //Communicators::MpiCommunicator::Request r = Communicators::MpiCommunicator::ISend( auxPtr, 0, 0, Communicators::MpiCommunicator::AllGroup ); - if( i == 1 ) { - /*for( int k = 0; k < 16*16; k++ ) - aux[ k ] = 10;*/ - printf( "1: mesh x: %d\n", mesh->getDimensions().x() ); - printf( "1: mesh y: %d\n", mesh->getDimensions().y() ); - //aux.save("aux_proc1.tnl"); - } - if( i == 0 ) { - printf( "0: mesh x: %d\n", mesh->getDimensions().x() ); - printf( "0: mesh y: %d\n", mesh->getDimensions().y() ); - //aux.save("aux_proc0.tnl"); - /*for( int k = 0; k < mesh->getDimensions().x()*mesh->getDimensions().y(); k++ ) - aux[ k ] = 10; - for( int k = 0; k < mesh->getDimensions().x(); k++ ){ - for( int l = 0; l < mesh->getDimensions().y(); l++ ) - printf("%f.2\t",aux[ k * 16 + l ] ); - printf("\n"); - }*/ - } - - /*bool a = Communicators::MpiCommunicator::IsInitialized(); - if( a ) - printf("Je Init\n"); - else - printf("Neni Init\n");*/ -#endif + aux.template synchronize< Communicator >(); //synchronize initialized overlaps + std::cout << "Calculating the values ..." << std::endl; while( iteration < this->maxIterations ) { // calculatedBefore indicates weather we calculated in the last passage of the while cycle @@ -290,41 +259,8 @@ solve( const MeshPointer& mesh, // Need for calling functions from kernel BaseType ptr; - int BlockIterD = 1; - /*auxPtr = helpFunc; - - CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, - interfaceMapPtr.template getData< Device >(), - auxPtr.template getData< Device>(), - helpFunc.template modifyData< Device>(), - BlockIterDevice, - oddEvenBlock.getView() ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - auxPtr = helpFunc; - - oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; - - CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, - interfaceMapPtr.template getData< Device >(), - auxPtr.template getData< Device>(), - helpFunc.template modifyData< Device>(), - BlockIterDevice, - oddEvenBlock.getView() ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - auxPtr = helpFunc; - - oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; - - CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY ) ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - - BlockIterD = dBlock.getElement( 0 );*/ + // True if we should calculate again. + int calculateCudaBlocksAgain = 1; // Array that identifies which blocks should be calculated. // All blocks should calculate in first passage ( setValue(1) ) @@ -343,16 +279,9 @@ solve( const MeshPointer& mesh, MeshFunctionPointer helpFunc( mesh ); helpFunc.template modifyData() = auxPtr.template getData(); - //int pocBloku = 0; - Devices::Cuda::synchronizeDevice(); - CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, - interfaceMapPtr.template getData< Device >(), - auxPtr.template modifyData< Device>(), - helpFunc.template modifyData< Device>(), - BlockIterDevice.getView() ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - + // number of iterations of while calculateCudaBlocksAgain + int numIter = 0; + //int oddEvenBlock = 0; while( calculateCudaBlocksAgain ) { @@ -390,44 +319,16 @@ solve( const MeshPointer& mesh, Devices::Cuda::synchronizeDevice(); CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(), - blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps ); + blockCalculationIndicator.getView(), vecLowerOverlaps, vecUpperOverlaps ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - - - aux = *auxPtr; - interfaceMap = *interfaceMapPtr; -#endif - } - - -/**----------------------MPI-TO-DO---------------------------------------------**/ - -#ifdef HAVE_MPI - //int i = MPI::GetRank( MPI::AllGroup ); - //TNL::Meshes::DistributedMeshes::DistributedMesh< MeshType > Mesh; - int neighCount = 0; // should this thread calculate again? - int calculpom[4] = {0,0,0,0}; - - if( i == 0 ){ - BlockIterPom1 = BlockIterDevice; - for( int i =0; i< numBlocksX; i++ ){ - for( int j = 0; j < numBlocksY; j++ ) - { - std::cout << BlockIterPom1[j*numBlocksX + i]; - } - std::cout << std::endl; - } - std::cout << std::endl; - } -#endif + + // Switching helpFunc and auxPtr. + auxPtr.swap( helpFunc ); // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now. Devices::Cuda::synchronizeDevice(); - GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator, blockCalculationIndicatorHelp, numBlocksX, numBlocksY ); + GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator.getView(), blockCalculationIndicatorHelp.getView(), numBlocksX, numBlocksY ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; blockCalculationIndicator = blockCalculationIndicatorHelp; @@ -445,46 +346,24 @@ solve( const MeshPointer& mesh, /**-----------------------------------------------------------------------------------------------------------*/ numIter ++; } - if( numIter%2 == 1 ){ - auxPtr = helpFunc; - } - /*cudaFree( BlockIterDevice ); - cudaFree( dBlock ); - delete BlockIter;*/ - - if( neigh[1] != -1 ) - { - req[neighCount] = MPI::ISend( &calculated, 1, neigh[1], 0, MPI::AllGroup ); - neighCount++; - - - req[neighCount] = MPI::IRecv( &calculpom[1], 1, neigh[1], 0, MPI::AllGroup ); - neighCount++; - } - - if( neigh[2] != -1 ) - { - req[neighCount] = MPI::ISend( &calculated, 1, neigh[2], 0, MPI::AllGroup ); - neighCount++; - - req[neighCount] = MPI::IRecv( &calculpom[2], 1, neigh[2], 0, MPI::AllGroup ); - neighCount++; - } - - if( neigh[5] != -1 ) + if( numIter%2 == 1 ) // Need to check parity for MPI overlaps to synchronize ( otherwise doesnt work ) { - req[neighCount] = MPI::ISend( &calculated, 1, neigh[5], 0, MPI::AllGroup ); - neighCount++; - - req[neighCount] = MPI::IRecv( &calculpom[3], 1, neigh[5], 0, MPI::AllGroup ); - neighCount++; + helpFunc.swap( auxPtr ); + Devices::Cuda::synchronizeDevice(); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; } - - MPI::WaitAll(req,neighCount); -#if ForDebug - printf( "%d: Sending Calculated = %d.\n", i, calculated ); -#endif - MPI::Allreduce( &calculated, &calculated, 1, MPI_LOR, MPI::AllGroup ); + aux = *auxPtr; + interfaceMap = *interfaceMapPtr; +#endif + } + + +/**----------------------MPI-TO-DO---------------------------------------------**/ +#ifdef HAVE_MPI + if( CommunicatorType::isDistributed() ){ + getInfoFromNeighbours( calculatedBefore, calculateMPIAgain, mesh ); + aux.template synchronize< Communicator >(); } #endif @@ -518,9 +397,16 @@ setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps, #endif } -template < typename Index > -__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY ) + + + +template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > +bool +FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: +goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, + MeshFunctionType& aux, const InterfaceMapType& interfaceMap, + const AnisotropyPointer& anisotropy ) { bool calculated = false; const MeshType& mesh = aux.getMesh(); @@ -548,97 +434,15 @@ __global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, I return calculated; } -template < typename Index > -__global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > dBlock, int nBlocks ) -{ - int i = threadIdx.x; - int blId = blockIdx.x; - int blockSize = blockDim.x; - /*if ( i == 0 && blId == 0 ){ - printf( "nBlocks = %d\n", nBlocks ); - for( int j = nBlocks-1; j > -1 ; j--){ - printf( "%d: cislo = %d \n", j, BlockIterDevice[ j ] ); - } - }*/ - __shared__ int sArray[ 1024 ]; - sArray[ i ] = 0; - if( blId * 1024 + i < nBlocks ) - sArray[ i ] = BlockIterDevice[ blId * 1024 + i ]; - __syncthreads(); - /*if ( i == 0 && blId == 0 ){ - printf( "nBlocks = %d\n", nBlocks ); - for( int j = 4; j > -1 ; j--){ - printf( "%d: cislo = %d \n", j, sArray[ j ] ); - } - }*/ - /*extern __shared__ volatile int sArray[]; - unsigned int i = threadIdx.x; - unsigned int gid = blockIdx.x * blockSize * 2 + threadIdx.x; - unsigned int gridSize = blockSize * 2 * gridDim.x; - sArray[ i ] = 0; - while( gid < nBlocks ) - { - sArray[ i ] += BlockIterDevice[ gid ] + BlockIterDevice[ gid + blockSize ]; - gid += gridSize; - } - __syncthreads();*/ - - if ( blockSize == 1024) { - if (i < 512) - sArray[ i ] += sArray[ i + 512 ]; - } - __syncthreads(); - if (blockSize >= 512) { - if (i < 256) { - sArray[ i ] += sArray[ i + 256 ]; - } - } - __syncthreads(); - if (blockSize >= 256) { - if (i < 128) { - sArray[ i ] += sArray[ i + 128 ]; - } - } - __syncthreads(); - if (blockSize >= 128) { - if (i < 64) { - sArray[ i ] += sArray[ i + 64 ]; - } - } - __syncthreads(); - if (i < 32 ) - { - if( blockSize >= 64 ){ sArray[ i ] += sArray[ i + 32 ];} - __syncthreads(); - if( blockSize >= 32 ){ sArray[ i ] += sArray[ i + 16 ];} - __syncthreads(); - if( blockSize >= 16 ){ sArray[ i ] += sArray[ i + 8 ];} - if( blockSize >= 8 ){ sArray[ i ] += sArray[ i + 4 ];} - __syncthreads(); - if( blockSize >= 4 ){ sArray[ i ] += sArray[ i + 2 ];} - __syncthreads(); - if( blockSize >= 2 ){ sArray[ i ] += sArray[ i + 1 ];} - __syncthreads(); - } - __syncthreads(); - - if( i == 0 ) - dBlock[ blId ] = sArray[ 0 ]; -} -template < int sizeSArray, typename Real, typename Device, typename Index > -__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, - const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, - const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, - CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY ) ); - TNL_CHECK_CUDA_DEVICE; - - CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks ); - TNL_CHECK_CUDA_DEVICE; +#ifdef HAVE_MPI +template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > +void +FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: +getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh ) { Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh(); @@ -687,4 +491,3 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< calculateFromNeighbours[2] || calculateFromNeighbours[3]; } #endif - TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock ) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h index 4895c7693..325b626f7 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h @@ -262,24 +262,86 @@ solve( const MeshPointer& mesh, // IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2) const int cudaBlockSize( 8 ); - CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr, - interfaceMapPtr.template getData< Device >(), - auxPtr.template getData< Device>(), - helpFunc.template modifyData< Device>(), - BlockIterDevice.getView() ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + // Getting the number of blocks in grid in each direction (without overlaps bcs we dont calculate on overlaps) + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize ); + int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize ); + int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z() - vecLowerOverlaps[2] - vecUpperOverlaps[2], cudaBlockSize ); + if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) + std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; - GetNeighbours3D<<< nBlocksNeigh, 1024 >>>( BlockIterDevice.getView(), BlockIterPom.getView(), numBlocksX, numBlocksY, numBlocksZ ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - BlockIterDevice = BlockIterPom; + // Making the variables for global function CudaUpdateCellCaller. + dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); + dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); - CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY * numBlocksZ ) ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + BaseType ptr; // tnlDirectEikonalMethodBase type for calling of function inside CudaUpdateCellCaller + + + int BlockIterD = 1; //variable that tells us weather we should calculate the main cuda body again + + // Array containing information about each block in grid, answering question (Have we calculated in this block?) + TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice( numBlocksX * numBlocksY * numBlocksZ ); + BlockIterDevice.setValue( 1 ); // calculate all in the first passage + + // Helping Array for GetNeighbours3D + TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom( numBlocksX * numBlocksY * numBlocksZ ); + BlockIterPom.setValue( 0 ); //doesnt matter what number + + + + // number of neighbours in one block (1024 threads) for GetNeighbours3D + int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0); + + + //MeshFunctionPointer helpFunc1( mesh ); + MeshFunctionPointer helpFunc( mesh ); + helpFunc.template modifyData() = auxPtr.template getData(); + Devices::Cuda::synchronizeDevice(); + + int numIter = 0; // number of passages of following while cycle - CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks ); + while( BlockIterD ) //main body of cuda code + { + + Devices::Cuda::synchronizeDevice(); + // main function that calculates all values in each blocks + // calculated values are in helpFunc + CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr, + interfaceMapPtr.template getData< Device >(), + auxPtr.template getData< Device>(), + helpFunc.template modifyData< Device>(), + BlockIterDevice.getView(), vecLowerOverlaps, vecUpperOverlaps ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + // Switching pointers to helpFunc and auxPtr so real results are in memory of helpFunc but here under variable auxPtr + auxPtr.swap( helpFunc ); + + Devices::Cuda::synchronizeDevice(); + // Neighbours of blocks that calculatedBefore in this passage should calculate in the next! + // BlockIterDevice contains blocks that calculatedBefore in this passage and BlockIterPom those that should calculate in next (are neighbours) + GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice.getView(), BlockIterPom.getView(), numBlocksX, numBlocksY, numBlocksZ ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + BlockIterDevice = BlockIterPom; + Devices::Cuda::synchronizeDevice(); + + // .containsValue(1) is actually parallel reduction implemented in TNL + BlockIterD = BlockIterDevice.containsValue(1); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + numIter++; + if( BlockIterD ){ + // if we calculated in this passage, we should send the info via MPI so neighbours should calculate after synchronization + calculatedBefore = 1; + } + } + if( numIter%2 == 1 ){ + + // We need auxPtr to point on memory of original auxPtr (not to helpFunc) + // last passage of previous while cycle didnt calculate any number anyway so switching names doesnt effect values + auxPtr.swap( helpFunc ); + Devices::Cuda::synchronizeDevice(); + } cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; aux = *auxPtr; @@ -375,10 +437,15 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, return calculated; } -template < typename Index > -__global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, - int numBlockX, int numBlockY, int numBlockZ ) + + + +#ifdef HAVE_MPI +template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > +void +FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: +getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh ) { Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh(); @@ -397,22 +464,6 @@ __global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup ); } -} - -template < int sizeSArray, typename Real, typename Device, typename Index > -__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, - const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, - const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, - Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice ) -{ - int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z; - int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z; - int i = threadIdx.x + blockDim.x*blockIdx.x + vLower[0]; // WITH OVERLAPS!!! i,j,k aren't coordinates of all values - int j = blockDim.y*blockIdx.y + threadIdx.y + vLower[1]; - int k = blockDim.z*blockIdx.z + threadIdx.z + vLower[2]; - int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri; - const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); if( neighbours[1] != -1 ) // EAST { -- GitLab From e162a57a89c80ef7d685191b013cb6436d0b8576 Mon Sep 17 00:00:00 2001 From: fencl Date: Sat, 5 Oct 2019 10:33:58 +0200 Subject: [PATCH 14/14] 2D MPI GPU method adjusted. --- .../Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h index c470a77ef..cddf4f9cb 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h @@ -297,7 +297,7 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, if( i < mesh.getDimensions().x() - vecUpperOverlaps[ 0 ] && j < mesh.getDimensions().y() - vecUpperOverlaps[ 1 ] && - i>vecLowerOverlaps[ 0 ] -1 && j> vecLowerOverlaps[ 0 ]-1 ) + i>vecLowerOverlaps[ 0 ] -1 && j> vecLowerOverlaps[ 1 ]-1 ) { const Real& hx = mesh.getSpaceSteps().x(); const Real& hy = mesh.getSpaceSteps().y(); -- GitLab