diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h index 1b46ecb3dbea0438be1065fc43808ecb157c93ab..b2cfc65dcee8b3dc7a4f980c20feba6c5eac8d5d 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h @@ -36,6 +36,9 @@ class DirectEikonalSolverConfig { config.addDelimiter( "Direct eikonal equation solver settings:" ); config.addRequiredEntry< String >( "input-file", "Input file." ); + config.addEntry< String >( "distributed-grid-io-type", "Choose Distributed Grid IO Type", "MpiIO"); + config.addEntryEnum< String >( "LocalCopy" ); + config.addEntryEnum< String >( "MpiIO" ); }; }; diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test index 0dc50246f5ced2ec65616f7eecc65ebf860a2cdb..24b782a820641a8ce62a11b7221da22d2473e735 100755 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test @@ -1,19 +1,33 @@ #!/bin/bash device="host" -dimensions="2D 3D" -#dimensions="3D" +#dimensions="2D 3D" +dimensions="2D" sizes1D="16 32 64 128 256 512 1024 2048 4096" #sizes1D="256" -sizes2D="16 32 64 128 256 512 1024" -#sizes2D="8" -sizes3D="16 32 64 128 256" +sizes2D="16 32 64 128 256 512 1024 2048 4096" +#sizes2D="16" +sizes3D="8 16 32 64 128 256" +#sizes3D="128 256" testFunctions="paraboloid" +#testFunctions="sin-wave-sdf" +#testFunctions="sin-bumps-sdf" snapshotPeriod=0.1 finalTime=1.5 -solverName="tnl-direct-eikonal-solver" -#solverName="gdb --args tnl-direct-eikonal-solver-dbg --catch-exceptions no" -# +realType="double" +#mpiRun="mpirun -np 4 -oversubscribe " +mpiRun="" + +## CAREFULL: If you set LocalCopy of MPI, you have to start with mpiRun even tnl-init +## This isnt problem with MpiIO. +## CAREFULL: For smoothly calculated error, you have to choose the right output function which +## is for both MpiIO, LocalCopy different. + +solverName=${mpiRun}"tnl-direct-eikonal-solver" +#solverName="gdb --args "${mpiRun}"tnl-direct-eikonal-solver-dbg --catch-exceptions no --mpi-gdb-debug false" +#scale=2.0 +#finalSdf="aux-0.tnl aux-1.tnl" +finalSdf="aux-final.tnl" setupTestFunction() { @@ -22,16 +36,17 @@ setupTestFunction() # then origin=-1.0 proportions=2.0 +# origin=-1.0 +# proportions=2.0 amplitude=1.0 - waveLength=0.2 - waveLengthX=0.2 - waveLengthY=0.2 - waveLengthZ=0.2 - wavesNumber=3.0 - wavesNumberX=0.5 - wavesNumberY=2.0 + waveLength=0.4 + waveLengthX=0.5 + waveLengthY=0.5 + waveLengthZ=0.5 + wavesNumber=1.25 + wavesNumberX=0.5 wavesNumberY=2.0 wavesNumberZ=3.0 - phase=0.1 + phase=-1.5 phaseX=0.0 phaseY=0.0 phaseZ=0.0 @@ -44,6 +59,7 @@ setupGrid() { dimensions=$1 gridSize=$2 + #scale=$3 tnl-grid-setup --dimensions ${dimensions} \ --origin-x ${origin} \ --origin-y ${origin} \ @@ -53,47 +69,51 @@ setupGrid() --proportions-z ${proportions} \ --size-x ${gridSize} \ --size-y ${gridSize} \ - --size-z ${gridSize} + --real-type ${realType} \ + --size-z ${gridSize} +#$((2*${gridSize})) } setInitialCondition() { testFunction=$1 tnl-init --test-function ${testFunction} \ - --output-file initial-u.tnl \ - --amplitude ${amplitude} \ - --wave-length ${waveLength} \ - --wave-length-x ${waveLengthX} \ - --wave-length-y ${waveLengthY} \ - --wave-length-z ${waveLengthZ} \ - --waves-number ${wavesNumber} \ - --waves-number-x ${wavesNumberX} \ - --waves-number-y ${wavesNumberY} \ - --waves-number-z ${wavesNumberZ} \ - --phase ${phase} \ - --phase-x ${phaseX} \ - --phase-y ${phaseY} \ - --phase-z ${phaseZ} \ - --sigma ${sigma} \ - --radius ${radius} + --real-type ${realType} \ + --output-file initial-u.tnl \ + --amplitude ${amplitude} \ + --wave-length ${waveLength} \ + --wave-length-x ${waveLengthX} \ + --wave-length-y ${waveLengthY} \ + --wave-length-z ${waveLengthZ} \ + --waves-number ${wavesNumber} \ + --waves-number-x ${wavesNumberX} \ + --waves-number-y ${wavesNumberY} \ + --waves-number-z ${wavesNumberZ} \ + --phase ${phase} \ + --phase-x ${phaseX} \ + --phase-y ${phaseY} \ + --phase-z ${phaseZ} \ + --sigma ${sigma} \ + --radius ${radius} - tnl-init --test-function ${testFunction}-sdf \ - --output-file exact-u.tnl \ - --amplitude ${amplitude} \ - --wave-length ${waveLength} \ - --wave-length-x ${waveLengthX} \ - --wave-length-y ${waveLengthY} \ - --wave-length-z ${waveLengthZ} \ - --waves-number ${wavesNumber} \ - --waves-number-x ${wavesNumberX} \ - --waves-number-y ${wavesNumberY} \ - --waves-number-z ${wavesNumberZ} \ - --phase ${phase} \ - --phase-x ${phaseX} \ - --phase-y ${phaseY} \ - --phase-z ${phaseZ} \ - --sigma ${sigma} \ - --radius ${radius} + tnl-init --test-function ${testFunction}-sdf \ + --real-type ${realType} \ + --output-file exact-u.tnl \ + --amplitude ${amplitude} \ + --wave-length ${waveLength} \ + --wave-length-x ${waveLengthX} \ + --wave-length-y ${waveLengthY} \ + --wave-length-z ${waveLengthZ} \ + --waves-number ${wavesNumber} \ + --waves-number-x ${wavesNumberX} \ + --waves-number-y ${wavesNumberY} \ + --waves-number-z ${wavesNumberZ} \ + --phase ${phase} \ + --phase-x ${phaseZ} \ + --phase-y ${phaseZ} \ + --phase-z ${phaseZ} \ + --sigma ${sigma} \ + --radius ${radius} \ } @@ -111,17 +131,22 @@ solve() --min-iterations 20 \ --convergence-residue 1.0e-12 \ --snapshot-period ${snapshotPeriod} \ + --real-type ${realType} \ --final-time ${finalTime} } computeError() { +for sweep in ${finalSdf} +do tnl-diff --mesh mesh.tnl \ - --input-files aux-final.tnl exact-u.tnl \ + --input-files exact-u.tnl u-00000.tnl \ --mode sequence \ --snapshot-period ${snapshotPeriod} \ --output-file errors.txt \ --write-difference yes +#aux-final.tnl \ +done } runTest() diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..55129c4e1008e69ef3b3d238acb8fc587cce4076 --- /dev/null +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h @@ -0,0 +1,204 @@ +/* + * File: tnlDirectEikonalMethodBase1D_impl.h + * Author: Fencl + * + * Created on March 15, 2019 + */ + +#pragma once + +template< typename Real, + typename Device, + typename Index > +void +tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >:: +initInterface( const MeshFunctionPointer& _input, + MeshFunctionPointer& _output, + InterfaceMapPointer& _interfaceMap ) +{ + if( std::is_same< Device, Devices::Cuda >::value ) + { +#ifdef HAVE_CUDA + const MeshType& mesh = _input->getMesh(); + + const int cudaBlockSize( 16 ); + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); + dim3 blockSize( cudaBlockSize ); + dim3 gridSize( numBlocksX ); + Devices::Cuda::synchronizeDevice(); + CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(), + _output.template modifyData< Device >(), + _interfaceMap.template modifyData< Device >() ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; +#endif + } + if( std::is_same< Device, Devices::Host >::value ) + { + const MeshType& mesh = _input->getMesh(); + typedef typename MeshType::Cell Cell; + const MeshFunctionType& input = _input.getData(); + MeshFunctionType& output = _output.modifyData(); + InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); + Cell cell( mesh ); + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh.getDimensions().x(); + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + output[ cell.getIndex() ] = + input( cell ) >= 0 ? std::numeric_limits< RealType >::max() : + -std::numeric_limits< RealType >::max(); + interfaceMap[ cell.getIndex() ] = false; + } + + + const RealType& h = mesh.getSpaceSteps().x(); + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh.getDimensions().x() - 1; + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + const RealType& c = input( cell ); + if( ! cell.isBoundaryEntity() ) + { + const auto& neighbors = cell.getNeighborEntities(); + Real pom = 0; + //const IndexType& c = cell.getIndex(); + const IndexType e = neighbors.template getEntityIndex< 1 >(); + if( c * input[ e ] <= 0 ) + { + pom = TNL::sign( c )*( h * c )/( c - input[ e ]); + if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) + output[ cell.getIndex() ] = pom; + + pom = pom - TNL::sign( c )*h; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; + if( TNL::abs( output[ e ] ) > TNL::abs( pom ) ) + output[ e ] = pom; + + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ e ] = true; + } + } + } + } +} + +template< typename Real, + typename Device, + typename Index > +template< typename MeshEntity > +void +tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >:: +updateCell( MeshFunctionType& u, + const MeshEntity& cell, + const RealType v ) +{ + const auto& neighborEntities = cell.template getNeighborEntities< 1 >(); + const MeshType& mesh = cell.getMesh(); + const RealType& h = mesh.getSpaceSteps().x(); + const RealType value = u( cell ); + RealType a, tmp = std::numeric_limits< RealType >::max(); + + if( cell.getCoordinates().x() == 0 ) + a = u[ neighborEntities.template getEntityIndex< 1 >() ]; + else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 ) + a = u[ neighborEntities.template getEntityIndex< -1 >() ]; + else + { + a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1 >() ], + u[ neighborEntities.template getEntityIndex< 1 >() ] ); + } + + if( fabs( a ) == std::numeric_limits< RealType >::max() ) + return; + + tmp = a + TNL::sign( value ) * h/v; + + u[ cell.getIndex() ] = argAbsMin( value, tmp ); +} + +template< typename Real, + typename Device, + typename Index > +__cuda_callable__ +bool +tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >:: +updateCell( volatile Real sArray[18], int thri, const Real h, const Real v ) +{ + const RealType value = sArray[ thri ]; + RealType a, tmp = std::numeric_limits< RealType >::max(); + + a = TNL::argAbsMin( sArray[ thri+1 ], + sArray[ thri-1 ] ); + + if( fabs( a ) == std::numeric_limits< RealType >::max() ) + return false; + + tmp = a + TNL::sign( value ) * h/v; + + + sArray[ thri ] = argAbsMin( value, tmp ); + + tmp = value - sArray[ thri ]; + if ( fabs( tmp ) > 0.001*h ) + return true; + else + return false; +} + +#ifdef HAVE_CUDA +template < typename Real, typename Device, typename Index > +__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, + Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output, + Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap ) +{ + int i = threadIdx.x + blockDim.x*blockIdx.x; + const Meshes::Grid< 1, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); + + if( i < mesh.getDimensions().x() ) + { + typedef typename Meshes::Grid< 1, Real, Device, Index >::Cell Cell; + Cell cell( mesh ); + cell.getCoordinates().x() = i; + cell.refresh(); + const Index cind = cell.getIndex(); + + + output[ cind ] = + input( cell ) >= 0 ? std::numeric_limits< Real >::max() : + - std::numeric_limits< Real >::max(); + interfaceMap[ cind ] = false; + + const Real& h = mesh.getSpaceSteps().x(); + cell.refresh(); + const Real& c = input( cell ); + if( ! cell.isBoundaryEntity() ) + { + auto neighbors = cell.getNeighborEntities(); + Real pom = 0; + const Index e = neighbors.template getEntityIndex< 1 >(); + const Index w = neighbors.template getEntityIndex< -1 >(); + if( c * input[ e ] <= 0 ) + { + pom = TNL::sign( c )*( h * c )/( c - input[ e ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ w ] <= 0 ) + { + pom = TNL::sign( c )*( h * c )/( c - input[ w ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + } + } + +} +#endif + + diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..cddf4f9cb7a97f8a74eb94f7377b2cf740db03a5 --- /dev/null +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h @@ -0,0 +1,803 @@ +/* + * File: tnlDirectEikonalMethodBase2D_impl.h + * Author: Fencl + * + * Created on March 15, 2019 + */ + +#pragma once + +template< typename Real, + typename Device, + typename Index > +void +tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: +initInterface( const MeshFunctionPointer& _input, + MeshFunctionPointer& _output, + InterfaceMapPointer& _interfaceMap, + const StaticVector vecLowerOverlaps, + const StaticVector vecUpperOverlaps ) +{ + + if( std::is_same< Device, Devices::Cuda >::value ) + { +#ifdef HAVE_CUDA + const MeshType& mesh = _input->getMesh(); + + const int cudaBlockSize( 16 ); + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); + int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); + dim3 blockSize( cudaBlockSize, cudaBlockSize ); + dim3 gridSize( numBlocksX, numBlocksY ); + Devices::Cuda::synchronizeDevice(); + CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(), + _output.template modifyData< Device >(), + _interfaceMap.template modifyData< Device >(), + vecLowerOverlaps, vecUpperOverlaps); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; +#endif + } + if( std::is_same< Device, Devices::Host >::value ) + { + MeshFunctionType input = _input.getData(); + MeshFunctionType& output = _output.modifyData(); + InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); + const MeshType& mesh = input.getMesh(); + typedef typename MeshType::Cell Cell; + Cell cell( mesh ); + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh.getDimensions().y(); + cell.getCoordinates().y() ++ ) + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh.getDimensions().x(); + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + output[ cell.getIndex() ] = + input( cell ) >= 0 ? std::numeric_limits< RealType >::max() : + - std::numeric_limits< RealType >::max(); + interfaceMap[ cell.getIndex() ] = false; + } + + const RealType& hx = mesh.getSpaceSteps().x(); + const RealType& hy = mesh.getSpaceSteps().y(); + for( cell.getCoordinates().y() = 0 + vecLowerOverlaps[1]; + cell.getCoordinates().y() < mesh.getDimensions().y() - vecUpperOverlaps[1]; + cell.getCoordinates().y() ++ ) + for( cell.getCoordinates().x() = 0 + vecLowerOverlaps[0]; + cell.getCoordinates().x() < mesh.getDimensions().x() - vecUpperOverlaps[0]; + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + const RealType& c = input( cell ); + if( ! cell.isBoundaryEntity() ) + { + auto neighbors = cell.getNeighborEntities(); + Real pom = 0; + const IndexType e = neighbors.template getEntityIndex< 1, 0 >(); + const IndexType n = neighbors.template getEntityIndex< 0, 1 >(); + if( c * input[ n ] <= 0 ) + { + pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); + if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) + output[ cell.getIndex() ] = pom; + pom = pom - TNL::sign( c )*hy; + if( TNL::abs( output[ n ] ) > TNL::abs( pom ) ) + output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy; + + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ n ] = true; + } + if( c * input[ e ] <= 0 ) + { + pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); + if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) + output[ cell.getIndex() ] = pom; + + pom = pom - TNL::sign( c )*hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; + if( TNL::abs( output[ e ] ) > TNL::abs( pom ) ) + output[ e ] = pom; + + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ e ] = true; + } + } + } + } +} + +template< typename Real, + typename Device, + typename Index > +template< typename MeshEntity > +__cuda_callable__ +bool +tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: +updateCell( MeshFunctionType& u, + const MeshEntity& cell, + const RealType v) +{ + const auto& neighborEntities = cell.template getNeighborEntities< 2 >(); + const MeshType& mesh = cell.getMesh(); + const RealType& hx = mesh.getSpaceSteps().x(); + const RealType& hy = mesh.getSpaceSteps().y(); + const RealType value = u( cell ); + RealType a, b, tmp = std::numeric_limits< RealType >::max(); + + if( cell.getCoordinates().x() == 0 ) + a = u[ neighborEntities.template getEntityIndex< 1, 0 >() ]; + else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 ) + a = u[ neighborEntities.template getEntityIndex< -1, 0 >() ]; + else + { + a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0 >() ], + u[ neighborEntities.template getEntityIndex< 1, 0 >() ] ); + } + + if( cell.getCoordinates().y() == 0 ) + b = u[ neighborEntities.template getEntityIndex< 0, 1 >()]; + else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 ) + b = u[ neighborEntities.template getEntityIndex< 0, -1 >() ]; + else + { + b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1 >() ], + u[ neighborEntities.template getEntityIndex< 0, 1 >() ] ); + } + + if( fabs( a ) == std::numeric_limits< RealType >::max() && + fabs( b ) == std::numeric_limits< RealType >::max() ) + return false; + + RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), hx, hy, 0.0 }; + + tmp = getNewValue( pom , value, v ); + + u[ cell.getIndex() ] = tmp; + + + tmp = value - u[ cell.getIndex() ]; + + if ( fabs( tmp ) > 0.001*hx ) + return true; + else + return false; + +} + +template< typename Real, + typename Device, + typename Index > +template< int sizeSArray > +__cuda_callable__ +bool +tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: +updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real hy, + const Real v ) +{ + const RealType value = sArray[ thrj * sizeSArray + thri ]; + RealType a, b, tmp = std::numeric_limits< RealType >::max(); + + b = TNL::argAbsMin( sArray[ (thrj+1) * sizeSArray + thri ], + sArray[ (thrj-1) * sizeSArray + thri ] ); + + a = TNL::argAbsMin( sArray[ thrj * sizeSArray + thri+1 ], + sArray[ thrj * sizeSArray + thri-1 ] ); + + if( fabs( a ) == std::numeric_limits< RealType >::max() && + fabs( b ) == std::numeric_limits< RealType >::max() ) + return false; + + RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 }; + + tmp = getNewValue( pom , value, v ); + + sArray[ thrj * sizeSArray + thri ] = tmp; + tmp = value - sArray[ thrj * sizeSArray + thri ]; + if ( fabs( tmp ) > 0.001*hx ) + return true; + else + return false; +} + +template< typename Real, + typename Device, + typename Index > +__cuda_callable__ +Real +tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: +getNewValue( RealType valuesAndSteps[], const RealType originalValue, const RealType v ) +{ + RealType newValue = std::numeric_limits< RealType >::max(); + sortMinims( valuesAndSteps ); + + // calculation of real value taken from ZHAO + newValue = valuesAndSteps[ 0 ] + TNL::sign( originalValue ) * valuesAndSteps[ 3 ]/v; + if( fabs( newValue ) < fabs( valuesAndSteps[ 1 ] ) ) + { + newValue = argAbsMin( originalValue, newValue ); + } + else + { + newValue = ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] * valuesAndSteps[ 1 ] + + valuesAndSteps[ 4 ] * valuesAndSteps[ 4 ] * valuesAndSteps[ 0 ] + + TNL::sign( originalValue ) * valuesAndSteps[ 3 ] * valuesAndSteps[ 4 ] * + TNL::sqrt( ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] + valuesAndSteps[ 4 ] * valuesAndSteps[ 4 ] )/( v * v ) - + ( valuesAndSteps[ 1 ] - valuesAndSteps[ 0 ] ) * + ( valuesAndSteps[ 1 ] - valuesAndSteps[ 0 ] ) ) )/ + ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] + valuesAndSteps[ 4 ] * valuesAndSteps[ 4 ] ); + newValue = argAbsMin( originalValue, newValue ); + } + + return newValue; +} + + +template < typename T1 > +__cuda_callable__ void sortMinims( T1 pom[] ) +{ + T1 tmp[6] = {0.0,0.0,0.0,0.0,0.0,0.0}; + if( fabs(pom[0]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[2])){ + tmp[0] = pom[0]; tmp[1] = pom[1]; tmp[2] = pom[2]; + tmp[3] = pom[3]; tmp[4] = pom[4]; tmp[5] = pom[5]; + + } + else if( fabs(pom[0]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[1]) ){ + tmp[0] = pom[0]; tmp[1] = pom[2]; tmp[2] = pom[1]; + tmp[3] = pom[3]; tmp[4] = pom[5]; tmp[5] = pom[4]; + } + else if( fabs(pom[1]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[2]) ){ + tmp[0] = pom[1]; tmp[1] = pom[0]; tmp[2] = pom[2]; + tmp[3] = pom[4]; tmp[4] = pom[3]; tmp[5] = pom[5]; + } + else if( fabs(pom[1]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[0]) ){ + tmp[0] = pom[1]; tmp[1] = pom[2]; tmp[2] = pom[0]; + tmp[3] = pom[4]; tmp[4] = pom[5]; tmp[5] = pom[3]; + } + else if( fabs(pom[2]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[1]) ){ + tmp[0] = pom[2]; tmp[1] = pom[0]; tmp[2] = pom[1]; + tmp[3] = pom[5]; tmp[4] = pom[3]; tmp[5] = pom[4]; + } + else if( fabs(pom[2]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[0]) ){ + tmp[0] = pom[2]; tmp[1] = pom[1]; tmp[2] = pom[0]; + tmp[3] = pom[5]; tmp[4] = pom[4]; tmp[5] = pom[3]; + } + + for( unsigned int i = 0; i < 6; i++ ) + { + pom[ i ] = tmp[ i ]; + } +} + +#ifdef HAVE_CUDA +template < typename Real, typename Device, typename Index > +__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output, + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, + const Containers::StaticVector< 2, Index > vecLowerOverlaps, + const Containers::StaticVector< 2, Index > vecUpperOverlaps ) +{ + int i = threadIdx.x + blockDim.x*blockIdx.x; + int j = blockDim.y*blockIdx.y + threadIdx.y; + const Meshes::Grid< 2, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); + + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) + { + typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell; + Cell cell( mesh ); + cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; + cell.refresh(); + const Index cind = cell.getIndex(); + + + output[ cind ] = + input( cell ) >= 0 ? std::numeric_limits< Real >::max() : + - std::numeric_limits< Real >::max(); + interfaceMap[ cind ] = false; + + if( i < mesh.getDimensions().x() - vecUpperOverlaps[ 0 ] && + j < mesh.getDimensions().y() - vecUpperOverlaps[ 1 ] && + i>vecLowerOverlaps[ 0 ] -1 && j> vecLowerOverlaps[ 1 ]-1 ) + { + const Real& hx = mesh.getSpaceSteps().x(); + const Real& hy = mesh.getSpaceSteps().y(); + cell.refresh(); + const Real& c = input( cell ); + if( ! cell.isBoundaryEntity() ) + { + auto neighbors = cell.getNeighborEntities(); + Real tmp = 0; + const Index e = neighbors.template getEntityIndex< 1, 0 >(); + const Index w = neighbors.template getEntityIndex< -1, 0 >(); + const Index n = neighbors.template getEntityIndex< 0, 1 >(); + const Index s = neighbors.template getEntityIndex< 0, -1 >(); + + if( c * input[ n ] <= 0 ) + { + tmp = TNL::sign( c )*( hy * c )/( c - input[ n ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( tmp ) ) + output[ cind ] = tmp; + + interfaceMap[ cell.getIndex() ] = true; + } + + + if( c * input[ e ] <= 0 ) + { + tmp = TNL::sign( c )*( hx * c )/( c - input[ e ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( tmp ) ) + output[ cind ] = tmp; + + interfaceMap[ cind ] = true; + } + if( c * input[ w ] <= 0 ) + { + tmp = TNL::sign( c )*( hx * c )/( c - input[ w ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( tmp ) ) + output[ cind ] = tmp; + + interfaceMap[ cind ] = true; + } + if( c * input[ s ] <= 0 ) + { + tmp = TNL::sign( c )*( hy * c )/( c - input[ s ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( tmp ) ) + output[ cind ] = tmp; + + interfaceMap[ cind ] = true; + } + } + } + } +} + + +template < typename Index > +__global__ void GetNeighbours( const TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator, + TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY ) +{ + int i = blockIdx.x * 1024 + threadIdx.x; + + if( i < numBlockX * numBlockY ) + { + int pom = 0;//BlockIterPom[ i ] = 0; + int m=0, k=0; + m = i%numBlockX; + k = i/numBlockX; + if( m > 0 && blockCalculationIndicator[ i - 1 ] ){ + pom = 1;//blockCalculationIndicatorHelp[ i ] = 1; + }else if( m < numBlockX -1 && blockCalculationIndicator[ i + 1 ] ){ + pom = 1;//blockCalculationIndicatorHelp[ i ] = 1; + }else if( k > 0 && blockCalculationIndicator[ i - numBlockX ] ){ + pom = 1;// blockCalculationIndicatorHelp[ i ] = 1; + }else if( k < numBlockY -1 && blockCalculationIndicator[ i + numBlockX ] ){ + pom = 1;//blockCalculationIndicatorHelp[ i ] = 1; + } + + if( blockCalculationIndicator[ i ] != 1 ) + blockCalculationIndicatorHelp[ i ] = pom;//BlockIterPom[ i ]; + else + blockCalculationIndicatorHelp[ i ] = 1; + } +} + + + + +template < int sizeSArray, typename Real, typename Device, typename Index > +__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, + const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, + const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, + TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator, + const Containers::StaticVector< 2, Index > vecLowerOverlaps, + const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock ) +{ + // Setting up threads + int thri = threadIdx.x; int thrj = threadIdx.y; + int i = threadIdx.x + blockDim.x*blockIdx.x + vecLowerOverlaps[0]; + int j = blockDim.y*blockIdx.y + threadIdx.y + vecLowerOverlaps[1]; + const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >(); + +/** FOR CHESS METHOD */ + //if( (blockIdx.y%2 + blockIdx.x) % 2 == oddEvenBlock ) + //{ +/**------------------------------------------*/ + +/** FOR FIM METHOD */ + if( blockCalculationIndicator[ blockIdx.y * gridDim.x + blockIdx.x ] ) + { + __syncthreads(); +/**-----------------------------------------*/ + + const int dimX = mesh.getDimensions().x(); const int dimY = mesh.getDimensions().y(); + const Real hx = mesh.getSpaceSteps().x(); const Real hy = mesh.getSpaceSteps().y(); + if( thri==0 && thrj == 0) + { + blockCalculationIndicator[ blockIdx.y * gridDim.x + blockIdx.x ] = 0; + } + __syncthreads(); + int maxThreadsInXDirection; + int maxThreadsInYDirection; + + // Maximum threads in each direction can differ + // e.g. cudaBlockSize = 16, dimX = 50, then: + // blockIdx maxThreadsInXDirection calculation [from, to] sArray [from, to] + // 0 16 [ 0,15] [ 0,16] //"-1" set to inf + // 1 16 [16,31] [15,32] + // 2 16 [32,47] [31,48] + // 3 2 [48,50] [47,50] // rest set to inf + // same for YDirection because blocks are squared + maxThreadsInXDirection = blockDim.x + 1; + maxThreadsInYDirection = blockDim.y + 1; + + if( gridDim.x - 1 == blockIdx.x ) // care about number of values if we are in last block + maxThreadsInXDirection = (dimX-vecUpperOverlaps[0]-vecLowerOverlaps[0]) - (blockIdx.x)*blockDim.x+1; + + if( gridDim.y - 1 == blockIdx.y ) // care about number of values if we are in last block + maxThreadsInYDirection = (dimY-vecUpperOverlaps[1]-vecLowerOverlaps[1]) - (blockIdx.y)*blockDim.y+1; + __syncthreads(); + + // Setting changed array that contains info: "Did the value of this thread changed in last passage?" + // Will be used in parallel reduction ( inside block level ) + int currentIndex = thrj * blockDim.x + thri; + __shared__ volatile bool changed[ ( sizeSArray - 2 ) * ( sizeSArray - 2 ) ]; + changed[ currentIndex ] = false; + if( thrj == 0 && thri == 0 ) + changed[ 0 ] = true; // fist must be true to start while cycle + + + //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ]; + __shared__ volatile Real sArray[ sizeSArray * sizeSArray ]; + sArray[ (thrj+1) * sizeSArray + thri +1 ] = std::numeric_limits< Real >::max(); + + + //filling sArray edges + if( thri == 0 ) // + { + if( dimX - vecLowerOverlaps[ 0 ] > (blockIdx.x+1) * blockDim.x && thrj+1 < maxThreadsInYDirection ) + sArray[ (thrj+1)*sizeSArray + maxThreadsInXDirection ] = + aux[ (blockIdx.y*blockDim.y+vecLowerOverlaps[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 // this to get to right possition + + (thrj+1)*dimX + maxThreadsInXDirection + vecLowerOverlaps[0] ]; // rest to get the right sArray overlap + else + sArray[ (thrj+1)*sizeSArray + maxThreadsInXDirection ] = std::numeric_limits< Real >::max(); + } + + if( thri == 1 ) + { + if( ( blockIdx.x != 0 || vecLowerOverlaps[0] != 0 ) && thrj+1 < maxThreadsInYDirection ) + sArray[(thrj+1)*sizeSArray + 0] = + aux[ (blockIdx.y*blockDim.y+vecLowerOverlaps[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + + (thrj+1)*dimX + vecLowerOverlaps[0] ]; + else + sArray[(thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max(); + } + + if( thri == 2 ) + { + if( dimY - vecLowerOverlaps[ 1 ] > (blockIdx.y+1) * blockDim.y && thrj+1 < maxThreadsInXDirection ) + sArray[ maxThreadsInYDirection * sizeSArray + thrj+1 ] = + aux[ ( blockIdx.y * blockDim.y + vecLowerOverlaps[ 1 ] ) * dimX - dimX + blockIdx.x * blockDim.x - 1 + + maxThreadsInYDirection * dimX + thrj + 1 + vecLowerOverlaps[0] ]; + else + sArray[ maxThreadsInYDirection*sizeSArray + thrj+1 ] = std::numeric_limits< Real >::max(); + + } + + if( thri == 3 ) + { + if( ( blockIdx.y != 0 || vecLowerOverlaps[1] != 0 ) && thrj+1 < maxThreadsInXDirection ) + sArray[0*sizeSArray + thrj+1] = + aux[ ( blockIdx.y * blockDim.y + vecLowerOverlaps[ 1 ] ) * dimX - dimX + blockIdx.x * blockDim.x - 1 + + thrj + 1 + vecLowerOverlaps[ 0 ] ]; + else + sArray[0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max(); + } + + // Filling sArray inside + if( i - vecLowerOverlaps[ 0 ] < dimX && j - vecLowerOverlaps[ 1 ] < dimY && + thri + 1 < maxThreadsInXDirection + vecUpperOverlaps[ 0 ] && + thrj + 1 < maxThreadsInYDirection + vecUpperOverlaps[ 1 ] ) + { + sArray[ ( thrj + 1 ) * sizeSArray + thri + 1 ] = aux[ j * dimX + i ]; + } + __syncthreads(); + + //main while cycle ( CALCULATES TILL VALUES ARE CHANGING ) + while( changed[ 0 ] ) + { + __syncthreads(); + + changed[ currentIndex] = false; + + //calculation of update cell + if( i < dimX - vecUpperOverlaps[ 0 ] && j < dimY - vecUpperOverlaps[ 1 ] ) + { + if( ! interfaceMap[ j * dimX + i ] ) + { + changed[ currentIndex ] = ptr.updateCell( sArray, thri + 1, thrj + 1, hx, hy ); + } + } + __syncthreads(); + + //pyramid reduction + if( blockDim.x * blockDim.y == 1024 ) + { + if( currentIndex < 512 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; + } + } + __syncthreads(); + if( blockDim.x * blockDim.y >= 512 ) + { + if( currentIndex < 256 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; + } + } + __syncthreads(); + if( blockDim.x * blockDim.y >= 256 ) + { + if( currentIndex < 128 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ]; + } + } + __syncthreads(); + if( blockDim.x * blockDim.y >= 128 ) + { + if( currentIndex < 64 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; + } + } + __syncthreads(); + if( currentIndex < 32 ) + { + if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ]; + if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ]; + if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ]; + if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ]; + if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; + if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; + } + // result of reduction is in changed[ 0 ] + + // If we calculated in passage, then the blockCalculationIndicator for this block has to be 1 + // means that we calculated in this block + if( thri == 0 && thrj == 0 && changed[ 0 ] ){ + blockCalculationIndicator[ blockIdx.y * gridDim.x + blockIdx.x ] = 1; + } + __syncthreads(); + } + + + + if( i < dimX && j < dimY && thri+1 < maxThreadsInXDirection && thrj+1 < maxThreadsInYDirection ) + helpFunc[ j * dimX + i ] = sArray[ ( thrj + 1 ) * sizeSArray + thri + 1 ]; + __syncthreads(); + } + else + { + if( i < mesh.getDimensions().x() - vecUpperOverlaps[0] && j < mesh.getDimensions().y() - vecUpperOverlaps[1] ) + helpFunc[ j * mesh.getDimensions().x() + i ] = aux[ j * mesh.getDimensions().x() + i ]; + } +} +#endif + + + +/// ====================OPEN=MP============================================ +template< typename Real, + typename Device, + typename Index > +template< int sizeSArray > +void +tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: +updateBlocks( InterfaceMapType interfaceMap, + MeshFunctionType aux, + MeshFunctionType helpFunc, + ArrayContainerView BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ) +{ +#pragma omp parallel for schedule( dynamic ) + for( IndexType i = 0; i < BlockIterHost.getSize(); i++ ) + { + if( BlockIterHost[ i ] ) + { + MeshType mesh = interfaceMap.template getMesh< Devices::Host >(); + + int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); + //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl; + int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0); + int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0); + //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl; + int xkolik = numThreadsPerBlock + 1; + int ykolik = numThreadsPerBlock + 1; + + int blIdx = i%numOfBlockx; + int blIdy = i/numOfBlockx; + //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl; + + if( numOfBlockx - 1 == blIdx ) + xkolik = dimX - (blIdx)*numThreadsPerBlock+1; + + if( numOfBlocky -1 == blIdy ) + ykolik = dimY - (blIdy)*numThreadsPerBlock+1; + //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl; + + + /*bool changed[numThreadsPerBlock*numThreadsPerBlock]; + changed[ 0 ] = 1;*/ + Real hx = mesh.getSpaceSteps().x(); + Real hy = mesh.getSpaceSteps().y(); + + bool changed = false; + BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0; + + + Real *sArray; + sArray = new Real[ sizeSArray * sizeSArray ]; + if( sArray == nullptr ) + std::cout << "Error while allocating memory for sArray." << std::endl; + + for( IndexType thri = 0; thri < sizeSArray; thri++ ){ + for( IndexType thrj = 0; thrj < sizeSArray; thrj++ ) + sArray[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max(); + } + + + //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); + for( IndexType thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ ) + { + if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik ) + sArray[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ]; + + + if( blIdx != 0 && thrj+1 < ykolik ) + sArray[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ]; + + if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik ) + sArray[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ]; + + if( blIdy != 0 && thrj+1 < xkolik ) + sArray[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ]; + } + + for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ + for( IndexType l = 0; l < numThreadsPerBlock; l++ ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) + sArray[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ]; + } + + for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ + for( IndexType l = 0; l < numThreadsPerBlock; l++ ){ + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ){ + //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl; + if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) + { + changed = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy) || changed; + + } + } + } + } + /*aux.save( "aux-1pruch.tnl" ); + for( int k = 0; k < sizeSArray; k++ ){ + for( int l = 0; l < sizeSArray; l++ ) { + std::cout << sArray[ k * sizeSArray + l] << " "; + } + std::cout << std::endl; + }*/ + + for( IndexType k = 0; k < numThreadsPerBlock; k++ ) + for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ) { + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) + { + if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) + { + this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy); + } + } + } + /*aux.save( "aux-2pruch.tnl" ); + for( int k = 0; k < sizeSArray; k++ ){ + for( int l = 0; l < sizeSArray; l++ ) { + std::cout << sArray[ k * sizeSArray + l] << " "; + } + std::cout << std::endl; + }*/ + + for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ) + for( IndexType l = 0; l < numThreadsPerBlock; l++ ) { + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) + { + if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) + { + this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy); + } + } + } + /*aux.save( "aux-3pruch.tnl" ); + for( int k = 0; k < sizeSArray; k++ ){ + for( int l = 0; l < sizeSArray; l++ ) { + std::cout << sArray[ k * sizeSArray + l] << " "; + } + std::cout << std::endl; + }*/ + + for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){ + for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ) { + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) + { + if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) + { + this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx, hy, 1.0); + } + } + } + } + /*aux.save( "aux-4pruch.tnl" ); + for( int k = 0; k < sizeSArray; k++ ){ + for( int l = 0; l < sizeSArray; l++ ) { + std::cout << sArray[ k * sizeSArray + l] << " "; + } + std::cout << std::endl; + }*/ + + + if( changed ){ + BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 1; + } + + + for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ + for( IndexType l = 0; l < numThreadsPerBlock; l++ ) { + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) + helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] = sArray[ (k + 1)* sizeSArray + l + 1 ]; + //std::cout<< sArray[k+1][l+1]; + } + //std::cout< +void +tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: +getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY ) +{ + int* BlockIterPom; + BlockIterPom = new int [numBlockX * numBlockY]; + + for(int i = 0; i < numBlockX * numBlockY; i++) + { + BlockIterPom[ i ] = 0;//BlockIterPom[ i ] = 0; + int m=0, k=0; + m = i%numBlockX; + k = i/numBlockX; + if( m > 0 && BlockIterHost[ i - 1 ] ){ + BlockIterPom[ i ] = 1; + }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){ + BlockIterPom[ i ] = 1; + }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){ + BlockIterPom[ i ] = 1; + }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){ + BlockIterPom[ i ] = 1; + } + } + + for(int i = 0; i < numBlockX * numBlockY; i++) + { + if( !BlockIterHost[ i ] ) + BlockIterHost[ i ] = BlockIterPom[ i ]; + } + delete[] BlockIterPom; +} + + + diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..32548abcfe66affd71a79d3ed5f2f21d67df644e --- /dev/null +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h @@ -0,0 +1,1091 @@ +/* + * File: tnlDirectEikonalMethodBase3D_impl.h + * Author: Fencl + * + * Created on March 15, 2019 + */ + +#pragma once + +template< typename Real, + typename Device, + typename Index > +void +tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: +initInterface( const MeshFunctionPointer& _input, + MeshFunctionPointer& _output, + InterfaceMapPointer& _interfaceMap, + StaticVector vLower, StaticVector vUpper ) +{ + if( std::is_same< Device, Devices::Cuda >::value ) + { +#ifdef HAVE_CUDA + const MeshType& mesh = _input->getMesh(); + + const int cudaBlockSize( 8 ); + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); + int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); + int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize ); + if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) + std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; + dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); + dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); + Devices::Cuda::synchronizeDevice(); + CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(), + _output.template modifyData< Device >(), + _interfaceMap.template modifyData< Device >(), vLower, vUpper ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; +#endif + } + if( std::is_same< Device, Devices::Host >::value ) + { + const MeshFunctionType& input = _input.getData(); + MeshFunctionType& output = _output.modifyData(); + InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); + + const MeshType& mesh = input.getMesh(); + typedef typename MeshType::Cell Cell; + + Cell cell( mesh ); + for( cell.getCoordinates().z() = 0; + cell.getCoordinates().z() < mesh.getDimensions().z(); + cell.getCoordinates().z() ++ ) + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh.getDimensions().y(); + cell.getCoordinates().y() ++ ) + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh.getDimensions().x(); + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + output[ cell.getIndex() ] = + input( cell ) > 0 ? std::numeric_limits< RealType >::max() : + - std::numeric_limits< RealType >::max(); + interfaceMap[ cell.getIndex() ] = false; + } + + const RealType& hx = mesh.getSpaceSteps().x(); + const RealType& hy = mesh.getSpaceSteps().y(); + const RealType& hz = mesh.getSpaceSteps().z(); + for( cell.getCoordinates().z() = 0 + vLower[2]; + cell.getCoordinates().z() < mesh.getDimensions().z() - vUpper[2]; + cell.getCoordinates().z() ++ ) + for( cell.getCoordinates().y() = 0 + vLower[1]; + cell.getCoordinates().y() < mesh.getDimensions().y() - vUpper[1]; + cell.getCoordinates().y() ++ ) + for( cell.getCoordinates().x() = 0 + vLower[0]; + cell.getCoordinates().x() < mesh.getDimensions().x() - vUpper[0]; + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + const RealType& c = input( cell ); + if( ! cell.isBoundaryEntity() ) + { + auto neighbors = cell.getNeighborEntities(); + Real pom = 0; + const IndexType e = neighbors.template getEntityIndex< 1, 0, 0 >(); + const IndexType n = neighbors.template getEntityIndex< 0, 1, 0 >(); + const IndexType t = neighbors.template getEntityIndex< 0, 0, 1 >(); + + + if( c * input[ n ] <= 0 ) + { + pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); + if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) + output[ cell.getIndex() ] = pom; + pom = pom - TNL::sign( c )*hy; + if( TNL::abs( output[ n ] ) > TNL::abs( pom ) ) + output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy; + + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ n ] = true; + } + + if( c * input[ e ] <= 0 ) + { + pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); + if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) + output[ cell.getIndex() ] = pom; + pom = pom - TNL::sign( c )*hx; + if( TNL::abs( output[ e ] ) > TNL::abs( pom ) ) + output[ e ] = pom; + + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ e ] = true; + } + + if( c * input[ t ] <= 0 ) + { + pom = TNL::sign( c )*( hz * c )/( c - input[ t ]); + if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) + output[ cell.getIndex() ] = pom; + pom = pom - TNL::sign( c )*hz; + if( TNL::abs( output[ t ] ) > TNL::abs( pom ) ) + output[ t ] = pom; + + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ t ] = true; + } + } + } + } +} + +template< typename Real, + typename Device, + typename Index > +template< typename MeshEntity > +__cuda_callable__ +bool +tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: +updateCell( MeshFunctionType& u, + const MeshEntity& cell, + const RealType v ) +{ + const auto& neighborEntities = cell.template getNeighborEntities< 3 >(); + const MeshType& mesh = cell.getMesh(); + + const RealType& hx = mesh.getSpaceSteps().x(); + const RealType& hy = mesh.getSpaceSteps().y(); + const RealType& hz = mesh.getSpaceSteps().z(); + const RealType value = u( cell ); + //std::cout << value << std::endl; + RealType a, b, c, tmp = std::numeric_limits< RealType >::max(); + + + if( cell.getCoordinates().x() == 0 ) + a = u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ]; + else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 ) + a = u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ]; + else + { + a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ], + u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ] ); + } + + if( cell.getCoordinates().y() == 0 ) + b = u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ]; + else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 ) + b = u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ]; + else + { + b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ], + u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ] ); + } + + if( cell.getCoordinates().z() == 0 ) + c = u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ]; + else if( cell.getCoordinates().z() == mesh.getDimensions().z() - 1 ) + c = u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ]; + else + { + c = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ], + u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ] ); + } + + if( fabs( a ) == std::numeric_limits< RealType >::max() && + fabs( b ) == std::numeric_limits< RealType >::max() && + fabs( c ) == std::numeric_limits< RealType >::max() ) + return false; + + RealType pom[6] = { a, b, c, hx, hy, hz}; + tmp = getNewValue( pom , value, v ); + + u[ cell.getIndex() ] = tmp; + tmp = value - u[ cell.getIndex() ]; + if ( fabs( tmp ) > 0.001*hx ) + return true; + else + return false; + /*sortMinims( pom ); + tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]; + + if( fabs( tmp ) < fabs( pom[ 1 ] ) ) + { + u[ cell.getIndex() ] = argAbsMin( value, tmp ); + tmp = value - u[ cell.getIndex() ]; + if ( fabs( tmp ) > 0.001*hx ){ + return true; + }else{ + return false; + } + } + else + { + tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + + TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - + ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); + if( fabs( tmp ) < fabs( pom[ 2 ]) ) + { + u[ cell.getIndex() ] = argAbsMin( value, tmp ); + tmp = value - u[ cell.getIndex() ]; + if ( fabs( tmp ) > 0.001*hx ){ + return true; + }else{ + return false; + } + } + else + { + tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c + + TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - + hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) - + hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx ); + u[ cell.getIndex() ] = argAbsMin( value, tmp ); + tmp = value - u[ cell.getIndex() ]; + if ( fabs( tmp ) > 0.001*hx ){ + return true; + }else{ + return false; + } + } + }*/ +} + +template< typename Real, + typename Device, + typename Index > +template< int sizeSArray > +__cuda_callable__ +bool +tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: +updateCell( volatile Real *sArray, int thri, int thrj, int thrk, + const Real hx, const Real hy, const Real hz, const Real v ) +{ + const RealType value = sArray[thrk *sizeSArray * sizeSArray + thrj * sizeSArray + thri]; + + RealType a, b, c, tmp = std::numeric_limits< RealType >::max(); + + c = TNL::argAbsMin( sArray[ (thrk+1)* sizeSArray*sizeSArray + thrj * sizeSArray + thri ], + sArray[ (thrk-1) * sizeSArray *sizeSArray + thrj* sizeSArray + thri ] ); + + b = TNL::argAbsMin( sArray[ thrk* sizeSArray*sizeSArray + (thrj+1) * sizeSArray + thri ], + sArray[ thrk* sizeSArray * sizeSArray + (thrj-1)* sizeSArray +thri ] ); + + a = TNL::argAbsMin( sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri+1 ], + sArray[ thrk* sizeSArray * sizeSArray + thrj* sizeSArray +thri-1 ] ); + + if( fabs( a ) == std::numeric_limits< RealType >::max() && + fabs( b ) == std::numeric_limits< RealType >::max() && + fabs( c ) == std::numeric_limits< RealType >::max() ) + return false; + + RealType pom[6] = { a, b, c, hx, hy, hz}; + + tmp = getNewValue( pom , value, v ); + + sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ] = tmp; + tmp = value - sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ]; + if ( fabs( tmp ) > 0.001*hx ) + return true; + else + return false; + /*sortMinims( pom ); + + // calculation of real value taken from ZHAO + tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v; + if( fabs( tmp ) < fabs( pom[ 1 ] ) ) + { + sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ] = argAbsMin( value, tmp ); + tmp = value - sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ]; + if ( fabs( tmp ) > 0.001*hx ) + return true; + else + return false; + } + else + { + tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + + TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - + ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); + if( fabs( tmp ) < fabs( pom[ 2 ]) ) + { + sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ] = argAbsMin( value, tmp ); + tmp = value - sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ]; + if ( fabs( tmp ) > 0.001*hx ) + return true; + else + return false; + } + else + { + tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c + + TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - + hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) - + hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx ); + sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ] = argAbsMin( value, tmp ); + tmp = value - sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ]; + if ( fabs( tmp ) > 0.001*hx ) + return true; + else + return false; + } + }*/ + +} + + +template< typename Real, + typename Device, + typename Index > +__cuda_callable__ +Real +tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: +getNewValue( RealType valuesAndSteps[], const RealType originalValue, const RealType v ) +{ + RealType newValue = std::numeric_limits< RealType >::max(); + sortMinims( valuesAndSteps ); + + // calculation of real value taken from ZHAO + newValue = valuesAndSteps[ 0 ] + TNL::sign( originalValue ) * valuesAndSteps[ 3 ]/v; + if( fabs( newValue ) < fabs( valuesAndSteps[ 1 ] ) ) + { + newValue = argAbsMin( originalValue, newValue ); + } + else + { + newValue = ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] * valuesAndSteps[ 1 ] + + valuesAndSteps[ 4 ] * valuesAndSteps[ 4 ] * valuesAndSteps[ 0 ] + + TNL::sign( originalValue ) * valuesAndSteps[ 3 ] * valuesAndSteps[ 4 ] * + TNL::sqrt( ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] + valuesAndSteps[ 4 ] * valuesAndSteps[ 4 ] )/( v * v ) - + ( valuesAndSteps[ 1 ] - valuesAndSteps[ 0 ] ) * + ( valuesAndSteps[ 1 ] - valuesAndSteps[ 0 ] ) ) )/ + ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] + valuesAndSteps[ 4 ] * valuesAndSteps[ 4 ] ); + if( fabs( newValue ) < fabs( valuesAndSteps[ 2 ]) ) + { + newValue = argAbsMin( originalValue, newValue ); + } + else + { + // Value from algorithm by Zhao + newValue = ( valuesAndSteps[4] * valuesAndSteps[4] * valuesAndSteps[5] * valuesAndSteps[5] * valuesAndSteps[0] + + valuesAndSteps[3] * valuesAndSteps[3] * valuesAndSteps[5] * valuesAndSteps[5] * valuesAndSteps[1] + + valuesAndSteps[3] * valuesAndSteps[3] * valuesAndSteps[4] * valuesAndSteps[4] * valuesAndSteps[2] + + TNL::sign(originalValue) * valuesAndSteps[3] * valuesAndSteps[4] * valuesAndSteps[5] * TNL::sqrt( + (valuesAndSteps[3] * valuesAndSteps[3] * valuesAndSteps[5] * valuesAndSteps[5] + + valuesAndSteps[4] * valuesAndSteps[4] * valuesAndSteps[5] * valuesAndSteps[5] + + valuesAndSteps[3] * valuesAndSteps[3] * valuesAndSteps[4] * valuesAndSteps[4]) / (v * v) - + valuesAndSteps[5] * valuesAndSteps[5] * (valuesAndSteps[0] - valuesAndSteps[1]) * (valuesAndSteps[0] - valuesAndSteps[1]) - + valuesAndSteps[4] * valuesAndSteps[4] * (valuesAndSteps[0] - valuesAndSteps[2]) * (valuesAndSteps[0] - valuesAndSteps[2]) - + valuesAndSteps[3] * valuesAndSteps[3] * (valuesAndSteps[1] - valuesAndSteps[2]) * (valuesAndSteps[1] - valuesAndSteps[2]))) / ( + valuesAndSteps[3] * valuesAndSteps[3] * valuesAndSteps[4] * valuesAndSteps[4] + + valuesAndSteps[4] * valuesAndSteps[4] * valuesAndSteps[5] * valuesAndSteps[5] + + valuesAndSteps[5] * valuesAndSteps[5] * valuesAndSteps[3] * valuesAndSteps[3]); + newValue = argAbsMin( originalValue, newValue ); + } + } + + return newValue; +} + + +#ifdef HAVE_CUDA +template < typename Real, typename Device, typename Index > +__global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output, + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, + Containers::StaticVector< 3, Index > vLower, Containers::StaticVector< 3, Index > vUpper ) +{ + int i = threadIdx.x + blockDim.x*blockIdx.x; + int j = blockDim.y*blockIdx.y + threadIdx.y; + int k = blockDim.z*blockIdx.z + threadIdx.z; + const Meshes::Grid< 3, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); + + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() ) + { + typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell; + Cell cell( mesh ); + cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k; + cell.refresh(); + const Index cind = cell.getIndex(); + + + output[ cind ] = + input( cell ) >= 0 ? std::numeric_limits< Real >::max() : + - std::numeric_limits< Real >::max(); + interfaceMap[ cind ] = false; + cell.refresh(); + + if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] && + k < mesh.getDimensions().y() - vUpper[2] && i>vLower[0]-1 && j> vLower[1]-1 && k>vLower[2]-1 ) + { + const Real& hx = mesh.getSpaceSteps().x(); + const Real& hy = mesh.getSpaceSteps().y(); + const Real& hz = mesh.getSpaceSteps().z(); + const Real& c = input( cell ); + if( ! cell.isBoundaryEntity() ) + { + auto neighbors = cell.getNeighborEntities(); + Real pom = 0; + const Index e = neighbors.template getEntityIndex< 1, 0, 0 >(); + const Index w = neighbors.template getEntityIndex< -1, 0, 0 >(); + const Index n = neighbors.template getEntityIndex< 0, 1, 0 >(); + const Index s = neighbors.template getEntityIndex< 0, -1, 0 >(); + const Index t = neighbors.template getEntityIndex< 0, 0, 1 >(); + const Index b = neighbors.template getEntityIndex< 0, 0, -1 >(); + + if( c * input[ n ] <= 0 ) + { + pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ e ] <= 0 ) + { + pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ w ] <= 0 ) + { + pom = TNL::sign( c )*( hx * c )/( c - input[ w ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ s ] <= 0 ) + { + pom = TNL::sign( c )*( hy * c )/( c - input[ s ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ b ] <= 0 ) + { + pom = TNL::sign( c )*( hz * c )/( c - input[ b ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ t ] <= 0 ) + { + pom = TNL::sign( c )*( hz * c )/( c - input[ t ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + } + } + } +} + + +template < typename Index > +__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, + TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, + int numBlockX, int numBlockY, int numBlockZ ) +{ + int i = blockIdx.x * 1024 + threadIdx.x; + + if( i < numBlockX * numBlockY * numBlockZ ) + { + int pom = 0;//BlockIterPom[ i ] = 0; + int m=0, l=0, k=0; + l = i/( numBlockX * numBlockY ); + k = (i-l*numBlockX * numBlockY )/(numBlockX ); + m = (i-l*numBlockX * numBlockY )%( numBlockX ); + if( m > 0 && BlockIterDevice[ i - 1 ] ){ // left neighbour + pom = 1;//BlockIterPom[ i ] = 1; + }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){ // right neighbour + pom = 1;//BlockIterPom[ i ] = 1; + }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){ // bottom neighbour + pom = 1;// BlockIterPom[ i ] = 1; + }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){ // top neighbour + pom = 1;//BlockIterPom[ i ] = 1; + }else if( l > 0 && BlockIterDevice[ i - numBlockX*numBlockY ] ){ // neighbour behind + pom = 1; + }else if( l < numBlockZ-1 && BlockIterDevice[ i + numBlockX*numBlockY ] ){ // neighbour in front + pom = 1; + } + + if( !BlockIterDevice[ i ] ) // only in CudaUpdateCellCaller can BlockIterDevice gain 0 + BlockIterPom[ i ] = pom; + else + BlockIterPom[ i ] = 1; + } +} + + +template < int sizeSArray, typename Real, typename Device, typename Index > +__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, + const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, + const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, + TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, + Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps ) +{ + int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z; + int i = threadIdx.x + blockDim.x*blockIdx.x + vecLowerOverlaps[0]; // WITH OVERLAPS!!! i,j,k aren't coordinates of all values + int j = blockDim.y*blockIdx.y + threadIdx.y + vecLowerOverlaps[1]; + int k = blockDim.z*blockIdx.z + threadIdx.z + vecLowerOverlaps[2]; + int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri; + const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); + + // should this block calculate? + if( BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] ) + { + __syncthreads(); + + // Array indicates weather some threads calculated (for parallel reduction) + __shared__ volatile bool changed[ (sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2) ]; + changed[ currentIndex ] = false; + + if( thrj == 0 && thri == 0 && thrk == 0 ) + changed[ 0 ] = true; // first indicates weather we should calculate again (princip of parallel reduction) + + //getting stepps and size of mesh + const Real hx = mesh.getSpaceSteps().x(); const int dimX = mesh.getDimensions().x(); + const Real hy = mesh.getSpaceSteps().y(); const int dimY = mesh.getDimensions().y(); + const Real hz = mesh.getSpaceSteps().z(); const int dimZ = mesh.getDimensions().z(); + + if( thrj == 1 && thri == 1 && thrk == 1 ) + { + // we dont know if we will calculate in here, more info down in code + BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 0; + } + + // sArray contains values of one block (coppied from aux) and edges (not MPI) of those blocks + __shared__ volatile Real sArray[ sizeSArray * sizeSArray * sizeSArray ]; + sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = std::numeric_limits< Real >::max(); + + + int xkolik = blockDim.x + 1;// maximum of threads in x direction (for all blocks different) + int ykolik = blockDim.y + 1; + int zkolik = blockDim.z + 1; + __syncthreads(); + + if( gridDim.x - 1 == blockIdx.x ) + xkolik = (dimX-vecUpperOverlaps[0]-vecLowerOverlaps[0]) - blockIdx.x*blockDim.x+1; + if( gridDim.y -1 == blockIdx.y ) + ykolik = (dimY-vecUpperOverlaps[1]-vecLowerOverlaps[1]) - (blockIdx.y)*blockDim.y+1; + if( gridDim.z-1 == blockIdx.z ) + zkolik = (dimZ-vecUpperOverlaps[2]-vecLowerOverlaps[2]) - (blockIdx.z)*blockDim.z+1; + __syncthreads(); + + //filling sArray edges + if( thri == 0 ) //x bottom + { + if( (blockIdx.x != 0 || vecLowerOverlaps[0] !=0) && thrj+1 < ykolik && thrk+1 < zkolik ) + sArray[ (thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0 ] = + aux[ (blockIdx.z*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + (blockIdx.y * blockDim.y+vecLowerOverlaps[1])*dimX + + blockIdx.x*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY + vecLowerOverlaps[0] ]; + else + sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max(); + } + + if( thri == 1 ) //xtop + { + if( dimX - vecLowerOverlaps[ 0 ] > (blockIdx.x+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik ) + sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = + aux[ (blockIdx.z*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + (blockIdx.y * blockDim.y+vecLowerOverlaps[1])*dimX + + blockIdx.x*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY + vecLowerOverlaps[0] ]; + else + sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max(); + } + if( thri == 2 ) //y bottom + { + if( (blockIdx.y != 0 || vecLowerOverlaps[1] !=0) && thrj+1 < xkolik && thrk+1 < zkolik ) + sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = + aux[ (blockIdx.z*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + (blockIdx.y * blockDim.y+vecLowerOverlaps[1])*dimX + + blockIdx.x*blockDim.x - dimX + thrj + thrk*dimX*dimY + vecLowerOverlaps[0] ]; + else + sArray[ (thrk+1) * sizeSArray * sizeSArray + 0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max(); + } + + if( thri == 3 ) //y top + { + if( dimY - vecLowerOverlaps[ 1 ] > (blockIdx.y+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik ) + sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = + aux[ (blockIdx.z*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + ((blockIdx.y+1) * blockDim.y+vecLowerOverlaps[1])*dimX + + blockIdx.x*blockDim.x + thrj + thrk*dimX*dimY + vecLowerOverlaps[0] ]; + else + sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = std::numeric_limits< Real >::max(); + } + if( thri == 4 ) //z bottom + { + if( (blockIdx.z != 0 || vecLowerOverlaps[2] !=0) && thrj+1 < ykolik && thrk+1 < xkolik ) + sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = + aux[ (blockIdx.z*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + (blockIdx.y * blockDim.y+vecLowerOverlaps[1])*dimX + + blockIdx.x*blockDim.x - dimX * dimY + thrj * dimX + thrk + vecLowerOverlaps[0] ]; + else + sArray[0 * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thrk+1] = std::numeric_limits< Real >::max(); + } + + if( thri == 5 ) //z top + { + if( dimZ - vecLowerOverlaps[ 2 ] > (blockIdx.z+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik ) + sArray[ zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = + aux[ ((blockIdx.z+1)*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + (blockIdx.y * blockDim.y+vecLowerOverlaps[1])*dimX + + blockIdx.x*blockDim.x + thrj * dimX + thrk + vecLowerOverlaps[0] ]; + else + sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = std::numeric_limits< Real >::max(); + } + + // Copy all other values that aren't edges + if( i - vecLowerOverlaps[0] < dimX && j - vecLowerOverlaps[1] < dimY && k - vecLowerOverlaps[2] < dimZ && + thri+1 < xkolik + vecUpperOverlaps[0] && thrj+1 < ykolik + vecUpperOverlaps[1] && thrk+1 < zkolik + vecUpperOverlaps[2] ) + { + sArray[(thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = aux[ k*dimX*dimY + j*dimX + i ]; + } + __syncthreads(); + + //main while cycle. each value can get information only from neighbour but that information has to spread there + while( changed[ 0 ] ) + { + __syncthreads(); + + changed[ currentIndex ] = false; + + //calculation of update cell + if( i < dimX - vecUpperOverlaps[0] && j < dimY - vecUpperOverlaps[1] && k < dimZ - vecUpperOverlaps[2] ) + { + if( ! interfaceMap[ k*dimX*dimY + j * dimX + i ] ) + { + // calculate new value depending on neighbours in sArray on (thri+1, thrj+1) coordinates + changed[ currentIndex ] = ptr.updateCell< sizeSArray >( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz); + } + } + __syncthreads(); + + //pyramid reduction (parallel reduction) + if( blockDim.x*blockDim.y*blockDim.z == 1024 ) + { + if( currentIndex < 512 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; + } + } + __syncthreads(); + if( blockDim.x*blockDim.y*blockDim.z >= 512 ) + { + if( currentIndex < 256 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; + } + } + __syncthreads(); + if( blockDim.x*blockDim.y*blockDim.z >= 256 ) + { + if( currentIndex < 128 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ]; + } + } + __syncthreads(); + if( blockDim.x*blockDim.y*blockDim.z >= 128 ) + { + if( currentIndex < 64 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; + } + } + __syncthreads(); + if( currentIndex < 32 ) + { + if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ]; + if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ]; + if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ]; + if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ]; + if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; + if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; + } + __syncthreads(); + + // if we calculated, then the BlockIterDevice should contain the info about this whole block! (only one number for one block) + if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 ) + { + BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 1; + } + __syncthreads(); + } + + // copy results into helpFunc (not into aux bcs of conflicts) + if( i < dimX && j < dimY && k < dimZ && thri+1 < xkolik && thrj+1 < ykolik && thrk+1 < zkolik ) + helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thri+1 ]; + + } + else // if not, then it should at least copy the values from aux to helpFunc. + { + if( i < mesh.getDimensions().x() - vecUpperOverlaps[0] && j < mesh.getDimensions().y() - vecUpperOverlaps[1] + && k < mesh.getDimensions().z() - vecUpperOverlaps[2]) + helpFunc[ k * mesh.getDimensions().x() * mesh.getDimensions().y() + j * mesh.getDimensions().x() + i ] = + aux[ k * mesh.getDimensions().x() * mesh.getDimensions().y() + j * mesh.getDimensions().x() + i ]; + } +} +#endif + + +/// ==========================OPEN=MP================================= +template< typename Real, + typename Device, + typename Index > +template< int sizeSArray > +void +tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: +updateBlocks( const InterfaceMapType interfaceMap, + const MeshFunctionType aux, + MeshFunctionType& helpFunc, + ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ) +{ + #pragma omp parallel for schedule( dynamic ) + for( IndexType i = 0; i < BlockIterHost.getSize(); i++ ) + { + if( BlockIterHost[ i ] ) + { + MeshType mesh = interfaceMap.template getMesh< Devices::Host >(); + + int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); + int dimZ = mesh.getDimensions().z(); + //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl; + int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0); + int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0); + int numOfBlockz = dimZ/numThreadsPerBlock + ((dimZ%numThreadsPerBlock != 0) ? 1:0); + //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl; + int xkolik = numThreadsPerBlock + 1; + int ykolik = numThreadsPerBlock + 1; + int zkolik = numThreadsPerBlock + 1; + + + int blIdz = i/( numOfBlockx * numOfBlocky ); + int blIdy = (i-blIdz*numOfBlockx * numOfBlocky )/(numOfBlockx ); + int blIdx = (i-blIdz*numOfBlockx * numOfBlocky )%( numOfBlockx ); + //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl; + + if( numOfBlockx - 1 == blIdx ) + xkolik = dimX - (blIdx)*numThreadsPerBlock+1; + if( numOfBlocky -1 == blIdy ) + ykolik = dimY - (blIdy)*numThreadsPerBlock+1; + if( numOfBlockz-1 == blIdz ) + zkolik = dimZ - (blIdz)*numThreadsPerBlock+1; + //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl; + + + /*bool changed[numThreadsPerBlock*numThreadsPerBlock]; + changed[ 0 ] = 1;*/ + Real hx = mesh.getSpaceSteps().x(); + Real hy = mesh.getSpaceSteps().y(); + Real hz = mesh.getSpaceSteps().z(); + + bool changed = false; + BlockIterHost[ i ] = 0; + + + Real *sArray; + sArray = new Real[ sizeSArray * sizeSArray * sizeSArray ]; + if( sArray == nullptr ) + std::cout << "Error while allocating memory for sArray." << std::endl; + + for( IndexType k = 0; k < sizeSArray; k++ ) + for( IndexType l = 0; l < sizeSArray; l++ ) + for( IndexType m = 0; m < sizeSArray; m++ ){ + sArray[ m * sizeSArray * sizeSArray + k * sizeSArray + l ] = std::numeric_limits< Real >::max(); + } + + + for( IndexType thrk = 0; thrk < numThreadsPerBlock; thrk++ ) + for( IndexType thrj = 0; thrj < numThreadsPerBlock; thrj++ ) + { + if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik ) + sArray[(thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = + aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX -1 + thrk*dimX*dimY ]; + + if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < zkolik ) + sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = + aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy *numThreadsPerBlock*dimX+ blIdx*numThreadsPerBlock + numThreadsPerBlock + thrj * dimX + thrk*dimX*dimY ]; + + if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik ) + sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = + aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX + thrj + thrk*dimX*dimY ]; + + if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik && thrk+1 < zkolik ) + sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = + aux[ blIdz*numThreadsPerBlock * dimX * dimY + (blIdy+1) * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj + thrk*dimX*dimY ]; + + if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik ) + sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = + aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX * dimY + thrj * dimX + thrk ]; + + if( dimZ > (blIdz+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < xkolik ) + sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = + aux[ (blIdz+1)*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX + thrk ]; + } + + for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ + for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ + for( IndexType l = 0; l < numThreadsPerBlock; l++ ){ + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + sArray[(m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1] = + aux[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ]; + } + } + } + /*string s; + int numWhile = 0; + for( int k = 0; k < numThreadsPerBlock; k++ ){ + for( int l = 0; l < numThreadsPerBlock; l++ ) + for( int m = 0; m < numThreadsPerBlock; m++ ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; + } + numWhile++; + s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; + helpFunc.save( s );*/ + + for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ + for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ + for( IndexType l = 0; l < numThreadsPerBlock; l++ ){ + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){ + //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl; + if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] ) + { + //printf("In with point m = %d, k = %d, l = %d\n", m, k, l); + changed = this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz) || changed; + + } + } + } + } + } + /*for( int k = 0; k < numThreadsPerBlock; k++ ){ + for( int l = 0; l < numThreadsPerBlock; l++ ) + for( int m = 0; m < numThreadsPerBlock; m++ ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; + } + numWhile++; + s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; + helpFunc.save( s );*/ + + for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){ + for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ + for( IndexType l = 0; l template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); + } + } + } + } + } + /*for( int k = 0; k < numThreadsPerBlock; k++ ){ + for( int l = 0; l < numThreadsPerBlock; l++ ) + for( int m = 0; m < numThreadsPerBlock; m++ ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; + } + numWhile++; + s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; + helpFunc.save( s );*/ + + for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ + for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ + for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){ + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + { + if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] ) + { + this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); + } + } + } + } + } + /*for( int k = 0; k < numThreadsPerBlock; k++ ){ + for( int l = 0; l < numThreadsPerBlock; l++ ) + for( int m = 0; m < numThreadsPerBlock; m++ ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; + } + numWhile++; + s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; + helpFunc.save( s ); + */ + for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){ + for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ + for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){ + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + { + if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] ) + { + this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); + } + } + } + } + } + /*for( int k = 0; k < numThreadsPerBlock; k++ ){ + for( int l = 0; l < numThreadsPerBlock; l++ ) + for( int m = 0; m < numThreadsPerBlock; m++ ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; + } + numWhile++; + s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; + helpFunc.save( s );*/ + + for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ + for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){ + for( IndexType l = 0; l template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); + } + } + } + } + } + /*for( int k = 0; k < numThreadsPerBlock; k++ ){ + for( int l = 0; l < numThreadsPerBlock; l++ ) + for( int m = 0; m < numThreadsPerBlock; m++ ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; + } + numWhile++; + s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; + helpFunc.save( s );*/ + + for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){ + for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){ + for( IndexType l = 0; l template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); + } + } + } + } + } + /*for( int k = 0; k < numThreadsPerBlock; k++ ){ + for( int l = 0; l < numThreadsPerBlock; l++ ) + for( int m = 0; m < numThreadsPerBlock; m++ ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; + } + numWhile++; + s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; + helpFunc.save( s );*/ + + for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ + for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){ + for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){ + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + { + if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] ) + { + this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); + } + } + } + } + } + /*for( int k = 0; k < numThreadsPerBlock; k++ ){ + for( int l = 0; l < numThreadsPerBlock; l++ ) + for( int m = 0; m < numThreadsPerBlock; m++ ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; + } + numWhile++; + s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; + helpFunc.save( s );*/ + + + for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){ + for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){ + for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){ + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + { + if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] ) + { + this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); + } + } + } + } + } + /*for( int k = 0; k < numThreadsPerBlock; k++ ){ + for( int l = 0; l < numThreadsPerBlock; l++ ) + for( int m = 0; m < numThreadsPerBlock; m++ ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) + helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; + } + numWhile++; + s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; + helpFunc.save( s );*/ + + if( changed ){ + BlockIterHost[ i ] = 1; + } + + + for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ + for( IndexType l = 0; l < numThreadsPerBlock; l++ ) { + for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){ + helpFunc[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] = + sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; + //std::cout << helpFunc[ m*dimX*dimY + k*dimX + l ] << " "; + } + } + //std::cout << std::endl; + } + //std::cout << std::endl; + } + //helpFunc.save( "helpF.tnl"); + delete []sArray; + } + } +} + +template< typename Real, + typename Device, + typename Index > +void +tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: +getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY, int numBlockZ ) +{ + int* BlockIterPom; + BlockIterPom = new int [ numBlockX * numBlockY * numBlockZ ]; + + for( int i = 0; i< BlockIterHost.getSize(); i++) + { + BlockIterPom[ i ] = 0; + + int m=0, l=0, k=0; + l = i/( numBlockX * numBlockY ); + k = (i-l*numBlockX * numBlockY )/(numBlockX ); + m = (i-l*numBlockX * numBlockY )%( numBlockX ); + + if( m > 0 && BlockIterHost[ i - 1 ] ){ + BlockIterPom[ i ] = 1; + }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){ + BlockIterPom[ i ] = 1; + }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){ + BlockIterPom[ i ] = 1; + }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){ + BlockIterPom[ i ] = 1; + }else if( l > 0 && BlockIterHost[ i - numBlockX*numBlockY ] ){ + BlockIterPom[ i ] = 1; + }else if( l < numBlockZ-1 && BlockIterHost[ i + numBlockX*numBlockY ] ){ + BlockIterPom[ i ] = 1; + } + } + for( int i = 0; i< BlockIterHost.getSize(); i++) + { + BlockIterHost[ i ] = BlockIterPom[ i ]; + } +} diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h index 24a3885540e091047cd2580468ff24d3f1da38d6..7cba99f6586b84e31fbe1a49da4223c9e621dd5b 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h @@ -7,9 +7,9 @@ #pragma once -#include -#include -#include +//#include +//#include +//#include using namespace TNL; @@ -62,30 +62,44 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > typedef Functions::MeshFunction< MeshType > MeshFunctionType; typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType; typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer; + using ArrayContainerView = typename ArrayContainer::ViewType; + typedef Containers::StaticVector< 2, Index > StaticVector; + + using MeshPointer = Pointers::SharedPointer< MeshType >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >; using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >; + // CALLER FOR HOST AND CUDA void initInterface( const MeshFunctionPointer& input, MeshFunctionPointer& output, - InterfaceMapPointer& interfaceMap ); - + InterfaceMapPointer& interfaceMap, + StaticVector vLower, StaticVector vUpper ); + + // FOR HOST template< typename MeshEntity > - __cuda_callable__ void updateCell( MeshFunctionType& u, + __cuda_callable__ bool updateCell( MeshFunctionType& u, const MeshEntity& cell, const RealType velocity = 1.0 ); + // FOR CUDA template< int sizeSArray > - __cuda_callable__ bool updateCell( volatile Real *sArray, - int thri, int thrj, const Real hx, const Real hy, - const Real velocity = 1.0 ); - + __cuda_callable__ bool updateCell( volatile RealType *sArray, + int thri, int thrj, const RealType hx, const RealType hy, + const RealType velocity = 1.0 ); + +// FOR OPENMP WILL BE REMOVED + void getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY ); + template< int sizeSArray > - void updateBlocks( const InterfaceMapType& interfaceMap, - MeshFunctionType& aux, - MeshFunctionType& helpFunc, - ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ); + void updateBlocks( InterfaceMapType interfaceMap, + MeshFunctionType aux, + MeshFunctionType helpFunc, + ArrayContainerView BlockIterHost, int numThreadsPerBlock ); + + protected: - void getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY ); + __cuda_callable__ RealType getNewValue( RealType valuesAndSteps[], + const RealType originalValue, const RealType v ); }; template< typename Real, @@ -101,36 +115,49 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > typedef Functions::MeshFunction< MeshType > MeshFunctionType; typedef Functions::MeshFunction< MeshType, 3, bool > InterfaceMapType; typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer; + using ArrayContainerView = typename ArrayContainer::ViewType; + typedef Containers::StaticVector< 3, Index > StaticVector; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >; using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >; + // CALLER FOR HOST AND CUDA void initInterface( const MeshFunctionPointer& input, MeshFunctionPointer& output, - InterfaceMapPointer& interfaceMap ); + InterfaceMapPointer& interfaceMap, + StaticVector vLower, StaticVector vUpper ); + // FOR HOST template< typename MeshEntity > - __cuda_callable__ void updateCell( MeshFunctionType& u, + __cuda_callable__ bool updateCell( MeshFunctionType& u, const MeshEntity& cell, const RealType velocity = 1.0); + // FOR CUDA template< int sizeSArray > - void updateBlocks( const InterfaceMapType& interfaceMap, - const MeshFunctionType& aux, - MeshFunctionType& helpFunc, - ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ); + __cuda_callable__ bool updateCell( volatile Real *sArray, + int thri, int thrj, int thrk, const RealType hx, const RealType hy, const RealType hz, + const RealType velocity = 1.0 ); - void getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY, int numBlockZ ); + // OPENMP WILL BE REMOVED + void getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY, int numBlockZ ); template< int sizeSArray > - __cuda_callable__ bool updateCell3D( volatile Real *sArray, - int thri, int thrj, int thrk, const Real hx, const Real hy, const Real hz, - const Real velocity = 1.0 ); + void updateBlocks( const InterfaceMapType interfaceMap, + const MeshFunctionType aux, + MeshFunctionType& helpFunc, + ArrayContainer BlockIterHost, int numThreadsPerBlock ); + + protected: + + __cuda_callable__ RealType getNewValue( RealType valuesAndSteps[], + const RealType originalValue, const RealType v ); }; template < typename T1 > __cuda_callable__ void sortMinims( T1 pom[] ); #ifdef HAVE_CUDA +// 1D template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output, @@ -142,42 +169,53 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& aux, bool *BlockIterDevice ); + + + +// 2D +template < typename Real, typename Device, typename Index > +__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output, + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, + const Containers::StaticVector< 2, Index > vecLowerOverlas, + const Containers::StaticVector< 2, Index > vecUpperOerlaps ); + template < int sizeSArray, typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock =0); + TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator, + const Containers::StaticVector< 2, Index > vecLowerOverlaps, + const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock =0); template < typename Index > -__global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > dBlock, int nBlocks ); +__global__ void GetNeighbours( const TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator, + TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY ); -template < typename Index > -__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY ); -template < typename Real, typename Device, typename Index > -__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap ); +// 3D template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output, - Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap ); + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, + Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps ); template < int sizeSArray, typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice ); + TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, + Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps ); template < typename Index > -__global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, +__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY, int numBlockZ ); #endif -#include "tnlDirectEikonalMethodsBase_impl.h" +#include "tnlDirectEikonalMethodBase1D_impl.h" +#include "tnlDirectEikonalMethodBase2D_impl.h" +#include "tnlDirectEikonalMethodBase3D_impl.h" diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h deleted file mode 100644 index 26444bcfa591d702fbf7fdb325db0fd846c31fa1..0000000000000000000000000000000000000000 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +++ /dev/null @@ -1,1639 +0,0 @@ -/* - * File: tnlDirectEikonalMethodsBase_impl.h - * Author: oberhuber - * - * Created on July 14, 2016, 3:22 PM - */ - -#pragma once - -#include - -#include -#include "tnlFastSweepingMethod.h" -#include "tnlDirectEikonalMethodsBase.h" - -template< typename Real, - typename Device, - typename Index > -void -tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >:: -initInterface( const MeshFunctionPointer& _input, - MeshFunctionPointer& _output, - InterfaceMapPointer& _interfaceMap ) -{ - if( std::is_same< Device, Devices::Cuda >::value ) - { -#ifdef HAVE_CUDA - const MeshType& mesh = _input->getMesh(); - - const int cudaBlockSize( 16 ); - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); - dim3 blockSize( cudaBlockSize ); - dim3 gridSize( numBlocksX ); - Devices::Cuda::synchronizeDevice(); - CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(), - _output.template modifyData< Device >(), - _interfaceMap.template modifyData< Device >() ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; -#endif - } - if( std::is_same< Device, Devices::Host >::value ) - { - const MeshType& mesh = _input->getMesh(); - typedef typename MeshType::Cell Cell; - const MeshFunctionType& input = _input.getData(); - MeshFunctionType& output = _output.modifyData(); - InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); - Cell cell( mesh ); - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh.getDimensions().x(); - cell.getCoordinates().x() ++ ) - { - cell.refresh(); - output[ cell.getIndex() ] = - input( cell ) >= 0 ? std::numeric_limits< RealType >::max() : - -std::numeric_limits< RealType >::max(); - interfaceMap[ cell.getIndex() ] = false; - } - - - const RealType& h = mesh.getSpaceSteps().x(); - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh.getDimensions().x() - 1; - cell.getCoordinates().x() ++ ) - { - cell.refresh(); - const RealType& c = input( cell ); - if( ! cell.isBoundaryEntity() ) - { - const auto& neighbors = cell.getNeighborEntities(); - Real pom = 0; - //const IndexType& c = cell.getIndex(); - const IndexType e = neighbors.template getEntityIndex< 1 >(); - if( c * input[ e ] <= 0 ) - { - pom = TNL::sign( c )*( h * c )/( c - input[ e ]); - if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) - output[ cell.getIndex() ] = pom; - - pom = pom - TNL::sign( c )*h; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; - if( TNL::abs( output[ e ] ) > TNL::abs( pom ) ) - output[ e ] = pom; - - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ e ] = true; - } - } - } - } -} - -template< typename Real, - typename Device, - typename Index > -template< int sizeSArray > -void -tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: -updateBlocks( const InterfaceMapType& interfaceMap, - MeshFunctionType& aux, - MeshFunctionType& helpFunc, - ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ) -{ -#pragma omp parallel for schedule( dynamic ) - for( IndexType i = 0; i < BlockIterHost.getSize(); i++ ) - { - if( BlockIterHost[ i ] ) - { - MeshType mesh = interfaceMap.template getMesh< Devices::Host >(); - - int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); - //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl; - int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0); - int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0); - //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl; - int xkolik = numThreadsPerBlock + 1; - int ykolik = numThreadsPerBlock + 1; - - int blIdx = i%numOfBlockx; - int blIdy = i/numOfBlockx; - //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl; - - if( numOfBlockx - 1 == blIdx ) - xkolik = dimX - (blIdx)*numThreadsPerBlock+1; - - if( numOfBlocky -1 == blIdy ) - ykolik = dimY - (blIdy)*numThreadsPerBlock+1; - //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl; - - - /*bool changed[numThreadsPerBlock*numThreadsPerBlock]; - changed[ 0 ] = 1;*/ - Real hx = mesh.getSpaceSteps().x(); - Real hy = mesh.getSpaceSteps().y(); - - bool changed = false; - BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0; - - - Real *sArray; - sArray = new Real[ sizeSArray * sizeSArray ]; - if( sArray == nullptr ) - std::cout << "Error while allocating memory for sArray." << std::endl; - - for( IndexType thri = 0; thri < sizeSArray; thri++ ){ - for( IndexType thrj = 0; thrj < sizeSArray; thrj++ ) - sArray[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max(); - } - - - //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); - for( IndexType thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ ) - { - if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik ) - sArray[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ]; - - - if( blIdx != 0 && thrj+1 < ykolik ) - sArray[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ]; - - if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik ) - sArray[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ]; - - if( blIdy != 0 && thrj+1 < xkolik ) - sArray[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ]; - } - - for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ - for( IndexType l = 0; l < numThreadsPerBlock; l++ ) - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) - sArray[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ]; - } - - for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ - for( IndexType l = 0; l < numThreadsPerBlock; l++ ){ - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ){ - //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl; - if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) - { - changed = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy) || changed; - - } - } - } - } - /*aux.save( "aux-1pruch.tnl" ); - for( int k = 0; k < sizeSArray; k++ ){ - for( int l = 0; l < sizeSArray; l++ ) { - std::cout << sArray[ k * sizeSArray + l] << " "; - } - std::cout << std::endl; - }*/ - - for( IndexType k = 0; k < numThreadsPerBlock; k++ ) - for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ) { - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) - { - if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) - { - this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy); - } - } - } - /*aux.save( "aux-2pruch.tnl" ); - for( int k = 0; k < sizeSArray; k++ ){ - for( int l = 0; l < sizeSArray; l++ ) { - std::cout << sArray[ k * sizeSArray + l] << " "; - } - std::cout << std::endl; - }*/ - - for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ) - for( IndexType l = 0; l < numThreadsPerBlock; l++ ) { - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) - { - if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) - { - this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy); - } - } - } - /*aux.save( "aux-3pruch.tnl" ); - for( int k = 0; k < sizeSArray; k++ ){ - for( int l = 0; l < sizeSArray; l++ ) { - std::cout << sArray[ k * sizeSArray + l] << " "; - } - std::cout << std::endl; - }*/ - - for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){ - for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ) { - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) - { - if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) - { - this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx, hy, 1.0); - } - } - } - } - /*aux.save( "aux-4pruch.tnl" ); - for( int k = 0; k < sizeSArray; k++ ){ - for( int l = 0; l < sizeSArray; l++ ) { - std::cout << sArray[ k * sizeSArray + l] << " "; - } - std::cout << std::endl; - }*/ - - - if( changed ){ - BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 1; - } - - - for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ - for( IndexType l = 0; l < numThreadsPerBlock; l++ ) { - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) - helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] = sArray[ (k + 1)* sizeSArray + l + 1 ]; - //std::cout<< sArray[k+1][l+1]; - } - //std::cout< -template< int sizeSArray > -void -tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: -updateBlocks( const InterfaceMapType& interfaceMap, - const MeshFunctionType& aux, - MeshFunctionType& helpFunc, - ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ) -{ -//#pragma omp parallel for schedule( dynamic ) - for( IndexType i = 0; i < BlockIterHost.getSize(); i++ ) - { - if( BlockIterHost[ i ] ) - { - MeshType mesh = interfaceMap.template getMesh< Devices::Host >(); - - int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); - int dimZ = mesh.getDimensions().z(); - //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl; - int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0); - int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0); - int numOfBlockz = dimZ/numThreadsPerBlock + ((dimZ%numThreadsPerBlock != 0) ? 1:0); - //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl; - int xkolik = numThreadsPerBlock + 1; - int ykolik = numThreadsPerBlock + 1; - int zkolik = numThreadsPerBlock + 1; - - - int blIdz = i/( numOfBlockx * numOfBlocky ); - int blIdy = (i-blIdz*numOfBlockx * numOfBlocky )/(numOfBlockx ); - int blIdx = (i-blIdz*numOfBlockx * numOfBlocky )%( numOfBlockx ); - //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl; - - if( numOfBlockx - 1 == blIdx ) - xkolik = dimX - (blIdx)*numThreadsPerBlock+1; - if( numOfBlocky -1 == blIdy ) - ykolik = dimY - (blIdy)*numThreadsPerBlock+1; - if( numOfBlockz-1 == blIdz ) - zkolik = dimZ - (blIdz)*numThreadsPerBlock+1; - //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl; - - - /*bool changed[numThreadsPerBlock*numThreadsPerBlock]; - changed[ 0 ] = 1;*/ - Real hx = mesh.getSpaceSteps().x(); - Real hy = mesh.getSpaceSteps().y(); - Real hz = mesh.getSpaceSteps().z(); - - bool changed = false; - BlockIterHost[ i ] = 0; - - - Real *sArray; - sArray = new Real[ sizeSArray * sizeSArray * sizeSArray ]; - if( sArray == nullptr ) - std::cout << "Error while allocating memory for sArray." << std::endl; - - for( IndexType k = 0; k < sizeSArray; k++ ) - for( IndexType l = 0; l < sizeSArray; l++ ) - for( IndexType m = 0; m < sizeSArray; m++ ){ - sArray[ m * sizeSArray * sizeSArray + k * sizeSArray + l ] = std::numeric_limits< Real >::max(); - } - - - for( IndexType thrk = 0; thrk < numThreadsPerBlock; thrk++ ) - for( IndexType thrj = 0; thrj < numThreadsPerBlock; thrj++ ) - { - if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik ) - sArray[(thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = - aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX -1 + thrk*dimX*dimY ]; - - if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < zkolik ) - sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = - aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy *numThreadsPerBlock*dimX+ blIdx*numThreadsPerBlock + numThreadsPerBlock + thrj * dimX + thrk*dimX*dimY ]; - - if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik ) - sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = - aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX + thrj + thrk*dimX*dimY ]; - - if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik && thrk+1 < zkolik ) - sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = - aux[ blIdz*numThreadsPerBlock * dimX * dimY + (blIdy+1) * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj + thrk*dimX*dimY ]; - - if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik ) - sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = - aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX * dimY + thrj * dimX + thrk ]; - - if( dimZ > (blIdz+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < xkolik ) - sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = - aux[ (blIdz+1)*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX + thrk ]; - } - - for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ - for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ - for( IndexType l = 0; l < numThreadsPerBlock; l++ ){ - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - sArray[(m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1] = - aux[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ]; - } - } - } - /*string s; - int numWhile = 0; - for( int k = 0; k < numThreadsPerBlock; k++ ){ - for( int l = 0; l < numThreadsPerBlock; l++ ) - for( int m = 0; m < numThreadsPerBlock; m++ ) - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; - } - numWhile++; - s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; - helpFunc.save( s );*/ - - for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ - for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ - for( IndexType l = 0; l < numThreadsPerBlock; l++ ){ - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){ - //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl; - if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] ) - { - //printf("In with point m = %d, k = %d, l = %d\n", m, k, l); - changed = this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz) || changed; - - } - } - } - } - } - /*for( int k = 0; k < numThreadsPerBlock; k++ ){ - for( int l = 0; l < numThreadsPerBlock; l++ ) - for( int m = 0; m < numThreadsPerBlock; m++ ) - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; - } - numWhile++; - s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; - helpFunc.save( s );*/ - - for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){ - for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ - for( IndexType l = 0; l template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); - } - } - } - } - } - /*for( int k = 0; k < numThreadsPerBlock; k++ ){ - for( int l = 0; l < numThreadsPerBlock; l++ ) - for( int m = 0; m < numThreadsPerBlock; m++ ) - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; - } - numWhile++; - s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; - helpFunc.save( s );*/ - - for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ - for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ - for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){ - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - { - if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] ) - { - this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); - } - } - } - } - } - /*for( int k = 0; k < numThreadsPerBlock; k++ ){ - for( int l = 0; l < numThreadsPerBlock; l++ ) - for( int m = 0; m < numThreadsPerBlock; m++ ) - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; - } - numWhile++; - s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; - helpFunc.save( s ); - */ - for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){ - for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ - for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){ - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - { - if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] ) - { - this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); - } - } - } - } - } - /*for( int k = 0; k < numThreadsPerBlock; k++ ){ - for( int l = 0; l < numThreadsPerBlock; l++ ) - for( int m = 0; m < numThreadsPerBlock; m++ ) - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; - } - numWhile++; - s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; - helpFunc.save( s );*/ - - for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ - for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){ - for( IndexType l = 0; l template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); - } - } - } - } - } - /*for( int k = 0; k < numThreadsPerBlock; k++ ){ - for( int l = 0; l < numThreadsPerBlock; l++ ) - for( int m = 0; m < numThreadsPerBlock; m++ ) - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; - } - numWhile++; - s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; - helpFunc.save( s );*/ - - for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){ - for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){ - for( IndexType l = 0; l template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); - } - } - } - } - } - /*for( int k = 0; k < numThreadsPerBlock; k++ ){ - for( int l = 0; l < numThreadsPerBlock; l++ ) - for( int m = 0; m < numThreadsPerBlock; m++ ) - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; - } - numWhile++; - s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; - helpFunc.save( s );*/ - - for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ - for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){ - for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){ - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - { - if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] ) - { - this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); - } - } - } - } - } - /*for( int k = 0; k < numThreadsPerBlock; k++ ){ - for( int l = 0; l < numThreadsPerBlock; l++ ) - for( int m = 0; m < numThreadsPerBlock; m++ ) - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; - } - numWhile++; - s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; - helpFunc.save( s );*/ - - - for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){ - for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){ - for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){ - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - { - if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] ) - { - this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz); - } - } - } - } - } - /*for( int k = 0; k < numThreadsPerBlock; k++ ){ - for( int l = 0; l < numThreadsPerBlock; l++ ) - for( int m = 0; m < numThreadsPerBlock; m++ ) - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ) - helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; - } - numWhile++; - s = "helpFunc-"+ std::to_string(numWhile) + ".tnl"; - helpFunc.save( s );*/ - - if( changed ){ - BlockIterHost[ i ] = 1; - } - - - for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ - for( IndexType l = 0; l < numThreadsPerBlock; l++ ) { - for( IndexType m = 0; m < numThreadsPerBlock; m++ ){ - if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){ - helpFunc[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] = - sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ]; - //std::cout << helpFunc[ m*dimX*dimY + k*dimX + l ] << " "; - } - } - //std::cout << std::endl; - } - //std::cout << std::endl; - } - //helpFunc.save( "helpF.tnl"); - delete []sArray; - } - } -} -template< typename Real, - typename Device, - typename Index > -void -tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: -getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY, int numBlockZ ) -{ - int* BlockIterPom; - BlockIterPom = new int [ numBlockX * numBlockY * numBlockZ ]; - - for( int i = 0; i< BlockIterHost.getSize(); i++) - { - BlockIterPom[ i ] = 0; - - int m=0, l=0, k=0; - l = i/( numBlockX * numBlockY ); - k = (i-l*numBlockX * numBlockY )/(numBlockX ); - m = (i-l*numBlockX * numBlockY )%( numBlockX ); - - if( m > 0 && BlockIterHost[ i - 1 ] ){ - BlockIterPom[ i ] = 1; - }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){ - BlockIterPom[ i ] = 1; - }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){ - BlockIterPom[ i ] = 1; - }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){ - BlockIterPom[ i ] = 1; - }else if( l > 0 && BlockIterHost[ i - numBlockX*numBlockY ] ){ - BlockIterPom[ i ] = 1; - }else if( l < numBlockZ-1 && BlockIterHost[ i + numBlockX*numBlockY ] ){ - BlockIterPom[ i ] = 1; - } - } - for( int i = 0; i< BlockIterHost.getSize(); i++) - { - BlockIterHost[ i ] = BlockIterPom[ i ]; - } -} - - -template< typename Real, - typename Device, - typename Index > -void -tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: -getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY ) -{ - int* BlockIterPom; - BlockIterPom = new int [numBlockX * numBlockY]; - - for(int i = 0; i < numBlockX * numBlockY; i++) - { - BlockIterPom[ i ] = 0;//BlockIterPom[ i ] = 0; - int m=0, k=0; - m = i%numBlockX; - k = i/numBlockX; - if( m > 0 && BlockIterHost[ i - 1 ] ){ - BlockIterPom[ i ] = 1; - }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){ - BlockIterPom[ i ] = 1; - }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){ - BlockIterPom[ i ] = 1; - }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){ - BlockIterPom[ i ] = 1; - } - //BlockIterPom[ i ]; - } - - for(int i = 0; i < numBlockX * numBlockY; i++) - { - if( !BlockIterHost[ i ] ) - BlockIterHost[ i ] = BlockIterPom[ i ]; - } - /*else - BlockIter[ i ] = 0;*/ - /*for( int i = numBlockX-1; i > -1; i-- ) - { - for( int j = 0; j< numBlockY; j++ ) - std::cout << BlockIterHost[ i*numBlockY + j ]; - std::cout << std::endl; - } - std::cout << std::endl;*/ - delete[] BlockIterPom; -} - -template< typename Real, - typename Device, - typename Index > -template< typename MeshEntity > -void -tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >:: -updateCell( MeshFunctionType& u, - const MeshEntity& cell, - const RealType v ) -{ - const auto& neighborEntities = cell.template getNeighborEntities< 1 >(); - const MeshType& mesh = cell.getMesh(); - const RealType& h = mesh.getSpaceSteps().x(); - const RealType value = u( cell ); - RealType a, tmp = std::numeric_limits< RealType >::max(); - - if( cell.getCoordinates().x() == 0 ) - a = u[ neighborEntities.template getEntityIndex< 1 >() ]; - else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 ) - a = u[ neighborEntities.template getEntityIndex< -1 >() ]; - else - { - a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1 >() ], - u[ neighborEntities.template getEntityIndex< 1 >() ] ); - } - - if( fabs( a ) == std::numeric_limits< RealType >::max() ) - return; - - tmp = a + TNL::sign( value ) * h/v; - - u[ cell.getIndex() ] = argAbsMin( value, tmp ); -} - -template< typename Real, - typename Device, - typename Index > -void -tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: -initInterface( const MeshFunctionPointer& _input, - MeshFunctionPointer& _output, - InterfaceMapPointer& _interfaceMap ) -{ - - if( std::is_same< Device, Devices::Cuda >::value ) - { -#ifdef HAVE_CUDA - const MeshType& mesh = _input->getMesh(); - - const int cudaBlockSize( 16 ); - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); - int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); - dim3 blockSize( cudaBlockSize, cudaBlockSize ); - dim3 gridSize( numBlocksX, numBlocksY ); - Devices::Cuda::synchronizeDevice(); - CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(), - _output.template modifyData< Device >(), - _interfaceMap.template modifyData< Device >() ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; -#endif - } - if( std::is_same< Device, Devices::Host >::value ) - { - MeshFunctionType input = _input.getData(); - - /*double A[320][320]; - std::ifstream fileInit("/home/maty/Downloads/initData.txt"); - - for (int i = 0; i < 320; i++) - for (int j = 0; j < 320; j++) - fileInit >> A[j]; - fileInit.close(); - for (int i = 0; i < 320; i++) - for (int j = 0; j < 320; j++) - input[i*320 + j] = A[j];*/ - - - MeshFunctionType& output = _output.modifyData(); - InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); - const MeshType& mesh = input.getMesh(); - typedef typename MeshType::Cell Cell; - Cell cell( mesh ); - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh.getDimensions().y(); - cell.getCoordinates().y() ++ ) - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh.getDimensions().x(); - cell.getCoordinates().x() ++ ) - { - cell.refresh(); - output[ cell.getIndex() ] = - input( cell ) >= 0 ? std::numeric_limits< RealType >::max() : - - std::numeric_limits< RealType >::max(); - interfaceMap[ cell.getIndex() ] = false; - } - - const RealType& hx = mesh.getSpaceSteps().x(); - const RealType& hy = mesh.getSpaceSteps().y(); - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh.getDimensions().y(); - cell.getCoordinates().y() ++ ) - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh.getDimensions().x(); - cell.getCoordinates().x() ++ ) - { - cell.refresh(); - const RealType& c = input( cell ); - if( ! cell.isBoundaryEntity() ) - { - auto neighbors = cell.getNeighborEntities(); - Real pom = 0; - const IndexType e = neighbors.template getEntityIndex< 1, 0 >(); - const IndexType n = neighbors.template getEntityIndex< 0, 1 >(); - //Try init with exact data: - /*if( c * input[ n ] <= 0 ) - { - output[ cell.getIndex() ] = c; - output[ n ] = input[ n ]; - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ n ] = true; - } - if( c * input[ e ] <= 0 ) - { - output[ cell.getIndex() ] = c; - output[ e ] = input[ e ]; - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ e ] = true; - }*/ - if( c * input[ n ] <= 0 ) - { - /*if( c >= 0 ) - {*/ - pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); - if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) - output[ cell.getIndex() ] = pom; - pom = pom - TNL::sign( c )*hy; - if( TNL::abs( output[ n ] ) > TNL::abs( pom ) ) - output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy; - /*}else - { - pom = - ( hy * c )/( c - input[ n ]); - if( output[ cell.getIndex() ] < pom ) - output[ cell.getIndex() ] = pom; - if( output[ n ] > hy + pom ) - output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]); - }*/ - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ n ] = true; - } - if( c * input[ e ] <= 0 ) - { - /*if( c >= 0 ) - {*/ - pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); - if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) - output[ cell.getIndex() ] = pom; - - pom = pom - TNL::sign( c )*hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; - if( TNL::abs( output[ e ] ) > TNL::abs( pom ) ) - output[ e ] = pom; - /*}else - { - pom = - (hx * c)/( c - input[ e ]); - if( output[ cell.getIndex() ] < pom ) - output[ cell.getIndex() ] = pom; - - pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]); - if( output[ e ] > pom ) - output[ e ] = pom; - }*/ - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ e ] = true; - } - } - } - } -} - -template< typename Real, - typename Device, - typename Index > -template< typename MeshEntity > -__cuda_callable__ -void -tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: -updateCell( MeshFunctionType& u, - const MeshEntity& cell, - const RealType v) -{ - const auto& neighborEntities = cell.template getNeighborEntities< 2 >(); - const MeshType& mesh = cell.getMesh(); - const RealType& hx = mesh.getSpaceSteps().x(); - const RealType& hy = mesh.getSpaceSteps().y(); - const RealType value = u( cell ); - RealType a, b, tmp = std::numeric_limits< RealType >::max(); - - if( cell.getCoordinates().x() == 0 ) - a = u[ neighborEntities.template getEntityIndex< 1, 0 >() ]; - else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 ) - a = u[ neighborEntities.template getEntityIndex< -1, 0 >() ]; - else - { - a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0 >() ], - u[ neighborEntities.template getEntityIndex< 1, 0 >() ] ); - } - - if( cell.getCoordinates().y() == 0 ) - b = u[ neighborEntities.template getEntityIndex< 0, 1 >()]; - else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 ) - b = u[ neighborEntities.template getEntityIndex< 0, -1 >() ]; - else - { - b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1 >() ], - u[ neighborEntities.template getEntityIndex< 0, 1 >() ] ); - } - if( fabs( a ) == std::numeric_limits< RealType >::max() && - fabs( b ) == std::numeric_limits< RealType >::max() ) - return; - /*if( fabs( a ) == TypeInfo< Real >::getMaxValue() || - fabs( b ) == TypeInfo< Real >::getMaxValue() || - fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) ) - { - tmp = - fabs( a ) >= fabs( b ) ? b + TNL::sign( value ) * hy : - a + TNL::sign( value ) * hx; - }*/ - /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() && - fabs( b ) != TypeInfo< Real >::getMaxValue() && - fabs( a - b ) < TNL::sqrt( (hx * hx + hy * hy)/v ) ) - { - tmp = ( hx * hx * b + hy * hy * a + - sign( value ) * hx * hy * TNL::sqrt( ( hx * hx + hy * hy )/v - - ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy ); - u[ cell.getIndex() ] = tmp; - } - else - { - tmp = - fabs( a ) > fabs( b ) ? b + TNL::sign( value ) * hy/v : - a + TNL::sign( value ) * hx/v; - u[ cell.getIndex() ] = argAbsMin( value, tmp ); - //tmp = TypeInfo< RealType >::getMaxValue(); - }*/ - RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 }; - sortMinims( pom ); - tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v; - - - if( fabs( tmp ) < fabs( pom[ 1 ] ) ) - u[ cell.getIndex() ] = argAbsMin( value, tmp ); - else - { - tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + - TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - - ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); - u[ cell.getIndex() ] = argAbsMin( value, tmp ); - } -} - -template< typename Real, - typename Device, - typename Index > -void -tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: -initInterface( const MeshFunctionPointer& _input, - MeshFunctionPointer& _output, - InterfaceMapPointer& _interfaceMap ) -{ - if( std::is_same< Device, Devices::Cuda >::value ) - { -#ifdef HAVE_CUDA - const MeshType& mesh = _input->getMesh(); - - const int cudaBlockSize( 8 ); - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); - int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); - int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize ); - if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) - std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; - dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); - dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); - Devices::Cuda::synchronizeDevice(); - CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(), - _output.template modifyData< Device >(), - _interfaceMap.template modifyData< Device >() ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; -#endif - } - if( std::is_same< Device, Devices::Host >::value ) - { - const MeshFunctionType& input = _input.getData(); - MeshFunctionType& output = _output.modifyData(); - InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); - const MeshType& mesh = input.getMesh(); - typedef typename MeshType::Cell Cell; - Cell cell( mesh ); - for( cell.getCoordinates().z() = 0; - cell.getCoordinates().z() < mesh.getDimensions().z(); - cell.getCoordinates().z() ++ ) - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh.getDimensions().y(); - cell.getCoordinates().y() ++ ) - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh.getDimensions().x(); - cell.getCoordinates().x() ++ ) - { - cell.refresh(); - output[ cell.getIndex() ] = - input( cell ) > 0 ? 10://std::numeric_limits< RealType >::max() : - -10;//- std::numeric_limits< RealType >::max(); - interfaceMap[ cell.getIndex() ] = false; - } - - const RealType& hx = mesh.getSpaceSteps().x(); - const RealType& hy = mesh.getSpaceSteps().y(); - const RealType& hz = mesh.getSpaceSteps().z(); - for( cell.getCoordinates().z() = 0; - cell.getCoordinates().z() < mesh.getDimensions().z(); - cell.getCoordinates().z() ++ ) - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh.getDimensions().y(); - cell.getCoordinates().y() ++ ) - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh.getDimensions().x(); - cell.getCoordinates().x() ++ ) - { - cell.refresh(); - const RealType& c = input( cell ); - if( ! cell.isBoundaryEntity() ) - { - auto neighbors = cell.getNeighborEntities(); - Real pom = 0; - const IndexType e = neighbors.template getEntityIndex< 1, 0, 0 >(); - const IndexType n = neighbors.template getEntityIndex< 0, 1, 0 >(); - const IndexType t = neighbors.template getEntityIndex< 0, 0, 1 >(); - //Try exact initiation - /*const IndexType w = neighbors.template getEntityIndex< -1, 0, 0 >(); - const IndexType s = neighbors.template getEntityIndex< 0, -1, 0 >(); - const IndexType b = neighbors.template getEntityIndex< 0, 0, -1 >(); - if( c * input[ e ] <= 0 ) - { - output[ cell.getIndex() ] = c; - output[ e ] = input[ e ]; - interfaceMap[ e ] = true; - interfaceMap[ cell.getIndex() ] = true; - } - else if( c * input[ n ] <= 0 ) - { - output[ cell.getIndex() ] = c; - output[ n ] = input[ n ]; - interfaceMap[ n ] = true; - interfaceMap[ cell.getIndex() ] = true; - } - else if( c * input[ t ] <= 0 ) - { - output[ cell.getIndex() ] = c; - output[ t ] = input[ t ]; - interfaceMap[ t ] = true; - interfaceMap[ cell.getIndex() ] = true; - }*/ - if( c * input[ n ] <= 0 ) - { - if( c >= 0 ) - { - pom = ( hy * c )/( c - input[ n ]); - if( output[ cell.getIndex() ] > pom ) - output[ cell.getIndex() ] = pom; - - if ( output[ n ] < pom - hy) - output[ n ] = pom - hy; // ( hy * c )/( c - input[ n ]) - hy; - - }else - { - pom = - ( hy * c )/( c - input[ n ]); - if( output[ cell.getIndex() ] < pom ) - output[ cell.getIndex() ] = pom; - if( output[ n ] > hy + pom ) - output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]); - - } - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ n ] = true; - } - if( c * input[ e ] <= 0 ) - { - if( c >= 0 ) - { - pom = ( hx * c )/( c - input[ e ]); - if( output[ cell.getIndex() ] > pom ) - output[ cell.getIndex() ] = pom; - - pom = pom - hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; - if( output[ e ] < pom ) - output[ e ] = pom; - - }else - { - pom = - (hx * c)/( c - input[ e ]); - if( output[ cell.getIndex() ] < pom ) - output[ cell.getIndex() ] = pom; - - pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]); - if( output[ e ] > pom ) - output[ e ] = pom; - } - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ e ] = true; - } - if( c * input[ t ] <= 0 ) - { - if( c >= 0 ) - { - pom = ( hz * c )/( c - input[ t ]); - if( output[ cell.getIndex() ] > pom ) - output[ cell.getIndex() ] = pom; - - pom = pom - hz; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; - if( output[ t ] < pom ) - output[ t ] = pom; - - }else - { - pom = - (hz * c)/( c - input[ t ]); - if( output[ cell.getIndex() ] < pom ) - output[ cell.getIndex() ] = pom; - - pom = pom + hz; //output[ e ] = hx - (hx * c)/( c - input[ e ]); - if( output[ t ] > pom ) - output[ t ] = pom; - - } - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ t ] = true; - } - } - /*output[ cell.getIndex() ] = - c > 0 ? TypeInfo< RealType >::getMaxValue() : - -TypeInfo< RealType >::getMaxValue(); - interfaceMap[ cell.getIndex() ] = false;*/ //is on line 245 - } - } -} - -template< typename Real, - typename Device, - typename Index > -template< typename MeshEntity > -__cuda_callable__ -void -tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: -updateCell( MeshFunctionType& u, - const MeshEntity& cell, - const RealType v ) -{ - const auto& neighborEntities = cell.template getNeighborEntities< 3 >(); - const MeshType& mesh = cell.getMesh(); - - const RealType& hx = mesh.getSpaceSteps().x(); - const RealType& hy = mesh.getSpaceSteps().y(); - const RealType& hz = mesh.getSpaceSteps().z(); - const RealType value = u( cell ); - //std::cout << value << std::endl; - RealType a, b, c, tmp = std::numeric_limits< RealType >::max(); - - - if( cell.getCoordinates().x() == 0 ) - a = u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ]; - else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 ) - a = u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ]; - else - { - a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ], - u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ] ); - } - if( cell.getCoordinates().y() == 0 ) - b = u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ]; - else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 ) - b = u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ]; - else - { - b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ], - u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ] ); - }if( cell.getCoordinates().z() == 0 ) - c = u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ]; - else if( cell.getCoordinates().z() == mesh.getDimensions().z() - 1 ) - c = u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ]; - else - { - c = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ], - u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ] ); - } - if( fabs( a ) == std::numeric_limits< RealType >::max() && - fabs( b ) == std::numeric_limits< RealType >::max() && - fabs( c ) == std::numeric_limits< RealType >::max() ) - return; - - - /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() && - fabs( b ) != TypeInfo< Real >::getMaxValue() && - fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) ) - { - tmp = ( hx * hx * a + hy * hy * b + - sign( value ) * hx * hy * sqrt( ( hx * hx + hy * hy )/v - - ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy ); - } - if( fabs( a ) != TypeInfo< Real >::getMaxValue() && - fabs( c ) != TypeInfo< Real >::getMaxValue() && - fabs( a - c ) >= TNL::sqrt( (hx * hx + hz * hz)/v ) ) - { - tmp = ( hx * hx * a + hz * hz * c + - sign( value ) * hx * hz * sqrt( ( hx * hx + hz * hz )/v - - ( a - c ) * ( a - c ) ) )/( hx * hx + hz * hz ); - } - if( fabs( b ) != TypeInfo< Real >::getMaxValue() && - fabs( c ) != TypeInfo< Real >::getMaxValue() && - fabs( b - c ) >= TNL::sqrt( (hy * hy + hz * hz)/v ) ) - { - tmp = ( hy * hy * b + hz * hz * c + - sign( value ) * hy * hz * sqrt( ( hy * hy + hz * hz )/v - - ( b - c ) * ( b - c ) ) )/( hy * hy + hz * hz ); - }*/ - RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz}; - sortMinims( pom ); - tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]; - if( fabs( tmp ) < fabs( pom[ 1 ] ) ) - { - u[ cell.getIndex() ] = argAbsMin( value, tmp ); - } - else - { - tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + - TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - - ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); - if( fabs( tmp ) < fabs( pom[ 2 ]) ) - { - u[ cell.getIndex() ] = argAbsMin( value, tmp ); - } - else - { - tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c + - TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - - hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) - - hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx ); - u[ cell.getIndex() ] = argAbsMin( value, tmp ); - } - } -} - -template < typename T1 > -__cuda_callable__ void sortMinims( T1 pom[] ) -{ - T1 tmp[6] = {0.0,0.0,0.0,0.0,0.0,0.0}; - if( fabs(pom[0]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[2])){ - tmp[0] = pom[0]; tmp[1] = pom[1]; tmp[2] = pom[2]; - tmp[3] = pom[3]; tmp[4] = pom[4]; tmp[5] = pom[5]; - - } - else if( fabs(pom[0]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[1]) ){ - tmp[0] = pom[0]; tmp[1] = pom[2]; tmp[2] = pom[1]; - tmp[3] = pom[3]; tmp[4] = pom[5]; tmp[5] = pom[4]; - } - else if( fabs(pom[1]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[2]) ){ - tmp[0] = pom[1]; tmp[1] = pom[0]; tmp[2] = pom[2]; - tmp[3] = pom[4]; tmp[4] = pom[3]; tmp[5] = pom[5]; - } - else if( fabs(pom[1]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[0]) ){ - tmp[0] = pom[1]; tmp[1] = pom[2]; tmp[2] = pom[0]; - tmp[3] = pom[4]; tmp[4] = pom[5]; tmp[5] = pom[3]; - } - else if( fabs(pom[2]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[1]) ){ - tmp[0] = pom[2]; tmp[1] = pom[0]; tmp[2] = pom[1]; - tmp[3] = pom[5]; tmp[4] = pom[3]; tmp[5] = pom[4]; - } - else if( fabs(pom[2]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[0]) ){ - tmp[0] = pom[2]; tmp[1] = pom[1]; tmp[2] = pom[0]; - tmp[3] = pom[5]; tmp[4] = pom[4]; tmp[5] = pom[3]; - } - - for( unsigned int i = 0; i < 6; i++ ) - { - pom[ i ] = tmp[ i ]; - } -} - -template< typename Real, - typename Device, - typename Index > -template< int sizeSArray > -__cuda_callable__ -bool -tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: -updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real hy, - const Real v ) -{ - const RealType value = sArray[ thrj * sizeSArray + thri ]; - RealType a, b, tmp = std::numeric_limits< RealType >::max(); - - b = TNL::argAbsMin( sArray[ (thrj+1) * sizeSArray + thri ], - sArray[ (thrj-1) * sizeSArray + thri ] ); - - a = TNL::argAbsMin( sArray[ thrj * sizeSArray + thri+1 ], - sArray[ thrj * sizeSArray + thri-1 ] ); - - if( fabs( a ) == std::numeric_limits< RealType >::max() && - fabs( b ) == std::numeric_limits< RealType >::max() ) - return false; - - RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 }; - sortMinims( pom ); - tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v; - - - if( fabs( tmp ) < fabs( pom[ 1 ] ) ) - { - sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp ); - tmp = value - sArray[ thrj * sizeSArray + thri ]; - if ( fabs( tmp ) > 0.001*hx ) - return true; - else - return false; - } - else - { - tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + - TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - - ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); - sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp ); - tmp = value - sArray[ thrj * sizeSArray + thri ]; - if ( fabs( tmp ) > 0.001*hx ) - return true; - else - return false; - } - - return false; -} -template< typename Real, - typename Device, - typename Index > -template< int sizeSArray > -__cuda_callable__ -bool -tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: -updateCell3D( volatile Real *sArray, int thri, int thrj, int thrk, - const Real hx, const Real hy, const Real hz, const Real v ) -{ - const RealType value = sArray[thrk *sizeSArray * sizeSArray + thrj * sizeSArray + thri]; - - RealType a, b, c, tmp = std::numeric_limits< RealType >::max(); - - c = TNL::argAbsMin( sArray[ (thrk+1)* sizeSArray*sizeSArray + thrj * sizeSArray + thri ], - sArray[ (thrk-1) * sizeSArray *sizeSArray + thrj* sizeSArray + thri ] ); - - b = TNL::argAbsMin( sArray[ thrk* sizeSArray*sizeSArray + (thrj+1) * sizeSArray + thri ], - sArray[ thrk* sizeSArray * sizeSArray + (thrj-1)* sizeSArray +thri ] ); - - a = TNL::argAbsMin( sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri+1 ], - sArray[ thrk* sizeSArray * sizeSArray + thrj* sizeSArray +thri-1 ] ); - - /*if( thrk == 8 ) - printf("Calculating a = %f, b = %f, c = %f\n" , a, b, c );*/ - - if( fabs( a ) == 10&& //std::numeric_limits< RealType >::max() && - fabs( b ) == 10&&//std::numeric_limits< RealType >::max() && - fabs( c ) == 10)//std::numeric_limits< RealType >::max() ) - return false; - - RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz}; - - sortMinims( pom ); - - tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]; - if( fabs( tmp ) < fabs( pom[ 1 ] ) ) - { - sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ] = argAbsMin( value, tmp ); - tmp = value - sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ]; - if ( fabs( tmp ) > 0.001*hx ) - return true; - else - return false; - } - else - { - tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + - TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - - ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); - if( fabs( tmp ) < fabs( pom[ 2 ]) ) - { - sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ] = argAbsMin( value, tmp ); - tmp = value - sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ]; - if ( fabs( tmp ) > 0.001*hx ) - return true; - else - return false; - } - else - { - tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c + - TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - - hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) - - hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx ); - sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ] = argAbsMin( value, tmp ); - tmp = value - sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ]; - if ( fabs( tmp ) > 0.001*hx ) - return true; - else - return false; - } - } - - return false; -} - -#ifdef HAVE_CUDA -template < typename Real, typename Device, typename Index > -__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, - Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output, - Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap ) -{ - int i = threadIdx.x + blockDim.x*blockIdx.x; - const Meshes::Grid< 1, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); - - if( i < mesh.getDimensions().x() ) - { - typedef typename Meshes::Grid< 1, Real, Device, Index >::Cell Cell; - Cell cell( mesh ); - cell.getCoordinates().x() = i; - cell.refresh(); - const Index cind = cell.getIndex(); - - - output[ cind ] = - input( cell ) >= 0 ? std::numeric_limits< Real >::max() : - - std::numeric_limits< Real >::max(); - interfaceMap[ cind ] = false; - - const Real& h = mesh.getSpaceSteps().x(); - cell.refresh(); - const Real& c = input( cell ); - if( ! cell.isBoundaryEntity() ) - { - auto neighbors = cell.getNeighborEntities(); - Real pom = 0; - const Index e = neighbors.template getEntityIndex< 1 >(); - const Index w = neighbors.template getEntityIndex< -1 >(); - if( c * input[ e ] <= 0 ) - { - pom = TNL::sign( c )*( h * c )/( c - input[ e ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ w ] <= 0 ) - { - pom = TNL::sign( c )*( h * c )/( c - input[ w ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - } - } - -} -template < typename Real, typename Device, typename Index > -__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap ) -{ - int i = threadIdx.x + blockDim.x*blockIdx.x; - int j = blockDim.y*blockIdx.y + threadIdx.y; - const Meshes::Grid< 2, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); - - if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) - { - typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell; - Cell cell( mesh ); - cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; - cell.refresh(); - const Index cind = cell.getIndex(); - - - output[ cind ] = - input( cell ) >= 0 ? std::numeric_limits< Real >::max() : - - std::numeric_limits< Real >::max(); - interfaceMap[ cind ] = false; - - const Real& hx = mesh.getSpaceSteps().x(); - const Real& hy = mesh.getSpaceSteps().y(); - cell.refresh(); - const Real& c = input( cell ); - if( ! cell.isBoundaryEntity() ) - { - auto neighbors = cell.getNeighborEntities(); - Real pom = 0; - const Index e = neighbors.template getEntityIndex< 1, 0 >(); - const Index w = neighbors.template getEntityIndex< -1, 0 >(); - const Index n = neighbors.template getEntityIndex< 0, 1 >(); - const Index s = neighbors.template getEntityIndex< 0, -1 >(); - - if( c * input[ n ] <= 0 ) - { - pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cell.getIndex() ] = true; - } - if( c * input[ e ] <= 0 ) - { - pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ w ] <= 0 ) - { - pom = TNL::sign( c )*( hx * c )/( c - input[ w ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ s ] <= 0 ) - { - pom = TNL::sign( c )*( hy * c )/( c - input[ s ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - } - } -} - -template < typename Real, typename Device, typename Index > -__global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, - Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output, - Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap ) -{ - int i = threadIdx.x + blockDim.x*blockIdx.x; - int j = blockDim.y*blockIdx.y + threadIdx.y; - int k = blockDim.z*blockIdx.z + threadIdx.z; - const Meshes::Grid< 3, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); - - if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() ) - { - typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell; - Cell cell( mesh ); - cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k; - cell.refresh(); - const Index cind = cell.getIndex(); - - - output[ cind ] = - input( cell ) >= 0 ? std::numeric_limits< Real >::max() : - - std::numeric_limits< Real >::max(); - interfaceMap[ cind ] = false; - cell.refresh(); - - const Real& hx = mesh.getSpaceSteps().x(); - const Real& hy = mesh.getSpaceSteps().y(); - const Real& hz = mesh.getSpaceSteps().z(); - const Real& c = input( cell ); - if( ! cell.isBoundaryEntity() ) - { - auto neighbors = cell.getNeighborEntities(); - Real pom = 0; - const Index e = neighbors.template getEntityIndex< 1, 0, 0 >(); - const Index w = neighbors.template getEntityIndex< -1, 0, 0 >(); - const Index n = neighbors.template getEntityIndex< 0, 1, 0 >(); - const Index s = neighbors.template getEntityIndex< 0, -1, 0 >(); - const Index t = neighbors.template getEntityIndex< 0, 0, 1 >(); - const Index b = neighbors.template getEntityIndex< 0, 0, -1 >(); - - if( c * input[ n ] <= 0 ) - { - pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ e ] <= 0 ) - { - pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ w ] <= 0 ) - { - pom = TNL::sign( c )*( hx * c )/( c - input[ w ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ s ] <= 0 ) - { - pom = TNL::sign( c )*( hy * c )/( c - input[ s ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ b ] <= 0 ) - { - pom = TNL::sign( c )*( hz * c )/( c - input[ b ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ t ] <= 0 ) - { - pom = TNL::sign( c )*( hz * c )/( c - input[ t ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - } - } -} - - - - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -bool -tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >:: -updateCell( volatile Real sArray[18], int thri, const Real h, const Real v ) -{ - const RealType value = sArray[ thri ]; - RealType a, tmp = std::numeric_limits< RealType >::max(); - - a = TNL::argAbsMin( sArray[ thri+1 ], - sArray[ thri-1 ] ); - - if( fabs( a ) == std::numeric_limits< RealType >::max() ) - return false; - - tmp = a + TNL::sign( value ) * h/v; - - - sArray[ thri ] = argAbsMin( value, tmp ); - - tmp = value - sArray[ thri ]; - if ( fabs( tmp ) > 0.001*h ) - return true; - else - return false; -} -#endif diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h index 61465bee9f9784130a547dfe7aaf8bdac6ebdc2d..41aea10a0ef559042e3a9d94c80ab983e47de737 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h @@ -71,6 +71,8 @@ class tnlDirectEikonalProblem bool setInitialCondition( const Config::ParameterContainer& parameters, DofVectorPointer& dofs ); + + bool makeSnapshot( ); bool solve( DofVectorPointer& dosf ); diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h index 7437803e262b517632d43c590f485b5e923b2698..c36c4dca98d4c6bbdf27265a57eeef073512dc59 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h @@ -12,6 +12,9 @@ */ #pragma once +#include + +#include "tnlDirectEikonalProblem.h" template< typename Mesh, typename Communicator, @@ -76,6 +79,11 @@ tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >:: setup( const Config::ParameterContainer& parameters, const String& prefix ) { + String param=parameters.getParameter< String >( "distributed-grid-io-type" ); + if(param=="MpiIO") + distributedIOType=Meshes::DistributedMeshes::MpiIO; + if(param=="LocalCopy") + distributedIOType=Meshes::DistributedMeshes::LocalCopy; return true; } @@ -115,16 +123,15 @@ setInitialCondition( const Config::ParameterContainer& parameters, { this->bindDofs( dofs ); String inputFile = parameters.getParameter< String >( "input-file" ); - this->initialData->setMesh( this->getMesh() ); - std::cout<<"setInitialCondition" <initialData->setMesh( this->getMesh() ); + if( CommunicatorType::isDistributed() ) { - std::cout<<"Nodes Distribution: " << u->getMesh().getDistributedMesh()->printProcessDistr() << std::endl; + std::cout<<"Nodes Distribution: " << initialData->getMesh().getDistributedMesh()->printProcessDistr() << std::endl; if(distributedIOType==Meshes::DistributedMeshes::MpiIO) - Meshes::DistributedMeshes::DistributedGridIO ::load(inputFile, *u ); + Meshes::DistributedMeshes::DistributedGridIO ::load(inputFile, *initialData ); if(distributedIOType==Meshes::DistributedMeshes::LocalCopy) - Meshes::DistributedMeshes::DistributedGridIO ::load(inputFile, *u ); - u->template synchronize(); + Meshes::DistributedMeshes::DistributedGridIO ::load(inputFile, *initialData ); + initialData->template synchronize(); } else { @@ -141,6 +148,35 @@ setInitialCondition( const Config::ParameterContainer& parameters, return true; } +template< typename Mesh, + typename Communicator, + typename Anisotropy, + typename Real, + typename Index > +bool +tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >:: +makeSnapshot( ) +{ + std::cout << std::endl << "Writing output." << std::endl; + + //this->bindDofs( dofs ); + + FileName fileName; + fileName.setFileNameBase( "u-" ); + fileName.setExtension( "tnl" ); + + if(CommunicatorType::isDistributed()) + { + if(distributedIOType==Meshes::DistributedMeshes::MpiIO) + Meshes::DistributedMeshes::DistributedGridIO ::save(fileName.getFileName(), *u ); + if(distributedIOType==Meshes::DistributedMeshes::LocalCopy) + Meshes::DistributedMeshes::DistributedGridIO ::save(fileName.getFileName(), *u ); + } + else + this->u->save( fileName.getFileName() ); + return true; +} + template< typename Mesh, typename Communicator, @@ -151,7 +187,9 @@ bool tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >:: solve( DofVectorPointer& dofs ) { - FastSweepingMethod< MeshType, AnisotropyType > fsm; - fsm.solve( this->getMesh(), anisotropy, initialData ); + FastSweepingMethod< MeshType, Communicator,AnisotropyType > fsm; + fsm.solve( this->getMesh(), u, anisotropy, initialData ); + + makeSnapshot(); return true; } diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h index 57b1886e8b67c943051623c0a07f301faa76cad8..8c58ee6102ebe52cf71cdc6a8a6bff94d5792536 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h @@ -10,13 +10,14 @@ #pragma once -#include -#include -#include +//#include +//#include +//#include #include "tnlDirectEikonalMethodsBase.h" template< typename Mesh, + typename Communicator, typename Anisotropy = Functions::Analytic::Constant< Mesh::getMeshDimension(), typename Mesh::RealType > > class FastSweepingMethod { @@ -25,8 +26,9 @@ class FastSweepingMethod template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > -class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy > +class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Communicator, Anisotropy > : public tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > { //static_assert( std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." ); @@ -47,7 +49,7 @@ class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy > using typename BaseType::MeshFunctionType; using typename BaseType::InterfaceMapPointer; using typename BaseType::MeshFunctionPointer; - + FastSweepingMethod(); @@ -56,6 +58,7 @@ class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy > void setMaxIterations( const IndexType& maxIterations ); void solve( const MeshPointer& mesh, + MeshFunctionPointer& Aux, const AnisotropyPointer& anisotropy, MeshFunctionPointer& u ); @@ -68,8 +71,9 @@ class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy > template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > -class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy > +class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy > : public tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > { //static_assert( std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." ); @@ -82,15 +86,19 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy > typedef Index IndexType; typedef Anisotropy AnisotropyType; typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > BaseType; + typedef Communicator CommunicatorType; + typedef Containers::StaticVector< 2, Index > StaticVector; + using MeshPointer = Pointers::SharedPointer< MeshType >; using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >; + using MPI = Communicators::MpiCommunicator; using typename BaseType::InterfaceMapType; using typename BaseType::MeshFunctionType; using typename BaseType::InterfaceMapPointer; using typename BaseType::MeshFunctionPointer; using typename BaseType::ArrayContainer; - + FastSweepingMethod(); const IndexType& getMaxIterations() const; @@ -98,19 +106,30 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy > void setMaxIterations( const IndexType& maxIterations ); void solve( const MeshPointer& mesh, + MeshFunctionPointer& Aux, const AnisotropyPointer& anisotropy, - MeshFunctionPointer& u ); + const MeshFunctionPointer& u ); protected: const IndexType maxIterations; + + void setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps, + const MeshPointer& mesh); + + bool goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, + MeshFunctionType& aux, const InterfaceMapType& interfaceMap, + const AnisotropyPointer& anisotropy ); + + void getInfoFromNeighbours( int& calculated, int& calculateAgain, const MeshPointer& mesh ); }; template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > -class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy > +class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy > : public tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > { //static_assert( std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." ); @@ -123,8 +142,12 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy > typedef Index IndexType; typedef Anisotropy AnisotropyType; typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > BaseType; + typedef Communicator CommunicatorType; + typedef Containers::StaticVector< 3, Index > StaticVector; + using MeshPointer = Pointers::SharedPointer< MeshType >; using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >; + using MPI = Communicators::MpiCommunicator; using typename BaseType::InterfaceMapType; using typename BaseType::MeshFunctionType; @@ -140,6 +163,7 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy > void setMaxIterations( const IndexType& maxIterations ); void solve( const MeshPointer& mesh, + MeshFunctionPointer& Aux, const AnisotropyPointer& anisotropy, MeshFunctionPointer& u ); @@ -147,6 +171,15 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy > protected: const IndexType maxIterations; + + void setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps, + const MeshPointer& mesh); + + bool goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, + MeshFunctionType& aux, const InterfaceMapType& interfaceMap, + const AnisotropyPointer& anisotropy ); + + void getInfoFromNeighbours( int& calculated, int& calculateAgain, const MeshPointer& mesh ); }; diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h index 890c6cb4c9373917abf1d668ebf60a30cfbc17b3..f2f033ccbee3ffa5b71567ea1b54e2307ebe1713 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h @@ -18,8 +18,9 @@ template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > -FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Communicator, Anisotropy >:: FastSweepingMethod() : maxIterations( 1 ) { @@ -29,9 +30,10 @@ FastSweepingMethod() template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > const Index& -FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Communicator, Anisotropy >:: getMaxIterations() const { @@ -40,9 +42,10 @@ getMaxIterations() const template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > void -FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Communicator, Anisotropy >:: setMaxIterations( const IndexType& maxIterations ) { @@ -51,10 +54,12 @@ setMaxIterations( const IndexType& maxIterations ) template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > void -FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Communicator, Anisotropy >:: solve( const MeshPointer& mesh, + MeshFunctionPointer& Aux, const AnisotropyPointer& anisotropy, MeshFunctionPointer& u ) { @@ -104,7 +109,7 @@ solve( const MeshPointer& mesh, dim3 blockSize( cudaBlockSize ); dim3 gridSize( numBlocksX ); - tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > ptr; + BaseType ptr; diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index 89cb608810fd37823475c2fb08c1d0ad03397db2..e5638c11dd71d88d72d8d0590c8e51c0df6baaab 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -13,22 +13,12 @@ #pragma once -#include "tnlFastSweepingMethod.h" -#include -#include - - - - -#include -#include -#include - template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > -FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: FastSweepingMethod() : maxIterations( 1 ) { @@ -38,9 +28,10 @@ FastSweepingMethod() template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > const Index& -FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: getMaxIterations() const { @@ -49,9 +40,10 @@ getMaxIterations() const template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > void -FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: setMaxIterations( const IndexType& maxIterations ) { @@ -60,711 +52,442 @@ setMaxIterations( const IndexType& maxIterations ) template< typename Real, typename Device, typename Index, - typename Anisotropy > + typename Communicator, + typename Anisotropy > void -FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: solve( const MeshPointer& mesh, + MeshFunctionPointer& Aux, const AnisotropyPointer& anisotropy, - MeshFunctionPointer& u ) + const MeshFunctionPointer& u ) { MeshFunctionPointer auxPtr; InterfaceMapPointer interfaceMapPtr; auxPtr->setMesh( mesh ); interfaceMapPtr->setMesh( mesh ); + + // Setting overlaps ( WITHOUT MPI SHOULD BE 0 ) + StaticVector vecLowerOverlaps, vecUpperOverlaps; + setOverlaps( vecLowerOverlaps, vecUpperOverlaps, mesh ); + std::cout << "Initiating the interface cells ..." << std::endl; - BaseType::initInterface( u, auxPtr, interfaceMapPtr ); + BaseType::initInterface( u, auxPtr, interfaceMapPtr, vecLowerOverlaps, vecUpperOverlaps ); - auxPtr->save( "aux-ini.tnl" ); + //auxPtr->save( "aux-ini.tnl" ); typename MeshType::Cell cell( *mesh ); IndexType iteration( 0 ); InterfaceMapType interfaceMap = *interfaceMapPtr; MeshFunctionType aux = *auxPtr; + aux.template synchronize< Communicator >(); //synchronize initialized overlaps - -#ifdef HAVE_MPI - int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup ); - //printf( "Hello world from rank: %d ", i ); - //Communicators::MpiCommunicator::Request r = Communicators::MpiCommunicator::ISend( auxPtr, 0, 0, Communicators::MpiCommunicator::AllGroup ); - if( i == 1 ) { - /*for( int k = 0; k < 16*16; k++ ) - aux[ k ] = 10;*/ - printf( "1: mesh x: %d\n", mesh->getDimensions().x() ); - printf( "1: mesh y: %d\n", mesh->getDimensions().y() ); - //aux.save("aux_proc1.tnl"); - } - if( i == 0 ) { - printf( "0: mesh x: %d\n", mesh->getDimensions().x() ); - printf( "0: mesh y: %d\n", mesh->getDimensions().y() ); - //aux.save("aux_proc0.tnl"); - /*for( int k = 0; k < mesh->getDimensions().x()*mesh->getDimensions().y(); k++ ) - aux[ k ] = 10; - for( int k = 0; k < mesh->getDimensions().x(); k++ ){ - for( int l = 0; l < mesh->getDimensions().y(); l++ ) - printf("%f.2\t",aux[ k * 16 + l ] ); - printf("\n"); - }*/ - } - - /*bool a = Communicators::MpiCommunicator::IsInitialized(); - if( a ) - printf("Je Init\n"); - else - printf("Neni Init\n");*/ -#endif - + std::cout << "Calculating the values ..." << std::endl; while( iteration < this->maxIterations ) { - if( std::is_same< DeviceType, Devices::Host >::value ) - { - int numThreadsPerBlock = -1; - - numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0)); - //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); - if( numThreadsPerBlock <= 16 ) - numThreadsPerBlock = 16; - else if(numThreadsPerBlock <= 32 ) - numThreadsPerBlock = 32; - else if(numThreadsPerBlock <= 64 ) - numThreadsPerBlock = 64; - else if(numThreadsPerBlock <= 128 ) - numThreadsPerBlock = 128; - else if(numThreadsPerBlock <= 256 ) - numThreadsPerBlock = 256; - else if(numThreadsPerBlock <= 512 ) - numThreadsPerBlock = 512; - else - numThreadsPerBlock = 1024; - //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); - - if( numThreadsPerBlock == -1 ){ - printf("Fail in setting numThreadsPerBlock.\n"); - break; - } - - - - int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0); - int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0); - - //std::cout << "numBlocksX = " << numBlocksX << std::endl; - - /*Real **sArray = new Real*[numBlocksX*numBlocksY]; - for( int i = 0; i < numBlocksX * numBlocksY; i++ ) - sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];*/ - - ArrayContainer BlockIterHost; - BlockIterHost.setSize( numBlocksX * numBlocksY ); - BlockIterHost.setValue( 1 ); - int IsCalculationDone = 1; - - MeshFunctionPointer helpFunc( mesh ); - MeshFunctionPointer helpFunc1( mesh ); - helpFunc1 = auxPtr; - auxPtr = helpFunc; - helpFunc = helpFunc1; - //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl; - /*for( int k = numBlocksX-1; k >-1; k-- ){ - for( int l = 0; l < numBlocksY; l++ ){ - std::cout<< BlockIterHost[ l*numBlocksX + k ]; - } - std::cout<template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); - case 32: - this->template updateBlocks< 34 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); - case 64: - this->template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); - case 128: - this->template updateBlocks< 130 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); - case 256: - this->template updateBlocks< 258 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); - case 512: - this->template updateBlocks< 514 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); - default: - this->template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); - } - - - //Reduction - for( int i = 0; i < BlockIterHost.getSize(); i++ ){ - if( IsCalculationDone == 0 ){ - IsCalculationDone = IsCalculationDone || BlockIterHost[ i ]; - //break; - } - } - numWhile++; - /*std::cout <<"numWhile = "<< numWhile <-1; j-- ){ - for( int i = 0; i < numBlocksX; i++ ) - std::cout << BlockIterHost[ j * numBlocksX + i ]; - std::cout << std::endl; - } - std::cout << std::endl;*/ - - this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY ); - - /*for( int j = numBlocksY-1; j>-1; j-- ){ - for( int i = 0; i < numBlocksX; i++ ) - std::cout << "BlockIterHost = "<< j*numBlocksX + i<< " ," << BlockIterHost[ j * numBlocksX + i ]; - std::cout << std::endl; - } - std::cout << std::endl;*/ - - //std::cout<getDimensions().y(); - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); - cell.getCoordinates().x()++ ) - { - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - - //aux.save( "aux-1.tnl" ); - - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh->getDimensions().y(); - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) - { - //std::cerr << "2 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - - //aux.save( "aux-2.tnl" ); - - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; - cell.getCoordinates().y() >= 0 ; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); - cell.getCoordinates().x()++ ) - { - //std::cerr << "3 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - - //aux.save( "aux-3.tnl" ); - - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; - cell.getCoordinates().y() >= 0; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) - { - //std::cerr << "4 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - - for( int j = 0; - j < mesh->getDimensions().y(); - j++ ) - { - for( int i = 0; - i < mesh->getDimensions().x(); - i++ ) - { - std::cout << aux[ i * mesh->getDimensions().y() + j ] << " "; - } - std::cout << std::endl; - }*/ - - } - if( std::is_same< DeviceType, Devices::Cuda >::value ) + // calculatedBefore indicates weather we calculated in the last passage of the while cycle + // calculatedBefore is same for all ranks + // without MPI should be FALSE at the end of while cycle body + int calculatedBefore = 1; + + // calculateMPIAgain indicates if the thread should calculate again in upcoming passage of while cycle + // calculateMPIAgain is a value that can differ in every rank + // without MPI should be FALSE at the end of while cycle body + int calculateMPIAgain = 1; + + while( calculatedBefore ) { - // TODO: CUDA code -#ifdef HAVE_CUDA - TNL_CHECK_CUDA_DEVICE; - // Maximum cudaBlockSite is 32. Because of maximum num. of threads in kernel. - const int cudaBlockSize( 16 ); - - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); - int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); - dim3 blockSize( cudaBlockSize, cudaBlockSize ); - dim3 gridSize( numBlocksX, numBlocksY ); - - tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; - - int BlockIterD = 1; - - TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; - BlockIterDevice.setSize( numBlocksX * numBlocksY ); - BlockIterDevice.setValue( 1 ); - TNL_CHECK_CUDA_DEVICE; - - - TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom; - BlockIterPom.setSize( numBlocksX * numBlocksY ); - BlockIterPom.setValue( 0 ); - /*TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1; - BlockIterPom1.setSize( numBlocksX * numBlocksY ); - BlockIterPom1.setValue( 0 );*/ - /*int *BlockIterDevice; - cudaMalloc((void**) &BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/ - int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0); - //std::cout << "nBlocksNeigh = " << nBlocksNeigh << std::endl; - //free( BlockIter ); - /*int *BlockIterPom; - cudaMalloc((void**) &BlockIterPom, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/ - - int nBlocks = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0); + calculatedBefore = 0; - TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; - dBlock.setSize( nBlocks ); - TNL_CHECK_CUDA_DEVICE; - /*int *dBlock; - cudaMalloc((void**) &dBlock, nBlocks * sizeof( int ) );*/ - - - MeshFunctionPointer helpFunc1( mesh ); - MeshFunctionPointer helpFunc( mesh ); - - helpFunc1 = auxPtr; - auxPtr = helpFunc; - helpFunc = helpFunc1; - - int numIter = 0; - - //int oddEvenBlock = 0; - while( BlockIterD ) + if( std::is_same< DeviceType, Devices::Host >::value && calculateMPIAgain ) // should we calculate in Host? { - /** HERE IS CHESS METHOD **/ + calculateMPIAgain = 0; - /*auxPtr = helpFunc; + /**--HERE-IS-PARALLEL-OMP-CODE--!!!WITHOUT MPI!!!--------------------**/ + /* + int numThreadsPerBlock = -1; - CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, - interfaceMapPtr.template getData< Device >(), - auxPtr.template getData< Device>(), - helpFunc.template modifyData< Device>(), - BlockIterDevice, - oddEvenBlock.getView() ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - auxPtr = helpFunc; + numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0)); + //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); + if( numThreadsPerBlock <= 16 ) + numThreadsPerBlock = 16; + else if(numThreadsPerBlock <= 32 ) + numThreadsPerBlock = 32; + else if(numThreadsPerBlock <= 64 ) + numThreadsPerBlock = 64; + else if(numThreadsPerBlock <= 128 ) + numThreadsPerBlock = 128; + else if(numThreadsPerBlock <= 256 ) + numThreadsPerBlock = 256; + else if(numThreadsPerBlock <= 512 ) + numThreadsPerBlock = 512; + else + numThreadsPerBlock = 1024; + //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); + + if( numThreadsPerBlock == -1 ){ + printf("Fail in setting numThreadsPerBlock.\n"); + break; + } + + + + int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0); + int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0); + + //std::cout << "numBlocksX = " << numBlocksX << std::endl; - oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; + //Real **sArray = new Real*[numBlocksX*numBlocksY]; + //for( int i = 0; i < numBlocksX * numBlocksY; i++ ) + // sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)]; - CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, - interfaceMapPtr.template getData< Device >(), - auxPtr.template getData< Device>(), - helpFunc.template modifyData< Device>(), - BlockIterDevice, - oddEvenBlock.getView() ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + ArrayContainer BlockIterHost; + BlockIterHost.setSize( numBlocksX * numBlocksY ); + BlockIterHost.setValue( 1 ); + int IsCalculationDone = 1; + + MeshFunctionPointer helpFunc( mesh ); + MeshFunctionPointer helpFunc1( mesh ); + helpFunc1 = auxPtr; + auxPtr = helpFunc; + helpFunc = helpFunc1; + //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl; + //for( int k = numBlocksX-1; k >-1; k-- ){ + // for( int l = 0; l < numBlocksY; l++ ){ + // std::cout<< BlockIterHost[ l*numBlocksX + k ]; + // } + // std::cout<template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + case 32: + this->template updateBlocks< 34 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + case 64: + this->template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + case 128: + this->template updateBlocks< 130 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + case 256: + this->template updateBlocks< 258 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + case 512: + this->template updateBlocks< 514 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + default: + this->template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + } - oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; - CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY ) ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + //Reduction + for( int i = 0; i < BlockIterHost.getSize(); i++ ){ + if( IsCalculationDone == 0 ){ + IsCalculationDone = IsCalculationDone || BlockIterHost[ i ]; + //break; + } + } + numWhile++; + //std::cout <<"numWhile = "<< numWhile <-1; j-- ){ + // for( int i = 0; i < numBlocksX; i++ ) + // std::cout << BlockIterHost[ j * numBlocksX + i ]; + // std::cout << std::endl; + // } + // std::cout << std::endl; + + this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY ); + + //std::cout<getDimensions().y() - vecUpperOverlaps[1]; + boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; + calculatedBefore = goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); + //aux.save("aux-1.tnl"); - helpFunc1 = auxPtr; - auxPtr = helpFunc; - helpFunc = helpFunc1; - TNL_CHECK_CUDA_DEVICE; + // UP and LEFL + boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; + boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = -1 + vecLowerOverlaps[0]; + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); + //aux.save( "aux-2.tnl" ); - //int pocBloku = 0; - Devices::Cuda::synchronizeDevice(); - CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, - interfaceMapPtr.template getData< Device >(), - auxPtr.template modifyData< Device>(), - helpFunc.template modifyData< Device>(), - BlockIterDevice.getView() ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + // DOWN and RIGHT + boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; + boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); + //aux.save( "aux-3.tnl" ); - //std::cout << "Pocet aktivnich bloku = " << pocBloku << std::endl; - //BlockIterPom1 = BlockIterDevice; - ///for( int i =0; i< numBlocksX; i++ ){ - // for( int j = 0; j < numBlocksY; j++ ) - // { - // std::cout << BlockIterPom1[j*numBlocksX + i]; - // } - // std::cout << std::endl; - //} - //std::cout << std::endl; + // DOWN and LEFT + boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; + boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice.getView(), BlockIterPom.getView(), numBlocksX, numBlocksY ); - cudaDeviceSynchronize(); + } + if( std::is_same< DeviceType, Devices::Cuda >::value && calculateMPIAgain ) // should we calculate on CUDA? + { + calculateMPIAgain = 0; + +#ifdef HAVE_CUDA TNL_CHECK_CUDA_DEVICE; - BlockIterDevice = BlockIterPom; - - //std::cout<< "Probehlo" << std::endl; + // Maximum cudaBlockSite is 32. Because of maximum num. of threads in kernel. + // IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2) + const int cudaBlockSize( 16 ); - //TNL::swap( auxPtr, helpFunc ); + // Setting number of threads and blocks for kernel + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize ); + int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize ); + dim3 blockSize( cudaBlockSize, cudaBlockSize ); + dim3 gridSize( numBlocksX, numBlocksY ); + // Need for calling functions from kernel + BaseType ptr; - CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY ) ); - TNL_CHECK_CUDA_DEVICE; + // True if we should calculate again. + int calculateCudaBlocksAgain = 1; - CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks ); + // Array that identifies which blocks should be calculated. + // All blocks should calculate in first passage ( setValue(1) ) + TNL::Containers::Array< int, Devices::Cuda, IndexType > blockCalculationIndicator( numBlocksX * numBlocksY ); + blockCalculationIndicator.setValue( 1 ); TNL_CHECK_CUDA_DEVICE; + // Array into which we identify the neighbours and then copy it into blockCalculationIndicator + TNL::Containers::Array< int, Devices::Cuda, IndexType > blockCalculationIndicatorHelp(numBlocksX * numBlocksY ); + blockCalculationIndicatorHelp.setValue( 0 ); - BlockIterD = dBlock.getElement( 0 ); - //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + // number of Blocks for kernel that calculates neighbours. + int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0); + // Helping meshFunction that switches with AuxPtr in every calculation of CudaUpdateCellCaller<<<>>>() + MeshFunctionPointer helpFunc( mesh ); + helpFunc.template modifyData() = auxPtr.template getData(); - /**-----------------------------------------------------------------------------------------------------------*/ - /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) - BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ - numIter ++; - } - if( numIter == 1 ){ - auxPtr = helpFunc; + // number of iterations of while calculateCudaBlocksAgain + int numIter = 0; + + //int oddEvenBlock = 0; + while( calculateCudaBlocksAgain ) + { + /** HERE IS CHESS METHOD (NO MPI) **/ + + /* + CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, + interfaceMapPtr.template getData< Device >(), + auxPtr.template getData< Device>(), + helpFunc.template modifyData< Device>(), + blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps, + oddEvenBlock ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; + + CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, + interfaceMapPtr.template getData< Device >(), + helpFunc.template getData< Device>(), + auxPtr.template modifyData< Device>(), + blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps, + oddEvenBlock ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; + + calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1); + */ + /**------------------------------------------------------------------------------------------------*/ + + + /** HERE IS FIM FOR MPI AND WITHOUT MPI **/ + Devices::Cuda::synchronizeDevice(); + CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), + auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(), + blockCalculationIndicator.getView(), vecLowerOverlaps, vecUpperOverlaps ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + // Switching helpFunc and auxPtr. + auxPtr.swap( helpFunc ); + + // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now. + Devices::Cuda::synchronizeDevice(); + GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator.getView(), blockCalculationIndicatorHelp.getView(), numBlocksX, numBlocksY ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + blockCalculationIndicator = blockCalculationIndicatorHelp; + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + // "Parallel reduction" to see if we should calculate again calculateCudaBlocksAgain + calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1); + + // When we change something then we should caclucate again in the next passage of MPI ( calculated = true ) + if( calculateCudaBlocksAgain ){ + calculatedBefore = 1; + } + +/**-----------------------------------------------------------------------------------------------------------*/ + numIter ++; + } + if( numIter%2 == 1 ) // Need to check parity for MPI overlaps to synchronize ( otherwise doesnt work ) + { + helpFunc.swap( auxPtr ); + Devices::Cuda::synchronizeDevice(); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + } + aux = *auxPtr; + interfaceMap = *interfaceMapPtr; +#endif } - /*cudaFree( BlockIterDevice ); - cudaFree( dBlock ); - delete BlockIter;*/ - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + - aux = *auxPtr; - interfaceMap = *interfaceMapPtr; +/**----------------------MPI-TO-DO---------------------------------------------**/ +#ifdef HAVE_MPI + if( CommunicatorType::isDistributed() ){ + getInfoFromNeighbours( calculatedBefore, calculateMPIAgain, mesh ); + + aux.template synchronize< Communicator >(); + } #endif + if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculated 0! + calculatedBefore = 0; } iteration++; } - //#endif - aux.save("aux-final.tnl"); + Aux=auxPtr; // copy it for MakeSnapshot } -#ifdef HAVE_CUDA - +// PROTECTED FUNCTIONS: -template < typename Index > -__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY ) +template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > +void +FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: +setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps, + const MeshPointer& mesh) { - int i = blockIdx.x * 1024 + threadIdx.x; - - if( i < numBlockX * numBlockY ) + vecLowerOverlaps[0] = 0; vecLowerOverlaps[1] = 0; vecUpperOverlaps[0] = 0; vecUpperOverlaps[1] = 0; +#ifdef HAVE_MPI + if( CommunicatorType::isDistributed() ) //If we started solver with MPI { - int pom = 0;//BlockIterPom[ i ] = 0; - int m=0, k=0; - m = i%numBlockX; - k = i/numBlockX; - if( m > 0 && BlockIterDevice[ i - 1 ] ){ - pom = 1;//BlockIterPom[ i ] = 1; - }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){ - pom = 1;//BlockIterPom[ i ] = 1; - }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){ - pom = 1;// BlockIterPom[ i ] = 1; - }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){ - pom = 1;//BlockIterPom[ i ] = 1; - } - - BlockIterPom[ i ] = pom;//BlockIterPom[ i ]; + //Distributed mesh for MPI overlaps (without MPI null pointer) + const Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh(); + vecLowerOverlaps = meshPom->getLowerOverlap(); + vecUpperOverlaps = meshPom->getUpperOverlap(); } +#endif } -template < typename Index > -__global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > dBlock, int nBlocks ) + + + +template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > +bool +FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: +goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, + MeshFunctionType& aux, const InterfaceMapType& interfaceMap, + const AnisotropyPointer& anisotropy ) { - int i = threadIdx.x; - int blId = blockIdx.x; - int blockSize = blockDim.x; - /*if ( i == 0 && blId == 0 ){ - printf( "nBlocks = %d \n", nBlocks ); - for( int j = nBlocks-1; j > -1 ; j--){ - printf( "cislo = %d \n", BlockIterDevice[ j ] ); - } - }*/ - __shared__ int sArray[ 1024 ]; - sArray[ i ] = 0; - if( blId * 1024 + i < nBlocks ) - sArray[ i ] = BlockIterDevice[ blId * 1024 + i ]; - __syncthreads(); - /*extern __shared__ volatile int sArray[]; - unsigned int i = threadIdx.x; - unsigned int gid = blockIdx.x * blockSize * 2 + threadIdx.x; - unsigned int gridSize = blockSize * 2 * gridDim.x; - sArray[ i ] = 0; - while( gid < nBlocks ) - { - sArray[ i ] += BlockIterDevice[ gid ] + BlockIterDevice[ gid + blockSize ]; - gid += gridSize; - } - __syncthreads();*/ + bool calculated = false; + const MeshType& mesh = aux.getMesh(); + const IndexType stepX = boundsFrom[0] < boundsTo[0]? 1 : -1; + const IndexType stepY = boundsFrom[1] < boundsTo[1]? 1 : -1; - if ( blockSize == 1024) { - if (i < 512) - sArray[ i ] += sArray[ i + 512 ]; - } - __syncthreads(); - if (blockSize >= 512) { - if (i < 256) { - sArray[ i ] += sArray[ i + 256 ]; - } - } - __syncthreads(); - if (blockSize >= 256) { - if (i < 128) { - sArray[ i ] += sArray[ i + 128 ]; - } - } - __syncthreads(); - if (blockSize >= 128) { - if (i < 64) { - sArray[ i ] += sArray[ i + 64 ]; - } - } - __syncthreads(); - if (i < 32 ) + typename MeshType::Cell cell( mesh ); + cell.refresh(); + + for( cell.getCoordinates().y() = boundsFrom[1]; + TNL::abs( cell.getCoordinates().y() - boundsTo[1] ) > 0; + cell.getCoordinates().y() += stepY ) { - if( blockSize >= 64 ) sArray[ i ] += sArray[ i + 32 ]; - if( blockSize >= 32 ) sArray[ i ] += sArray[ i + 16 ]; - if( blockSize >= 16 ) sArray[ i ] += sArray[ i + 8 ]; - if( blockSize >= 8 ) sArray[ i ] += sArray[ i + 4 ]; - if( blockSize >= 4 ) sArray[ i ] += sArray[ i + 2 ]; - if( blockSize >= 2 ) sArray[ i ] += sArray[ i + 1 ]; + for( cell.getCoordinates().x() = boundsFrom[0]; + TNL::abs( cell.getCoordinates().x() - boundsTo[0] ) > 0; + cell.getCoordinates().x() += stepX ) + { + cell.refresh(); + if( ! interfaceMap( cell ) ) + { + calculated = this->updateCell( aux, cell ) || calculated; + } + } } - - if( i == 0 ) - dBlock[ blId ] = sArray[ 0 ]; + return calculated; } -template < int sizeSArray, typename Real, typename Device, typename Index > -__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, - const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, - const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock ) + +#ifdef HAVE_MPI +template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > +void +FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: +getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh ) { - int thri = threadIdx.x; int thrj = threadIdx.y; - int i = threadIdx.x + blockDim.x*blockIdx.x; - int j = blockDim.y*blockIdx.y + threadIdx.y; - /** FOR CHESS METHOD */ - //if( (blockIdx.y%2 + blockIdx.x) % 2 == oddEvenBlock ) - //{ - /**------------------------------------------*/ + Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh(); + int calculateFromNeighbours[4] = {0,0,0,0}; + const int *neighbours = meshDistr->getNeighbors(); // Getting neighbors of distributed mesh + MPI::Request *requestsInformation; + requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ]; - /** FOR FIM METHOD */ + int neighCount = 0; // should this thread calculate again? - if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] ) - { - __syncthreads(); - /**-----------------------------------------*/ - const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); - __shared__ int dimX; - __shared__ int dimY; - __shared__ Real hx; - __shared__ Real hy; - if( thri==0 && thrj == 0) - { - dimX = mesh.getDimensions().x(); - dimY = mesh.getDimensions().y(); - hx = mesh.getSpaceSteps().x(); - hy = mesh.getSpaceSteps().y(); - BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] = 0; - } - __syncthreads(); - int numOfBlockx; - int numOfBlocky; - int xkolik; - int ykolik; - - xkolik = blockDim.x + 1; - ykolik = blockDim.y + 1; - numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0); - numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0); - - if( numOfBlockx - 1 == blockIdx.x ) - xkolik = dimX - (blockIdx.x)*blockDim.x+1; - - if( numOfBlocky -1 == blockIdx.y ) - ykolik = dimY - (blockIdx.y)*blockDim.y+1; - __syncthreads(); - - int currentIndex = thrj * blockDim.x + thri; - //__shared__ volatile bool changed[ blockDim.x*blockDim.y ]; - __shared__ volatile bool changed[ (sizeSArray-2)*(sizeSArray-2)]; - changed[ currentIndex ] = false; - if( thrj == 0 && thri == 0 ) - changed[ 0 ] = true; - - - //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ]; - __shared__ volatile Real sArray[ sizeSArray * sizeSArray ]; - sArray[ thrj * sizeSArray + thri ] = std::numeric_limits< Real >::max(); - - //filling sArray edges - if( thri == 0 ) - { - if( dimX > (blockIdx.x+1) * blockDim.x && thrj+1 < ykolik ) - sArray[(thrj+1)*sizeSArray + xkolik] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX + xkolik ]; - else - sArray[(thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max(); - } - - if( thri == 1 ) - { - if( blockIdx.x != 0 && thrj+1 < ykolik ) - sArray[(thrj+1)*sizeSArray + 0] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX ]; - else - sArray[(thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max(); - } - - if( thri == 2 ) - { - if( dimY > (blockIdx.y+1) * blockDim.y && thrj+1 < xkolik ) - sArray[ ykolik*sizeSArray + thrj+1 ] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + ykolik*dimX + thrj+1 ]; - else - sArray[ykolik*sizeSArray + thrj+1] = std::numeric_limits< Real >::max(); - - } - - if( thri == 3 ) - { - if( blockIdx.y != 0 && thrj+1 < xkolik ) - sArray[0*sizeSArray + thrj+1] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + thrj+1 ]; - else - sArray[0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max(); - } - - if( i < dimX && j < dimY ) - { - sArray[(thrj+1)*sizeSArray + thri+1] = aux[ j*dimX + i ]; - } - __syncthreads(); - - while( changed[ 0 ] ) - { - __syncthreads(); - - changed[ currentIndex] = false; - - //calculation of update cell - if( i < dimX && j < dimY ) - { - if( ! interfaceMap[ j * dimX + i ] ) - { - changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy); - } - } - __syncthreads(); - - //pyramid reduction - if( blockDim.x*blockDim.y == 1024 ) - { - if( currentIndex < 512 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; - } - } - __syncthreads(); - if( blockDim.x*blockDim.y >= 512 ) - { - if( currentIndex < 256 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; - } - } - __syncthreads(); - if( blockDim.x*blockDim.y >= 256 ) - { - if( currentIndex < 128 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ]; - } - } - __syncthreads(); - if( blockDim.x*blockDim.y >= 128 ) - { - if( currentIndex < 64 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; - } - } - __syncthreads(); - if( currentIndex < 32 ) - { - if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ]; - if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ]; - if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ]; - if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ]; - if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; - if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; - } - if( thri == 0 && thrj == 0 && changed[ 0 ] ){ - BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] = 1; - } - /*if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 ) - { - for( int k = 15; k>-1; k-- ){ - for( int l = 0; l < 16; l++ ) - printf( "%f\t", sArray[k * 16 + l]); - printf( "\n"); - } - printf( "\n"); - }*/ - __syncthreads(); - } - if( i < dimX && j < dimY ) - helpFunc[ j * dimX + i ] = sArray[ ( thrj + 1 ) * sizeSArray + thri + 1 ]; - - } + if( neighbours[0] != -1 ) // LEFT + { + requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[0], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = + MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup ); + } + + if( neighbours[1] != -1 ) // RIGHT + { + requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = + MPI::IRecv( &calculateFromNeighbours[1], 1, neighbours[1], 0, MPI::AllGroup ); + } + + if( neighbours[2] != -1 ) //UP + { + requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[2], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = + MPI::IRecv( &calculateFromNeighbours[2], 1, neighbours[2], 0, MPI::AllGroup ); + } + + if( neighbours[5] != -1 ) //DOWN + { + requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[5], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = + MPI::IRecv( &calculateFromNeighbours[3], 1, neighbours[5], 0, MPI::AllGroup ); + } + MPI::WaitAll( requestsInformation, neighCount ); + + MPI::Allreduce( &calculatedBefore, &calculatedBefore, 1, MPI_LOR, MPI::AllGroup ); + calculateMPIAgain = calculateFromNeighbours[0] || calculateFromNeighbours[1] || + calculateFromNeighbours[2] || calculateFromNeighbours[3]; } #endif diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h index 2a1183bc2007c1c4c6498c1027b7cdc7ef3a3e1d..325b626f7bf5262637f8e1b43ec9e156bbeca26b 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h @@ -13,13 +13,13 @@ #pragma once -#include "tnlFastSweepingMethod.h" template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > -FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: FastSweepingMethod() : maxIterations( 1 ) { @@ -29,9 +29,10 @@ FastSweepingMethod() template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > const Index& -FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: getMaxIterations() const { @@ -40,9 +41,10 @@ getMaxIterations() const template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > void -FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: setMaxIterations( const IndexType& maxIterations ) { @@ -51,10 +53,12 @@ setMaxIterations( const IndexType& maxIterations ) template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > void -FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >:: +FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: solve( const MeshPointer& mesh, + MeshFunctionPointer& Aux, const AnisotropyPointer& anisotropy, MeshFunctionPointer& u ) { @@ -62,8 +66,13 @@ solve( const MeshPointer& mesh, InterfaceMapPointer interfaceMapPtr; auxPtr->setMesh( mesh ); interfaceMapPtr->setMesh( mesh ); + + // getting overlaps ( WITHOUT MPI SHOULD BE 0 ) + Containers::StaticVector< 3, IndexType > vecLowerOverlaps, vecUpperOverlaps; + setOverlaps( vecLowerOverlaps, vecUpperOverlaps, mesh ); + std::cout << "Initiating the interface cells ..." << std::endl; - BaseType::initInterface( u, auxPtr, interfaceMapPtr ); + BaseType::initInterface( u, auxPtr, interfaceMapPtr, vecLowerOverlaps, vecUpperOverlaps ); auxPtr->save( "aux-ini.tnl" ); typename MeshType::Cell cell( *mesh ); @@ -71,594 +80,436 @@ solve( const MeshPointer& mesh, IndexType iteration( 0 ); MeshFunctionType aux = *auxPtr; InterfaceMapType interfaceMap = * interfaceMapPtr; + aux.template synchronize< Communicator >(); //synchronization of intial conditions + while( iteration < this->maxIterations ) { - if( std::is_same< DeviceType, Devices::Host >::value ) + // indicates weather we calculated in the last passage of the while cycle + // calculatedBefore is same for all ranks + // without MPI should be FALSE at the end of while cycle body + int calculatedBefore = 1; + + // indicates if the MPI process should calculate again in upcoming passage of cycle + // calculateMPIAgain is a value that can differ in every rank + // without MPI should be FALSE at the end of while cycle body + int calculateMPIAgain = 1; + + while( calculatedBefore ) { - int numThreadsPerBlock = 64; - + calculatedBefore = 0; - int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0); - int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0); - int numBlocksZ = mesh->getDimensions().z() / numThreadsPerBlock + (mesh->getDimensions().z() % numThreadsPerBlock != 0 ? 1:0); - //std::cout << "numBlocksX = " << numBlocksX << std::endl; - - /*Real **sArray = new Real*[numBlocksX*numBlocksY]; - for( int i = 0; i < numBlocksX * numBlocksY; i++ ) - sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];*/ - - ArrayContainer BlockIterHost; - BlockIterHost.setSize( numBlocksX * numBlocksY * numBlocksZ ); - BlockIterHost.setValue( 1 ); - int IsCalculationDone = 1; - - MeshFunctionPointer helpFunc( mesh ); - MeshFunctionPointer helpFunc1( mesh ); - helpFunc1 = auxPtr; - auxPtr = helpFunc; - helpFunc = helpFunc1; - //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl; - /*for( int k = numBlocksX-1; k >-1; k-- ){ - for( int l = 0; l < numBlocksY; l++ ){ - std::cout<< BlockIterHost[ l*numBlocksX + k ]; - } - std::cout<template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); + if( std::is_same< DeviceType, Devices::Host >::value && calculateMPIAgain ) // should we calculate in Host? + { + calculateMPIAgain = 0; - //Reduction - for( int i = 0; i < BlockIterHost.getSize(); i++ ){ - if( IsCalculationDone == 0 ){ - IsCalculationDone = IsCalculationDone || BlockIterHost[ i ]; - //break; - } - } - numWhile++; - std::cout <<"numWhile = "<< numWhile <-1; j-- ){ - for( int i = 0; i < numBlocksX; i++ ){ - //std::cout << (*auxPtr)[ k * numBlocksX * numBlocksY + j * numBlocksX + i ] << " "; - std::cout << BlockIterHost[ k * numBlocksX * numBlocksY + j * numBlocksX + i ]; - } - std::cout << std::endl; - } - std::cout << std::endl; - } - std::cout << std::endl;*/ +/** HERE IS FSM FOR OPENMP (NO MPI) - isnt worthy */ + /*int numThreadsPerBlock = -1; + + numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0)); + //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); + if( numThreadsPerBlock <= 16 ) + numThreadsPerBlock = 16; + else if(numThreadsPerBlock <= 32 ) + numThreadsPerBlock = 32; + else if(numThreadsPerBlock <= 64 ) + numThreadsPerBlock = 64; + else if(numThreadsPerBlock <= 128 ) + numThreadsPerBlock = 128; + else if(numThreadsPerBlock <= 256 ) + numThreadsPerBlock = 256; + else if(numThreadsPerBlock <= 512 ) + numThreadsPerBlock = 512; + else + numThreadsPerBlock = 1024; + //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); + + if( numThreadsPerBlock == -1 ){ + printf("Fail in setting numThreadsPerBlock.\n"); + break; + } + + int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0); + int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0); + int numBlocksZ = mesh->getDimensions().z() / numThreadsPerBlock + (mesh->getDimensions().z() % numThreadsPerBlock != 0 ? 1:0); + //std::cout << "numBlocksX = " << numBlocksX << std::endl; + + //Real **sArray = new Real*[numBlocksX*numBlocksY]; + // for( int i = 0; i < numBlocksX * numBlocksY; i++ ) + // sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)]; + + ArrayContainer BlockIterHost; + BlockIterHost.setSize( numBlocksX * numBlocksY * numBlocksZ ); + BlockIterHost.setValue( 1 ); + int IsCalculationDone = 1; + + MeshFunctionPointer helpFunc( mesh ); + MeshFunctionPointer helpFunc1( mesh ); + helpFunc1 = auxPtr; + auxPtr = helpFunc; + helpFunc = helpFunc1; + //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl; + //for( int k = numBlocksX-1; k >-1; k-- ){ + //for( int l = 0; l < numBlocksY; l++ ){ + // std::cout<< BlockIterHost[ l*numBlocksX + k ]; + // } + // std::cout<template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + case 32: + this->template updateBlocks< 34 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + case 64: + this->template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + case 128: + this->template updateBlocks< 130 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + case 256: + this->template updateBlocks< 258 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + case 512: + this->template updateBlocks< 514 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + default: + this->template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); + } + //Reduction + for( int i = 0; i < BlockIterHost.getSize(); i++ ){ + if( IsCalculationDone == 0 ){ + IsCalculationDone = IsCalculationDone || BlockIterHost[ i ]; + //break; + } + } + numWhile++; + + + this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY, numBlocksZ ); + + //string s( "aux-"+ std::to_string(numWhile) + ".tnl"); + //aux.save( s ); + } + if( numWhile == 1 ){ + auxPtr = helpFunc; + } + aux = *auxPtr;*/ +/**------------------------------------------------------------------------------*/ - this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY, numBlocksZ ); - /*for( int k = 0; k < numBlocksZ; k++ ){ - for( int j = numBlocksY-1; j>-1; j-- ){ - for( int i = 0; i < numBlocksX; i++ ){ - //std::cout << (*auxPtr)[ k * numBlocksX * numBlocksY + j * numBlocksX + i ] << " "; - std::cout << BlockIterHost[ k * numBlocksX * numBlocksY + j * numBlocksX + i ]; - } - std::cout << std::endl; - } - std::cout << std::endl; - }*/ +/** HERE IS FSM WITH MPI AND WITHOUT MPI */ + StaticVector boundsFrom; StaticVector boundsTo; - /*for( int j = numBlocksY-1; j>-1; j-- ){ - for( int i = 0; i < numBlocksX; i++ ) - std::cout << "BlockIterHost = "<< j*numBlocksX + i<< " ," << BlockIterHost[ j * numBlocksX + i ]; - std::cout << std::endl; - } - std::cout << std::endl;*/ + // TOP, NORTH and EAST + boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2]; + boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; + boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; + calculatedBefore = goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - //std::cout<getDimensions().z() - vecUpperOverlaps[2]; + boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; + boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); + + // TOP, SOUTH and EAST + boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2]; + boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; + boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); + + // TOP, SOUTH and WEST + boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2]; + boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; + boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); + + // BOTTOM, NOTH and EAST + boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2]; + boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; + boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); + + // BOTTOM, NOTH and WEST + boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2]; + boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; + boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); + + // BOTTOM, SOUTH and EAST + boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2]; + boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; + boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); + + // BOTTOM, SOUTH and WEST + boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2]; + boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; + boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); + + + /**----------------------------------------------------------------------------------*/ } - aux = *auxPtr; - - /*for( cell.getCoordinates().z() = 0; - cell.getCoordinates().z() < mesh->getDimensions().z(); - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh->getDimensions().y(); - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); - cell.getCoordinates().x()++ ) - { - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } - //aux.save( "aux-1.tnl" ); - - for( cell.getCoordinates().z() = 0; - cell.getCoordinates().z() < mesh->getDimensions().z(); - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh->getDimensions().y(); - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) - { - //std::cerr << "2 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } - //aux.save( "aux-2.tnl" ); - for( cell.getCoordinates().z() = 0; - cell.getCoordinates().z() < mesh->getDimensions().z(); - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; - cell.getCoordinates().y() >= 0 ; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); - cell.getCoordinates().x()++ ) - { - //std::cerr << "3 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } - //aux.save( "aux-3.tnl" ); - - for( cell.getCoordinates().z() = 0; - cell.getCoordinates().z() < mesh->getDimensions().z(); - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; - cell.getCoordinates().y() >= 0; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) - { - //std::cerr << "4 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } - //aux.save( "aux-4.tnl" ); - - for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1; - cell.getCoordinates().z() >= 0; - cell.getCoordinates().z()-- ) - { - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh->getDimensions().y(); - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); - cell.getCoordinates().x()++ ) - { - //std::cerr << "5 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } - //aux.save( "aux-5.tnl" ); - - for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1; - cell.getCoordinates().z() >= 0; - cell.getCoordinates().z()-- ) - { - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh->getDimensions().y(); - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) - { - //std::cerr << "6 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } - //aux.save( "aux-6.tnl" ); - - for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1; - cell.getCoordinates().z() >= 0; - cell.getCoordinates().z()-- ) - { - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; - cell.getCoordinates().y() >= 0 ; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); - cell.getCoordinates().x()++ ) - { - //std::cerr << "7 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } - //aux.save( "aux-7.tnl" ); - - for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1; - cell.getCoordinates().z() >= 0; - cell.getCoordinates().z()-- ) - { - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; - cell.getCoordinates().y() >= 0; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) - { - //std::cerr << "8 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - }*/ - } - if( std::is_same< DeviceType, Devices::Cuda >::value ) - { - // TODO: CUDA code -#ifdef HAVE_CUDA - const int cudaBlockSize( 8 ); - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); - int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); - int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z(), cudaBlockSize ); - if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) - std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; - dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); - dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); - - tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr; - - - int BlockIterD = 1; - - TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; - BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ ); - BlockIterDevice.setValue( 1 ); - TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom; - BlockIterPom.setSize( numBlocksX * numBlocksY * numBlocksZ ); - BlockIterPom.setValue( 0 ); - /*int *BlockIterDevice; - cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/ - int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0); - - TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; - dBlock.setSize( nBlocks ); - dBlock.setValue( 0 ); - - int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0); - /*int *dBlock; - cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/ - MeshFunctionPointer helpFunc1( mesh ); - MeshFunctionPointer helpFunc( mesh ); - - helpFunc1 = auxPtr; - auxPtr = helpFunc; - helpFunc = helpFunc1; - int numIter = 0; - - while( BlockIterD ) + if( std::is_same< DeviceType, Devices::Cuda >::value && calculateMPIAgain ) { - helpFunc1 = auxPtr; - auxPtr = helpFunc; - helpFunc = helpFunc1; - TNL_CHECK_CUDA_DEVICE; +#ifdef HAVE_CUDA + // cudaBlockSize is a size of blocks. It's the number raised to the 3 power. + // the number should be less than 10^3 (num of threads in one grid is maximally 1024) + // IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2) + const int cudaBlockSize( 8 ); - CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr, - interfaceMapPtr.template getData< Device >(), - auxPtr.template getData< Device>(), - helpFunc.template modifyData< Device>(), - BlockIterDevice.getView() ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + // Getting the number of blocks in grid in each direction (without overlaps bcs we dont calculate on overlaps) + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize ); + int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize ); + int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z() - vecLowerOverlaps[2] - vecUpperOverlaps[2], cudaBlockSize ); + if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) + std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; - GetNeighbours3D<<< nBlocksNeigh, 1024 >>>( BlockIterDevice.getView(), BlockIterPom.getView(), numBlocksX, numBlocksY, numBlocksZ ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - BlockIterDevice = BlockIterPom; + // Making the variables for global function CudaUpdateCellCaller. + dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); + dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); - CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY * numBlocksZ ) ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + BaseType ptr; // tnlDirectEikonalMethodBase type for calling of function inside CudaUpdateCellCaller + + + int BlockIterD = 1; //variable that tells us weather we should calculate the main cuda body again - CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks ); + // Array containing information about each block in grid, answering question (Have we calculated in this block?) + TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice( numBlocksX * numBlocksY * numBlocksZ ); + BlockIterDevice.setValue( 1 ); // calculate all in the first passage + + // Helping Array for GetNeighbours3D + TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom( numBlocksX * numBlocksY * numBlocksZ ); + BlockIterPom.setValue( 0 ); //doesnt matter what number + + + + // number of neighbours in one block (1024 threads) for GetNeighbours3D + int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0); + + + //MeshFunctionPointer helpFunc1( mesh ); + MeshFunctionPointer helpFunc( mesh ); + helpFunc.template modifyData() = auxPtr.template getData(); + Devices::Cuda::synchronizeDevice(); + + int numIter = 0; // number of passages of following while cycle + + while( BlockIterD ) //main body of cuda code + { + + Devices::Cuda::synchronizeDevice(); + // main function that calculates all values in each blocks + // calculated values are in helpFunc + CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr, + interfaceMapPtr.template getData< Device >(), + auxPtr.template getData< Device>(), + helpFunc.template modifyData< Device>(), + BlockIterDevice.getView(), vecLowerOverlaps, vecUpperOverlaps ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + // Switching pointers to helpFunc and auxPtr so real results are in memory of helpFunc but here under variable auxPtr + auxPtr.swap( helpFunc ); + + Devices::Cuda::synchronizeDevice(); + // Neighbours of blocks that calculatedBefore in this passage should calculate in the next! + // BlockIterDevice contains blocks that calculatedBefore in this passage and BlockIterPom those that should calculate in next (are neighbours) + GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice.getView(), BlockIterPom.getView(), numBlocksX, numBlocksY, numBlocksZ ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + BlockIterDevice = BlockIterPom; + Devices::Cuda::synchronizeDevice(); + + // .containsValue(1) is actually parallel reduction implemented in TNL + BlockIterD = BlockIterDevice.containsValue(1); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + numIter++; + if( BlockIterD ){ + // if we calculated in this passage, we should send the info via MPI so neighbours should calculate after synchronization + calculatedBefore = 1; + } + } + if( numIter%2 == 1 ){ + + // We need auxPtr to point on memory of original auxPtr (not to helpFunc) + // last passage of previous while cycle didnt calculate any number anyway so switching names doesnt effect values + auxPtr.swap( helpFunc ); + Devices::Cuda::synchronizeDevice(); + } cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); - numIter++; - /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) - BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ - + aux = *auxPtr; + interfaceMap = *interfaceMapPtr; +#endif } - if( numIter == 1 ){ - auxPtr = helpFunc; + +#ifdef HAVE_MPI + if( CommunicatorType::isDistributed() ) + { + getInfoFromNeighbours( calculatedBefore, calculateMPIAgain, mesh ); + + // synchronizate the overlaps + aux.template synchronize< Communicator >(); + } - //cudaFree( BlockIterDevice ); - //cudaFree( dBlock ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - aux = *auxPtr; - interfaceMap = *interfaceMapPtr; #endif + + if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculatedBefore 0! + calculatedBefore = 0; //otherwise we would go throw the FSM code and CUDA FSM code again uselessly } - //aux.save( "aux-8.tnl" ); iteration++; } + // Saving the results into Aux for MakeSnapshot function. + Aux = auxPtr; aux.save("aux-final.tnl"); } -#ifdef HAVE_CUDA -template < typename Index > -__global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, - int numBlockX, int numBlockY, int numBlockZ ) +// PROTECTED FUNCTIONS: + +template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > +void +FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: +setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps, + const MeshPointer& mesh) { - int i = blockIdx.x * 1024 + threadIdx.x; - - if( i < numBlockX * numBlockY * numBlockZ ) + vecLowerOverlaps[0] = 0; vecLowerOverlaps[1] = 0; vecLowerOverlaps[2] = 0; + vecUpperOverlaps[0] = 0; vecUpperOverlaps[1] = 0; vecUpperOverlaps[2] = 0; +#ifdef HAVE_MPI + if( CommunicatorType::isDistributed() ) //If we started solver with MPI { - int pom = 0;//BlockIterPom[ i ] = 0; - int m=0, l=0, k=0; - l = i/( numBlockX * numBlockY ); - k = (i-l*numBlockX * numBlockY )/(numBlockX ); - m = (i-l*numBlockX * numBlockY )%( numBlockX ); - if( m > 0 && BlockIterDevice[ i - 1 ] ){ - pom = 1;//BlockIterPom[ i ] = 1; - }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){ - pom = 1;//BlockIterPom[ i ] = 1; - }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){ - pom = 1;// BlockIterPom[ i ] = 1; - }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){ - pom = 1;//BlockIterPom[ i ] = 1; - }else if( l > 0 && BlockIterDevice[ i - numBlockX*numBlockY ] ){ - pom = 1; - }else if( l < numBlockZ-1 && BlockIterDevice[ i + numBlockX*numBlockY ] ){ - pom = 1; - } - - BlockIterPom[ i ] = pom;//BlockIterPom[ i ]; + //Distributed mesh for MPI overlaps (without MPI null pointer) + const Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh(); + vecLowerOverlaps = meshPom->getLowerOverlap(); + vecUpperOverlaps = meshPom->getUpperOverlap(); } +#endif } -template < int sizeSArray, typename Real, typename Device, typename Index > -__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, - const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, - const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, - Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, - TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice ) + + + +template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > +bool +FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: +goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, + MeshFunctionType& aux, const InterfaceMapType& interfaceMap, + const AnisotropyPointer& anisotropy ) { - int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z; - int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z; - int i = threadIdx.x + blockDim.x*blockIdx.x; - int j = blockDim.y*blockIdx.y + threadIdx.y; - int k = blockDim.z*blockIdx.z + threadIdx.z; - int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri; + bool calculated = false; + const MeshType& mesh = aux.getMesh(); + const IndexType stepX = boundsFrom[0] < boundsTo[0]? 1 : -1; + const IndexType stepY = boundsFrom[1] < boundsTo[1]? 1 : -1; + const IndexType stepZ = boundsFrom[2] < boundsTo[2]? 1 : -1; - if( BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] ) + typename MeshType::Cell cell( mesh ); + cell.refresh(); + + for( cell.getCoordinates().z() = boundsFrom[2]; + TNL::abs( cell.getCoordinates().z() - boundsTo[2] ) > 0; + cell.getCoordinates().z() += stepZ ) { - __syncthreads(); - - __shared__ volatile bool changed[ 8*8*8/*(sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2)*/]; - - changed[ currentIndex ] = false; - if( thrj == 0 && thri == 0 && thrk == 0 ) - changed[ 0 ] = true; - - const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); - __shared__ Real hx; __shared__ int dimX; - __shared__ Real hy; __shared__ int dimY; - __shared__ Real hz; __shared__ int dimZ; - - if( thrj == 1 && thri == 1 && thrk == 1 ) - { - //printf( "We are in the calculation. Block = %d.\n",blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ); - hx = mesh.getSpaceSteps().x(); - hy = mesh.getSpaceSteps().y(); - hz = mesh.getSpaceSteps().z(); - dimX = mesh.getDimensions().x(); - dimY = mesh.getDimensions().y(); - dimZ = mesh.getDimensions().z(); - BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 0; - } - __shared__ volatile Real sArray[ 10*10*10/*sizeSArray * sizeSArray * sizeSArray*/ ]; - sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = std::numeric_limits< Real >::max(); - - //filling sArray edges - int numOfBlockx; - int numOfBlocky; - int numOfBlockz; - int xkolik; - int ykolik; - int zkolik; - xkolik = blockDim.x + 1; - ykolik = blockDim.y + 1; - zkolik = blockDim.z + 1; - numOfBlockx = gridDim.x; - numOfBlocky = gridDim.y; - numOfBlockz = gridDim.z; - __syncthreads(); - - if( numOfBlockx - 1 == blIdx ) - xkolik = dimX - (blIdx)*blockDim.x+1; - if( numOfBlocky -1 == blIdy ) - ykolik = dimY - (blIdy)*blockDim.y+1; - if( numOfBlockz-1 == blIdz ) - zkolik = dimZ - (blIdz)*blockDim.z+1; - __syncthreads(); - - if( thri == 0 ) - { - if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik ) - sArray[(thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ]; - else - sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max(); - } - - if( thri == 1 ) - { - if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik ) - sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ]; - else - sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max(); - } - if( thri == 2 ) - { - if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik ) - sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ]; - else - sArray[ (thrk+1) * sizeSArray * sizeSArray + 0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max(); - } - - if( thri == 3 ) - { - if( dimY > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik ) - sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ]; - else - sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = std::numeric_limits< Real >::max(); - } - if( thri == 4 ) - { - if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik ) - sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ]; - else - sArray[0 * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thrk+1] = std::numeric_limits< Real >::max(); - } - - if( thri == 5 ) - { - if( dimZ > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik ) - sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ]; - else - sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = std::numeric_limits< Real >::max(); - } - - if( i < dimX && j < dimY && k < dimZ ) + for( cell.getCoordinates().y() = boundsFrom[1]; + TNL::abs( cell.getCoordinates().y() - boundsTo[1] ) > 0; + cell.getCoordinates().y() += stepY ) { - sArray[(thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = aux[ k*dimX*dimY + j*dimX + i ]; - } - __syncthreads(); - - while( changed[ 0 ] ) - { - __syncthreads(); - - changed[ currentIndex ] = false; - - //calculation of update cell - if( i < dimX && j < dimY && k < dimZ ) - { - if( ! interfaceMap[ k*dimX*dimY + j * dimX + i ] ) - { - changed[ currentIndex ] = ptr.updateCell3D< sizeSArray >( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz); - } - } - __syncthreads(); - - //pyramid reduction - if( blockDim.x*blockDim.y*blockDim.z == 1024 ) - { - if( currentIndex < 512 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; - } - } - __syncthreads(); - if( blockDim.x*blockDim.y*blockDim.z >= 512 ) - { - if( currentIndex < 256 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; - } - } - __syncthreads(); - if( blockDim.x*blockDim.y*blockDim.z >= 256 ) - { - if( currentIndex < 128 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ]; - } - } - __syncthreads(); - if( blockDim.x*blockDim.y*blockDim.z >= 128 ) + for( cell.getCoordinates().x() = boundsFrom[0]; + TNL::abs( cell.getCoordinates().x() - boundsTo[0] ) > 0; + cell.getCoordinates().x() += stepX ) { - if( currentIndex < 64 ) + cell.refresh(); + if( ! interfaceMap( cell ) ) { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; + calculated = this->updateCell( aux, cell ) || calculated; } } - __syncthreads(); - if( currentIndex < 32 ) - { - if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ]; - if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ]; - if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ]; - if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ]; - if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; - if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; - } - __syncthreads(); - - /*if(thri == 0 && thrj ==0 && thrk ==0 && blIdx == 0 && blIdy == 0 && blIdz == 0) - { - //for(int m = 0; m < 8; m++){ - int m = 4; - for(int n = 0; n<8; n++){ - for(int b=0; b<8; b++) - printf(" %i ", changed[m*64 + n*8 + b]); - printf("\n"); - } - printf("\n \n"); - } - //}*/ - - if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 ) - { - //printf( "Setting block calculation. Block = %d.\n",blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ); - BlockIterDevice[ blIdz * gridDim.x * gridDim.y + blIdy * gridDim.x + blIdx ] = 1; - } - __syncthreads(); } - - if( i < dimX && j < dimY && k < dimZ ) - helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thri+1 ]; - - } -} + } + return calculated; +} + + + + +#ifdef HAVE_MPI +template< typename Real, typename Device, typename Index, + typename Communicator, typename Anisotropy > +void +FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: +getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh ) +{ + Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh(); + + int calculateFromNeighbours[6] = {0,0,0,0,0,0}; + + const int *neighbours = meshDistr->getNeighbors(); // Getting neighbors of distributed mesh + MPI::Request *requestsInformation; + requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ]; + + int neighCount = 0; // should this thread calculate again? + + if( neighbours[0] != -1 ) // WEST + { + requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[0], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = + MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup ); + } + + if( neighbours[1] != -1 ) // EAST + { + requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = + MPI::IRecv( &calculateFromNeighbours[1], 1, neighbours[1], 0, MPI::AllGroup ); + } + + if( neighbours[2] != -1 ) //NORTH + { + requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[2], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = + MPI::IRecv( &calculateFromNeighbours[2], 1, neighbours[2], 0, MPI::AllGroup ); + } + + if( neighbours[5] != -1 ) //SOUTH + { + requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[5], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = + MPI::IRecv( &calculateFromNeighbours[3], 1, neighbours[5], 0, MPI::AllGroup ); + } + + if( neighbours[8] != -1 ) // TOP + { + requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[8], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = + MPI::IRecv( &calculateFromNeighbours[4], 1, neighbours[8], 0, MPI::AllGroup ); + } + + if( neighbours[17] != -1 ) //BOTTOM + { + requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[17], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = + MPI::IRecv( &calculateFromNeighbours[5], 1, neighbours[17], 0, MPI::AllGroup ); + } + + MPI::WaitAll( requestsInformation, neighCount ); + + MPI::Allreduce( &calculatedBefore, &calculatedBefore, 1, MPI_LOR, MPI::AllGroup ); + calculateMPIAgain = calculateFromNeighbours[0] || calculateFromNeighbours[1] || + calculateFromNeighbours[2] || calculateFromNeighbours[3] || + calculateFromNeighbours[4] || calculateFromNeighbours[5]; +} #endif