diff --git a/CMakeLists.txt b/CMakeLists.txt index b69b572cca9a8cafbf2ebe93e4cecafb6a15c595..f0b812d80bcf8fa36dbe66678ffffb9a5e03a14c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,8 @@ option(WITH_GMP "Build with GMP support" OFF) option(WITH_TESTS "Build tests" ON) option(WITH_COVERAGE "Enable code coverage reports from unit tests" OFF) option(WITH_EXAMPLES "Compile the 'examples' directory" ON) +option(WITH_TOOLS "Compile the 'src/Tools' directory" ON) +option(WITH_PYTHON "Compile the Python bindings" ON) option(WITH_TEMPLATES_INSTANTIATION "Enable explicit template instantiation" OFF) # install paths relative to the cmake's prefix @@ -111,7 +113,6 @@ if( ${CXX_COMPILER_NAME} STREQUAL "mpic++" ) message( "MPI compiler detected." ) set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_MPI" ) set( CUDA_HOST_COMPILER "mpic++" ) - endif() #### @@ -148,6 +149,22 @@ if( ${WITH_CUDA} ) set( CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} ) endif() endif() + # An extra CUDA_ARCH_HOST_COMPILER variable for compiling tnl-cuda-arch alone, + # because it SHOULD NOT be compiled using mpic++, which would cause weird + # RPATH_CHANGE error in cmake. + # FIXME: find better solution to switch between MPI-enabled and MPI-disabled binaries in cmake + if( NOT $ENV{CUDA_ARCH_HOST_COMPILER} STREQUAL "" ) + message( "-- Setting CUDA_ARCH_HOST_COMPILER to '$ENV{CUDA_ARCH_HOST_COMPILER}'" ) + set( CUDA_ARCH_HOST_COMPILER $ENV{CUDA_ARCH_HOST_COMPILER} ) + else() + if( EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/bin/g++" ) + message( "-- Setting CUDA_ARCH_HOST_COMPILER to '${CUDA_TOOLKIT_ROOT_DIR}/bin/g++'" ) + set( CUDA_ARCH_HOST_COMPILER "${CUDA_TOOLKIT_ROOT_DIR}/bin/g++" ) + else() + message( "-- Setting CUDA_ARCH_HOST_COMPILER to '${CMAKE_CXX_COMPILER}'" ) + set( CUDA_ARCH_HOST_COMPILER ${CMAKE_CXX_COMPILER} ) + endif() + endif() set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ;-DHAVE_CUDA) # disable false compiler warnings # reference for the -Xcudafe flag: http://stackoverflow.com/questions/14831051/how-to-disable-compiler-warnings-with-nvcc/17095910#17095910 @@ -171,7 +188,7 @@ if( ${WITH_CUDA} ) set( CUDA_ARCH_SOURCE ${PROJECT_SOURCE_DIR}/src/Tools/tnl-cuda-arch.cu) message( "Compiling tnl-cuda-arch ..." ) file( MAKE_DIRECTORY ${EXECUTABLE_OUTPUT_PATH} ) - execute_process( COMMAND nvcc --compiler-bindir ${CUDA_HOST_COMPILER} --std=c++11 ${CUDA_ARCH_SOURCE} -o ${CUDA_ARCH_EXECUTABLE} + execute_process( COMMAND nvcc --compiler-bindir ${CUDA_ARCH_HOST_COMPILER} --std=c++11 ${CUDA_ARCH_SOURCE} -o ${CUDA_ARCH_EXECUTABLE} RESULT_VARIABLE CUDA_ARCH_RESULT OUTPUT_VARIABLE CUDA_ARCH_OUTPUT ERROR_VARIABLE CUDA_ARCH_OUTPUT ) @@ -437,6 +454,8 @@ message( " WITH_GMP=${WITH_GMP}" ) message( " WITH_TESTS=${WITH_TESTS}" ) message( " WITH_COVERAGE=${WITH_COVERAGE}" ) message( " WITH_EXAMPLES=${WITH_EXAMPLES}" ) +message( " WITH_TOOLS=${WITH_TOOLS}" ) +message( " WITH_PYTHON=${WITH_PYTHON}" ) message( " WITH_TEMPLATES_INSTANTIATION=${WITH_TEMPLATES_INSTANTIATION}" ) # Print compiler options message( "-- Compiler options:" ) diff --git a/build b/build index 943259b4b4b992b63467951e896e7b5d162a0454..40bfa6d4f501dcbbf5415ed22efa475c9f966a2b 100755 --- a/build +++ b/build @@ -22,6 +22,7 @@ WITH_GMP="no" WITH_TESTS="yes" WITH_COVERAGE="no" WITH_EXAMPLES="yes" +WITH_PYTHON="yes" WITH_TOOLS="yes" WITH_TEMPLATE_INSTANTIATION="no" @@ -94,8 +95,8 @@ then echo " --with-tests=yes/no Enables unit tests. 'yes' by default." echo " --with-coverage=yes/no Enables code coverage reports for unit tests. 'no' by default (lcov is required)." echo " --with-examples=yes/no Compile the 'examples' directory. 'yes' by default." - echo " --with-tools=yes/no Compile the 'tools' directory. 'yes' by default." - echo " --with-python=yes/no Compile with the python bindings. 'yes' by default." + echo " --with-tools=yes/no Compile the 'src/Tools' directory. 'yes' by default." + echo " --with-python=yes/no Compile the Python bindings. 'yes' by default." echo " --with-templates-instantiation=yes/no Precompiles some TNL templates during the build. 'no' by default." echo " --cmake=CMAKE Path to cmake. 'cmake' by default." echo " --verbose It enables verbose build." diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9353b18958722e21016c1e781cd3ff8c9b9ab420..540606d3782c71df92031ad96abd519783e70b63 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -4,10 +4,10 @@ endif( WITH_PYTHON STREQUAL "yes" ) ADD_SUBDIRECTORY( TNL ) -if( WITH_TOOLS STREQUAL "yes" ) +if( ${WITH_TOOLS} ) ADD_SUBDIRECTORY( Tools ) -endif( WITH_TOOLS STREQUAL "yes" ) +endif() -if( WITH_TESTS STREQUAL "yes" ) +if( ${WITH_TESTS} ) ADD_SUBDIRECTORY( UnitTests ) -endif( WITH_TESTS STREQUAL "yes" ) +endif() diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h index dbccc4fba370aa6bb852c8fcd810cb4902262af8..1d34160d4d6ecaecbefb067d1838e4a83b29aeef 100644 --- a/src/TNL/Communicators/MpiCommunicator.h +++ b/src/TNL/Communicators/MpiCommunicator.h @@ -39,6 +39,7 @@ namespace TNL { namespace Communicators { +namespace { class MpiCommunicator { @@ -526,7 +527,8 @@ template<> class MPITypeResolver<long double> }; #endif -}//namespace Communicators +} // namespace <unnamed> +} // namespace Communicators } // namespace TNL #define TNL_MPI_PRINT( message ) \ diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/CMakeLists.txt b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/CMakeLists.txt index 4a257df7f9f029742aea06ca417c1d62d9c5e604..edb30a70cae874f2ac46808c8c5543d4171244da 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/CMakeLists.txt +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/CMakeLists.txt @@ -10,8 +10,13 @@ set( tnl_hamilton_jacobi_SOURCES #ADD_EXECUTABLE(tnl-hamilton-jacobi main.cpp) #target_link_libraries (tnl-hamilton-jacobi tnl ) -ADD_EXECUTABLE(tnl-direct-eikonal-solver tnl-direct-eikonal-solver.cpp ) -target_link_libraries (tnl-direct-eikonal-solver tnl ) +IF( BUILD_CUDA ) + CUDA_ADD_EXECUTABLE(tnl-direct-eikonal-solver tnl-direct-eikonal-solver.cu ) + target_link_libraries (tnl-direct-eikonal-solver tnl ) +ELSE( BUILD_CUDA ) + ADD_EXECUTABLE(tnl-direct-eikonal-solver tnl-direct-eikonal-solver.cpp ) + target_link_libraries (tnl-direct-eikonal-solver tnl ) +ENDIF( BUILD_CUDA ) INSTALL( TARGETS #tnl-hamilton-jacobi diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/MainBuildConfig.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/MainBuildConfig.h index ed3d686eb99379af1589d734eac9b5812cccdedf..e829cc64f398dafcd3d6b890e9fe7756b94b4676 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/MainBuildConfig.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/MainBuildConfig.h @@ -1,64 +1,57 @@ /*************************************************************************** - MainBuildConfig.h - description + HamiltonJacobiBuildConfigTag.h - description ------------------- begin : Jul 7, 2014 copyright : (C) 2014 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ -/*************************************************************************** - * * - * This program is free software; you can redistribute it and/or modify * - * it under the terms of the GNU General Public License as published by * - * the Free Software Foundation; either version 2 of the License, or * - * (at your option) any later version. * - * * - ***************************************************************************/ +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include <TNL/Solvers/BuildConfigTags.h> -#ifndef MAINBUILDCONFIG_H_ -#define MAINBUILDCONFIG_H_ +namespace TNL { -#include <solvers/tnlBuildConfigTags.h> +class HamiltonJacobiBuildConfig {}; -class MainBuildConfig -{ - public: +namespace Solvers { - static void print() {std::cerr << "MainBuildConfig" <<std::endl; } -}; /**** * Turn off support for float and long double. */ -template<> struct tnlConfigTagReal< MainBuildConfig, float > { enum { enabled = false }; }; -template<> struct tnlConfigTagReal< MainBuildConfig, long double > { enum { enabled = false }; }; +//template<> struct ConfigTagReal< HamiltonJacobiBuildConfig, float > { enum { enabled = false }; }; +template<> struct ConfigTagReal< HamiltonJacobiBuildConfig, long double > { enum { enabled = false }; }; /**** * Turn off support for short int and long int indexing. */ -template<> struct tnlConfigTagIndex< MainBuildConfig, short int >{ enum { enabled = false }; }; -template<> struct tnlConfigTagIndex< MainBuildConfig, long int >{ enum { enabled = false }; }; +template<> struct ConfigTagIndex< HamiltonJacobiBuildConfig, short int >{ enum { enabled = false }; }; +template<> struct ConfigTagIndex< HamiltonJacobiBuildConfig, long int >{ enum { enabled = false }; }; /**** - * Use of tnlGrid is enabled for allowed dimensions and Real, Device and Index types. + * Use of Grid is enabled for allowed dimensions and Real, Device and Index types. */ -template< int Dimensions, typename Real, typename Device, typename Index > - struct tnlConfigTagMesh< MainBuildConfig, tnlGrid< Dimensions, Real, Device, Index > > - { enum { enabled = tnlConfigTagDimensions< MainBuildConfig, Dimensions >::enabled && - tnlConfigTagReal< MainBuildConfig, Real >::enabled && - tnlConfigTagDevice< MainBuildConfig, Device >::enabled && - tnlConfigTagIndex< MainBuildConfig, Index >::enabled }; }; +/*template< int Dimension, typename Real, typename Device, typename Index > + struct ConfigTagMesh< HamiltonJacobiBuildConfig, Meshes::Grid< Dimension, Real, Device, Index > > + { enum { enabled = ConfigTagDimension< HamiltonJacobiBuildConfig, Dimension >::enabled && + ConfigTagReal< HamiltonJacobiBuildConfig, Real >::enabled && + ConfigTagDevice< HamiltonJacobiBuildConfig, Device >::enabled && + ConfigTagIndex< HamiltonJacobiBuildConfig, Index >::enabled }; };*/ /**** * Please, chose your preferred time discretisation here. */ -template<> struct tnlConfigTagTimeDiscretisation< MainBuildConfig, tnlExplicitTimeDiscretisationTag >{ enum { enabled = true }; }; -template<> struct tnlConfigTagTimeDiscretisation< MainBuildConfig, tnlSemiImplicitTimeDiscretisationTag >{ enum { enabled = false}; }; -template<> struct tnlConfigTagTimeDiscretisation< MainBuildConfig, tnlImplicitTimeDiscretisationTag >{ enum { enabled = false }; }; +template<> struct ConfigTagTimeDiscretisation< HamiltonJacobiBuildConfig, ExplicitTimeDiscretisationTag >{ enum { enabled = true }; }; +template<> struct ConfigTagTimeDiscretisation< HamiltonJacobiBuildConfig, SemiImplicitTimeDiscretisationTag >{ enum { enabled = true }; }; +template<> struct ConfigTagTimeDiscretisation< HamiltonJacobiBuildConfig, ImplicitTimeDiscretisationTag >{ enum { enabled = false }; }; /**** * Only the Runge-Kutta-Merson solver is enabled by default. */ -template<> struct tnlConfigTagExplicitSolver< MainBuildConfig, tnlExplicitEulerSolverTag >{ enum { enabled = false }; }; +//template<> struct ConfigTagExplicitSolver< HamiltonJacobiBuildConfig, ExplicitEulerSolverTag >{ enum { enabled = false }; }; -#endif /* MAINBUILDCONFIG_H_ */ +} // namespace Solvers +} // namespace TNL diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h index dccd06e3ccbd33dd093577ba11b245ab338ae44b..1b46ecb3dbea0438be1065fc43808ecb157c93ab 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h @@ -20,14 +20,16 @@ #include <TNL/Functions/MeshFunction.h> #include <TNL/Meshes/Grid.h> #include "tnlDirectEikonalProblem.h" +#include "MainBuildConfig.h" using namespace TNL; //typedef tnlDefaultBuildMeshConfig BuildConfig; -typedef Solvers::FastBuildConfigTag BuildConfig; +//typedef Solvers::FastBuildConfig BuildConfig; +typedef HamiltonJacobiBuildConfig BuildConfig; template< typename MeshConfig > -class tnlDirectEikonalSolverConfig +class DirectEikonalSolverConfig { public: static void configSetup( Config::ConfigDescription& config ) @@ -44,7 +46,7 @@ template< typename Real, typename MeshConfig, typename SolverStarter, typename CommunicatorType > -class tnlDirectEikonalSolverSetter +class DirectEikonalSolverSetter { public: @@ -66,7 +68,7 @@ class tnlDirectEikonalSolverSetter int main( int argc, char* argv[] ) { - if( ! Solvers::Solver< tnlDirectEikonalSolverSetter, tnlDirectEikonalSolverConfig, BuildConfig >::run( argc, argv ) ) + if( ! Solvers::Solver< DirectEikonalSolverSetter, DirectEikonalSolverConfig, BuildConfig >::run( argc, argv ) ) return EXIT_FAILURE; return EXIT_SUCCESS; } diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test index 6d997ce230a9de4ad1deefc1ccaa1f926021a16f..0dc50246f5ced2ec65616f7eecc65ebf860a2cdb 100755 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test @@ -2,7 +2,7 @@ device="host" dimensions="2D 3D" -dimensions="2D" +#dimensions="3D" sizes1D="16 32 64 128 256 512 1024 2048 4096" #sizes1D="256" sizes2D="16 32 64 128 256 512 1024" @@ -12,7 +12,7 @@ testFunctions="paraboloid" snapshotPeriod=0.1 finalTime=1.5 solverName="tnl-direct-eikonal-solver" -#solverName="gdb --args tnl-direct-eikonal-solver-dbg" +#solverName="gdb --args tnl-direct-eikonal-solver-dbg --catch-exceptions no" # setupTestFunction() @@ -59,7 +59,7 @@ setupGrid() setInitialCondition() { testFunction=$1 - tnl-init --test-function ${testFunction}-sdf \ + tnl-init --test-function ${testFunction} \ --output-file initial-u.tnl \ --amplitude ${amplitude} \ --wave-length ${waveLength} \ @@ -78,7 +78,7 @@ setInitialCondition() --radius ${radius} tnl-init --test-function ${testFunction}-sdf \ - --output-file final-u.tnl \ + --output-file exact-u.tnl \ --amplitude ${amplitude} \ --wave-length ${waveLength} \ --wave-length-x ${waveLengthX} \ @@ -108,7 +108,6 @@ solve() --time-step 10 \ --time-step-order 2 \ --discrete-solver ${discreteSolver} \ - --merson-adaptivity 1.0e-5 \ --min-iterations 20 \ --convergence-residue 1.0e-12 \ --snapshot-period ${snapshotPeriod} \ @@ -118,7 +117,7 @@ solve() computeError() { tnl-diff --mesh mesh.tnl \ - --input-files final-u.tnl u-*.tnl \ + --input-files aux-final.tnl exact-u.tnl \ --mode sequence \ --snapshot-period ${snapshotPeriod} \ --output-file errors.txt \ @@ -172,8 +171,8 @@ runTest() echo "=========================================================================" echo "=== STARTING THE SOLVER ===" echo "=========================================================================" - solve explicit merson - #solve semi-implicit gmres + #solve explicit merson + solve semi-implicit cg mv computation-in-progress computation-done fi echo "=========================================================================" diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h index c6385f64a9668272bee961d0bb5dc02b182655cd..a406a5b1772033bed2da5ab17330c892119923dc 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h @@ -9,6 +9,7 @@ #include <TNL/Meshes/Grid.h> #include <TNL/Functions/MeshFunction.h> +#include <TNL/Devices/Cuda.h> using namespace TNL; @@ -30,15 +31,21 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > typedef Index IndexType; typedef Functions::MeshFunction< MeshType > MeshFunctionType; typedef Functions::MeshFunction< MeshType, 1, bool > InterfaceMapType; + using MeshFunctionPointer = SharedPointer< MeshFunctionType >; + using InterfaceMapPointer = SharedPointer< InterfaceMapType >; - void initInterface( const MeshFunctionType& input, - MeshFunctionType& output, - InterfaceMapType& interfaceMap ); + void initInterface( const MeshFunctionPointer& input, + MeshFunctionPointer& output, + InterfaceMapPointer& interfaceMap ); template< typename MeshEntity > - void updateCell( MeshFunctionType& u, - const MeshEntity& cell ); + __cuda_callable__ void updateCell( MeshFunctionType& u, + const MeshEntity& cell, + const RealType velocity = 1.0 ); + __cuda_callable__ bool updateCell( volatile Real sArray[18], + int thri, const Real h, + const Real velocity = 1.0 ); }; @@ -48,21 +55,27 @@ template< typename Real, class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > { public: - typedef Meshes::Grid< 2, Real, Device, Index > MeshType; typedef Real RealType; typedef Device DevcieType; typedef Index IndexType; typedef Functions::MeshFunction< MeshType > MeshFunctionType; typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType; + using MeshFunctionPointer = SharedPointer< MeshFunctionType >; + using InterfaceMapPointer = SharedPointer< InterfaceMapType >; - void initInterface( const MeshFunctionType& input, - MeshFunctionType& output, - InterfaceMapType& interfaceMap ); + void initInterface( const MeshFunctionPointer& input, + MeshFunctionPointer& output, + InterfaceMapPointer& interfaceMap ); template< typename MeshEntity > - void updateCell( MeshFunctionType& u, - const MeshEntity& cell ); + __cuda_callable__ void updateCell( MeshFunctionType& u, + const MeshEntity& cell, + const RealType velocity = 1.0 ); + + __cuda_callable__ bool updateCell( volatile Real sArray[18][18], + int thri, int thrj, const Real hx, const Real hy, + const Real velocity = 1.0 ); }; template< typename Real, @@ -71,21 +84,73 @@ template< typename Real, class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > { public: - typedef Meshes::Grid< 3, Real, Device, Index > MeshType; typedef Real RealType; typedef Device DevcieType; typedef Index IndexType; typedef Functions::MeshFunction< MeshType > MeshFunctionType; typedef Functions::MeshFunction< MeshType, 3, bool > InterfaceMapType; + using MeshFunctionPointer = SharedPointer< MeshFunctionType >; + using InterfaceMapPointer = SharedPointer< InterfaceMapType >; - void initInterface( const MeshFunctionType& input, - MeshFunctionType& output, - InterfaceMapType& interfaceMap ); + void initInterface( const MeshFunctionPointer& input, + MeshFunctionPointer& output, + InterfaceMapPointer& interfaceMap ); template< typename MeshEntity > - void updateCell( MeshFunctionType& u, - const MeshEntity& cell ); + __cuda_callable__ void updateCell( MeshFunctionType& u, + const MeshEntity& cell, + const RealType velocity = 1.0); + + __cuda_callable__ bool updateCell( volatile Real sArray[10][10][10], + int thri, int thrj, int thrk, const Real hx, const Real hy, const Real hz, + const Real velocity = 1.0 ); }; +template < typename T1, typename T2 > +T1 meet2DCondition( T1 a, T1 b, const T2 ha, const T2 hb, const T1 value, double v = 1); + +template < typename T1 > +__cuda_callable__ void sortMinims( T1 pom[] ); + + +#ifdef HAVE_CUDA +template < typename Real, typename Device, typename Index > +__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, + Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output, + Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap ); + +template < typename Real, typename Device, typename Index > +__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > ptr, + const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap, + Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& aux, + bool *BlockIterDevice ); + +template < typename Real, typename Device, typename Index > +__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, + const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, + Real *aux, + int *BlockIterDevice); +__global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks ); + +template < typename Real, typename Device, typename Index > +__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a ); + +template < typename Real, typename Device, typename Index > +__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output, + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap ); + +template < typename Real, typename Device, typename Index > +__global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output, + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap ); + +template < typename Real, typename Device, typename Index > +__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, + const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, + int *BlockIterDevice ); +#endif + #include "tnlDirectEikonalMethodsBase_impl.h" diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h index 5decdba55d2d7fc418453fcfd6781a8c0055cd61..649a5ad43a3041fcd479ab7828f7fb1a85634b1c 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h @@ -1,4 +1,4 @@ -/* + /* * File: tnlDirectEikonalMethodsBase_impl.h * Author: oberhuber * @@ -9,43 +9,84 @@ #include <limits> +#include <iostream> +#include "tnlFastSweepingMethod.h" + template< typename Real, typename Device, typename Index > void tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >:: -initInterface( const MeshFunctionType& input, - MeshFunctionType& output, - InterfaceMapType& interfaceMap ) +initInterface( const MeshFunctionPointer& _input, + MeshFunctionPointer& _output, + InterfaceMapPointer& _interfaceMap ) { - const MeshType& mesh = input.getMesh(); - typedef typename MeshType::Cell Cell; - Cell cell( mesh ); - for( cell.getCoordinates().x() = 1; - cell.getCoordinates().x() < mesh.getDimensions().x() - 1; - cell.getCoordinates().x() ++ ) - { - cell.refresh(); - const RealType& c = input( cell ); - if( ! cell.isBoundaryEntity() ) - { - const auto& neighbors = cell.getNeighborEntities(); - //const IndexType& c = cell.getIndex(); - const IndexType e = neighbors.template getEntityIndex< 1 >(); - const IndexType w = neighbors.template getEntityIndex< -1 >(); - - if( c * input[ e ] <= 0 || c * input[ w ] <= 0 ) - { - output[ cell.getIndex() ] = c; - interfaceMap[ cell.getIndex() ] = true; - continue; - } - } - output[ cell.getIndex() ] = - c > 0 ? std::numeric_limits< RealType >::max() : - -std::numeric_limits< RealType >::max(); - interfaceMap[ cell.getIndex() ] = false; - } + if( std::is_same< Device, Devices::Cuda >::value ) + { +#ifdef HAVE_CUDA + const MeshType& mesh = _input->getMesh(); + + const int cudaBlockSize( 16 ); + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); + dim3 blockSize( cudaBlockSize ); + dim3 gridSize( numBlocksX ); + Devices::Cuda::synchronizeDevice(); + CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(), + _output.template modifyData< Device >(), + _interfaceMap.template modifyData< Device >() ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; +#endif + } + if( std::is_same< Device, Devices::Host >::value ) + { + const MeshType& mesh = _input->getMesh(); + typedef typename MeshType::Cell Cell; + const MeshFunctionType& input = _input.getData(); + MeshFunctionType& output = _output.modifyData(); + InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); + Cell cell( mesh ); + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh.getDimensions().x(); + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + output[ cell.getIndex() ] = + input( cell ) >= 0 ? std::numeric_limits< RealType >::max() : + -std::numeric_limits< RealType >::max(); + interfaceMap[ cell.getIndex() ] = false; + } + + + const RealType& h = mesh.getSpaceSteps().x(); + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh.getDimensions().x() - 1; + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + const RealType& c = input( cell ); + if( ! cell.isBoundaryEntity() ) + { + const auto& neighbors = cell.getNeighborEntities(); + Real pom = 0; + //const IndexType& c = cell.getIndex(); + const IndexType e = neighbors.template getEntityIndex< 1 >(); + if( c * input[ e ] <= 0 ) + { + pom = TNL::sign( c )*( h * c )/( c - input[ e ]); + if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) + output[ cell.getIndex() ] = pom; + + pom = pom - TNL::sign( c )*h; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; + if( TNL::abs( output[ e ] ) > TNL::abs( pom ) ) + output[ e ] = pom; + + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ e ] = true; + } + } + } + } } template< typename Real, @@ -55,51 +96,175 @@ template< typename Real, void tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >:: updateCell( MeshFunctionType& u, - const MeshEntity& cell ) + const MeshEntity& cell, + const RealType v ) { -} + const auto& neighborEntities = cell.template getNeighborEntities< 1 >(); + const MeshType& mesh = cell.getMesh(); + const RealType& h = mesh.getSpaceSteps().x(); + const RealType value = u( cell ); + RealType a, tmp = std::numeric_limits< RealType >::max(); + if( cell.getCoordinates().x() == 0 ) + a = u[ neighborEntities.template getEntityIndex< 1 >() ]; + else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 ) + a = u[ neighborEntities.template getEntityIndex< -1 >() ]; + else + { + a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1 >() ], + u[ neighborEntities.template getEntityIndex< 1 >() ] ); + } + + if( fabs( a ) == std::numeric_limits< RealType >::max() ) + return; + + tmp = a + TNL::sign( value ) * h/v; + + u[ cell.getIndex() ] = argAbsMin( value, tmp ); +} template< typename Real, typename Device, typename Index > void tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: -initInterface( const MeshFunctionType& input, - MeshFunctionType& output, - InterfaceMapType& interfaceMap ) +initInterface( const MeshFunctionPointer& _input, + MeshFunctionPointer& _output, + InterfaceMapPointer& _interfaceMap ) { - const MeshType& mesh = input.getMesh(); - typedef typename MeshType::Cell Cell; - Cell cell( mesh ); - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh.getDimensions().y(); - cell.getCoordinates().y() ++ ) - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh.getDimensions().x(); - cell.getCoordinates().x() ++ ) - { - cell.refresh(); - const RealType& c = input( cell ); - if( ! cell.isBoundaryEntity() ) - { - auto neighbors = cell.getNeighborEntities(); - const IndexType e = neighbors.template getEntityIndex< 1, 0 >(); - const IndexType w = neighbors.template getEntityIndex< -1, 0 >(); - const IndexType n = neighbors.template getEntityIndex< 0, 1 >(); - const IndexType s = neighbors.template getEntityIndex< 0, -1 >(); - if( c * input[ e ] <= 0 || c * input[ w ] <= 0 || - c * input[ n ] <= 0 || c * input[ s ] <= 0 ) - { - output[ cell.getIndex() ] = c; - interfaceMap[ cell.getIndex() ] = true; - continue; - } - } - output[ cell.getIndex() ] = - c > 0 ? std::numeric_limits< RealType >::max() : - -std::numeric_limits< RealType >::max(); - interfaceMap[ cell.getIndex() ] = false; + + if( std::is_same< Device, Devices::Cuda >::value ) + { +#ifdef HAVE_CUDA + const MeshType& mesh = _input->getMesh(); + + const int cudaBlockSize( 16 ); + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); + int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); + dim3 blockSize( cudaBlockSize, cudaBlockSize ); + dim3 gridSize( numBlocksX, numBlocksY ); + Devices::Cuda::synchronizeDevice(); + CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(), + _output.template modifyData< Device >(), + _interfaceMap.template modifyData< Device >() ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; +#endif + } + if( std::is_same< Device, Devices::Host >::value ) + { + MeshFunctionType input = _input.getData(); + + /*double A[320][320]; + std::ifstream fileInit("/home/maty/Downloads/initData.txt"); + + for (int i = 0; i < 320; i++) + for (int j = 0; j < 320; j++) + fileInit >> A[i][j]; + fileInit.close(); + for (int i = 0; i < 320; i++) + for (int j = 0; j < 320; j++) + input[i*320 + j] = A[i][j];*/ + + + MeshFunctionType& output = _output.modifyData(); + InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); + const MeshType& mesh = input.getMesh(); + typedef typename MeshType::Cell Cell; + Cell cell( mesh ); + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh.getDimensions().y(); + cell.getCoordinates().y() ++ ) + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh.getDimensions().x(); + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + output[ cell.getIndex() ] = + input( cell ) >= 0 ? std::numeric_limits< RealType >::max() : + - std::numeric_limits< RealType >::max(); + interfaceMap[ cell.getIndex() ] = false; + } + + const RealType& hx = mesh.getSpaceSteps().x(); + const RealType& hy = mesh.getSpaceSteps().y(); + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh.getDimensions().y(); + cell.getCoordinates().y() ++ ) + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh.getDimensions().x(); + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + const RealType& c = input( cell ); + if( ! cell.isBoundaryEntity() ) + { + auto neighbors = cell.getNeighborEntities(); + Real pom = 0; + const IndexType e = neighbors.template getEntityIndex< 1, 0 >(); + const IndexType n = neighbors.template getEntityIndex< 0, 1 >(); + //Try init with exact data: + /*if( c * input[ n ] <= 0 ) + { + output[ cell.getIndex() ] = c; + output[ n ] = input[ n ]; + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ n ] = true; + } + if( c * input[ e ] <= 0 ) + { + output[ cell.getIndex() ] = c; + output[ e ] = input[ e ]; + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ e ] = true; + }*/ + if( c * input[ n ] <= 0 ) + { + /*if( c >= 0 ) + {*/ + pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); + if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) + output[ cell.getIndex() ] = pom; + pom = pom - TNL::sign( c )*hy; + if( TNL::abs( output[ n ] ) > TNL::abs( pom ) ) + output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy; + /*}else + { + pom = - ( hy * c )/( c - input[ n ]); + if( output[ cell.getIndex() ] < pom ) + output[ cell.getIndex() ] = pom; + if( output[ n ] > hy + pom ) + output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]); + }*/ + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ n ] = true; + } + if( c * input[ e ] <= 0 ) + { + /*if( c >= 0 ) + {*/ + pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); + if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) + output[ cell.getIndex() ] = pom; + + pom = pom - TNL::sign( c )*hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; + if( TNL::abs( output[ e ] ) > TNL::abs( pom ) ) + output[ e ] = pom; + /*}else + { + pom = - (hx * c)/( c - input[ e ]); + if( output[ cell.getIndex() ] < pom ) + output[ cell.getIndex() ] = pom; + + pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]); + if( output[ e ] > pom ) + output[ e ] = pom; + }*/ + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ e ] = true; + } + } + } } } @@ -107,26 +272,28 @@ template< typename Real, typename Device, typename Index > template< typename MeshEntity > +__cuda_callable__ void tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: updateCell( MeshFunctionType& u, - const MeshEntity& cell ) + const MeshEntity& cell, + const RealType v) { const auto& neighborEntities = cell.template getNeighborEntities< 2 >(); const MeshType& mesh = cell.getMesh(); - - const RealType& h = mesh.getSpaceSteps().x(); + const RealType& hx = mesh.getSpaceSteps().x(); + const RealType& hy = mesh.getSpaceSteps().y(); const RealType value = u( cell ); - Real a, b, tmp; - + RealType a, b, tmp = std::numeric_limits< RealType >::max(); + if( cell.getCoordinates().x() == 0 ) a = u[ neighborEntities.template getEntityIndex< 1, 0 >() ]; else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 ) a = u[ neighborEntities.template getEntityIndex< -1, 0 >() ]; else { - a = argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0 >() ], - u[ neighborEntities.template getEntityIndex< 1, 0 >() ] ); + a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0 >() ], + u[ neighborEntities.template getEntityIndex< 1, 0 >() ] ); } if( cell.getCoordinates().y() == 0 ) @@ -135,98 +302,770 @@ updateCell( MeshFunctionType& u, b = u[ neighborEntities.template getEntityIndex< 0, -1 >() ]; else { - b = argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1 >() ], - u[ neighborEntities.template getEntityIndex< 0, 1 >() ] ); + b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1 >() ], + u[ neighborEntities.template getEntityIndex< 0, 1 >() ] ); } - - if( fabs( a ) == std::numeric_limits< Real >::max() && - fabs( b ) == std::numeric_limits< Real >::max() ) + if( fabs( a ) == std::numeric_limits< RealType >::max() && + fabs( b ) == std::numeric_limits< RealType >::max() ) return; - if( fabs( a ) == std::numeric_limits< Real >::max() || - fabs( b ) == std::numeric_limits< Real >::max() || - fabs( a - b ) >= h ) + /*if( fabs( a ) == TypeInfo< Real >::getMaxValue() || + fabs( b ) == TypeInfo< Real >::getMaxValue() || + fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) ) { - tmp = argAbsMin( a, b ) + sign( value ) * h; - /* std::cerr << "a = " << a << " b = " << b << " h = " << h - << " ArgAbsMin( a, b ) = " << ArgAbsMin( a, b ) << " sign( value ) = " << sign( value ) - << " sign( value ) * h = " << sign( value ) * h - << " ArgAbsMin( a, b ) + sign( value ) * h = " << ArgAbsMin( a, b ) + sign( value ) * h - << " tmp = " << tmp << std::endl; - tmp = ArgAbsMin( a, b ) + sign( value ) * h; - tmp = ArgAbsMin( a, b ) + sign( value ) * h; - tmp = ArgAbsMin( a, b ) + sign( value ) * h; - res = ArgAbsMin( a, b ) + sign( value ) * h; - std::cerr << " tmp = " << tmp << std::endl; - std::cerr << " res = " << res << std::endl;*/ - + tmp = + fabs( a ) >= fabs( b ) ? b + TNL::sign( value ) * hy : + a + TNL::sign( value ) * hx; + }*/ + /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() && + fabs( b ) != TypeInfo< Real >::getMaxValue() && + fabs( a - b ) < TNL::sqrt( (hx * hx + hy * hy)/v ) ) + { + tmp = ( hx * hx * b + hy * hy * a + + sign( value ) * hx * hy * TNL::sqrt( ( hx * hx + hy * hy )/v - + ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy ); + u[ cell.getIndex() ] = tmp; } else - tmp = 0.5 * ( a + b + sign( value ) * sqrt( 2.0 * h * h - ( a - b ) * ( a - b ) ) ); - - u[ cell.getIndex() ] = argAbsMin( value, tmp ); - //std::cerr << ArgAbsMin( value, tmp ) << " "; + { + tmp = + fabs( a ) > fabs( b ) ? b + TNL::sign( value ) * hy/v : + a + TNL::sign( value ) * hx/v; + u[ cell.getIndex() ] = argAbsMin( value, tmp ); + //tmp = TypeInfo< RealType >::getMaxValue(); + }*/ + RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 }; + sortMinims( pom ); + tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v; + + + if( fabs( tmp ) < fabs( pom[ 1 ] ) ) + u[ cell.getIndex() ] = argAbsMin( value, tmp ); + else + { + tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + + TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - + ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); + u[ cell.getIndex() ] = argAbsMin( value, tmp ); + } } - template< typename Real, typename Device, typename Index > void tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: -initInterface( const MeshFunctionType& input, - MeshFunctionType& output, - InterfaceMapType& interfaceMap ) +initInterface( const MeshFunctionPointer& _input, + MeshFunctionPointer& _output, + InterfaceMapPointer& _interfaceMap ) { - const MeshType& mesh = input.getMesh(); - typedef typename MeshType::Cell Cell; - Cell cell( mesh ); - for( cell.getCoordinates().z() = 1; - cell.getCoordinates().z() < mesh.getDimensions().z() - 1; - cell.getCoordinates().z() ++ ) - for( cell.getCoordinates().y() = 1; - cell.getCoordinates().y() < mesh.getDimensions().y() - 1; - cell.getCoordinates().y() ++ ) - for( cell.getCoordinates().x() = 1; - cell.getCoordinates().x() < mesh.getDimensions().x() - 1; - cell.getCoordinates().x() ++ ) - { - cell.refresh(); - const RealType& c = input( cell ); - if( ! cell.isBoundaryEntity() ) - { - auto neighbors = cell.getNeighborEntities(); - //const IndexType& c = cell.getIndex(); - const IndexType e = neighbors.template getEntityIndex< 1, 0, 0 >(); - const IndexType w = neighbors.template getEntityIndex< -1, 0, 0 >(); - const IndexType n = neighbors.template getEntityIndex< 0, 1, 0 >(); - const IndexType s = neighbors.template getEntityIndex< 0, -1, 0 >(); - const IndexType t = neighbors.template getEntityIndex< 0, 0, 1 >(); - const IndexType b = neighbors.template getEntityIndex< 0, 0, -1 >(); - - if( c * input[ e ] <= 0 || c * input[ w ] <= 0 || - c * input[ n ] <= 0 || c * input[ s ] <= 0 || - c * input[ t ] <= 0 || c * input[ b ] <= 0 ) - { - output[ cell.getIndex() ] = c; - interfaceMap[ cell.getIndex() ] = true; - continue; - } - } - output[ cell.getIndex() ] = - c > 0 ? std::numeric_limits< RealType >::max() : - -std::numeric_limits< RealType >::max(); - interfaceMap[ cell.getIndex() ] = false; - } + if( std::is_same< Device, Devices::Cuda >::value ) + { +#ifdef HAVE_CUDA + const MeshType& mesh = _input->getMesh(); + + const int cudaBlockSize( 8 ); + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); + int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); + int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize ); + if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) + std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; + dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); + dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); + Devices::Cuda::synchronizeDevice(); + CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(), + _output.template modifyData< Device >(), + _interfaceMap.template modifyData< Device >() ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; +#endif + } + if( std::is_same< Device, Devices::Host >::value ) + { + const MeshFunctionType& input = _input.getData(); + MeshFunctionType& output = _output.modifyData(); + InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); + const MeshType& mesh = input.getMesh(); + typedef typename MeshType::Cell Cell; + Cell cell( mesh ); + for( cell.getCoordinates().z() = 0; + cell.getCoordinates().z() < mesh.getDimensions().z(); + cell.getCoordinates().z() ++ ) + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh.getDimensions().y(); + cell.getCoordinates().y() ++ ) + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh.getDimensions().x(); + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + output[ cell.getIndex() ] = + input( cell ) > 0 ? std::numeric_limits< RealType >::max() : + - std::numeric_limits< RealType >::max(); + interfaceMap[ cell.getIndex() ] = false; + } + + const RealType& hx = mesh.getSpaceSteps().x(); + const RealType& hy = mesh.getSpaceSteps().y(); + const RealType& hz = mesh.getSpaceSteps().z(); + for( cell.getCoordinates().z() = 0; + cell.getCoordinates().z() < mesh.getDimensions().z(); + cell.getCoordinates().z() ++ ) + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh.getDimensions().y(); + cell.getCoordinates().y() ++ ) + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh.getDimensions().x(); + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + const RealType& c = input( cell ); + if( ! cell.isBoundaryEntity() ) + { + auto neighbors = cell.getNeighborEntities(); + Real pom = 0; + const IndexType e = neighbors.template getEntityIndex< 1, 0, 0 >(); + const IndexType n = neighbors.template getEntityIndex< 0, 1, 0 >(); + const IndexType t = neighbors.template getEntityIndex< 0, 0, 1 >(); + //Try exact initiation + /*const IndexType w = neighbors.template getEntityIndex< -1, 0, 0 >(); + const IndexType s = neighbors.template getEntityIndex< 0, -1, 0 >(); + const IndexType b = neighbors.template getEntityIndex< 0, 0, -1 >(); + if( c * input[ e ] <= 0 ) + { + output[ cell.getIndex() ] = c; + output[ e ] = input[ e ]; + interfaceMap[ e ] = true; + interfaceMap[ cell.getIndex() ] = true; + } + else if( c * input[ n ] <= 0 ) + { + output[ cell.getIndex() ] = c; + output[ n ] = input[ n ]; + interfaceMap[ n ] = true; + interfaceMap[ cell.getIndex() ] = true; + } + else if( c * input[ t ] <= 0 ) + { + output[ cell.getIndex() ] = c; + output[ t ] = input[ t ]; + interfaceMap[ t ] = true; + interfaceMap[ cell.getIndex() ] = true; + }*/ + if( c * input[ n ] <= 0 ) + { + if( c >= 0 ) + { + pom = ( hy * c )/( c - input[ n ]); + if( output[ cell.getIndex() ] > pom ) + output[ cell.getIndex() ] = pom; + + if ( output[ n ] < pom - hy) + output[ n ] = pom - hy; // ( hy * c )/( c - input[ n ]) - hy; + + }else + { + pom = - ( hy * c )/( c - input[ n ]); + if( output[ cell.getIndex() ] < pom ) + output[ cell.getIndex() ] = pom; + if( output[ n ] > hy + pom ) + output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]); + + } + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ n ] = true; + } + if( c * input[ e ] <= 0 ) + { + if( c >= 0 ) + { + pom = ( hx * c )/( c - input[ e ]); + if( output[ cell.getIndex() ] > pom ) + output[ cell.getIndex() ] = pom; + + pom = pom - hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; + if( output[ e ] < pom ) + output[ e ] = pom; + + }else + { + pom = - (hx * c)/( c - input[ e ]); + if( output[ cell.getIndex() ] < pom ) + output[ cell.getIndex() ] = pom; + + pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]); + if( output[ e ] > pom ) + output[ e ] = pom; + } + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ e ] = true; + } + if( c * input[ t ] <= 0 ) + { + if( c >= 0 ) + { + pom = ( hz * c )/( c - input[ t ]); + if( output[ cell.getIndex() ] > pom ) + output[ cell.getIndex() ] = pom; + + pom = pom - hz; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; + if( output[ t ] < pom ) + output[ t ] = pom; + + }else + { + pom = - (hz * c)/( c - input[ t ]); + if( output[ cell.getIndex() ] < pom ) + output[ cell.getIndex() ] = pom; + + pom = pom + hz; //output[ e ] = hx - (hx * c)/( c - input[ e ]); + if( output[ t ] > pom ) + output[ t ] = pom; + + } + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ t ] = true; + } + } + /*output[ cell.getIndex() ] = + c > 0 ? TypeInfo< RealType >::getMaxValue() : + -TypeInfo< RealType >::getMaxValue(); + interfaceMap[ cell.getIndex() ] = false;*/ //is on line 245 + } + } } template< typename Real, typename Device, typename Index > template< typename MeshEntity > +__cuda_callable__ void tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: updateCell( MeshFunctionType& u, - const MeshEntity& cell ) + const MeshEntity& cell, + const RealType v ) +{ + const auto& neighborEntities = cell.template getNeighborEntities< 3 >(); + const MeshType& mesh = cell.getMesh(); + + const RealType& hx = mesh.getSpaceSteps().x(); + const RealType& hy = mesh.getSpaceSteps().y(); + const RealType& hz = mesh.getSpaceSteps().z(); + const RealType value = u( cell ); + //std::cout << value << std::endl; + RealType a, b, c, tmp = std::numeric_limits< RealType >::max(); + + + if( cell.getCoordinates().x() == 0 ) + a = u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ]; + else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 ) + a = u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ]; + else + { + a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ], + u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ] ); + } + if( cell.getCoordinates().y() == 0 ) + b = u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ]; + else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 ) + b = u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ]; + else + { + b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ], + u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ] ); + }if( cell.getCoordinates().z() == 0 ) + c = u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ]; + else if( cell.getCoordinates().z() == mesh.getDimensions().z() - 1 ) + c = u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ]; + else + { + c = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ], + u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ] ); + } + if( fabs( a ) == std::numeric_limits< RealType >::max() && + fabs( b ) == std::numeric_limits< RealType >::max() && + fabs( c ) == std::numeric_limits< RealType >::max() ) + return; + + + /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() && + fabs( b ) != TypeInfo< Real >::getMaxValue() && + fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) ) + { + tmp = ( hx * hx * a + hy * hy * b + + sign( value ) * hx * hy * sqrt( ( hx * hx + hy * hy )/v - + ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy ); + } + if( fabs( a ) != TypeInfo< Real >::getMaxValue() && + fabs( c ) != TypeInfo< Real >::getMaxValue() && + fabs( a - c ) >= TNL::sqrt( (hx * hx + hz * hz)/v ) ) + { + tmp = ( hx * hx * a + hz * hz * c + + sign( value ) * hx * hz * sqrt( ( hx * hx + hz * hz )/v - + ( a - c ) * ( a - c ) ) )/( hx * hx + hz * hz ); + } + if( fabs( b ) != TypeInfo< Real >::getMaxValue() && + fabs( c ) != TypeInfo< Real >::getMaxValue() && + fabs( b - c ) >= TNL::sqrt( (hy * hy + hz * hz)/v ) ) + { + tmp = ( hy * hy * b + hz * hz * c + + sign( value ) * hy * hz * sqrt( ( hy * hy + hz * hz )/v - + ( b - c ) * ( b - c ) ) )/( hy * hy + hz * hz ); + }*/ + RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz}; + sortMinims( pom ); + tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]; + if( fabs( tmp ) < fabs( pom[ 1 ] ) ) + { + u[ cell.getIndex() ] = argAbsMin( value, tmp ); + } + else + { + tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + + TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - + ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); + if( fabs( tmp ) < fabs( pom[ 2 ]) ) + { + u[ cell.getIndex() ] = argAbsMin( value, tmp ); + } + else + { + tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c + + TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - + hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) - + hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx ); + u[ cell.getIndex() ] = argAbsMin( value, tmp ); + } + } +} + +template < typename T1, typename T2 > +T1 meet2DCondition( T1 a, T1 b, const T2 ha, const T2 hb, const T1 value, double v) +{ + T1 tmp; + if( fabs( a ) != std::numeric_limits< T1 >::max && + fabs( b ) != std::numeric_limits< T1 >::max && + fabs( a - b ) < ha/v )//TNL::sqrt( (ha * ha + hb * hb)/2 )/v ) + { + tmp = ( ha * ha * b + hb * hb * a + + TNL::sign( value ) * ha * hb * TNL::sqrt( ( ha * ha + hb * hb )/( v * v ) - + ( a - b ) * ( a - b ) ) )/( ha * ha + hb * hb ); + } + else + { + tmp = std::numeric_limits< T1 >::max; + } + + return tmp; +} + +template < typename T1 > +__cuda_callable__ void sortMinims( T1 pom[] ) { + T1 tmp[6] = {0.0,0.0,0.0,0.0,0.0,0.0}; + if( fabs(pom[0]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[2])){ + tmp[0] = pom[0]; tmp[1] = pom[1]; tmp[2] = pom[2]; + tmp[3] = pom[3]; tmp[4] = pom[4]; tmp[5] = pom[5]; + + } + else if( fabs(pom[0]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[1]) ){ + tmp[0] = pom[0]; tmp[1] = pom[2]; tmp[2] = pom[1]; + tmp[3] = pom[3]; tmp[4] = pom[5]; tmp[5] = pom[4]; + } + else if( fabs(pom[1]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[2]) ){ + tmp[0] = pom[1]; tmp[1] = pom[0]; tmp[2] = pom[2]; + tmp[3] = pom[4]; tmp[4] = pom[3]; tmp[5] = pom[5]; + } + else if( fabs(pom[1]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[0]) ){ + tmp[0] = pom[1]; tmp[1] = pom[2]; tmp[2] = pom[0]; + tmp[3] = pom[4]; tmp[4] = pom[5]; tmp[5] = pom[3]; + } + else if( fabs(pom[2]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[1]) ){ + tmp[0] = pom[2]; tmp[1] = pom[0]; tmp[2] = pom[1]; + tmp[3] = pom[5]; tmp[4] = pom[3]; tmp[5] = pom[4]; + } + else if( fabs(pom[2]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[0]) ){ + tmp[0] = pom[2]; tmp[1] = pom[1]; tmp[2] = pom[0]; + tmp[3] = pom[5]; tmp[4] = pom[4]; tmp[5] = pom[3]; + } + + for( int i = 0; i < 6; i++ ) + { + pom[ i ] = tmp[ i ]; + } +} + + + +#ifdef HAVE_CUDA +template < typename Real, typename Device, typename Index > +__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, + Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output, + Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap ) +{ + int i = threadIdx.x + blockDim.x*blockIdx.x; + const Meshes::Grid< 1, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); + + if( i < mesh.getDimensions().x() ) + { + typedef typename Meshes::Grid< 1, Real, Device, Index >::Cell Cell; + Cell cell( mesh ); + cell.getCoordinates().x() = i; + cell.refresh(); + const Index cind = cell.getIndex(); + + + output[ cind ] = + input( cell ) >= 0 ? std::numeric_limits< Real >::max() : + - std::numeric_limits< Real >::max(); + interfaceMap[ cind ] = false; + + const Real& h = mesh.getSpaceSteps().x(); + cell.refresh(); + const Real& c = input( cell ); + if( ! cell.isBoundaryEntity() ) + { + auto neighbors = cell.getNeighborEntities(); + Real pom = 0; + const Index e = neighbors.template getEntityIndex< 1 >(); + const Index w = neighbors.template getEntityIndex< -1 >(); + if( c * input[ e ] <= 0 ) + { + pom = TNL::sign( c )*( h * c )/( c - input[ e ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ w ] <= 0 ) + { + pom = TNL::sign( c )*( h * c )/( c - input[ w ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + } + } + +} +template < typename Real, typename Device, typename Index > +__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output, + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap ) +{ + int i = threadIdx.x + blockDim.x*blockIdx.x; + int j = blockDim.y*blockIdx.y + threadIdx.y; + const Meshes::Grid< 2, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); + + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) + { + typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell; + Cell cell( mesh ); + cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; + cell.refresh(); + const Index cind = cell.getIndex(); + + + output[ cind ] = + input( cell ) >= 0 ? std::numeric_limits< Real >::max() : + - std::numeric_limits< Real >::max(); + interfaceMap[ cind ] = false; + + const Real& hx = mesh.getSpaceSteps().x(); + const Real& hy = mesh.getSpaceSteps().y(); + cell.refresh(); + const Real& c = input( cell ); + if( ! cell.isBoundaryEntity() ) + { + auto neighbors = cell.getNeighborEntities(); + Real pom = 0; + const Index e = neighbors.template getEntityIndex< 1, 0 >(); + const Index w = neighbors.template getEntityIndex< -1, 0 >(); + const Index n = neighbors.template getEntityIndex< 0, 1 >(); + const Index s = neighbors.template getEntityIndex< 0, -1 >(); + + if( c * input[ n ] <= 0 ) + { + pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cell.getIndex() ] = true; + } + if( c * input[ e ] <= 0 ) + { + pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ w ] <= 0 ) + { + pom = TNL::sign( c )*( hx * c )/( c - input[ w ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ s ] <= 0 ) + { + pom = TNL::sign( c )*( hy * c )/( c - input[ s ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + } + } +} + +template < typename Real, typename Device, typename Index > +__global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output, + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap ) +{ + int i = threadIdx.x + blockDim.x*blockIdx.x; + int j = blockDim.y*blockIdx.y + threadIdx.y; + int k = blockDim.z*blockIdx.z + threadIdx.z; + const Meshes::Grid< 3, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); + + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() ) + { + typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell; + Cell cell( mesh ); + cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k; + cell.refresh(); + const Index cind = cell.getIndex(); + + + output[ cind ] = + input( cell ) >= 0 ? std::numeric_limits< Real >::max() : + - std::numeric_limits< Real >::max(); + interfaceMap[ cind ] = false; + cell.refresh(); + + const Real& hx = mesh.getSpaceSteps().x(); + const Real& hy = mesh.getSpaceSteps().y(); + const Real& hz = mesh.getSpaceSteps().z(); + const Real& c = input( cell ); + if( ! cell.isBoundaryEntity() ) + { + auto neighbors = cell.getNeighborEntities(); + Real pom = 0; + const Index e = neighbors.template getEntityIndex< 1, 0, 0 >(); + const Index w = neighbors.template getEntityIndex< -1, 0, 0 >(); + const Index n = neighbors.template getEntityIndex< 0, 1, 0 >(); + const Index s = neighbors.template getEntityIndex< 0, -1, 0 >(); + const Index t = neighbors.template getEntityIndex< 0, 0, 1 >(); + const Index b = neighbors.template getEntityIndex< 0, 0, -1 >(); + + if( c * input[ n ] <= 0 ) + { + pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ e ] <= 0 ) + { + pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ w ] <= 0 ) + { + pom = TNL::sign( c )*( hx * c )/( c - input[ w ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ s ] <= 0 ) + { + pom = TNL::sign( c )*( hy * c )/( c - input[ s ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ b ] <= 0 ) + { + pom = TNL::sign( c )*( hz * c )/( c - input[ b ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ t ] <= 0 ) + { + pom = TNL::sign( c )*( hz * c )/( c - input[ t ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + } + } +} + + +template< typename Real, + typename Device, + typename Index > +__cuda_callable__ +bool +tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: +updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy, + const Real v ) +{ + const RealType value = sArray[ thrj ][ thri ]; + RealType a, b, tmp = std::numeric_limits< RealType >::max(); + + b = TNL::argAbsMin( sArray[ thrj+1 ][ thri ], + sArray[ thrj-1 ][ thri ] ); + + a = TNL::argAbsMin( sArray[ thrj ][ thri+1 ], + sArray[ thrj ][ thri-1 ] ); + + if( fabs( a ) == std::numeric_limits< RealType >::max() && + fabs( b ) == std::numeric_limits< RealType >::max() ) + return false; + + RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 }; + sortMinims( pom ); + tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v; + + + if( fabs( tmp ) < fabs( pom[ 1 ] ) ) + { + sArray[ thrj ][ thri ] = argAbsMin( value, tmp ); + tmp = value - sArray[ thrj ][ thri ]; + if ( fabs( tmp ) > 0.01*hx ) + return true; + else + return false; + } + else + { + tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + + TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - + ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); + sArray[ thrj ][ thri ] = argAbsMin( value, tmp ); + tmp = value - sArray[ thrj ][ thri ]; + if ( fabs( tmp ) > 0.01*hx ) + return true; + else + return false; + } + + return false; +} + +template< typename Real, + typename Device, + typename Index > +__cuda_callable__ +bool +tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >:: +updateCell( volatile Real sArray[18], int thri, const Real h, const Real v ) +{ + const RealType value = sArray[ thri ]; + RealType a, tmp = std::numeric_limits< RealType >::max(); + + a = TNL::argAbsMin( sArray[ thri+1 ], + sArray[ thri-1 ] ); + + if( fabs( a ) == std::numeric_limits< RealType >::max() ) + return false; + + tmp = a + TNL::sign( value ) * h/v; + + + sArray[ thri ] = argAbsMin( value, tmp ); + + tmp = value - sArray[ thri ]; + if ( fabs( tmp ) > 0.01*h ) + return true; + else + return false; +} + +template< typename Real, + typename Device, + typename Index > +__cuda_callable__ +bool +tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: +updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk, + const Real hx, const Real hy, const Real hz, const Real v ) +{ + const RealType value = sArray[thrk][thrj][thri]; + //std::cout << value << std::endl; + RealType a, b, c, tmp = std::numeric_limits< RealType >::max(); + + c = TNL::argAbsMin( sArray[ thrk+1 ][ thrj ][ thri ], + sArray[ thrk-1 ][ thrj ][ thri ] ); + + b = TNL::argAbsMin( sArray[ thrk ][ thrj+1 ][ thri ], + sArray[ thrk ][ thrj-1 ][ thri ] ); + + a = TNL::argAbsMin( sArray[ thrk ][ thrj ][ thri+1 ], + sArray[ thrk ][ thrj ][ thri-1 ] ); + + + if( fabs( a ) == std::numeric_limits< RealType >::max() && + fabs( b ) == std::numeric_limits< RealType >::max() && + fabs( c ) == std::numeric_limits< RealType >::max() ) + return false; + RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz}; + + sortMinims( pom ); + + tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]; + if( fabs( tmp ) < fabs( pom[ 1 ] ) ) + { + sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp ); + tmp = value - sArray[ thrk ][ thrj ][ thri ]; + if ( fabs( tmp ) > 0.01*hx ) + return true; + else + return false; + } + else + { + tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + + TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - + ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); + if( fabs( tmp ) < fabs( pom[ 2 ]) ) + { + sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp ); + tmp = value - sArray[ thrk ][ thrj ][ thri ]; + if ( fabs( tmp ) > 0.01*hx ) + return true; + else + return false; + } + else + { + tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c + + TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - + hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) - + hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx ); + sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp ); + tmp = value - sArray[ thrk ][ thrj ][ thri ]; + if ( fabs( tmp ) > 0.01*hx ) + return true; + else + return false; + } + } + + return false; } +#endif diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h index 7737073a623eab27ec8888a72b9b1e91703e5b55..f7e1f1d386b061937417c0906590290f5a6703fa 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h @@ -38,6 +38,8 @@ class tnlDirectEikonalProblem typedef Functions::MeshFunction< Mesh > MeshFunctionType; typedef Problems::PDEProblem< Mesh, Communicator, RealType, DeviceType, IndexType > BaseType; using AnisotropyType = Anisotropy; + using AnisotropyPointer = SharedPointer< AnisotropyType, DeviceType >; + using MeshFunctionPointer = SharedPointer< MeshFunctionType >; using typename BaseType::MeshType; using typename BaseType::DofVectorType; @@ -71,11 +73,11 @@ class tnlDirectEikonalProblem protected: - MeshFunctionType u; + MeshFunctionPointer u; - MeshFunctionType initialData; + MeshFunctionPointer initialData; - AnisotropyType anisotropy; + AnisotropyPointer anisotropy; }; diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h index 3938e2754892cf35d0d7c4ac4813347b29886593..3d5fa15dc23d82eb886d344bc7cf4fff7f4e34d3 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h @@ -100,7 +100,7 @@ void tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >:: bindDofs( const DofVectorPointer& dofs ) { - this->u.bind( this->getMesh(), dofs ); + this->u->bind( this->getMesh(), dofs ); } template< typename Mesh, @@ -114,8 +114,8 @@ setInitialCondition( const Config::ParameterContainer& parameters, DofVectorPointer& dofs ) { String inputFile = parameters.getParameter< String >( "input-file" ); - this->initialData.setMesh( this->getMesh() ); - if( !this->initialData.boundLoad( inputFile ) ) + this->initialData->setMesh( this->getMesh() ); + if( !this->initialData->boundLoad( inputFile ) ) return false; return true; } diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h index 17d139b0ec8b01c0dbf87188968be67ee5fa5cca..54e577dacde239171ae299ee62b078bd7a406c35 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h @@ -15,6 +15,7 @@ #include <TNL/SharedPointer.h> #include "tnlDirectEikonalMethodsBase.h" + template< typename Mesh, typename Anisotropy = Functions::Analytic::Constant< Mesh::getMeshDimension(), typename Mesh::RealType > > class FastSweepingMethod @@ -28,20 +29,24 @@ template< typename Real, class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy > : public tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > { - static_assert( std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." ); + //static_assert( std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." ); public: typedef Meshes::Grid< 1, Real, Device, Index > MeshType; typedef Real RealType; - typedef TNL::Devices::Host DeviceType; + typedef Device DeviceType; typedef Index IndexType; typedef Anisotropy AnisotropyType; typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > BaseType; using MeshPointer = SharedPointer< MeshType >; + using AnisotropyPointer = SharedPointer< AnisotropyType, DeviceType >; using typename BaseType::InterfaceMapType; using typename BaseType::MeshFunctionType; + using typename BaseType::InterfaceMapPointer; + using typename BaseType::MeshFunctionPointer; + FastSweepingMethod(); @@ -50,8 +55,8 @@ class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy > void setMaxIterations( const IndexType& maxIterations ); void solve( const MeshPointer& mesh, - const AnisotropyType& anisotropy, - MeshFunctionType& u ); + const AnisotropyPointer& anisotropy, + MeshFunctionPointer& u ); protected: @@ -66,20 +71,23 @@ template< typename Real, class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy > : public tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > { - static_assert( std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." ); + //static_assert( std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." ); public: typedef Meshes::Grid< 2, Real, Device, Index > MeshType; typedef Real RealType; - typedef TNL::Devices::Host DeviceType; + typedef Device DeviceType; typedef Index IndexType; typedef Anisotropy AnisotropyType; typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > BaseType; using MeshPointer = SharedPointer< MeshType >; + using AnisotropyPointer = SharedPointer< AnisotropyType, DeviceType >; using typename BaseType::InterfaceMapType; using typename BaseType::MeshFunctionType; + using typename BaseType::InterfaceMapPointer; + using typename BaseType::MeshFunctionPointer; FastSweepingMethod(); @@ -88,9 +96,8 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy > void setMaxIterations( const IndexType& maxIterations ); void solve( const MeshPointer& mesh, - const AnisotropyType& anisotropy, - MeshFunctionType& u ); - + const AnisotropyPointer& anisotropy, + MeshFunctionPointer& u ); protected: @@ -104,20 +111,23 @@ template< typename Real, class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy > : public tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > { - static_assert( std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." ); + //static_assert( std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." ); public: typedef Meshes::Grid< 3, Real, Device, Index > MeshType; typedef Real RealType; - typedef TNL::Devices::Host DeviceType; + typedef Device DeviceType; typedef Index IndexType; typedef Anisotropy AnisotropyType; typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > BaseType; using MeshPointer = SharedPointer< MeshType >; + using AnisotropyPointer = SharedPointer< AnisotropyType, DeviceType >; using typename BaseType::InterfaceMapType; using typename BaseType::MeshFunctionType; + using typename BaseType::InterfaceMapPointer; + using typename BaseType::MeshFunctionPointer; FastSweepingMethod(); @@ -126,8 +136,8 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy > void setMaxIterations( const IndexType& maxIterations ); void solve( const MeshPointer& mesh, - const AnisotropyType& anisotropy, - MeshFunctionType& u ); + const AnisotropyPointer& anisotropy, + MeshFunctionPointer& u ); protected: @@ -136,7 +146,6 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy > }; - #include "tnlFastSweepingMethod1D_impl.h" #include "tnlFastSweepingMethod2D_impl.h" #include "tnlFastSweepingMethod3D_impl.h" diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h index 874945eafd7f0cb1eff4667eba6ef026ba026634..890c6cb4c9373917abf1d668ebf60a30cfbc17b3 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h @@ -55,16 +55,193 @@ template< typename Real, void FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >:: solve( const MeshPointer& mesh, - const AnisotropyType& anisotropy, - MeshFunctionType& u ) -{ - MeshFunctionType aux; - InterfaceMapType interfaceMap; - aux.setMesh( mesh ); - interfaceMap.setMesh( mesh ); + const AnisotropyPointer& anisotropy, + MeshFunctionPointer& u ) +{ + MeshFunctionPointer auxPtr; + InterfaceMapPointer interfaceMapPtr; + auxPtr->setMesh( mesh ); + interfaceMapPtr->setMesh( mesh ); std::cout << "Initiating the interface cells ..." << std::endl; - BaseType::initInterface( u, aux, interfaceMap ); - aux.save( "aux-ini.tnl" ); + BaseType::initInterface( u, auxPtr, interfaceMapPtr ); + + auxPtr->save( "aux-ini.tnl" ); + + typename MeshType::Cell cell( *mesh ); + + IndexType iteration( 0 ); + InterfaceMapType interfaceMap = *interfaceMapPtr; + MeshFunctionType aux = *auxPtr; + while( iteration < this->maxIterations ) + { + if( std::is_same< DeviceType, Devices::Host >::value ) + { + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh->getDimensions().x(); + cell.getCoordinates().x()++ ) + { + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + + + for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; + cell.getCoordinates().x() >= 0 ; + cell.getCoordinates().x()-- ) + { + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + if( std::is_same< DeviceType, Devices::Cuda >::value ) + { + // TODO: CUDA code +#ifdef HAVE_CUDA + const int cudaBlockSize( 16 ); + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); + dim3 blockSize( cudaBlockSize ); + dim3 gridSize( numBlocksX ); + + tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > ptr; + + + + bool* BlockIter = (bool*)malloc( ( numBlocksX ) * sizeof( bool ) ); + + bool *BlockIterDevice; + cudaMalloc(&BlockIterDevice, ( numBlocksX ) * sizeof( bool ) ); + + while( BlockIter[ 0 ] ) + { + for( int i = 0; i < numBlocksX; i++ ) + BlockIter[ i ] = false; + cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX ) * sizeof( bool ), cudaMemcpyHostToDevice); + + CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, + interfaceMapPtr.template getData< Device >(), + auxPtr.template modifyData< Device>(), + BlockIterDevice ); + cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX ) * sizeof( bool ), cudaMemcpyDeviceToHost); + + for( int i = 1; i < numBlocksX; i++ ) + BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ]; + + } + delete[] BlockIter; + cudaFree( BlockIterDevice ); + cudaDeviceSynchronize(); + + TNL_CHECK_CUDA_DEVICE; + + aux = *auxPtr; + interfaceMap = *interfaceMapPtr; +#endif + } + iteration++; + } + + aux.save("aux-final.tnl"); +} + +#ifdef HAVE_CUDA +template < typename Real, typename Device, typename Index > +__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > ptr, + const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap, + Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& aux, + bool *BlockIterDevice ) +{ + int thri = threadIdx.x; + int blIdx = blockIdx.x; + int i = thri + blockDim.x*blIdx; + + __shared__ volatile bool changed[16]; + changed[ thri ] = false; + + if( thri == 0 ) + changed[ 0 ] = true; + + const Meshes::Grid< 1, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); + __shared__ Real h; + if( thri == 1 ) + { + h = mesh.getSpaceSteps().x(); + } + + __shared__ volatile Real sArray[ 18 ]; + sArray[thri] = std::numeric_limits< Real >::max(); + + //filling sArray edges + int dimX = mesh.getDimensions().x(); + __shared__ volatile int numOfBlockx; + __shared__ int xkolik; + if( thri == 0 ) + { + xkolik = blockDim.x + 1; + numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0); + + if( numOfBlockx - 1 == blIdx ) + xkolik = dimX - (blIdx)*blockDim.x+1; + } + __syncthreads(); + + if( thri == 0 ) + { + if( dimX > (blIdx+1) * blockDim.x ) + sArray[xkolik] = aux[ blIdx*blockDim.x - 1 + xkolik ]; + else + sArray[xkolik] = std::numeric_limits< Real >::max(); + } + + if( thri == 1 ) + { + if( blIdx != 0 ) + sArray[0] = aux[ blIdx*blockDim.x - 1 ]; + else + sArray[0] = std::numeric_limits< Real >::max(); + } + + + if( i < mesh.getDimensions().x() ) + { + sArray[thri+1] = aux[ i ]; + } + __syncthreads(); + while( changed[ 0 ] ) + { + __syncthreads(); + + changed[ thri ] = false; + + //calculation of update cell + if( i < mesh.getDimensions().x() ) + { + if( ! interfaceMap[ i ] ) + { + changed[ thri ] = ptr.updateCell( sArray, thri+1, h ); + } + } + __syncthreads(); + + + + //pyramid reduction + if( thri < 8 ) changed[ thri ] = changed[ thri ] || changed[ thri + 8 ]; + if( thri < 4 ) changed[ thri ] = changed[ thri ] || changed[ thri + 4 ]; + if( thri < 2 ) changed[ thri ] = changed[ thri ] || changed[ thri + 2 ]; + if( thri < 1 ) changed[ thri ] = changed[ thri ] || changed[ thri + 1 ]; + __syncthreads(); + + if( changed[ 0 ] && thri == 0 ) + BlockIterDevice[ blIdx ] = true; + __syncthreads(); + } + + if( i < mesh.getDimensions().x() && (!interfaceMap[ i ]) ) + aux[ i ] = sArray[ thri + 1 ]; } +#endif + diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index 13e5c824912ef387c47ddaca1c5eb98acfe30fe6..6703843c179ea0215c40cf2d21b9cd66fde99de6 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -14,6 +14,11 @@ #pragma once #include "tnlFastSweepingMethod.h" +#include <TNL/Devices/Cuda.h> + + +#include <iostream> +#include <fstream> template< typename Real, typename Device, @@ -21,7 +26,7 @@ template< typename Real, typename Anisotropy > FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >:: FastSweepingMethod() -: maxIterations( 1 ) +: maxIterations( 100 ) { } @@ -55,87 +60,466 @@ template< typename Real, void FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >:: solve( const MeshPointer& mesh, - const AnisotropyType& anisotropy, - MeshFunctionType& u ) + const AnisotropyPointer& anisotropy, + MeshFunctionPointer& u ) { - MeshFunctionType aux; - InterfaceMapType interfaceMap; - aux.setMesh( mesh ); - interfaceMap.setMesh( mesh ); + /*MeshFunctionType v; + v.setMesh(mesh); + double A[320][320]; + for (int i = 0; i < 320; i++) + for (int j = 0; j < 320; j++) + A[i][j] = 0; + + std::ifstream file("/home/maty/Downloads/mapa2.txt"); + + for (int i = 0; i < 320; i++) + for (int j = 0; j < 320; j++) + file >> A[i][j]; + file.close(); + for (int i = 0; i < 320; i++) + for (int j = 0; j < 320; j++) + v[i*320 + j] = A[i][j]; + v.save("mapa.tnl");*/ + + + MeshFunctionPointer auxPtr; + InterfaceMapPointer interfaceMapPtr; + auxPtr->setMesh( mesh ); + interfaceMapPtr->setMesh( mesh ); std::cout << "Initiating the interface cells ..." << std::endl; - BaseType::initInterface( u, aux, interfaceMap ); - //aux.save( "aux-ini.tnl" ); + BaseType::initInterface( u, auxPtr, interfaceMapPtr ); + + auxPtr->save( "aux-ini.tnl" ); typename MeshType::Cell cell( *mesh ); IndexType iteration( 0 ); + InterfaceMapType interfaceMap = *interfaceMapPtr; + MeshFunctionType aux = *auxPtr; while( iteration < this->maxIterations ) { - - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh->getDimensions().y(); - cell.getCoordinates().y()++ ) + if( std::is_same< DeviceType, Devices::Host >::value ) { - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); - cell.getCoordinates().x()++ ) + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh->getDimensions().y(); + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh->getDimensions().x(); + cell.getCoordinates().x()++ ) + { + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + + //aux.save( "aux-1.tnl" ); + + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh->getDimensions().y(); + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; + cell.getCoordinates().x() >= 0 ; + cell.getCoordinates().x()-- ) + { + //std::cerr << "2 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + + //aux.save( "aux-2.tnl" ); + + for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; + cell.getCoordinates().y() >= 0 ; + cell.getCoordinates().y()-- ) { - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh->getDimensions().x(); + cell.getCoordinates().x()++ ) + { + //std::cerr << "3 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } } - } - //aux.save( "aux-1.tnl" ); - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh->getDimensions().y(); - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) + //aux.save( "aux-3.tnl" ); + + for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; + cell.getCoordinates().y() >= 0; + cell.getCoordinates().y()-- ) { - //std::cerr << "2 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); + for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; + cell.getCoordinates().x() >= 0 ; + cell.getCoordinates().x()-- ) + { + //std::cerr << "4 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } } - } - //aux.save( "aux-2.tnl" ); - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; - cell.getCoordinates().y() >= 0 ; - cell.getCoordinates().y()-- ) + //aux.save( "aux-4.tnl" ); + + /*for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh->getDimensions().y(); + cell.getCoordinates().x()++ ) { + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh->getDimensions().x(); + cell.getCoordinates().y()++ ) + { + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + + + aux.save( "aux-5.tnl" ); + for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); + cell.getCoordinates().x() < mesh->getDimensions().y(); cell.getCoordinates().x()++ ) - { - //std::cerr << "3 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } + { + for( cell.getCoordinates().y() = mesh->getDimensions().x() - 1; + cell.getCoordinates().y() >= 0 ; + cell.getCoordinates().y()-- ) + { + //std::cerr << "2 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } } - //aux.save( "aux-3.tnl" ); - + aux.save( "aux-6.tnl" ); - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; - cell.getCoordinates().y() >= 0; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; + for( cell.getCoordinates().x() = mesh->getDimensions().y() - 1; cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) + cell.getCoordinates().x()-- ) { - //std::cerr << "4 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh->getDimensions().x(); + cell.getCoordinates().y()++ ) + { + //std::cerr << "3 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } } - } - //aux.save( "aux-4.tnl" ); + aux.save( "aux-7.tnl" ); + + for( cell.getCoordinates().x() = mesh->getDimensions().y() - 1; + cell.getCoordinates().x() >= 0; + cell.getCoordinates().x()-- ) + { + for( cell.getCoordinates().y() = mesh->getDimensions().x() - 1; + cell.getCoordinates().y() >= 0 ; + cell.getCoordinates().y()-- ) + { + //std::cerr << "4 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + }*/ + } + if( std::is_same< DeviceType, Devices::Cuda >::value ) + { + // TODO: CUDA code +#ifdef HAVE_CUDA + + Real *dAux; + cudaMalloc(&dAux, ( mesh->getDimensions().x() * mesh->getDimensions().y() ) * sizeof( Real ) ); + + + + + const int cudaBlockSize( 16 ); + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); + int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); + dim3 blockSize( cudaBlockSize, cudaBlockSize ); + dim3 gridSize( numBlocksX, numBlocksY ); + + tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; + + aux1<<< gridSize, blockSize >>>( auxPtr.template modifyData< Device>(), dAux,1 ); + + //int BlockIter = 1;// = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) ); + + int *BlockIterDevice; + int BlockIterD = 1; + + cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) ); + int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0); + int *dBlock; + cudaMalloc(&dBlock, nBlocks * sizeof( int ) ); + + while( BlockIterD ) + { + /*for( int i = 0; i < numBlocksX * numBlocksY; i++ ) + BlockIter[ i ] = false;*/ + + CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, + interfaceMapPtr.template getData< Device >(), + dAux, + BlockIterDevice ); + + CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); + CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); + + cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); + + /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) + BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ + + } + aux1<<<gridSize,blockSize>>>( auxPtr.template modifyData< Device>(), dAux, 0 ); + cudaFree( dAux ); + cudaFree( BlockIterDevice ); + cudaFree( dBlock ); + cudaDeviceSynchronize(); + + TNL_CHECK_CUDA_DEVICE; + + //aux = *auxPtr; + //interfaceMap = *interfaceMapPtr; +#endif + } iteration++; } + aux.save("aux-final.tnl"); } +#ifdef HAVE_CUDA +template < typename Real, typename Device, typename Index > +__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a ) +{ + int i = threadIdx.x + blockDim.x*blockIdx.x; + int j = blockDim.y*blockIdx.y + threadIdx.y; + const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >(); + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 1 ) + { + dAux[ j*mesh.getDimensions().x() + i ] = aux[ j*mesh.getDimensions().x() + i ]; + } + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 0 ) + { + aux[ j*mesh.getDimensions().x() + i ] = dAux[ j*mesh.getDimensions().x() + i ]; + } + +} + +__global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks ) +{ + int i = threadIdx.x; + int blId = blockIdx.x; + __shared__ volatile int sArray[ 512 ]; + sArray[ i ] = false; + if(blId * 1024 + i < nBlocks ) + sArray[ i ] = BlockIterDevice[ blId * 1024 + i ]; + + if (blockDim.x * blockDim.y == 1024) { + if (i < 512) + sArray[ i ] += sArray[ i ]; + } + __syncthreads(); + if (blockDim.x * blockDim.y >= 512) { + if (i < 256) { + sArray[ i ] += sArray[ i ]; + } + } + if (blockDim.x * blockDim.y >= 256) { + if (i < 128) { + sArray[ i ] += sArray[ i + 128 ]; + } + } + __syncthreads(); + if (blockDim.x * blockDim.y >= 128) { + if (i < 64) { + sArray[ i ] += sArray[ i + 64 ]; + } + } + __syncthreads(); + if (i < 32 ) + { + if( blockDim.x * blockDim.y >= 64 ) sArray[ i ] += sArray[ i + 32 ]; + if( blockDim.x * blockDim.y >= 32 ) sArray[ i ] += sArray[ i + 16 ]; + if( blockDim.x * blockDim.y >= 16 ) sArray[ i ] += sArray[ i + 8 ]; + if( blockDim.x * blockDim.y >= 8 ) sArray[ i ] += sArray[ i + 4 ]; + if( blockDim.x * blockDim.y >= 4 ) sArray[ i ] += sArray[ i + 2 ]; + if( blockDim.x * blockDim.y >= 2 ) sArray[ i ] += sArray[ i + 1 ]; + } + + if( i == 0 ) + dBlock[ blId ] = sArray[ 0 ]; +} + + + +template < typename Real, typename Device, typename Index > +__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, + const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, + Real *aux, + int *BlockIterDevice ) +{ + int thri = threadIdx.x; int thrj = threadIdx.y; + int blIdx = blockIdx.x; int blIdy = blockIdx.y; + int i = thri + blockDim.x*blIdx; + int j = blockDim.y*blIdy + thrj; + int currentIndex = thrj * blockDim.x + thri; + + //__shared__ volatile bool changed[ blockDim.x*blockDim.y ]; + __shared__ volatile bool changed[16*16]; + changed[ currentIndex ] = false; + + if( thrj == 0 && thri == 0 ) + changed[ 0 ] = true; + + const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); + __shared__ Real hx; + __shared__ Real hy; + if( thrj == 1 && thri == 1 ) + { + hx = mesh.getSpaceSteps().x(); + hy = mesh.getSpaceSteps().y(); + } + + //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ]; + __shared__ volatile Real sArray[18][18]; + sArray[thrj][thri] = std::numeric_limits< Real >::max(); + + //filling sArray edges + int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); + __shared__ volatile int numOfBlockx; + __shared__ volatile int numOfBlocky; + __shared__ int xkolik; + __shared__ int ykolik; + if( thri == 0 && thrj == 0 ) + { + xkolik = blockDim.x + 1; + ykolik = blockDim.y + 1; + numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0); + numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0); + + if( numOfBlockx - 1 == blIdx ) + xkolik = dimX - (blIdx)*blockDim.x+1; + + if( numOfBlocky -1 == blIdy ) + ykolik = dimY - (blIdy)*blockDim.y+1; + BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0; + } + __syncthreads(); + + if( thri == 0 ) + { + if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik ) + sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ]; + else + sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max(); + } + + if( thri == 1 ) + { + if( blIdx != 0 && thrj+1 < ykolik ) + sArray[thrj+1][0] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX ]; + else + sArray[thrj+1][0] = std::numeric_limits< Real >::max(); + } + + if( thri == 2 ) + { + if( dimY > (blIdy+1) * blockDim.y && thri+1 < xkolik ) + sArray[ykolik][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thrj+1 ]; + else + sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max(); + } + + if( thri == 3 ) + { + if( blIdy != 0 && thrj+1 < xkolik ) + sArray[0][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + thrj+1 ]; + else + sArray[0][thrj+1] = std::numeric_limits< Real >::max(); + } + + + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) + { + sArray[thrj+1][thri+1] = aux[ j*mesh.getDimensions().x() + i ]; + } + __syncthreads(); + + while( changed[ 0 ] ) + { + __syncthreads(); + + changed[ currentIndex] = false; + + //calculation of update cell + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) + { + if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] ) + { + changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy); + } + } + __syncthreads(); + + //pyramid reduction + if( blockDim.x*blockDim.y == 1024 ) + { + if( currentIndex < 512 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; + } + } + __syncthreads(); + if( blockDim.x*blockDim.y >= 512 ) + { + if( currentIndex < 256 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; + } + } + __syncthreads(); + if( blockDim.x*blockDim.y >= 256 ) + { + if( currentIndex < 128 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ]; + } + } + __syncthreads(); + if( blockDim.x*blockDim.y >= 128 ) + { + if( currentIndex < 64 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; + } + } + __syncthreads(); + if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU + { + if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ]; + if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ]; + if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ]; + if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ]; + if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; + if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; + } + if( changed[ 0 ] && thri == 0 && thrj == 0 ) + BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1; + __syncthreads(); + } + + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) ) + aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ]; +} +#endif diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h index 6fad87c7ea63f50a13e7b9e38c65994998e2b441..b024979cc6247022bd3ee156c877c2b47f71f99a 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h @@ -55,15 +55,457 @@ template< typename Real, void FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >:: solve( const MeshPointer& mesh, - const AnisotropyType& anisotropy, - MeshFunctionType& u ) + const AnisotropyPointer& anisotropy, + MeshFunctionPointer& u ) { - MeshFunctionType aux; - InterfaceMapType interfaceMap; - aux.setMesh( mesh ); - interfaceMap.setMesh( mesh ); + MeshFunctionPointer auxPtr; + InterfaceMapPointer interfaceMapPtr; + auxPtr->setMesh( mesh ); + interfaceMapPtr->setMesh( mesh ); std::cout << "Initiating the interface cells ..." << std::endl; - BaseType::initInterface( u, aux, interfaceMap ); - aux.save( "aux-ini.tnl" ); + BaseType::initInterface( u, auxPtr, interfaceMapPtr ); +#ifdef HAVE_CUDA + cudaDeviceSynchronize(); +#endif + auxPtr->save( "aux-ini.tnl" ); + + typename MeshType::Cell cell( *mesh ); + + IndexType iteration( 0 ); + MeshFunctionType aux = *auxPtr; + InterfaceMapType interfaceMap = * interfaceMapPtr; + while( iteration < this->maxIterations ) + { + if( std::is_same< DeviceType, Devices::Host >::value ) + { + for( cell.getCoordinates().z() = 0; + cell.getCoordinates().z() < mesh->getDimensions().z(); + cell.getCoordinates().z()++ ) + { + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh->getDimensions().y(); + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh->getDimensions().x(); + cell.getCoordinates().x()++ ) + { + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + } + //aux.save( "aux-1.tnl" ); + + for( cell.getCoordinates().z() = 0; + cell.getCoordinates().z() < mesh->getDimensions().z(); + cell.getCoordinates().z()++ ) + { + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh->getDimensions().y(); + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; + cell.getCoordinates().x() >= 0 ; + cell.getCoordinates().x()-- ) + { + //std::cerr << "2 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + } + //aux.save( "aux-2.tnl" ); + for( cell.getCoordinates().z() = 0; + cell.getCoordinates().z() < mesh->getDimensions().z(); + cell.getCoordinates().z()++ ) + { + for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; + cell.getCoordinates().y() >= 0 ; + cell.getCoordinates().y()-- ) + { + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh->getDimensions().x(); + cell.getCoordinates().x()++ ) + { + //std::cerr << "3 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + } + //aux.save( "aux-3.tnl" ); + + for( cell.getCoordinates().z() = 0; + cell.getCoordinates().z() < mesh->getDimensions().z(); + cell.getCoordinates().z()++ ) + { + for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; + cell.getCoordinates().y() >= 0; + cell.getCoordinates().y()-- ) + { + for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; + cell.getCoordinates().x() >= 0 ; + cell.getCoordinates().x()-- ) + { + //std::cerr << "4 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + } + //aux.save( "aux-4.tnl" ); + + for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1; + cell.getCoordinates().z() >= 0; + cell.getCoordinates().z()-- ) + { + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh->getDimensions().y(); + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh->getDimensions().x(); + cell.getCoordinates().x()++ ) + { + //std::cerr << "5 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + } + //aux.save( "aux-5.tnl" ); + + for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1; + cell.getCoordinates().z() >= 0; + cell.getCoordinates().z()-- ) + { + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh->getDimensions().y(); + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; + cell.getCoordinates().x() >= 0 ; + cell.getCoordinates().x()-- ) + { + //std::cerr << "6 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + } + //aux.save( "aux-6.tnl" ); + + for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1; + cell.getCoordinates().z() >= 0; + cell.getCoordinates().z()-- ) + { + for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; + cell.getCoordinates().y() >= 0 ; + cell.getCoordinates().y()-- ) + { + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh->getDimensions().x(); + cell.getCoordinates().x()++ ) + { + //std::cerr << "7 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + } + //aux.save( "aux-7.tnl" ); + + for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1; + cell.getCoordinates().z() >= 0; + cell.getCoordinates().z()-- ) + { + for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; + cell.getCoordinates().y() >= 0; + cell.getCoordinates().y()-- ) + { + for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; + cell.getCoordinates().x() >= 0 ; + cell.getCoordinates().x()-- ) + { + //std::cerr << "8 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + } + } + if( std::is_same< DeviceType, Devices::Cuda >::value ) + { + // TODO: CUDA code +#ifdef HAVE_CUDA + const int cudaBlockSize( 8 ); + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); + int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); + int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z(), cudaBlockSize ); + if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) + std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; + dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); + dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); + + tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr; + + int *BlockIterDevice; + int BlockIterD = 1; + + cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) ); + int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0); + int *dBlock; + cudaMalloc(&dBlock, nBlocks * sizeof( int ) ); + + while( BlockIterD ) + { + CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, + interfaceMapPtr.template getData< Device >(), + auxPtr.template modifyData< Device>(), + BlockIterDevice ); + CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); + CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); + + cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); + + /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) + BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ + + } + cudaFree( BlockIterDevice ); + cudaFree( dBlock ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + aux = *auxPtr; + interfaceMap = *interfaceMapPtr; +#endif + } + + //aux.save( "aux-8.tnl" ); + iteration++; + + } + aux.save("aux-final.tnl"); } +#ifdef HAVE_CUDA +template < typename Real, typename Device, typename Index > +__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, + const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, + int *BlockIterDevice ) +{ + int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z; + int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z; + int i = threadIdx.x + blockDim.x*blockIdx.x; + int j = blockDim.y*blockIdx.y + threadIdx.y; + int k = blockDim.z*blockIdx.z + threadIdx.z; + int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri; + + __shared__ volatile bool changed[8*8*8]; + changed[ currentIndex ] = false; + + if( thrj == 0 && thri == 0 && thrk == 0 ) + changed[ 0 ] = true; + + const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); + __shared__ Real hx; + __shared__ Real hy; + __shared__ Real hz; + if( thrj == 1 && thri == 1 && thrk == 1 ) + { + hx = mesh.getSpaceSteps().x(); + hy = mesh.getSpaceSteps().y(); + hz = mesh.getSpaceSteps().z(); + } + __shared__ volatile Real sArray[10][10][10]; + sArray[thrk][thrj][thri] = std::numeric_limits< Real >::max(); + if(thri == 0 ) + { + sArray[8][thrj+1][thrk+1] = std::numeric_limits< Real >::max(); + sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max(); + sArray[thrk+1][thrj+1][8] = std::numeric_limits< Real >::max(); + sArray[thrk+1][thrj+1][9] = std::numeric_limits< Real >::max(); + sArray[thrj+1][8][thrk+1] = std::numeric_limits< Real >::max(); + sArray[thrj+1][9][thrk+1] = std::numeric_limits< Real >::max(); + } + + //filling sArray edges + int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); + int dimZ = mesh.getDimensions().z(); + __shared__ volatile int numOfBlockx; + __shared__ volatile int numOfBlocky; + __shared__ volatile int numOfBlockz; + __shared__ int xkolik; + __shared__ int ykolik; + __shared__ int zkolik; + if( thri == 0 && thrj == 0 && thrk == 0 ) + { + xkolik = blockDim.x + 1; + ykolik = blockDim.y + 1; + zkolik = blockDim.z + 1; + numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0); + numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0); + numOfBlockz = dimZ/blockDim.z + ((dimZ%blockDim.z != 0) ? 1:0); + + if( numOfBlockx - 1 == blIdx ) + xkolik = dimX - (blIdx)*blockDim.x+1; + + if( numOfBlocky -1 == blIdy ) + ykolik = dimY - (blIdy)*blockDim.y+1; + if( numOfBlockz-1 == blIdz ) + zkolik = dimZ - (blIdz)*blockDim.z+1; + + BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 0; + } + __syncthreads(); + + if( thri == 0 ) + { + if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik ) + sArray[thrk+1][thrj+1][0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ]; + else + sArray[thrk+1][thrj+1][0] = std::numeric_limits< Real >::max(); + } + + if( thri == 1 ) + { + if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik ) + sArray[thrk+1][thrj+1][9] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ]; + else + sArray[thrk+1][thrj+1][9] = std::numeric_limits< Real >::max(); + } + if( thri == 2 ) + { + if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik ) + sArray[thrk+1][0][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ]; + else + sArray[thrk+1][0][thrj+1] = std::numeric_limits< Real >::max(); + } + + if( thri == 3 ) + { + if( dimY > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik ) + sArray[thrk+1][9][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ]; + else + sArray[thrk+1][9][thrj+1] = std::numeric_limits< Real >::max(); + } + if( thri == 4 ) + { + if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik ) + sArray[0][thrj+1][thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ]; + else + sArray[0][thrj+1][thrk+1] = std::numeric_limits< Real >::max(); + } + + if( thri == 5 ) + { + if( dimZ > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik ) + sArray[9][thrj+1][thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ]; + else + sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max(); + } + + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() ) + { + sArray[thrk+1][thrj+1][thri+1] = aux[ k*dimX*dimY + j*dimX + i ]; + } + __shared__ volatile int loopcounter; + loopcounter = 0; + __syncthreads(); + while( changed[ 0 ] ) + { + __syncthreads(); + + changed[ currentIndex ] = false; + + //calculation of update cell + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ ) + { + if( ! interfaceMap[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] ) + { + changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz); + } + } + __syncthreads(); + + //pyramid reduction + if( blockDim.x*blockDim.y*blockDim.z == 1024 ) + { + if( currentIndex < 512 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; + } + } + __syncthreads(); + if( blockDim.x*blockDim.y*blockDim.z >= 512 ) + { + if( currentIndex < 256 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; + } + } + __syncthreads(); + if( blockDim.x*blockDim.y*blockDim.z >= 256 ) + { + if( currentIndex < 128 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ]; + } + } + __syncthreads(); + if( blockDim.x*blockDim.y*blockDim.z >= 128 ) + { + if( currentIndex < 64 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; + } + } + __syncthreads(); + if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU + { + if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ]; + if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ]; + if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ]; + if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ]; + if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; + if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; + } + __syncthreads(); + + /*if(thri == 0 && thrj ==0 && thrk ==0 && blIdx == 0 && blIdy == 0 && blIdz == 0) + { + for(int m = 0; m < 8; m++){ + for(int n = 0; n<8; n++){ + for(int b=0; b<8; b++) + printf(" %i ", changed[m*64 + n*8 + b]); + printf("\n"); + } + printf("\n \n"); + } + }*/ + if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 ) + { + //loopcounter++; + BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 1; + } + __syncthreads(); + /*if(thri == 0 && thrj==0 && thrk==0) + printf("%i \n",loopcounter); + if(loopcounter == 500) + break;*/ + } + + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ && (!interfaceMap[ k*dimX*dimY+j * mesh.getDimensions().x() + i ]) ) + aux[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ]; +} +#endif diff --git a/src/TNL/Math.h b/src/TNL/Math.h index 1f754bcecaa88ffc973387b56aa9d692aaca344c..67e9f5e082af0b238e1d045bc516bf578efe82a0 100644 --- a/src/TNL/Math.h +++ b/src/TNL/Math.h @@ -72,7 +72,12 @@ template< class T > __cuda_callable__ inline T abs( const T& n ) { -#if defined(__MIC__) +#if defined(__CUDA_ARCH__) + if( std::is_integral< T >::value ) + return ::abs( n ); + else + return ::fabs( n ); +#elif defined(__MIC__) if( n < ( T ) 0 ) return -n; return n; diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp b/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp index 7d34ea0ccd210a2b86605ec12df4a1b99b3c32b0..01cf6e58e76dcdc3ffbe286dd39bf1b3f27523c6 100644 --- a/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp +++ b/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp @@ -389,7 +389,7 @@ setupNeighbors() if( this->isThereNeighbor( direction ) ) this->neighbors[ i ] = this->getRankOfProcCoord( coordinates ); else - this->neighbors[ i ] =- 1; + this->neighbors[ i ] = -1; // Handling periodic neighbors for( int d = 0; d < Dimension; d++ ) diff --git a/src/Tools/tnl-diff.h b/src/Tools/tnl-diff.h index 40b7c5fe01e8950a4db38f4b498813a949bd1036..5f4ca182a9fb23783c42f0f47ee13fcdefe9ea8e 100644 --- a/src/Tools/tnl-diff.h +++ b/src/Tools/tnl-diff.h @@ -300,6 +300,7 @@ bool computeDifference( const MeshPointer& meshPointer, const String& objectType if( objectType == "Containers::Vector" || objectType == "tnlVector" || objectType == "tnlSharedVector" ) // TODO: remove deprecated type name return computeDifferenceOfVectors< MeshPointer, Element, Real, Index >( meshPointer, parameters ); + std::cerr << "Unknown object type " << objectType << "." << std::endl; return false; } @@ -490,4 +491,4 @@ bool resolveGridRealType( const Containers::List< String >& parsedMeshType, return false; } -#endif /* TNL_DIFF_H_ */ +#endif /* TNL_DIFF_H_ */ \ No newline at end of file