diff --git a/CMakeLists.txt b/CMakeLists.txt
index b69b572cca9a8cafbf2ebe93e4cecafb6a15c595..f0b812d80bcf8fa36dbe66678ffffb9a5e03a14c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,6 +28,8 @@ option(WITH_GMP "Build with GMP support" OFF)
 option(WITH_TESTS "Build tests" ON)
 option(WITH_COVERAGE "Enable code coverage reports from unit tests" OFF)
 option(WITH_EXAMPLES "Compile the 'examples' directory" ON)
+option(WITH_TOOLS "Compile the 'src/Tools' directory" ON)
+option(WITH_PYTHON "Compile the Python bindings" ON)
 option(WITH_TEMPLATES_INSTANTIATION "Enable explicit template instantiation" OFF)
 
 # install paths relative to the cmake's prefix
@@ -111,7 +113,6 @@ if( ${CXX_COMPILER_NAME} STREQUAL "mpic++" )
    message( "MPI compiler detected."    )
    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_MPI" )
    set( CUDA_HOST_COMPILER "mpic++" )
-
 endif()
 
 ####
@@ -148,6 +149,22 @@ if( ${WITH_CUDA} )
                set( CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} )
             endif()
         endif()
+        # An extra CUDA_ARCH_HOST_COMPILER variable for compiling tnl-cuda-arch alone,
+        # because it SHOULD NOT be compiled using mpic++, which would cause weird
+        # RPATH_CHANGE error in cmake.
+        # FIXME: find better solution to switch between MPI-enabled and MPI-disabled binaries in cmake
+        if( NOT $ENV{CUDA_ARCH_HOST_COMPILER} STREQUAL "" )
+           message( "-- Setting CUDA_ARCH_HOST_COMPILER to '$ENV{CUDA_ARCH_HOST_COMPILER}'" )
+           set( CUDA_ARCH_HOST_COMPILER $ENV{CUDA_ARCH_HOST_COMPILER} )
+        else()
+            if( EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/bin/g++" )
+               message( "-- Setting CUDA_ARCH_HOST_COMPILER to '${CUDA_TOOLKIT_ROOT_DIR}/bin/g++'" )
+               set( CUDA_ARCH_HOST_COMPILER "${CUDA_TOOLKIT_ROOT_DIR}/bin/g++" )
+            else()
+               message( "-- Setting CUDA_ARCH_HOST_COMPILER to '${CMAKE_CXX_COMPILER}'" )
+               set( CUDA_ARCH_HOST_COMPILER ${CMAKE_CXX_COMPILER} )
+            endif()
+        endif()
         set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ;-DHAVE_CUDA)
         # disable false compiler warnings
         #   reference for the -Xcudafe flag: http://stackoverflow.com/questions/14831051/how-to-disable-compiler-warnings-with-nvcc/17095910#17095910
@@ -171,7 +188,7 @@ if( ${WITH_CUDA} )
                 set( CUDA_ARCH_SOURCE ${PROJECT_SOURCE_DIR}/src/Tools/tnl-cuda-arch.cu)
                 message( "Compiling tnl-cuda-arch ..." )
                 file( MAKE_DIRECTORY ${EXECUTABLE_OUTPUT_PATH} )
-                execute_process( COMMAND nvcc --compiler-bindir ${CUDA_HOST_COMPILER} --std=c++11 ${CUDA_ARCH_SOURCE} -o ${CUDA_ARCH_EXECUTABLE}
+                execute_process( COMMAND nvcc --compiler-bindir ${CUDA_ARCH_HOST_COMPILER} --std=c++11 ${CUDA_ARCH_SOURCE} -o ${CUDA_ARCH_EXECUTABLE}
                                  RESULT_VARIABLE CUDA_ARCH_RESULT
                                  OUTPUT_VARIABLE CUDA_ARCH_OUTPUT
                                  ERROR_VARIABLE CUDA_ARCH_OUTPUT )
@@ -437,6 +454,8 @@ message( "   WITH_GMP=${WITH_GMP}" )
 message( "   WITH_TESTS=${WITH_TESTS}" )
 message( "   WITH_COVERAGE=${WITH_COVERAGE}" )
 message( "   WITH_EXAMPLES=${WITH_EXAMPLES}" )
+message( "   WITH_TOOLS=${WITH_TOOLS}" )
+message( "   WITH_PYTHON=${WITH_PYTHON}" )
 message( "   WITH_TEMPLATES_INSTANTIATION=${WITH_TEMPLATES_INSTANTIATION}" )
 # Print compiler options
 message( "-- Compiler options:" )
diff --git a/build b/build
index 943259b4b4b992b63467951e896e7b5d162a0454..40bfa6d4f501dcbbf5415ed22efa475c9f966a2b 100755
--- a/build
+++ b/build
@@ -22,6 +22,7 @@ WITH_GMP="no"
 WITH_TESTS="yes"
 WITH_COVERAGE="no"
 WITH_EXAMPLES="yes"
+WITH_PYTHON="yes"
 WITH_TOOLS="yes"
 
 WITH_TEMPLATE_INSTANTIATION="no"
@@ -94,8 +95,8 @@ then
     echo "   --with-tests=yes/no                   Enables unit tests. 'yes' by default."
     echo "   --with-coverage=yes/no                Enables code coverage reports for unit tests. 'no' by default (lcov is required)."
     echo "   --with-examples=yes/no                Compile the 'examples' directory. 'yes' by default."
-    echo "   --with-tools=yes/no                   Compile the 'tools' directory. 'yes' by default."
-    echo "   --with-python=yes/no                  Compile with the python bindings. 'yes' by default."
+    echo "   --with-tools=yes/no                   Compile the 'src/Tools' directory. 'yes' by default."
+    echo "   --with-python=yes/no                  Compile the Python bindings. 'yes' by default."
     echo "   --with-templates-instantiation=yes/no Precompiles some TNL templates during the build. 'no' by default."
     echo "   --cmake=CMAKE                         Path to cmake. 'cmake' by default."
     echo "   --verbose                             It enables verbose build."
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9353b18958722e21016c1e781cd3ff8c9b9ab420..540606d3782c71df92031ad96abd519783e70b63 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -4,10 +4,10 @@ endif( WITH_PYTHON STREQUAL "yes" )
 
 ADD_SUBDIRECTORY( TNL )
 
-if( WITH_TOOLS STREQUAL "yes" )
+if( ${WITH_TOOLS} )
    ADD_SUBDIRECTORY( Tools )
-endif( WITH_TOOLS STREQUAL "yes" )
+endif()
 
-if( WITH_TESTS STREQUAL "yes" )
+if( ${WITH_TESTS} )
    ADD_SUBDIRECTORY( UnitTests )
-endif( WITH_TESTS STREQUAL "yes" )
+endif()
diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h
index dbccc4fba370aa6bb852c8fcd810cb4902262af8..1d34160d4d6ecaecbefb067d1838e4a83b29aeef 100644
--- a/src/TNL/Communicators/MpiCommunicator.h
+++ b/src/TNL/Communicators/MpiCommunicator.h
@@ -39,6 +39,7 @@
 
 namespace TNL {
 namespace Communicators {
+namespace {
 
 class MpiCommunicator
 {
@@ -526,7 +527,8 @@ template<> class MPITypeResolver<long double>
 };
 #endif
 
-}//namespace Communicators
+} // namespace <unnamed>
+} // namespace Communicators
 } // namespace TNL
 
 #define TNL_MPI_PRINT( message )                                                                                         \
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/CMakeLists.txt b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/CMakeLists.txt
index 4a257df7f9f029742aea06ca417c1d62d9c5e604..edb30a70cae874f2ac46808c8c5543d4171244da 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/CMakeLists.txt
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/CMakeLists.txt
@@ -10,8 +10,13 @@ set( tnl_hamilton_jacobi_SOURCES
 #ADD_EXECUTABLE(tnl-hamilton-jacobi main.cpp)
 #target_link_libraries (tnl-hamilton-jacobi tnl )
 
-ADD_EXECUTABLE(tnl-direct-eikonal-solver tnl-direct-eikonal-solver.cpp )
-target_link_libraries (tnl-direct-eikonal-solver tnl )
+IF( BUILD_CUDA )
+    CUDA_ADD_EXECUTABLE(tnl-direct-eikonal-solver tnl-direct-eikonal-solver.cu )
+    target_link_libraries (tnl-direct-eikonal-solver tnl )
+ELSE(  BUILD_CUDA )               
+    ADD_EXECUTABLE(tnl-direct-eikonal-solver tnl-direct-eikonal-solver.cpp )
+    target_link_libraries (tnl-direct-eikonal-solver tnl )
+ENDIF( BUILD_CUDA )
 
 
 INSTALL( TARGETS #tnl-hamilton-jacobi 
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/MainBuildConfig.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/MainBuildConfig.h
index ed3d686eb99379af1589d734eac9b5812cccdedf..e829cc64f398dafcd3d6b890e9fe7756b94b4676 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/MainBuildConfig.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/MainBuildConfig.h
@@ -1,64 +1,57 @@
 /***************************************************************************
-                          MainBuildConfig.h  -  description
+                          HamiltonJacobiBuildConfigTag.h  -  description
                              -------------------
     begin                : Jul 7, 2014
     copyright            : (C) 2014 by Tomas Oberhuber
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Solvers/BuildConfigTags.h>
 
-#ifndef MAINBUILDCONFIG_H_
-#define MAINBUILDCONFIG_H_
+namespace TNL {
 
-#include <solvers/tnlBuildConfigTags.h>
+class HamiltonJacobiBuildConfig {};
 
-class MainBuildConfig
-{
-   public:
+namespace Solvers {   
 
-      static void print() {std::cerr << "MainBuildConfig" <<std::endl; }
-};
 
 /****
  * Turn off support for float and long double.
  */
-template<> struct tnlConfigTagReal< MainBuildConfig, float > { enum { enabled = false }; };
-template<> struct tnlConfigTagReal< MainBuildConfig, long double > { enum { enabled = false }; };
+//template<> struct ConfigTagReal< HamiltonJacobiBuildConfig, float > { enum { enabled = false }; };
+template<> struct ConfigTagReal< HamiltonJacobiBuildConfig, long double > { enum { enabled = false }; };
 
 /****
  * Turn off support for short int and long int indexing.
  */
-template<> struct tnlConfigTagIndex< MainBuildConfig, short int >{ enum { enabled = false }; };
-template<> struct tnlConfigTagIndex< MainBuildConfig, long int >{ enum { enabled = false }; };
+template<> struct ConfigTagIndex< HamiltonJacobiBuildConfig, short int >{ enum { enabled = false }; };
+template<> struct ConfigTagIndex< HamiltonJacobiBuildConfig, long int >{ enum { enabled = false }; };
 
 /****
- * Use of tnlGrid is enabled for allowed dimensions and Real, Device and Index types.
+ * Use of Grid is enabled for allowed dimensions and Real, Device and Index types.
  */
-template< int Dimensions, typename Real, typename Device, typename Index >
-   struct tnlConfigTagMesh< MainBuildConfig, tnlGrid< Dimensions, Real, Device, Index > >
-      { enum { enabled = tnlConfigTagDimensions< MainBuildConfig, Dimensions >::enabled  &&
-                         tnlConfigTagReal< MainBuildConfig, Real >::enabled &&
-                         tnlConfigTagDevice< MainBuildConfig, Device >::enabled &&
-                         tnlConfigTagIndex< MainBuildConfig, Index >::enabled }; };
+/*template< int Dimension, typename Real, typename Device, typename Index >
+   struct ConfigTagMesh< HamiltonJacobiBuildConfig, Meshes::Grid< Dimension, Real, Device, Index > >
+      { enum { enabled = ConfigTagDimension< HamiltonJacobiBuildConfig, Dimension >::enabled  &&
+                         ConfigTagReal< HamiltonJacobiBuildConfig, Real >::enabled &&
+                         ConfigTagDevice< HamiltonJacobiBuildConfig, Device >::enabled &&
+                         ConfigTagIndex< HamiltonJacobiBuildConfig, Index >::enabled }; };*/
 
 /****
  * Please, chose your preferred time discretisation  here.
  */
-template<> struct tnlConfigTagTimeDiscretisation< MainBuildConfig, tnlExplicitTimeDiscretisationTag >{ enum { enabled = true }; };
-template<> struct tnlConfigTagTimeDiscretisation< MainBuildConfig, tnlSemiImplicitTimeDiscretisationTag >{ enum { enabled = false}; };
-template<> struct tnlConfigTagTimeDiscretisation< MainBuildConfig, tnlImplicitTimeDiscretisationTag >{ enum { enabled = false }; };
+template<> struct ConfigTagTimeDiscretisation< HamiltonJacobiBuildConfig, ExplicitTimeDiscretisationTag >{ enum { enabled = true }; };
+template<> struct ConfigTagTimeDiscretisation< HamiltonJacobiBuildConfig, SemiImplicitTimeDiscretisationTag >{ enum { enabled = true }; };
+template<> struct ConfigTagTimeDiscretisation< HamiltonJacobiBuildConfig, ImplicitTimeDiscretisationTag >{ enum { enabled = false }; };
 
 /****
  * Only the Runge-Kutta-Merson solver is enabled by default.
  */
-template<> struct tnlConfigTagExplicitSolver< MainBuildConfig, tnlExplicitEulerSolverTag >{ enum { enabled = false }; };
+//template<> struct ConfigTagExplicitSolver< HamiltonJacobiBuildConfig, ExplicitEulerSolverTag >{ enum { enabled = false }; };
 
-#endif /* MAINBUILDCONFIG_H_ */
+} // namespace Solvers
+} // namespace TNL
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h
index dccd06e3ccbd33dd093577ba11b245ab338ae44b..1b46ecb3dbea0438be1065fc43808ecb157c93ab 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h
@@ -20,14 +20,16 @@
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Meshes/Grid.h>
 #include "tnlDirectEikonalProblem.h"
+#include "MainBuildConfig.h"
 
 using namespace TNL;
 
 //typedef tnlDefaultBuildMeshConfig BuildConfig;
-typedef Solvers::FastBuildConfigTag BuildConfig;
+//typedef Solvers::FastBuildConfig BuildConfig;
+typedef HamiltonJacobiBuildConfig BuildConfig;
 
 template< typename MeshConfig >
-class tnlDirectEikonalSolverConfig
+class DirectEikonalSolverConfig
 {
    public:
       static void configSetup( Config::ConfigDescription& config )
@@ -44,7 +46,7 @@ template< typename Real,
           typename MeshConfig,
           typename SolverStarter,
           typename CommunicatorType >
-class tnlDirectEikonalSolverSetter
+class DirectEikonalSolverSetter
 {
    public:
 
@@ -66,7 +68,7 @@ class tnlDirectEikonalSolverSetter
 
 int main( int argc, char* argv[] )
 {
-   if( ! Solvers::Solver< tnlDirectEikonalSolverSetter, tnlDirectEikonalSolverConfig, BuildConfig >::run( argc, argv ) )
+   if( ! Solvers::Solver< DirectEikonalSolverSetter, DirectEikonalSolverConfig, BuildConfig >::run( argc, argv ) )
       return EXIT_FAILURE;
    return EXIT_SUCCESS;
 }
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test
index 6d997ce230a9de4ad1deefc1ccaa1f926021a16f..0dc50246f5ced2ec65616f7eecc65ebf860a2cdb 100755
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test
@@ -2,7 +2,7 @@
 
 device="host"
 dimensions="2D 3D"
-dimensions="2D"
+#dimensions="3D"
 sizes1D="16 32 64 128 256 512 1024 2048 4096"
 #sizes1D="256"
 sizes2D="16 32 64 128 256 512 1024"
@@ -12,7 +12,7 @@ testFunctions="paraboloid"
 snapshotPeriod=0.1
 finalTime=1.5
 solverName="tnl-direct-eikonal-solver"
-#solverName="gdb --args tnl-direct-eikonal-solver-dbg"
+#solverName="gdb --args tnl-direct-eikonal-solver-dbg --catch-exceptions no"
 #
 
 setupTestFunction()
@@ -59,7 +59,7 @@ setupGrid()
 setInitialCondition()
 {
    testFunction=$1
-   tnl-init --test-function ${testFunction}-sdf \
+   tnl-init --test-function ${testFunction} \
             --output-file initial-u.tnl \
             --amplitude ${amplitude} \
             --wave-length ${waveLength} \
@@ -78,7 +78,7 @@ setInitialCondition()
             --radius ${radius}
    
     tnl-init --test-function ${testFunction}-sdf \
-            --output-file final-u.tnl \
+            --output-file exact-u.tnl \
             --amplitude ${amplitude} \
             --wave-length ${waveLength} \
             --wave-length-x ${waveLengthX} \
@@ -108,7 +108,6 @@ solve()
                  --time-step 10 \
                  --time-step-order 2 \
                  --discrete-solver ${discreteSolver} \
-                 --merson-adaptivity 1.0e-5 \
                  --min-iterations 20 \
                  --convergence-residue 1.0e-12 \
                  --snapshot-period ${snapshotPeriod} \
@@ -118,7 +117,7 @@ solve()
 computeError()
 {
    tnl-diff --mesh mesh.tnl \
-            --input-files final-u.tnl u-*.tnl \
+            --input-files aux-final.tnl exact-u.tnl \
             --mode sequence \
             --snapshot-period ${snapshotPeriod} \
             --output-file errors.txt \
@@ -172,8 +171,8 @@ runTest()
                echo "========================================================================="
                echo "===                   STARTING THE SOLVER                             ==="
                echo "========================================================================="
-               solve explicit merson
-               #solve semi-implicit gmres
+               #solve explicit merson
+               solve semi-implicit cg
                mv computation-in-progress computation-done
             fi            
             echo "========================================================================="
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index c6385f64a9668272bee961d0bb5dc02b182655cd..a406a5b1772033bed2da5ab17330c892119923dc 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -9,6 +9,7 @@
 
 #include <TNL/Meshes/Grid.h>
 #include <TNL/Functions/MeshFunction.h>
+#include <TNL/Devices/Cuda.h>
 
 using namespace TNL;
 
@@ -30,15 +31,21 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >
       typedef Index IndexType;
       typedef Functions::MeshFunction< MeshType > MeshFunctionType;
       typedef Functions::MeshFunction< MeshType, 1, bool > InterfaceMapType;
+      using MeshFunctionPointer = SharedPointer< MeshFunctionType >;
+      using InterfaceMapPointer = SharedPointer< InterfaceMapType >;
       
-      void initInterface( const MeshFunctionType& input,
-                          MeshFunctionType& output,
-                          InterfaceMapType& interfaceMap );
+      void initInterface( const MeshFunctionPointer& input,
+                          MeshFunctionPointer& output,
+                          InterfaceMapPointer& interfaceMap );
       
       template< typename MeshEntity >
-      void updateCell( MeshFunctionType& u,
-                       const MeshEntity& cell );
+      __cuda_callable__ void updateCell( MeshFunctionType& u,
+                                         const MeshEntity& cell,
+                                         const RealType velocity = 1.0  );
       
+      __cuda_callable__ bool updateCell( volatile Real sArray[18],
+                                         int thri, const Real h,
+                                         const Real velocity = 1.0 );
 };
 
 
@@ -48,21 +55,27 @@ template< typename Real,
 class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
 {
    public:
-      
       typedef Meshes::Grid< 2, Real, Device, Index > MeshType;
       typedef Real RealType;
       typedef Device DevcieType;
       typedef Index IndexType;
       typedef Functions::MeshFunction< MeshType > MeshFunctionType;
       typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType;
+      using MeshFunctionPointer = SharedPointer< MeshFunctionType >;
+      using InterfaceMapPointer = SharedPointer< InterfaceMapType >;      
 
-      void initInterface( const MeshFunctionType& input,
-                          MeshFunctionType& output,
-                          InterfaceMapType& interfaceMap );
+      void initInterface( const MeshFunctionPointer& input,
+                          MeshFunctionPointer& output,
+                          InterfaceMapPointer& interfaceMap );
       
       template< typename MeshEntity >
-      void updateCell( MeshFunctionType& u,
-                       const MeshEntity& cell );
+      __cuda_callable__ void updateCell( MeshFunctionType& u,
+                                         const MeshEntity& cell,
+                                         const RealType velocity = 1.0 );
+      
+      __cuda_callable__ bool updateCell( volatile Real sArray[18][18],
+                                         int thri, int thrj, const Real hx, const Real hy,
+                                         const Real velocity = 1.0 );
 };
 
 template< typename Real,
@@ -71,21 +84,73 @@ template< typename Real,
 class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
 {
    public:
-      
       typedef Meshes::Grid< 3, Real, Device, Index > MeshType;
       typedef Real RealType;
       typedef Device DevcieType;
       typedef Index IndexType;
       typedef Functions::MeshFunction< MeshType > MeshFunctionType;
       typedef Functions::MeshFunction< MeshType, 3, bool > InterfaceMapType;
+      using MeshFunctionPointer = SharedPointer< MeshFunctionType >;
+      using InterfaceMapPointer = SharedPointer< InterfaceMapType >;      
 
-      void initInterface( const MeshFunctionType& input,
-                          MeshFunctionType& output,
-                          InterfaceMapType& interfaceMap );
+      void initInterface( const MeshFunctionPointer& input,
+                          MeshFunctionPointer& output,
+                          InterfaceMapPointer& interfaceMap );
       
       template< typename MeshEntity >
-      void updateCell( MeshFunctionType& u,
-                       const MeshEntity& cell );      
+      __cuda_callable__ void updateCell( MeshFunctionType& u,
+                                         const MeshEntity& cell,
+                                         const RealType velocity = 1.0);
+      
+      __cuda_callable__ bool updateCell( volatile Real sArray[10][10][10],
+                                         int thri, int thrj, int thrk, const Real hx, const Real hy, const Real hz,
+                                         const Real velocity = 1.0 );
 };
 
+template < typename T1, typename T2 >
+T1 meet2DCondition( T1 a, T1 b, const T2 ha, const T2 hb, const T1 value, double v = 1);
+
+template < typename T1 >
+__cuda_callable__ void sortMinims( T1 pom[] );
+
+
+#ifdef HAVE_CUDA
+template < typename Real, typename Device, typename Index >
+__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, 
+                                Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output,
+                                Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap  );
+
+template < typename Real, typename Device, typename Index >
+__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > ptr,
+                                      const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap,
+                                      Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& aux,
+                                      bool *BlockIterDevice );
+
+template < typename Real, typename Device, typename Index >
+__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
+                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
+                                      Real *aux,
+                                      int *BlockIterDevice);
+__global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks );
+
+template < typename Real, typename Device, typename Index >
+__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a );
+
+template < typename Real, typename Device, typename Index >
+__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
+                                Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
+                                Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap );
+
+template < typename Real, typename Device, typename Index >
+__global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, 
+                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
+                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap );
+
+template < typename Real, typename Device, typename Index >
+__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
+                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
+                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
+                                      int *BlockIterDevice );
+#endif
+
 #include "tnlDirectEikonalMethodsBase_impl.h"
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index 5decdba55d2d7fc418453fcfd6781a8c0055cd61..649a5ad43a3041fcd479ab7828f7fb1a85634b1c 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -1,4 +1,4 @@
-/* 
+ /* 
  * File:   tnlDirectEikonalMethodsBase_impl.h
  * Author: oberhuber
  *
@@ -9,43 +9,84 @@
 
 #include <limits>
 
+#include <iostream>
+#include "tnlFastSweepingMethod.h"
+
 template< typename Real,
           typename Device,
           typename Index >
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >::
-initInterface( const MeshFunctionType& input,
-               MeshFunctionType& output,
-               InterfaceMapType& interfaceMap  )
+initInterface( const MeshFunctionPointer& _input,
+               MeshFunctionPointer& _output,
+               InterfaceMapPointer& _interfaceMap  )
 {
-   const MeshType& mesh = input.getMesh();
-   typedef typename MeshType::Cell Cell;
-   Cell cell( mesh );
-   for( cell.getCoordinates().x() = 1;
-        cell.getCoordinates().x() < mesh.getDimensions().x() - 1;
-        cell.getCoordinates().x() ++ )
-   {
-      cell.refresh();
-      const RealType& c = input( cell );      
-      if( ! cell.isBoundaryEntity()  )
-      {
-         const auto& neighbors = cell.getNeighborEntities();
-         //const IndexType& c = cell.getIndex();
-         const IndexType e = neighbors.template getEntityIndex<  1 >();
-         const IndexType w = neighbors.template getEntityIndex< -1 >();
-
-         if( c * input[ e ] <= 0 || c * input[ w ] <= 0 )
-         {
-            output[ cell.getIndex() ] = c;
-            interfaceMap[ cell.getIndex() ] = true;
-            continue;
-         }
-      }
-      output[ cell.getIndex() ] =
-      c > 0 ? std::numeric_limits< RealType >::max() :
-             -std::numeric_limits< RealType >::max();
-      interfaceMap[ cell.getIndex() ] = false;
-   }
+    if( std::is_same< Device, Devices::Cuda >::value )
+    {
+#ifdef HAVE_CUDA
+        const MeshType& mesh = _input->getMesh();
+        
+        const int cudaBlockSize( 16 );
+        int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
+        dim3 blockSize( cudaBlockSize );
+        dim3 gridSize( numBlocksX );
+        Devices::Cuda::synchronizeDevice();
+        CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
+                                                   _output.template modifyData< Device >(),
+                                                   _interfaceMap.template modifyData< Device >() );
+        cudaDeviceSynchronize();
+        TNL_CHECK_CUDA_DEVICE;
+#endif
+    }
+    if( std::is_same< Device, Devices::Host >::value )
+    {
+        const MeshType& mesh = _input->getMesh();
+        typedef typename MeshType::Cell Cell;
+        const MeshFunctionType& input = _input.getData();
+        MeshFunctionType& output = _output.modifyData();
+        InterfaceMapType& interfaceMap = _interfaceMap.modifyData();
+        Cell cell( mesh );
+        for( cell.getCoordinates().x() = 0;
+            cell.getCoordinates().x() < mesh.getDimensions().x();
+            cell.getCoordinates().x() ++ )
+           {
+               cell.refresh();
+               output[ cell.getIndex() ] =
+               input( cell ) >= 0 ? std::numeric_limits< RealType >::max() :
+                                  -std::numeric_limits< RealType >::max();
+               interfaceMap[ cell.getIndex() ] = false;
+           }
+        
+        
+        const RealType& h = mesh.getSpaceSteps().x();
+        for( cell.getCoordinates().x() = 0;
+             cell.getCoordinates().x() < mesh.getDimensions().x() - 1;
+             cell.getCoordinates().x() ++ )
+        {
+           cell.refresh();
+           const RealType& c = input( cell );      
+           if( ! cell.isBoundaryEntity()  )
+           {
+              const auto& neighbors = cell.getNeighborEntities();
+              Real pom = 0;
+              //const IndexType& c = cell.getIndex();
+              const IndexType e = neighbors.template getEntityIndex<  1 >();
+              if( c * input[ e ] <= 0 )
+              {
+                pom = TNL::sign( c )*( h * c )/( c - input[ e ]);
+                if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
+                    output[ cell.getIndex() ] = pom;
+
+                pom = pom - TNL::sign( c )*h; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
+                if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
+                    output[ e ] = pom; 
+
+                interfaceMap[ cell.getIndex() ] = true;
+                interfaceMap[ e ] = true;
+              }
+           }
+        }
+    }
 }
 
 template< typename Real,
@@ -55,51 +96,175 @@ template< typename Real,
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >::
 updateCell( MeshFunctionType& u,
-            const MeshEntity& cell )
+            const MeshEntity& cell, 
+            const RealType v )
 {
-}
+    const auto& neighborEntities = cell.template getNeighborEntities< 1 >();
+    const MeshType& mesh = cell.getMesh();
+    const RealType& h = mesh.getSpaceSteps().x();
+    const RealType value = u( cell );
+    RealType a, tmp = std::numeric_limits< RealType >::max();
 
+    if( cell.getCoordinates().x() == 0 )
+       a = u[ neighborEntities.template getEntityIndex< 1 >() ];
+    else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
+       a = u[ neighborEntities.template getEntityIndex< -1 >() ];
+    else
+    {
+       a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1 >() ],
+                           u[ neighborEntities.template getEntityIndex<  1 >() ] );
+    }
+
+    if( fabs( a ) == std::numeric_limits< RealType >::max() )
+      return;
+   
+    tmp = a + TNL::sign( value ) * h/v;
+    
+    u[ cell.getIndex() ] = argAbsMin( value, tmp );
+}
 
 template< typename Real,
           typename Device,
           typename Index >
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
-initInterface( const MeshFunctionType& input,
-               MeshFunctionType& output,
-               InterfaceMapType& interfaceMap  )
+initInterface( const MeshFunctionPointer& _input,
+               MeshFunctionPointer& _output,
+               InterfaceMapPointer& _interfaceMap )
 {
-   const MeshType& mesh = input.getMesh();
-   typedef typename MeshType::Cell Cell;
-   Cell cell( mesh );
-   for( cell.getCoordinates().y() = 0;
-        cell.getCoordinates().y() < mesh.getDimensions().y();
-        cell.getCoordinates().y() ++ )
-      for( cell.getCoordinates().x() = 0;
-           cell.getCoordinates().x() < mesh.getDimensions().x();
-           cell.getCoordinates().x() ++ )
-      {
-         cell.refresh();
-         const RealType& c = input( cell );
-         if( ! cell.isBoundaryEntity()  )
-         {
-            auto neighbors = cell.getNeighborEntities();
-            const IndexType e = neighbors.template getEntityIndex<  1,  0 >();
-            const IndexType w = neighbors.template getEntityIndex< -1,  0 >();
-            const IndexType n = neighbors.template getEntityIndex<  0,  1 >();
-            const IndexType s = neighbors.template getEntityIndex<  0, -1 >();            
-            if( c * input[ e ] <= 0 || c * input[ w ] <= 0 ||
-                c * input[ n ] <= 0 || c * input[ s ] <= 0 )
-            {
-               output[ cell.getIndex() ] = c;
-               interfaceMap[ cell.getIndex() ] = true;
-               continue;
-            }
-         }
-         output[ cell.getIndex() ] =
-            c > 0 ? std::numeric_limits< RealType >::max() :
-                   -std::numeric_limits< RealType >::max();  
-         interfaceMap[ cell.getIndex() ] = false;
+            
+    if( std::is_same< Device, Devices::Cuda >::value )
+    {
+#ifdef HAVE_CUDA
+        const MeshType& mesh = _input->getMesh();
+        
+        const int cudaBlockSize( 16 );
+        int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
+        int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
+        dim3 blockSize( cudaBlockSize, cudaBlockSize );
+        dim3 gridSize( numBlocksX, numBlocksY );
+        Devices::Cuda::synchronizeDevice();
+        CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
+                                                   _output.template modifyData< Device >(),
+                                                   _interfaceMap.template modifyData< Device >() );
+        cudaDeviceSynchronize();
+        TNL_CHECK_CUDA_DEVICE;
+#endif
+    }
+    if( std::is_same< Device, Devices::Host >::value )
+    {
+        MeshFunctionType input = _input.getData();
+        
+        /*double A[320][320];
+        std::ifstream fileInit("/home/maty/Downloads/initData.txt");
+
+        for (int i = 0; i < 320; i++)
+            for (int j = 0; j < 320; j++)
+                fileInit >> A[i][j];
+        fileInit.close();
+        for (int i = 0; i < 320; i++)
+            for (int j = 0; j < 320; j++)
+                input[i*320 + j] = A[i][j];*/
+        
+        
+         MeshFunctionType& output = _output.modifyData();
+         InterfaceMapType& interfaceMap = _interfaceMap.modifyData();
+        const MeshType& mesh = input.getMesh();
+        typedef typename MeshType::Cell Cell;
+        Cell cell( mesh );
+        for( cell.getCoordinates().y() = 0;
+             cell.getCoordinates().y() < mesh.getDimensions().y();
+             cell.getCoordinates().y() ++ )
+            for( cell.getCoordinates().x() = 0;
+                 cell.getCoordinates().x() < mesh.getDimensions().x();
+                 cell.getCoordinates().x() ++ )
+                {
+                    cell.refresh();
+                    output[ cell.getIndex() ] =
+                    input( cell ) >= 0 ? std::numeric_limits< RealType >::max() :
+                                       - std::numeric_limits< RealType >::max();
+                    interfaceMap[ cell.getIndex() ] = false;
+                }
+
+       const RealType& hx = mesh.getSpaceSteps().x();
+       const RealType& hy = mesh.getSpaceSteps().y();     
+       for( cell.getCoordinates().y() = 0;
+            cell.getCoordinates().y() < mesh.getDimensions().y();
+            cell.getCoordinates().y() ++ )
+          for( cell.getCoordinates().x() = 0;
+               cell.getCoordinates().x() < mesh.getDimensions().x();
+               cell.getCoordinates().x() ++ )
+          {
+             cell.refresh();
+             const RealType& c = input( cell );
+             if( ! cell.isBoundaryEntity()  )
+             {
+                auto neighbors = cell.getNeighborEntities();
+                Real pom = 0;
+                const IndexType e = neighbors.template getEntityIndex<  1,  0 >();
+                const IndexType n = neighbors.template getEntityIndex<  0,  1 >();
+                //Try init with exact data:
+                /*if( c * input[ n ] <= 0 )
+                {
+                    output[ cell.getIndex() ] = c;
+                    output[ n ] = input[ n ];
+                    interfaceMap[ cell.getIndex() ] = true;
+                    interfaceMap[ n ] = true;
+                }   
+                if( c * input[ e ] <= 0 )
+                {   
+                    output[ cell.getIndex() ] = c;
+                    output[ e ] = input[ e ];
+                    interfaceMap[ cell.getIndex() ] = true;
+                    interfaceMap[ e ] = true;
+                }*/
+                if( c * input[ n ] <= 0 )
+                {
+                    /*if( c >= 0 )
+                    {*/
+                        pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
+                        if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
+                            output[ cell.getIndex() ] = pom;
+                        pom = pom - TNL::sign( c )*hy;
+                        if( TNL::abs( output[ n ] ) > TNL::abs( pom ) )
+                            output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy;
+                    /*}else
+                    {
+                        pom = - ( hy * c )/( c - input[ n ]);
+                        if( output[ cell.getIndex() ] < pom )
+                            output[ cell.getIndex() ] = pom;
+                        if( output[ n ] > hy + pom )
+                            output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]);
+                    }*/
+                    interfaceMap[ cell.getIndex() ] = true;
+                    interfaceMap[ n ] = true;
+                }
+                if( c * input[ e ] <= 0 )
+                {
+                    /*if( c >= 0 )
+                    {*/
+                        pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
+                        if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
+                            output[ cell.getIndex() ] = pom;
+
+                        pom = pom - TNL::sign( c )*hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
+                        if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
+                            output[ e ] = pom; 
+                    /*}else
+                    {
+                        pom = - (hx * c)/( c - input[ e ]);
+                        if( output[ cell.getIndex() ] < pom )
+                            output[ cell.getIndex() ] = pom;
+
+                        pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
+                        if( output[ e ] > pom )
+                            output[ e ] = pom;
+                    }*/
+                    interfaceMap[ cell.getIndex() ] = true;
+                    interfaceMap[ e ] = true;
+                }
+             }
+          }
       }
 }
 
@@ -107,26 +272,28 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename MeshEntity >
+__cuda_callable__
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
 updateCell( MeshFunctionType& u,
-            const MeshEntity& cell )
+            const MeshEntity& cell,   
+            const RealType v)
 {
    const auto& neighborEntities = cell.template getNeighborEntities< 2 >();
    const MeshType& mesh = cell.getMesh();
-  
-   const RealType& h = mesh.getSpaceSteps().x(); 
+   const RealType& hx = mesh.getSpaceSteps().x();
+   const RealType& hy = mesh.getSpaceSteps().y();
    const RealType value = u( cell );
-   Real a, b, tmp;
-
+   RealType a, b, tmp = std::numeric_limits< RealType >::max();
+   
    if( cell.getCoordinates().x() == 0 )
       a = u[ neighborEntities.template getEntityIndex< 1,  0 >() ];
    else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
       a = u[ neighborEntities.template getEntityIndex< -1,  0 >() ];
    else
    {
-      a = argAbsMin( u[ neighborEntities.template getEntityIndex< -1,  0 >() ],
-                     u[ neighborEntities.template getEntityIndex<  1,  0 >() ] );
+      a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1,  0 >() ],
+                          u[ neighborEntities.template getEntityIndex<  1,  0 >() ] );
    }
 
    if( cell.getCoordinates().y() == 0 )
@@ -135,98 +302,770 @@ updateCell( MeshFunctionType& u,
       b = u[ neighborEntities.template getEntityIndex< 0,  -1 >() ];
    else
    {
-      b = argAbsMin( u[ neighborEntities.template getEntityIndex< 0,  -1 >() ],
-                     u[ neighborEntities.template getEntityIndex< 0,   1 >() ] );
+      b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0,  -1 >() ],
+                          u[ neighborEntities.template getEntityIndex< 0,   1 >() ] );
    }
-
-   if( fabs( a ) == std::numeric_limits< Real >::max() && 
-       fabs( b ) == std::numeric_limits< Real >::max() )
+   if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+       fabs( b ) == std::numeric_limits< RealType >::max() )
       return;
-   if( fabs( a ) == std::numeric_limits< Real >::max() ||
-       fabs( b ) == std::numeric_limits< Real >::max() ||
-       fabs( a - b ) >= h )
+   /*if( fabs( a ) == TypeInfo< Real >::getMaxValue() ||
+       fabs( b ) == TypeInfo< Real >::getMaxValue() ||
+       fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) )
    {
-      tmp = argAbsMin( a, b ) + sign( value ) * h;
-      /*   std::cerr << "a = " << a << " b = " << b << " h = " << h 
-             << " ArgAbsMin( a, b ) = " << ArgAbsMin( a, b ) << " sign( value ) = " << sign( value )
-             << " sign( value ) * h = " << sign( value ) * h
-             << " ArgAbsMin( a, b ) + sign( value ) * h = " << ArgAbsMin( a, b ) + sign( value ) * h           
-             << " tmp = " << tmp << std::endl;
-      tmp = ArgAbsMin( a, b ) + sign( value ) * h;
-      tmp = ArgAbsMin( a, b ) + sign( value ) * h;
-      tmp = ArgAbsMin( a, b ) + sign( value ) * h;
-      res = ArgAbsMin( a, b ) + sign( value ) * h;
-      std::cerr << " tmp = " << tmp << std::endl;
-      std::cerr << " res = " << res << std::endl;*/
-
+      tmp = 
+        fabs( a ) >= fabs( b ) ? b + TNL::sign( value ) * hy :
+                                 a + TNL::sign( value ) * hx;
+   }*/
+   /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() &&
+       fabs( b ) != TypeInfo< Real >::getMaxValue() &&
+       fabs( a - b ) < TNL::sqrt( (hx * hx + hy * hy)/v ) )
+   {
+       tmp = ( hx * hx * b + hy * hy * a + 
+            sign( value ) * hx * hy * TNL::sqrt( ( hx * hx + hy * hy )/v - 
+            ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy );
+       u[ cell.getIndex() ] =  tmp;
    }
    else
-      tmp = 0.5 * ( a + b + sign( value ) * sqrt( 2.0 * h * h - ( a - b ) * ( a - b ) ) );
-
-   u[ cell.getIndex() ] = argAbsMin( value, tmp );
-   //std::cerr << ArgAbsMin( value, tmp ) << " ";   
+   {
+       tmp = 
+          fabs( a ) > fabs( b ) ? b + TNL::sign( value ) * hy/v :
+                                   a + TNL::sign( value ) * hx/v;
+       u[ cell.getIndex() ] = argAbsMin( value, tmp );
+       //tmp = TypeInfo< RealType >::getMaxValue();
+   }*/
+    RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
+    sortMinims( pom );
+    tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
+    
+                                
+    if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
+        u[ cell.getIndex() ] = argAbsMin( value, tmp );
+    else
+    {
+        tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
+            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
+        u[ cell.getIndex() ] = argAbsMin( value, tmp );
+    }
 }
 
-
 template< typename Real,
           typename Device,
           typename Index >
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
-initInterface( const MeshFunctionType& input,
-               MeshFunctionType& output,
-               InterfaceMapType& interfaceMap  )
+initInterface( const MeshFunctionPointer& _input,
+               MeshFunctionPointer& _output,
+               InterfaceMapPointer& _interfaceMap  )
 {
-   const MeshType& mesh = input.getMesh();
-   typedef typename MeshType::Cell Cell;
-   Cell cell( mesh );
-   for( cell.getCoordinates().z() = 1;
-        cell.getCoordinates().z() < mesh.getDimensions().z() - 1;
-        cell.getCoordinates().z() ++ )   
-      for( cell.getCoordinates().y() = 1;
-           cell.getCoordinates().y() < mesh.getDimensions().y() - 1;
-           cell.getCoordinates().y() ++ )
-         for( cell.getCoordinates().x() = 1;
-              cell.getCoordinates().x() < mesh.getDimensions().x() - 1;
-              cell.getCoordinates().x() ++ )
-         {
-            cell.refresh();
-            const RealType& c = input( cell );
-            if( ! cell.isBoundaryEntity() )
-            {
-               auto neighbors = cell.getNeighborEntities();
-               //const IndexType& c = cell.getIndex();
-               const IndexType e = neighbors.template getEntityIndex<  1,  0,  0 >();
-               const IndexType w = neighbors.template getEntityIndex< -1,  0,  0 >();
-               const IndexType n = neighbors.template getEntityIndex<  0,  1,  0 >();
-               const IndexType s = neighbors.template getEntityIndex<  0, -1,  0 >();
-               const IndexType t = neighbors.template getEntityIndex<  0,  0,  1 >();
-               const IndexType b = neighbors.template getEntityIndex<  0,  0, -1 >();
-
-               if( c * input[ e ] <= 0 || c * input[ w ] <= 0 ||
-                   c * input[ n ] <= 0 || c * input[ s ] <= 0 ||
-                   c * input[ t ] <= 0 || c * input[ b ] <= 0 )
-               {
-                  output[ cell.getIndex() ] = c;
-                  interfaceMap[ cell.getIndex() ] = true;
-                  continue;
-               }
-            }
-            output[ cell.getIndex() ] =
-               c > 0 ? std::numeric_limits< RealType >::max() :
-                      -std::numeric_limits< RealType >::max();
-            interfaceMap[ cell.getIndex() ] = false;
-         }
+    if( std::is_same< Device, Devices::Cuda >::value )
+    {
+#ifdef HAVE_CUDA
+        const MeshType& mesh = _input->getMesh();
+        
+        const int cudaBlockSize( 8 );
+        int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
+        int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
+        int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize );
+        if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
+            std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
+        dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
+        dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
+        Devices::Cuda::synchronizeDevice();
+        CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(),
+                                                     _output.template modifyData< Device >(),
+                                                     _interfaceMap.template modifyData< Device >() );
+        cudaDeviceSynchronize();
+        TNL_CHECK_CUDA_DEVICE;
+#endif
+    }
+    if( std::is_same< Device, Devices::Host >::value )
+    {
+        const MeshFunctionType& input =  _input.getData();
+        MeshFunctionType& output =  _output.modifyData();
+        InterfaceMapType& interfaceMap =  _interfaceMap.modifyData();
+        const MeshType& mesh = input.getMesh();
+        typedef typename MeshType::Cell Cell;
+        Cell cell( mesh );
+        for( cell.getCoordinates().z() = 0;
+             cell.getCoordinates().z() < mesh.getDimensions().z();
+             cell.getCoordinates().z() ++ )
+             for( cell.getCoordinates().y() = 0;
+                  cell.getCoordinates().y() < mesh.getDimensions().y();
+                  cell.getCoordinates().y() ++ )
+                 for( cell.getCoordinates().x() = 0;
+                      cell.getCoordinates().x() < mesh.getDimensions().x();
+                      cell.getCoordinates().x() ++ )
+                 {
+                     cell.refresh();
+                     output[ cell.getIndex() ] =
+                     input( cell ) > 0 ? std::numeric_limits< RealType >::max() :
+                                        - std::numeric_limits< RealType >::max();
+                     interfaceMap[ cell.getIndex() ] = false;
+                 }
+
+        const RealType& hx = mesh.getSpaceSteps().x();
+        const RealType& hy = mesh.getSpaceSteps().y();
+        const RealType& hz = mesh.getSpaceSteps().z();
+        for( cell.getCoordinates().z() = 0;
+             cell.getCoordinates().z() < mesh.getDimensions().z();
+             cell.getCoordinates().z() ++ )   
+           for( cell.getCoordinates().y() = 0;
+                cell.getCoordinates().y() < mesh.getDimensions().y();
+                cell.getCoordinates().y() ++ )
+              for( cell.getCoordinates().x() = 0;
+                   cell.getCoordinates().x() < mesh.getDimensions().x();
+                   cell.getCoordinates().x() ++ )
+              {
+                 cell.refresh();
+                 const RealType& c = input( cell );
+                 if( ! cell.isBoundaryEntity() )
+                 {
+                    auto neighbors = cell.getNeighborEntities();
+                    Real pom = 0;
+                    const IndexType e = neighbors.template getEntityIndex<  1,  0,  0 >();
+                    const IndexType n = neighbors.template getEntityIndex<  0,  1,  0 >();
+                    const IndexType t = neighbors.template getEntityIndex<  0,  0,  1 >();
+                    //Try exact initiation
+                    /*const IndexType w = neighbors.template getEntityIndex< -1,  0,  0 >();
+                    const IndexType s = neighbors.template getEntityIndex<  0, -1,  0 >();
+                    const IndexType b = neighbors.template getEntityIndex<  0,  0, -1 >();
+                    if( c * input[ e ] <= 0 )
+                    {
+                       output[ cell.getIndex() ] = c;
+                       output[ e ] = input[ e ];
+                       interfaceMap[ e ] = true;   
+                       interfaceMap[ cell.getIndex() ] = true;
+                    }
+                    else if( c * input[ n ] <= 0 )
+                    {
+                       output[ cell.getIndex() ] = c;
+                       output[ n ] = input[ n ];
+                       interfaceMap[ n ] = true;   
+                       interfaceMap[ cell.getIndex() ] = true;
+                    }
+                    else if( c * input[ t ] <= 0 )
+                    {
+                       output[ cell.getIndex() ] = c;
+                       output[ t ] = input[ t ];
+                       interfaceMap[ t ] = true;   
+                       interfaceMap[ cell.getIndex() ] = true;
+                    }*/
+                    if( c * input[ n ] <= 0 )
+                    {
+                        if( c >= 0 )
+                        {
+                        pom = ( hy * c )/( c - input[ n ]);
+                        if( output[ cell.getIndex() ] > pom ) 
+                            output[ cell.getIndex() ] = pom;
+
+                        if ( output[ n ] < pom - hy)
+                             output[ n ] = pom - hy; // ( hy * c )/( c - input[ n ]) - hy;
+
+                        }else
+                        {
+                          pom = - ( hy * c )/( c - input[ n ]);
+                          if( output[ cell.getIndex() ] < pom )
+                              output[ cell.getIndex() ] = pom;
+                          if( output[ n ] > hy + pom )
+                              output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]);
+
+                        }
+                    interfaceMap[ cell.getIndex() ] = true;
+                    interfaceMap[ n ] = true;
+                    }
+                    if( c * input[ e ] <= 0 )
+                    {
+                        if( c >= 0 )
+                        {
+                            pom = ( hx * c )/( c - input[ e ]);
+                            if( output[ cell.getIndex() ] > pom )
+                                output[ cell.getIndex() ] = pom;
+
+                            pom = pom - hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
+                            if( output[ e ] < pom )
+                                output[ e ] = pom;      
+
+                        }else
+                        {
+                            pom = - (hx * c)/( c - input[ e ]);
+                            if( output[ cell.getIndex() ] < pom )
+                                output[ cell.getIndex() ] = pom;
+
+                            pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
+                            if( output[ e ] > pom )
+                                output[ e ] = pom;
+                        }
+                    interfaceMap[ cell.getIndex() ] = true;
+                    interfaceMap[ e ] = true;
+                    }
+                    if( c * input[ t ] <= 0 )
+                    {
+                        if( c >= 0 )
+                        {
+                            pom = ( hz * c )/( c - input[ t ]);
+                            if( output[ cell.getIndex() ] > pom )
+                                output[ cell.getIndex() ] = pom;
+
+                            pom = pom - hz; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
+                            if( output[ t ] < pom )
+                                output[ t ] = pom; 
+
+                        }else
+                        {
+                            pom = - (hz * c)/( c - input[ t ]);
+                            if( output[ cell.getIndex() ] < pom )
+                                output[ cell.getIndex() ] = pom;
+
+                            pom = pom + hz; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
+                            if( output[ t ] > pom )
+                                output[ t ] = pom;
+
+                        }
+                    interfaceMap[ cell.getIndex() ] = true;
+                    interfaceMap[ t ] = true;
+                    }    
+                 }
+                 /*output[ cell.getIndex() ] =
+                    c > 0 ? TypeInfo< RealType >::getMaxValue() :
+                           -TypeInfo< RealType >::getMaxValue();
+                 interfaceMap[ cell.getIndex() ] = false;*/ //is on line 245
+              }
+    }
 }
 
 template< typename Real,
           typename Device,
           typename Index >
    template< typename MeshEntity >
+__cuda_callable__
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
 updateCell( MeshFunctionType& u,
-            const MeshEntity& cell )
+            const MeshEntity& cell, 
+            const RealType v )
+{
+   const auto& neighborEntities = cell.template getNeighborEntities< 3 >();
+   const MeshType& mesh = cell.getMesh();
+  
+   const RealType& hx = mesh.getSpaceSteps().x();
+   const RealType& hy = mesh.getSpaceSteps().y();
+   const RealType& hz = mesh.getSpaceSteps().z();
+   const RealType value = u( cell );
+   //std::cout << value << std::endl;
+   RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
+   
+   
+   if( cell.getCoordinates().x() == 0 )
+      a = u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ];
+   else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
+      a = u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ];
+   else
+   {
+      a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ],
+                        u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ] );
+   }
+   if( cell.getCoordinates().y() == 0 )
+      b = u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ];
+   else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 )
+      b = u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ];
+   else
+   {
+      b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ],
+                        u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ] );
+   }if( cell.getCoordinates().z() == 0 )
+      c = u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ];
+   else if( cell.getCoordinates().z() == mesh.getDimensions().z() - 1 )
+      c = u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ];
+   else
+   {
+      c = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ],
+                         u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ] );
+   }
+   if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+       fabs( b ) == std::numeric_limits< RealType >::max() &&
+       fabs( c ) == std::numeric_limits< RealType >::max() )
+      return;
+   
+   
+       /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() &&
+           fabs( b ) != TypeInfo< Real >::getMaxValue() &&
+           fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) )
+       {
+           tmp = ( hx * hx * a + hy * hy * b + 
+                sign( value ) * hx * hy * sqrt( ( hx * hx + hy * hy )/v - 
+                ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy );
+       }
+       if( fabs( a ) != TypeInfo< Real >::getMaxValue() &&
+           fabs( c ) != TypeInfo< Real >::getMaxValue() &&
+           fabs( a - c ) >= TNL::sqrt( (hx * hx + hz * hz)/v ) )
+       {
+           tmp = ( hx * hx * a + hz * hz * c + 
+                sign( value ) * hx * hz * sqrt( ( hx * hx + hz * hz )/v - 
+                ( a - c ) * ( a - c ) ) )/( hx * hx + hz * hz );
+       }
+       if( fabs( b ) != TypeInfo< Real >::getMaxValue() &&
+           fabs( c ) != TypeInfo< Real >::getMaxValue() &&
+           fabs( b - c ) >= TNL::sqrt( (hy * hy + hz * hz)/v ) )
+       {
+           tmp = ( hy * hy * b + hz * hz * c + 
+                sign( value ) * hy * hz * sqrt( ( hy * hy + hz * hz )/v - 
+                ( b - c ) * ( b - c ) ) )/( hy * hy + hz * hz );
+       }*/
+    RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
+    sortMinims( pom );   
+    tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
+    if( fabs( tmp ) < fabs( pom[ 1 ] ) )
+    {
+        u[ cell.getIndex() ] = argAbsMin( value, tmp ); 
+    }
+    else
+    {
+        tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
+            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
+        if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
+        {
+            u[ cell.getIndex() ] = argAbsMin( value, tmp );
+        }
+        else
+        {
+            tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
+                TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
+                hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
+                hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
+            u[ cell.getIndex() ] = argAbsMin( value, tmp );
+        }
+    }
+}
+
+template < typename T1, typename T2 >
+T1 meet2DCondition( T1 a, T1 b, const T2 ha, const T2 hb, const T1 value, double v)
+{
+   T1 tmp;
+   if( fabs( a ) != std::numeric_limits< T1 >::max &&
+       fabs( b ) != std::numeric_limits< T1 >::max &&
+       fabs( a - b ) < ha/v )//TNL::sqrt( (ha * ha + hb * hb)/2 )/v )
+   {
+      tmp = ( ha * ha * b + hb * hb * a + 
+            TNL::sign( value ) * ha * hb * TNL::sqrt( ( ha * ha + hb * hb )/( v * v ) - 
+            ( a - b ) * ( a - b ) ) )/( ha * ha + hb * hb );
+   }
+   else
+   {
+       tmp = std::numeric_limits< T1 >::max;
+   }
+   
+   return tmp;
+}
+
+template < typename T1 >
+__cuda_callable__ void sortMinims( T1 pom[] )
 {
+    T1 tmp[6] = {0.0,0.0,0.0,0.0,0.0,0.0}; 
+    if( fabs(pom[0]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[2])){
+        tmp[0] = pom[0]; tmp[1] = pom[1]; tmp[2] = pom[2];
+        tmp[3] = pom[3]; tmp[4] = pom[4]; tmp[5] = pom[5];
+        
+    }
+    else if( fabs(pom[0]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[1]) ){
+        tmp[0] = pom[0]; tmp[1] = pom[2]; tmp[2] = pom[1];
+        tmp[3] = pom[3]; tmp[4] = pom[5]; tmp[5] = pom[4];
+    }
+    else if( fabs(pom[1]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[2]) ){
+        tmp[0] = pom[1]; tmp[1] = pom[0]; tmp[2] = pom[2];
+        tmp[3] = pom[4]; tmp[4] = pom[3]; tmp[5] = pom[5];
+    }
+    else if( fabs(pom[1]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[0]) ){
+        tmp[0] = pom[1]; tmp[1] = pom[2]; tmp[2] = pom[0];
+        tmp[3] = pom[4]; tmp[4] = pom[5]; tmp[5] = pom[3];
+    }
+    else if( fabs(pom[2]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[1]) ){
+        tmp[0] = pom[2]; tmp[1] = pom[0]; tmp[2] = pom[1];
+        tmp[3] = pom[5]; tmp[4] = pom[3]; tmp[5] = pom[4];
+    }
+    else if( fabs(pom[2]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[0]) ){
+        tmp[0] = pom[2]; tmp[1] = pom[1]; tmp[2] = pom[0];
+        tmp[3] = pom[5]; tmp[4] = pom[4]; tmp[5] = pom[3];
+    }
+    
+    for( int i = 0; i < 6; i++ )
+    {
+        pom[ i ] = tmp[ i ];
+    }   
+}
+
+
+
+#ifdef HAVE_CUDA
+template < typename Real, typename Device, typename Index >
+__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, 
+                                Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output,
+                                Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap  )
+{
+    int i = threadIdx.x + blockDim.x*blockIdx.x;
+    const Meshes::Grid< 1, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
+    
+    if( i < mesh.getDimensions().x()  )
+    {
+        typedef typename Meshes::Grid< 1, Real, Device, Index >::Cell Cell;
+        Cell cell( mesh );
+        cell.getCoordinates().x() = i;
+        cell.refresh();
+        const Index cind = cell.getIndex();
+
+
+        output[ cind ] =
+               input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
+                                    - std::numeric_limits< Real >::max();
+        interfaceMap[ cind ] = false; 
+
+        const Real& h = mesh.getSpaceSteps().x();
+        cell.refresh();
+        const Real& c = input( cell );
+        if( ! cell.isBoundaryEntity()  )
+        {
+           auto neighbors = cell.getNeighborEntities();
+           Real pom = 0;
+           const Index e = neighbors.template getEntityIndex< 1 >();
+           const Index w = neighbors.template getEntityIndex< -1 >();
+           if( c * input[ e ] <= 0 )
+           {
+               pom = TNL::sign( c )*( h * c )/( c - input[ e ]);
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
+                   output[ cind ] = pom;                       
+
+               interfaceMap[ cind ] = true;
+           }
+           if( c * input[ w ] <= 0 )
+           {
+               pom = TNL::sign( c )*( h * c )/( c - input[ w ]);
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+                   output[ cind ] = pom;
+
+               interfaceMap[ cind ] = true;
+           }
+        }
+    }
+           
+}
+template < typename Real, typename Device, typename Index >
+__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
+                                Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
+                                Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap ) 
+{
+    int i = threadIdx.x + blockDim.x*blockIdx.x;
+    int j = blockDim.y*blockIdx.y + threadIdx.y;
+    const Meshes::Grid< 2, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
+    
+    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+    {
+        typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell;
+        Cell cell( mesh );
+        cell.getCoordinates().x() = i; cell.getCoordinates().y() = j;
+        cell.refresh();
+        const Index cind = cell.getIndex();
+
+
+        output[ cind ] =
+               input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
+                                    - std::numeric_limits< Real >::max();
+        interfaceMap[ cind ] = false; 
+
+        const Real& hx = mesh.getSpaceSteps().x();
+        const Real& hy = mesh.getSpaceSteps().y();
+        cell.refresh();
+        const Real& c = input( cell );
+        if( ! cell.isBoundaryEntity()  )
+        {
+           auto neighbors = cell.getNeighborEntities();
+           Real pom = 0;
+           const Index e = neighbors.template getEntityIndex<  1,  0 >();
+           const Index w = neighbors.template getEntityIndex<  -1,  0 >();
+           const Index n = neighbors.template getEntityIndex<  0,  1 >();
+           const Index s = neighbors.template getEntityIndex<  0,  -1 >();
+           
+           if( c * input[ n ] <= 0 )
+           {
+               pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+                   output[ cind ] = pom;
+
+               interfaceMap[ cell.getIndex() ] = true;
+           }
+           if( c * input[ e ] <= 0 )
+           {
+               pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
+                   output[ cind ] = pom;                       
+
+               interfaceMap[ cind ] = true;
+           }
+           if( c * input[ w ] <= 0 )
+           {
+               pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+                   output[ cind ] = pom;
+
+               interfaceMap[ cind ] = true;
+           }
+           if( c * input[ s ] <= 0 )
+           {
+               pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+                   output[ cind ] = pom;
+
+               interfaceMap[ cind ] = true;
+           }
+        }
+    }
+}
+
+template < typename Real, typename Device, typename Index >
+__global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, 
+                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
+                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap )
+{
+    int i = threadIdx.x + blockDim.x*blockIdx.x;
+    int j = blockDim.y*blockIdx.y + threadIdx.y;
+    int k = blockDim.z*blockIdx.z + threadIdx.z;
+    const Meshes::Grid< 3, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
+    
+    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
+    {
+        typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell;
+        Cell cell( mesh );
+        cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k;
+        cell.refresh();
+        const Index cind = cell.getIndex();
+
+
+        output[ cind ] =
+               input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
+                                    - std::numeric_limits< Real >::max();
+        interfaceMap[ cind ] = false; 
+        cell.refresh();
+
+        const Real& hx = mesh.getSpaceSteps().x();
+        const Real& hy = mesh.getSpaceSteps().y();
+        const Real& hz = mesh.getSpaceSteps().z();
+        const Real& c = input( cell );
+        if( ! cell.isBoundaryEntity()  )
+        {
+           auto neighbors = cell.getNeighborEntities();
+           Real pom = 0;
+           const Index e = neighbors.template getEntityIndex<  1, 0, 0 >();
+           const Index w = neighbors.template getEntityIndex<  -1, 0, 0 >();
+           const Index n = neighbors.template getEntityIndex<  0, 1, 0 >();
+           const Index s = neighbors.template getEntityIndex<  0, -1, 0 >();
+           const Index t = neighbors.template getEntityIndex<  0, 0, 1 >();
+           const Index b = neighbors.template getEntityIndex<  0, 0, -1 >();
+           
+           if( c * input[ n ] <= 0 )
+           {
+               pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+                   output[ cind ] = pom;
+
+               interfaceMap[ cind ] = true;
+           }
+           if( c * input[ e ] <= 0 )
+           {
+               pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
+                   output[ cind ] = pom;                       
+
+               interfaceMap[ cind ] = true;
+           }
+           if( c * input[ w ] <= 0 )
+           {
+               pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+                   output[ cind ] = pom;
+
+               interfaceMap[ cind ] = true;
+           }
+           if( c * input[ s ] <= 0 )
+           {
+               pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+                   output[ cind ] = pom;
+
+               interfaceMap[ cind ] = true;
+           }
+           if( c * input[ b ] <= 0 )
+           {
+               pom = TNL::sign( c )*( hz * c )/( c - input[ b ]);
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+                   output[ cind ] = pom;
+
+               interfaceMap[ cind ] = true;
+           }
+           if( c * input[ t ] <= 0 )
+           {
+               pom = TNL::sign( c )*( hz * c )/( c - input[ t ]);
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+                   output[ cind ] = pom;
+
+               interfaceMap[ cind ] = true;
+           }
+        }
+    }
+}
+
+
+template< typename Real,
+          typename Device,
+          typename Index >
+__cuda_callable__
+bool
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy,
+            const Real v )
+{
+   const RealType value = sArray[ thrj ][ thri ];
+   RealType a, b, tmp = std::numeric_limits< RealType >::max();
+      
+   b = TNL::argAbsMin( sArray[ thrj+1 ][ thri ],
+                        sArray[ thrj-1 ][ thri ] );
+    
+   a = TNL::argAbsMin( sArray[ thrj ][ thri+1 ],
+                        sArray[ thrj ][ thri-1 ] );
+
+    if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+        fabs( b ) == std::numeric_limits< RealType >::max() )
+       return false;
+   
+    RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
+    sortMinims( pom );
+    tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
+    
+                                
+    if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
+    {
+        sArray[ thrj ][ thri ] = argAbsMin( value, tmp );
+        tmp = value - sArray[ thrj ][ thri ];
+        if ( fabs( tmp ) >  0.01*hx )
+            return true;
+        else
+            return false;
+    }
+    else
+    {
+        tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
+            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
+        sArray[ thrj ][ thri ] = argAbsMin( value, tmp );
+        tmp = value - sArray[ thrj ][ thri ];
+        if ( fabs( tmp ) > 0.01*hx )
+            return true;
+        else
+            return false;
+    }
+    
+    return false;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+__cuda_callable__
+bool
+tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >::
+updateCell( volatile Real sArray[18], int thri, const Real h, const Real v )
+{
+   const RealType value = sArray[ thri ];
+   RealType a, tmp = std::numeric_limits< RealType >::max();
+      
+   a = TNL::argAbsMin( sArray[ thri+1 ],
+                       sArray[ thri-1 ] );
+
+    if( fabs( a ) == std::numeric_limits< RealType >::max() )
+       return false;
+   
+    tmp = a + TNL::sign( value ) * h/v;
+    
+                                
+    sArray[ thri ] = argAbsMin( value, tmp );
+    
+    tmp = value - sArray[ thri ];
+    if ( fabs( tmp ) >  0.01*h )
+        return true;
+    else
+        return false;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+__cuda_callable__ 
+bool 
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk,
+        const Real hx, const Real hy, const Real hz, const Real v )
+{
+   const RealType value = sArray[thrk][thrj][thri];
+   //std::cout << value << std::endl;
+   RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
+   
+   c = TNL::argAbsMin( sArray[ thrk+1 ][ thrj ][ thri ],
+                        sArray[ thrk-1 ][ thrj ][ thri ] );
+    
+   b = TNL::argAbsMin( sArray[ thrk ][ thrj+1 ][ thri ],
+                        sArray[ thrk ][ thrj-1 ][ thri ] );
+   
+   a = TNL::argAbsMin( sArray[ thrk ][ thrj ][ thri+1 ],
+                        sArray[ thrk ][ thrj ][ thri-1 ] );
+   
+   
+   if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+       fabs( b ) == std::numeric_limits< RealType >::max() &&
+       fabs( c ) == std::numeric_limits< RealType >::max() )
+      return false;
    
+    RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
+    
+    sortMinims( pom );
+    
+    tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
+    if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
+    {
+        sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
+        tmp = value - sArray[ thrk ][ thrj ][ thri ];
+        if ( fabs( tmp ) >  0.01*hx )
+            return true;
+        else
+            return false;
+    }
+    else
+    {
+        tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
+            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
+        if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
+        {
+            sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
+            tmp = value - sArray[ thrk ][ thrj ][ thri ];
+            if ( fabs( tmp ) > 0.01*hx )
+                return true;
+            else
+                return false;
+        }
+        else
+        {
+            tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
+                TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
+                hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
+                hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
+            sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
+            tmp = value - sArray[ thrk ][ thrj ][ thri ];
+            if ( fabs( tmp ) > 0.01*hx )
+                return true;
+            else
+                return false;
+        }
+    }
+    
+    return false;
 }
+#endif
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h
index 7737073a623eab27ec8888a72b9b1e91703e5b55..f7e1f1d386b061937417c0906590290f5a6703fa 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h
@@ -38,6 +38,8 @@ class tnlDirectEikonalProblem
       typedef Functions::MeshFunction< Mesh > MeshFunctionType;
       typedef Problems::PDEProblem< Mesh, Communicator, RealType, DeviceType, IndexType > BaseType;
       using AnisotropyType = Anisotropy;
+      using AnisotropyPointer = SharedPointer< AnisotropyType, DeviceType >;
+      using MeshFunctionPointer = SharedPointer< MeshFunctionType >;
 
       using typename BaseType::MeshType;
       using typename BaseType::DofVectorType;
@@ -71,11 +73,11 @@ class tnlDirectEikonalProblem
 
       protected:
          
-         MeshFunctionType u;
+         MeshFunctionPointer u;
          
-         MeshFunctionType initialData;
+         MeshFunctionPointer initialData;
          
-         AnisotropyType anisotropy;
+         AnisotropyPointer anisotropy;
 
 };
 
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
index 3938e2754892cf35d0d7c4ac4813347b29886593..3d5fa15dc23d82eb886d344bc7cf4fff7f4e34d3 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
@@ -100,7 +100,7 @@ void
 tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >::
 bindDofs( const DofVectorPointer& dofs )
 {
-   this->u.bind( this->getMesh(), dofs );
+   this->u->bind( this->getMesh(), dofs );
 }
 
 template< typename Mesh,
@@ -114,8 +114,8 @@ setInitialCondition( const Config::ParameterContainer& parameters,
                      DofVectorPointer& dofs )
 {
    String inputFile = parameters.getParameter< String >( "input-file" );
-   this->initialData.setMesh( this->getMesh() );
-   if( !this->initialData.boundLoad( inputFile ) )
+   this->initialData->setMesh( this->getMesh() );
+   if( !this->initialData->boundLoad( inputFile ) )
       return false;
    return true;
 }
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
index 17d139b0ec8b01c0dbf87188968be67ee5fa5cca..54e577dacde239171ae299ee62b078bd7a406c35 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
@@ -15,6 +15,7 @@
 #include <TNL/SharedPointer.h>
 #include "tnlDirectEikonalMethodsBase.h"
 
+
 template< typename Mesh,
           typename Anisotropy = Functions::Analytic::Constant< Mesh::getMeshDimension(), typename Mesh::RealType > >
 class FastSweepingMethod
@@ -28,20 +29,24 @@ template< typename Real,
 class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >
    : public tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >
 {
-   static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
+   //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
    
    public:
       
       typedef Meshes::Grid< 1, Real, Device, Index > MeshType;
       typedef Real RealType;
-      typedef TNL::Devices::Host DeviceType;
+      typedef Device DeviceType;
       typedef Index IndexType;
       typedef Anisotropy AnisotropyType;
       typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > BaseType;
       using MeshPointer = SharedPointer< MeshType >;
+      using AnisotropyPointer = SharedPointer< AnisotropyType, DeviceType >;
       
       using typename BaseType::InterfaceMapType;
       using typename BaseType::MeshFunctionType;
+      using typename BaseType::InterfaceMapPointer;
+      using typename BaseType::MeshFunctionPointer;
+      
       
       FastSweepingMethod();
       
@@ -50,8 +55,8 @@ class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >
       void setMaxIterations( const IndexType& maxIterations );
       
       void solve( const MeshPointer& mesh,
-                  const AnisotropyType& anisotropy,
-                  MeshFunctionType& u );
+                  const AnisotropyPointer& anisotropy,
+                  MeshFunctionPointer& u );
       
       
    protected:
@@ -66,20 +71,23 @@ template< typename Real,
 class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >
    : public tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
 {
-   static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
+   //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
    
    public:
       
       typedef Meshes::Grid< 2, Real, Device, Index > MeshType;
       typedef Real RealType;
-      typedef TNL::Devices::Host DeviceType;
+      typedef Device DeviceType;
       typedef Index IndexType;
       typedef Anisotropy AnisotropyType;
       typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > BaseType;
       using MeshPointer = SharedPointer< MeshType >;
+      using AnisotropyPointer = SharedPointer< AnisotropyType, DeviceType >;
 
       using typename BaseType::InterfaceMapType;
       using typename BaseType::MeshFunctionType;
+      using typename BaseType::InterfaceMapPointer;
+      using typename BaseType::MeshFunctionPointer;      
 
       FastSweepingMethod();
       
@@ -88,9 +96,8 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >
       void setMaxIterations( const IndexType& maxIterations );
       
       void solve( const MeshPointer& mesh,
-                  const AnisotropyType& anisotropy,
-                  MeshFunctionType& u );
-      
+                  const AnisotropyPointer& anisotropy,
+                  MeshFunctionPointer& u );
       
    protected:
       
@@ -104,20 +111,23 @@ template< typename Real,
 class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >
    : public tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
 {
-   static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
+   //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
    
    public:
       
       typedef Meshes::Grid< 3, Real, Device, Index > MeshType;
       typedef Real RealType;
-      typedef TNL::Devices::Host DeviceType;
+      typedef Device DeviceType;
       typedef Index IndexType;
       typedef Anisotropy AnisotropyType;
       typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > BaseType;
       using MeshPointer = SharedPointer< MeshType >;
+      using AnisotropyPointer = SharedPointer< AnisotropyType, DeviceType >;
       
       using typename BaseType::InterfaceMapType;
       using typename BaseType::MeshFunctionType;
+      using typename BaseType::InterfaceMapPointer;
+      using typename BaseType::MeshFunctionPointer;      
       
       FastSweepingMethod();
       
@@ -126,8 +136,8 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >
       void setMaxIterations( const IndexType& maxIterations );
       
       void solve( const MeshPointer& mesh,
-                  const AnisotropyType& anisotropy,
-                  MeshFunctionType& u );
+                  const AnisotropyPointer& anisotropy,
+                  MeshFunctionPointer& u );
       
       
    protected:
@@ -136,7 +146,6 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >
 };
 
 
-
 #include "tnlFastSweepingMethod1D_impl.h"
 #include "tnlFastSweepingMethod2D_impl.h"
 #include "tnlFastSweepingMethod3D_impl.h"
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h
index 874945eafd7f0cb1eff4667eba6ef026ba026634..890c6cb4c9373917abf1d668ebf60a30cfbc17b3 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h
@@ -55,16 +55,193 @@ template< typename Real,
 void
 FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >::
 solve( const MeshPointer& mesh,
-       const AnisotropyType& anisotropy,
-       MeshFunctionType& u )
-{
-   MeshFunctionType aux;
-   InterfaceMapType interfaceMap;
-   aux.setMesh( mesh );
-   interfaceMap.setMesh( mesh );
+       const AnisotropyPointer& anisotropy,
+       MeshFunctionPointer& u )
+{   
+   MeshFunctionPointer auxPtr;
+   InterfaceMapPointer interfaceMapPtr;
+   auxPtr->setMesh( mesh );
+   interfaceMapPtr->setMesh( mesh );
    std::cout << "Initiating the interface cells ..." << std::endl;
-   BaseType::initInterface( u, aux, interfaceMap );
-   aux.save( "aux-ini.tnl" );
+   BaseType::initInterface( u, auxPtr, interfaceMapPtr );
+        
+   auxPtr->save( "aux-ini.tnl" );
+
+   typename MeshType::Cell cell( *mesh );
+   
+   IndexType iteration( 0 );
+   InterfaceMapType interfaceMap = *interfaceMapPtr;
+   MeshFunctionType aux = *auxPtr;
+   while( iteration < this->maxIterations )
+   {
+      if( std::is_same< DeviceType, Devices::Host >::value )
+      {
+          for( cell.getCoordinates().x() = 0;
+               cell.getCoordinates().x() < mesh->getDimensions().x();
+               cell.getCoordinates().x()++ )
+             {
+                cell.refresh();
+                if( ! interfaceMap( cell ) )
+                   this->updateCell( aux, cell );
+             }
+
+         
+        for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+             cell.getCoordinates().x() >= 0 ;
+             cell.getCoordinates().x()-- )		
+           {
+              cell.refresh();
+              if( ! interfaceMap( cell ) )            
+                 this->updateCell( aux, cell );
+           }
+      }
+      if( std::is_same< DeviceType, Devices::Cuda >::value )
+      {
+         // TODO: CUDA code
+#ifdef HAVE_CUDA
+          const int cudaBlockSize( 16 );
+          int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
+          dim3 blockSize( cudaBlockSize );
+          dim3 gridSize( numBlocksX );
+          
+          tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > ptr;
+          
+          
+          
+          bool* BlockIter = (bool*)malloc( ( numBlocksX ) * sizeof( bool ) );
+          
+          bool *BlockIterDevice;
+          cudaMalloc(&BlockIterDevice, ( numBlocksX ) * sizeof( bool ) );
+          
+          while( BlockIter[ 0 ] )
+          {
+           for( int i = 0; i < numBlocksX; i++ )
+                BlockIter[ i ] = false;
+           cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX ) * sizeof( bool ), cudaMemcpyHostToDevice);
+                       
+            CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
+                                                             interfaceMapPtr.template getData< Device >(),
+                                                             auxPtr.template modifyData< Device>(),
+                                                             BlockIterDevice );
+            cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX ) * sizeof( bool ), cudaMemcpyDeviceToHost);
+                                   
+            for( int i = 1; i < numBlocksX; i++ )
+                BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];
+            
+          }
+          delete[] BlockIter;
+          cudaFree( BlockIterDevice );
+          cudaDeviceSynchronize();
+          
+          TNL_CHECK_CUDA_DEVICE;
+              
+          aux = *auxPtr;
+          interfaceMap = *interfaceMapPtr;
+#endif
+      }
+      iteration++;
+   }
+   
+   aux.save("aux-final.tnl");
+}
+
+#ifdef HAVE_CUDA
+template < typename Real, typename Device, typename Index >
+__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > ptr,
+                                      const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap,
+                                      Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& aux,
+                                      bool *BlockIterDevice )
+{
+    int thri = threadIdx.x;
+    int blIdx = blockIdx.x;
+    int i = thri + blockDim.x*blIdx;
+    
+    __shared__ volatile bool changed[16];
+    changed[ thri ] = false;
+    
+    if( thri == 0 )
+        changed[ 0 ] = true;
+    
+    const Meshes::Grid< 1, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
+    __shared__ Real h;
+    if( thri == 1 )
+    {
+        h = mesh.getSpaceSteps().x();
+    }
+    
+    __shared__ volatile Real sArray[ 18 ];
+    sArray[thri] = std::numeric_limits<  Real >::max();
+    
+    //filling sArray edges
+    int dimX = mesh.getDimensions().x(); 
+    __shared__ volatile int numOfBlockx;
+    __shared__ int xkolik;
+    if( thri == 0 )
+    {
+        xkolik = blockDim.x + 1;
+        numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0);
+    
+        if( numOfBlockx - 1 == blIdx )
+            xkolik = dimX - (blIdx)*blockDim.x+1;
+    }
+    __syncthreads();
+    
+    if( thri == 0 )
+    {        
+        if( dimX > (blIdx+1) * blockDim.x )
+            sArray[xkolik] = aux[ blIdx*blockDim.x - 1 + xkolik ];
+        else
+            sArray[xkolik] = std::numeric_limits< Real >::max();
+    }
+    
+    if( thri == 1 )
+    {
+        if( blIdx != 0 )
+            sArray[0] = aux[ blIdx*blockDim.x - 1 ];
+        else
+            sArray[0] = std::numeric_limits< Real >::max();
+    }
+    
+        
+    if( i < mesh.getDimensions().x() )
+    {    
+        sArray[thri+1] = aux[ i ];
+    }
+    __syncthreads();  
 
+    while( changed[ 0 ] )
+    {
+        __syncthreads();
+        
+        changed[ thri ] = false;
+        
+    //calculation of update cell
+        if( i < mesh.getDimensions().x() )
+        {
+            if( ! interfaceMap[ i ] )
+            {
+                changed[ thri ] = ptr.updateCell( sArray, thri+1, h );
+            }
+        }
+        __syncthreads();
+        
+        
+        
+    //pyramid reduction
+        if( thri < 8 ) changed[ thri ] = changed[ thri ] || changed[ thri + 8 ];
+        if( thri < 4 ) changed[ thri ] = changed[ thri ] || changed[ thri + 4 ];
+        if( thri < 2 ) changed[ thri ] = changed[ thri ] || changed[ thri + 2 ];
+        if( thri < 1 ) changed[ thri ] = changed[ thri ] || changed[ thri + 1 ];
+        __syncthreads();
+
+        if( changed[ 0 ] && thri == 0 )
+            BlockIterDevice[ blIdx ] = true;
+        __syncthreads();
+    }
+  
+    if( i < mesh.getDimensions().x()  && (!interfaceMap[ i ]) )
+        aux[ i ] = sArray[ thri + 1 ];
 }
+#endif
+
 
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index 13e5c824912ef387c47ddaca1c5eb98acfe30fe6..6703843c179ea0215c40cf2d21b9cd66fde99de6 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -14,6 +14,11 @@
 #pragma once
 
 #include "tnlFastSweepingMethod.h"
+#include <TNL/Devices/Cuda.h>
+
+
+#include <iostream>
+#include <fstream>
 
 template< typename Real,
           typename Device,
@@ -21,7 +26,7 @@ template< typename Real,
           typename Anisotropy >
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
 FastSweepingMethod()
-: maxIterations( 1 )
+: maxIterations( 100 )
 {
    
 }
@@ -55,87 +60,466 @@ template< typename Real,
 void
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
 solve( const MeshPointer& mesh,
-       const AnisotropyType& anisotropy,
-       MeshFunctionType& u )
+       const AnisotropyPointer& anisotropy,
+       MeshFunctionPointer& u )
 {
-   MeshFunctionType aux;
-   InterfaceMapType interfaceMap;
-   aux.setMesh( mesh );
-   interfaceMap.setMesh( mesh );
+   /*MeshFunctionType v;
+   v.setMesh(mesh);
+   double A[320][320];
+    for (int i = 0; i < 320; i++)
+        for (int j = 0; j < 320; j++)
+            A[i][j] = 0;
+    
+    std::ifstream file("/home/maty/Downloads/mapa2.txt");
+
+    for (int i = 0; i < 320; i++)
+        for (int j = 0; j < 320; j++)
+            file >> A[i][j];
+    file.close();
+    for (int i = 0; i < 320; i++)
+        for (int j = 0; j < 320; j++)
+            v[i*320 + j] = A[i][j];
+   v.save("mapa.tnl");*/
+   
+       
+   MeshFunctionPointer auxPtr;
+   InterfaceMapPointer interfaceMapPtr;
+   auxPtr->setMesh( mesh );
+   interfaceMapPtr->setMesh( mesh );
    std::cout << "Initiating the interface cells ..." << std::endl;
-   BaseType::initInterface( u, aux, interfaceMap );
-   //aux.save( "aux-ini.tnl" );
+   BaseType::initInterface( u, auxPtr, interfaceMapPtr );
+        
+   auxPtr->save( "aux-ini.tnl" );
 
    typename MeshType::Cell cell( *mesh );
    
    IndexType iteration( 0 );
+   InterfaceMapType interfaceMap = *interfaceMapPtr;
+   MeshFunctionType aux = *auxPtr;
    while( iteration < this->maxIterations )
    {
-
-      for( cell.getCoordinates().y() = 0;
-           cell.getCoordinates().y() < mesh->getDimensions().y();
-           cell.getCoordinates().y()++ )
+      if( std::is_same< DeviceType, Devices::Host >::value )
       {
-         for( cell.getCoordinates().x() = 0;
-              cell.getCoordinates().x() < mesh->getDimensions().x();
-              cell.getCoordinates().x()++ )
+         for( cell.getCoordinates().y() = 0;
+              cell.getCoordinates().y() < mesh->getDimensions().y();
+              cell.getCoordinates().y()++ )
+         {
+            for( cell.getCoordinates().x() = 0;
+                 cell.getCoordinates().x() < mesh->getDimensions().x();
+                 cell.getCoordinates().x()++ )
+               {
+                  cell.refresh();
+                  if( ! interfaceMap( cell ) )
+                     this->updateCell( aux, cell );
+               }
+         }
+
+         //aux.save( "aux-1.tnl" );
+
+         for( cell.getCoordinates().y() = 0;
+              cell.getCoordinates().y() < mesh->getDimensions().y();
+              cell.getCoordinates().y()++ )
+         {
+            for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                 cell.getCoordinates().x() >= 0 ;
+                 cell.getCoordinates().x()-- )		
+               {
+                  //std::cerr << "2 -> ";
+                  cell.refresh();
+                  if( ! interfaceMap( cell ) )            
+                     this->updateCell( aux, cell );
+               }
+         }
+
+         //aux.save( "aux-2.tnl" );
+
+         for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+              cell.getCoordinates().y() >= 0 ;
+              cell.getCoordinates().y()-- )
             {
-               cell.refresh();
-               if( ! interfaceMap( cell ) )
-                  this->updateCell( aux, cell );
+            for( cell.getCoordinates().x() = 0;
+                 cell.getCoordinates().x() < mesh->getDimensions().x();
+                 cell.getCoordinates().x()++ )
+               {
+                  //std::cerr << "3 -> ";
+                  cell.refresh();
+                  if( ! interfaceMap( cell ) )            
+                     this->updateCell( aux, cell );
+               }
             }
-      }
-      //aux.save( "aux-1.tnl" );
 
-      for( cell.getCoordinates().y() = 0;
-           cell.getCoordinates().y() < mesh->getDimensions().y();
-           cell.getCoordinates().y()++ )
-      {
-         for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-              cell.getCoordinates().x() >= 0 ;
-              cell.getCoordinates().x()-- )		
+         //aux.save( "aux-3.tnl" );
+
+         for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+              cell.getCoordinates().y() >= 0;
+              cell.getCoordinates().y()-- )
             {
-               //std::cerr << "2 -> ";
-               cell.refresh();
-               if( ! interfaceMap( cell ) )            
-                  this->updateCell( aux, cell );
+            for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                 cell.getCoordinates().x() >= 0 ;
+                 cell.getCoordinates().x()-- )		
+               {
+                  //std::cerr << "4 -> ";
+                  cell.refresh();
+                  if( ! interfaceMap( cell ) )            
+                     this->updateCell( aux, cell );
+               }
             }
-      }
-      //aux.save( "aux-2.tnl" );
 
-      for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-           cell.getCoordinates().y() >= 0 ;
-           cell.getCoordinates().y()-- )
+         //aux.save( "aux-4.tnl" );
+
+         /*for( cell.getCoordinates().x() = 0;
+              cell.getCoordinates().x() < mesh->getDimensions().y();
+              cell.getCoordinates().x()++ )
          {
+            for( cell.getCoordinates().y() = 0;
+                 cell.getCoordinates().y() < mesh->getDimensions().x();
+                 cell.getCoordinates().y()++ )
+               {
+                  cell.refresh();
+                  if( ! interfaceMap( cell ) )
+                     this->updateCell( aux, cell );
+               }
+         }     
+
+
+         aux.save( "aux-5.tnl" );
+
          for( cell.getCoordinates().x() = 0;
-              cell.getCoordinates().x() < mesh->getDimensions().x();
+              cell.getCoordinates().x() < mesh->getDimensions().y();
               cell.getCoordinates().x()++ )
-            {
-               //std::cerr << "3 -> ";
-               cell.refresh();
-               if( ! interfaceMap( cell ) )            
-                  this->updateCell( aux, cell );
-            }
+         {
+            for( cell.getCoordinates().y() = mesh->getDimensions().x() - 1;
+                 cell.getCoordinates().y() >= 0 ;
+                 cell.getCoordinates().y()-- )		
+               {
+                  //std::cerr << "2 -> ";
+                  cell.refresh();
+                  if( ! interfaceMap( cell ) )            
+                     this->updateCell( aux, cell );
+               }
          }
-      //aux.save( "aux-3.tnl" );
-
+         aux.save( "aux-6.tnl" );
 
-      for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-           cell.getCoordinates().y() >= 0;
-           cell.getCoordinates().y()-- )
-         {
-         for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+         for( cell.getCoordinates().x() = mesh->getDimensions().y() - 1;
               cell.getCoordinates().x() >= 0 ;
-              cell.getCoordinates().x()-- )		
+              cell.getCoordinates().x()-- )
             {
-               //std::cerr << "4 -> ";
-               cell.refresh();
-               if( ! interfaceMap( cell ) )            
-                  this->updateCell( aux, cell );
+            for( cell.getCoordinates().y() = 0;
+                 cell.getCoordinates().y() < mesh->getDimensions().x();
+                 cell.getCoordinates().y()++ )
+               {
+                  //std::cerr << "3 -> ";
+                  cell.refresh();
+                  if( ! interfaceMap( cell ) )            
+                     this->updateCell( aux, cell );
+               }
             }
-         }   
-      //aux.save( "aux-4.tnl" );
+         aux.save( "aux-7.tnl" );
+
+         for( cell.getCoordinates().x() = mesh->getDimensions().y() - 1;
+              cell.getCoordinates().x() >= 0;
+              cell.getCoordinates().x()-- )
+            {
+            for( cell.getCoordinates().y() = mesh->getDimensions().x() - 1;
+                 cell.getCoordinates().y() >= 0 ;
+                 cell.getCoordinates().y()-- )		
+               {
+                  //std::cerr << "4 -> ";
+                  cell.refresh();
+                  if( ! interfaceMap( cell ) )            
+                     this->updateCell( aux, cell );
+               }
+            }*/
+      }
+      if( std::is_same< DeviceType, Devices::Cuda >::value )
+      {
+         // TODO: CUDA code
+#ifdef HAVE_CUDA
+          
+          Real *dAux;
+          cudaMalloc(&dAux, ( mesh->getDimensions().x() * mesh->getDimensions().y() ) * sizeof( Real ) );
+          
+          
+          
+          
+          const int cudaBlockSize( 16 );
+          int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
+          int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
+          dim3 blockSize( cudaBlockSize, cudaBlockSize );
+          dim3 gridSize( numBlocksX, numBlocksY );
+          
+          tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
+          
+          aux1<<< gridSize, blockSize >>>( auxPtr.template modifyData< Device>(), dAux,1 );
+          
+          //int BlockIter = 1;// = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) );
+
+          int *BlockIterDevice;
+          int BlockIterD = 1;
+          
+          cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );
+          int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0);
+          int *dBlock;
+          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );
+          
+          while( BlockIterD )
+          {
+           /*for( int i = 0; i < numBlocksX * numBlocksY; i++ )
+                BlockIter[ i ] = false;*/
+                       
+            CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
+                                                             interfaceMapPtr.template getData< Device >(),
+                                                             dAux,
+                                                             BlockIterDevice );
+            
+            CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
+            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+            
+            cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
+                                   
+            /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
+                BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
+            
+          }
+          aux1<<<gridSize,blockSize>>>( auxPtr.template modifyData< Device>(), dAux, 0 );
+          cudaFree( dAux );
+          cudaFree( BlockIterDevice );
+          cudaFree( dBlock );
+          cudaDeviceSynchronize();
+          
+          TNL_CHECK_CUDA_DEVICE;
+              
+          //aux = *auxPtr;
+          //interfaceMap = *interfaceMapPtr;
+#endif
+      }
       iteration++;
    }
+   aux.save("aux-final.tnl");
 }
 
+#ifdef HAVE_CUDA
+template < typename Real, typename Device, typename Index >
+__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a )
+{
+    int i = threadIdx.x + blockDim.x*blockIdx.x;
+    int j = blockDim.y*blockIdx.y + threadIdx.y;
+    const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >();
+    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 1 )
+    {    
+        dAux[ j*mesh.getDimensions().x() + i ] = aux[ j*mesh.getDimensions().x() + i ];
+    }
+    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 0 )
+    {    
+        aux[ j*mesh.getDimensions().x() + i ] = dAux[ j*mesh.getDimensions().x() + i ];
+    }
+    
+}
+
+__global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks )
+{
+    int i = threadIdx.x;
+    int blId = blockIdx.x;
+    __shared__ volatile int sArray[ 512 ];
+    sArray[ i ] = false;
+    if(blId * 1024 + i < nBlocks )
+        sArray[ i ] = BlockIterDevice[ blId * 1024 + i ];
+    
+    if (blockDim.x * blockDim.y == 1024) {
+        if (i < 512)
+            sArray[ i ] += sArray[ i ];
+    }
+    __syncthreads();
+    if (blockDim.x * blockDim.y >= 512) {
+        if (i < 256) {
+            sArray[ i ] += sArray[ i ];
+        }
+    }
+    if (blockDim.x * blockDim.y >= 256) {
+        if (i < 128) {
+            sArray[ i ] += sArray[ i + 128 ];
+        }
+    }
+    __syncthreads();
+    if (blockDim.x * blockDim.y >= 128) {
+        if (i < 64) {
+            sArray[ i ] += sArray[ i + 64 ];
+        }
+    }
+    __syncthreads();
+    if (i < 32 )
+    {
+        if(  blockDim.x * blockDim.y >= 64 ) sArray[ i ] += sArray[ i + 32 ];
+        if(  blockDim.x * blockDim.y >= 32 )  sArray[ i ] += sArray[ i + 16 ];
+        if(  blockDim.x * blockDim.y >= 16 )  sArray[ i ] += sArray[ i + 8 ];
+        if(  blockDim.x * blockDim.y >= 8 )  sArray[ i ] += sArray[ i + 4 ];
+        if(  blockDim.x * blockDim.y >= 4 )  sArray[ i ] += sArray[ i + 2 ];
+        if(  blockDim.x * blockDim.y >= 2 )  sArray[ i ] += sArray[ i + 1 ];
+    }
+    
+    if( i == 0 )
+        dBlock[ blId ] = sArray[ 0 ];
+}
+
+
+
+template < typename Real, typename Device, typename Index >
+__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
+                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
+                                      Real *aux,
+                                      int *BlockIterDevice )
+{
+    int thri = threadIdx.x; int thrj = threadIdx.y;
+    int blIdx = blockIdx.x; int blIdy = blockIdx.y;
+    int i = thri + blockDim.x*blIdx;
+    int j = blockDim.y*blIdy + thrj;
+    int currentIndex = thrj * blockDim.x + thri;
+    
+    //__shared__ volatile bool changed[ blockDim.x*blockDim.y ];
+    __shared__ volatile bool changed[16*16];
+    changed[ currentIndex ] = false;
+    
+    if( thrj == 0 && thri == 0 )
+        changed[ 0 ] = true;
+    
+    const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
+    __shared__ Real hx;
+    __shared__ Real hy;
+    if( thrj == 1 && thri == 1 )
+    {
+        hx = mesh.getSpaceSteps().x();
+        hy = mesh.getSpaceSteps().y();
+    }
+    
+    //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ];
+    __shared__ volatile Real sArray[18][18];
+    sArray[thrj][thri] = std::numeric_limits< Real >::max();
+    
+    //filling sArray edges
+    int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
+    __shared__ volatile int numOfBlockx;
+    __shared__ volatile int numOfBlocky;
+    __shared__ int xkolik;
+    __shared__ int ykolik;
+    if( thri == 0 && thrj == 0 )
+    {
+        xkolik = blockDim.x + 1;
+        ykolik = blockDim.y + 1;
+        numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0);
+        numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0);
+    
+        if( numOfBlockx - 1 == blIdx )
+            xkolik = dimX - (blIdx)*blockDim.x+1;
+
+        if( numOfBlocky -1 == blIdy )
+            ykolik = dimY - (blIdy)*blockDim.y+1;
+        BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0;
+    }
+    __syncthreads();
+    
+    if( thri == 0 )
+    {        
+        if( dimX > (blIdx+1) * blockDim.x  && thrj+1 < ykolik )
+            sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ];
+        else
+            sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
+    }
+    
+    if( thri == 1 )
+    {
+        if( blIdx != 0 && thrj+1 < ykolik )
+            sArray[thrj+1][0] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX ];
+        else
+            sArray[thrj+1][0] = std::numeric_limits< Real >::max();
+    }
+    
+    if( thri == 2 )
+    {
+        if( dimY > (blIdy+1) * blockDim.y  && thri+1 < xkolik )
+            sArray[ykolik][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thrj+1 ];
+        else
+           sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();
+    }
+    
+    if( thri == 3 )
+    {
+        if( blIdy != 0 && thrj+1 < xkolik )
+            sArray[0][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + thrj+1 ];
+        else
+            sArray[0][thrj+1] = std::numeric_limits< Real >::max();
+    }
+    
+        
+    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+    {    
+        sArray[thrj+1][thri+1] = aux[ j*mesh.getDimensions().x() + i ];
+    }
+    __syncthreads();  
+
+    while( changed[ 0 ] )
+    {
+        __syncthreads();
+        
+        changed[ currentIndex] = false;
+        
+    //calculation of update cell
+        if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+        {
+            if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] )
+            {
+                changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy);
+            }
+        }
+        __syncthreads();
+        
+    //pyramid reduction
+        if( blockDim.x*blockDim.y == 1024 )
+        {
+            if( currentIndex < 512 )
+            {
+                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
+            }
+        }
+        __syncthreads();
+        if( blockDim.x*blockDim.y >= 512 )
+        {
+            if( currentIndex < 256 )
+            {
+                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
+            }
+        }
+        __syncthreads();
+        if( blockDim.x*blockDim.y >= 256 )
+        {
+            if( currentIndex < 128 )
+            {
+                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
+            }
+        }
+        __syncthreads();
+        if( blockDim.x*blockDim.y >= 128 )
+        {
+            if( currentIndex < 64 )
+            {
+                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
+            }
+        }
+        __syncthreads();
+        if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
+        {
+            if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
+            if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
+            if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
+            if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
+            if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
+            if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
+        }
+        if( changed[ 0 ] && thri == 0 && thrj == 0 )
+            BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1;
+        __syncthreads();
+    }
+  
+    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) )
+        aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ];
+}
+#endif
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index 6fad87c7ea63f50a13e7b9e38c65994998e2b441..b024979cc6247022bd3ee156c877c2b47f71f99a 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -55,15 +55,457 @@ template< typename Real,
 void
 FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
 solve( const MeshPointer& mesh,
-       const AnisotropyType& anisotropy,
-       MeshFunctionType& u )
+       const AnisotropyPointer& anisotropy,
+       MeshFunctionPointer& u )
 {
-   MeshFunctionType aux;
-   InterfaceMapType interfaceMap;
-   aux.setMesh( mesh );
-   interfaceMap.setMesh( mesh );
+   MeshFunctionPointer auxPtr;
+   InterfaceMapPointer interfaceMapPtr;
+   auxPtr->setMesh( mesh );
+   interfaceMapPtr->setMesh( mesh );
    std::cout << "Initiating the interface cells ..." << std::endl;
-   BaseType::initInterface( u, aux, interfaceMap );
-   aux.save( "aux-ini.tnl" );   
+   BaseType::initInterface( u, auxPtr, interfaceMapPtr );
+#ifdef HAVE_CUDA
+   cudaDeviceSynchronize();
+#endif
+   auxPtr->save( "aux-ini.tnl" );   
+   
+   typename MeshType::Cell cell( *mesh );
+   
+   IndexType iteration( 0 );
+   MeshFunctionType aux = *auxPtr;
+   InterfaceMapType interfaceMap = * interfaceMapPtr;
+    while( iteration < this->maxIterations )
+    {
+        if( std::is_same< DeviceType, Devices::Host >::value )
+        {
+           for( cell.getCoordinates().z() = 0;
+                cell.getCoordinates().z() < mesh->getDimensions().z();
+                cell.getCoordinates().z()++ )
+           {
+              for( cell.getCoordinates().y() = 0;
+                   cell.getCoordinates().y() < mesh->getDimensions().y();
+                   cell.getCoordinates().y()++ )
+              {
+                 for( cell.getCoordinates().x() = 0;
+                      cell.getCoordinates().x() < mesh->getDimensions().x();
+                      cell.getCoordinates().x()++ )
+                 {
+                    cell.refresh();
+                    if( ! interfaceMap( cell ) )
+                       this->updateCell( aux, cell );
+                 }
+              }
+           }
+           //aux.save( "aux-1.tnl" );
+
+           for( cell.getCoordinates().z() = 0;
+                cell.getCoordinates().z() < mesh->getDimensions().z();
+                cell.getCoordinates().z()++ )
+           {
+              for( cell.getCoordinates().y() = 0;
+                   cell.getCoordinates().y() < mesh->getDimensions().y();
+                   cell.getCoordinates().y()++ )
+              {
+                 for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                      cell.getCoordinates().x() >= 0 ;
+                      cell.getCoordinates().x()-- )		
+                 {
+                    //std::cerr << "2 -> ";
+                    cell.refresh();
+                    if( ! interfaceMap( cell ) )            
+                       this->updateCell( aux, cell );
+                 }
+              }
+           }
+           //aux.save( "aux-2.tnl" );
+           for( cell.getCoordinates().z() = 0;
+                cell.getCoordinates().z() < mesh->getDimensions().z();
+                cell.getCoordinates().z()++ )
+           {
+              for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+                   cell.getCoordinates().y() >= 0 ;
+                   cell.getCoordinates().y()-- )
+              {
+                 for( cell.getCoordinates().x() = 0;
+                      cell.getCoordinates().x() < mesh->getDimensions().x();
+                      cell.getCoordinates().x()++ )
+                 {
+                    //std::cerr << "3 -> ";
+                    cell.refresh();
+                    if( ! interfaceMap( cell ) )            
+                       this->updateCell( aux, cell );
+                 }
+              }
+           }
+           //aux.save( "aux-3.tnl" );
+
+           for( cell.getCoordinates().z() = 0;
+                cell.getCoordinates().z() < mesh->getDimensions().z();
+                cell.getCoordinates().z()++ )
+           {
+              for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+                   cell.getCoordinates().y() >= 0;
+                   cell.getCoordinates().y()-- )
+              {
+                 for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                      cell.getCoordinates().x() >= 0 ;
+                      cell.getCoordinates().x()-- )		
+                 {
+                    //std::cerr << "4 -> ";
+                    cell.refresh();
+                    if( ! interfaceMap( cell ) )            
+                       this->updateCell( aux, cell );
+                 }
+              }
+           }     
+           //aux.save( "aux-4.tnl" );
+
+           for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+                cell.getCoordinates().z() >= 0;
+                cell.getCoordinates().z()-- )
+           {
+              for( cell.getCoordinates().y() = 0;
+                   cell.getCoordinates().y() < mesh->getDimensions().y();
+                   cell.getCoordinates().y()++ )
+              {
+                 for( cell.getCoordinates().x() = 0;
+                      cell.getCoordinates().x() < mesh->getDimensions().x();
+                      cell.getCoordinates().x()++ )
+                 {
+                    //std::cerr << "5 -> ";
+                    cell.refresh();
+                    if( ! interfaceMap( cell ) )
+                       this->updateCell( aux, cell );
+                 }
+              }
+           }
+           //aux.save( "aux-5.tnl" );
+
+           for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+                cell.getCoordinates().z() >= 0;
+                cell.getCoordinates().z()-- )
+           {
+              for( cell.getCoordinates().y() = 0;
+                   cell.getCoordinates().y() < mesh->getDimensions().y();
+                   cell.getCoordinates().y()++ )
+              {
+                 for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                      cell.getCoordinates().x() >= 0 ;
+                      cell.getCoordinates().x()-- )		
+                 {
+                    //std::cerr << "6 -> ";
+                    cell.refresh();
+                    if( ! interfaceMap( cell ) )            
+                       this->updateCell( aux, cell );
+                 }
+              }
+           }
+           //aux.save( "aux-6.tnl" );
+
+           for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+                cell.getCoordinates().z() >= 0;
+                cell.getCoordinates().z()-- )
+           {
+              for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+                   cell.getCoordinates().y() >= 0 ;
+                   cell.getCoordinates().y()-- )
+              {
+                 for( cell.getCoordinates().x() = 0;
+                      cell.getCoordinates().x() < mesh->getDimensions().x();
+                      cell.getCoordinates().x()++ )
+                 {
+                    //std::cerr << "7 -> ";
+                    cell.refresh();
+                    if( ! interfaceMap( cell ) )            
+                       this->updateCell( aux, cell );
+                 }
+              }
+           }
+           //aux.save( "aux-7.tnl" );
+
+           for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+                cell.getCoordinates().z() >= 0;
+                cell.getCoordinates().z()-- )
+           {
+              for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+                   cell.getCoordinates().y() >= 0;
+                   cell.getCoordinates().y()-- )
+              {
+                 for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                      cell.getCoordinates().x() >= 0 ;
+                      cell.getCoordinates().x()-- )		
+                 {
+                    //std::cerr << "8 -> ";
+                    cell.refresh();
+                    if( ! interfaceMap( cell ) )            
+                       this->updateCell( aux, cell );
+                 }
+              }
+           }
+      }
+      if( std::is_same< DeviceType, Devices::Cuda >::value )
+      {
+         // TODO: CUDA code
+#ifdef HAVE_CUDA
+          const int cudaBlockSize( 8 );
+          int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
+          int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
+          int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z(), cudaBlockSize ); 
+          if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
+              std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
+          dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
+          dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
+                 
+          tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
+          
+          int *BlockIterDevice;
+          int BlockIterD = 1;
+          
+          cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );
+          int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0);
+          int *dBlock;
+          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );
+          
+          while( BlockIterD )
+          {
+             CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
+                                                              interfaceMapPtr.template getData< Device >(),
+                                                              auxPtr.template modifyData< Device>(),
+                                                              BlockIterDevice );
+            CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
+            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+            
+            cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
+                                   
+            /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
+                BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
+            
+          }
+          cudaFree( BlockIterDevice );
+          cudaFree( dBlock );
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
+          aux = *auxPtr;
+          interfaceMap = *interfaceMapPtr;
+#endif
+      }
+        
+      //aux.save( "aux-8.tnl" );
+      iteration++;
+      
+   }
+   aux.save("aux-final.tnl");
 }
 
+#ifdef HAVE_CUDA
+template < typename Real, typename Device, typename Index >
+__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
+                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
+                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
+                                      int *BlockIterDevice )
+{
+    int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
+    int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z;
+    int i = threadIdx.x + blockDim.x*blockIdx.x;
+    int j = blockDim.y*blockIdx.y + threadIdx.y;
+    int k = blockDim.z*blockIdx.z + threadIdx.z;
+    int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri;
+    
+    __shared__ volatile bool changed[8*8*8];
+    changed[ currentIndex ] = false;
+    
+    if( thrj == 0 && thri == 0 && thrk == 0 )
+        changed[ 0 ] = true;
+    
+    const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
+    __shared__ Real hx;
+    __shared__ Real hy;
+    __shared__ Real hz;
+    if( thrj == 1 && thri == 1 && thrk == 1 )
+    {
+        hx = mesh.getSpaceSteps().x();
+        hy = mesh.getSpaceSteps().y();
+        hz = mesh.getSpaceSteps().z();
+    }
+    __shared__ volatile Real sArray[10][10][10];
+    sArray[thrk][thrj][thri] = std::numeric_limits< Real >::max();
+    if(thri == 0 )
+    {
+        sArray[8][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
+        sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
+        sArray[thrk+1][thrj+1][8] = std::numeric_limits< Real >::max();
+        sArray[thrk+1][thrj+1][9] = std::numeric_limits< Real >::max();
+        sArray[thrj+1][8][thrk+1] = std::numeric_limits< Real >::max();
+        sArray[thrj+1][9][thrk+1] = std::numeric_limits< Real >::max();
+    }
+            
+    //filling sArray edges
+    int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
+    int dimZ = mesh.getDimensions().z();
+    __shared__ volatile int numOfBlockx;
+    __shared__ volatile int numOfBlocky;
+    __shared__ volatile int numOfBlockz;
+    __shared__ int xkolik;
+    __shared__ int ykolik;
+    __shared__ int zkolik;
+    if( thri == 0 && thrj == 0 && thrk == 0 )
+    {
+        xkolik = blockDim.x + 1;
+        ykolik = blockDim.y + 1;
+        zkolik = blockDim.z + 1;
+        numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0);
+        numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0);
+        numOfBlockz = dimZ/blockDim.z + ((dimZ%blockDim.z != 0) ? 1:0);
+        
+        if( numOfBlockx - 1 == blIdx )
+            xkolik = dimX - (blIdx)*blockDim.x+1;
+
+        if( numOfBlocky -1 == blIdy )
+            ykolik = dimY - (blIdy)*blockDim.y+1;
+        if( numOfBlockz-1 == blIdz )
+            zkolik = dimZ - (blIdz)*blockDim.z+1;
+        
+        BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 0;
+    }
+    __syncthreads();
+    
+    if( thri == 0 )
+    {        
+        if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik )
+            sArray[thrk+1][thrj+1][0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ];
+        else
+            sArray[thrk+1][thrj+1][0] = std::numeric_limits< Real >::max();
+    }
+    
+    if( thri == 1 )
+    {
+        if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik )
+            sArray[thrk+1][thrj+1][9] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ];
+        else
+            sArray[thrk+1][thrj+1][9] = std::numeric_limits< Real >::max();
+    }
+    if( thri == 2 )
+    {        
+        if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik )
+            sArray[thrk+1][0][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ];
+        else
+            sArray[thrk+1][0][thrj+1] = std::numeric_limits< Real >::max();
+    }
+    
+    if( thri == 3 )
+    {
+        if( dimY > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik )
+            sArray[thrk+1][9][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ];
+        else
+            sArray[thrk+1][9][thrj+1] = std::numeric_limits< Real >::max();
+    }
+    if( thri == 4 )
+    {        
+        if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik )
+            sArray[0][thrj+1][thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ];
+        else
+            sArray[0][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
+    }
+    
+    if( thri == 5 )
+    {
+        if( dimZ > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik )
+            sArray[9][thrj+1][thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ];
+        else
+            sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
+    }
+    
+    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
+    {
+        sArray[thrk+1][thrj+1][thri+1] = aux[ k*dimX*dimY + j*dimX + i ];
+    }
+    __shared__ volatile int loopcounter;
+    loopcounter = 0;
+    __syncthreads(); 
+    while( changed[ 0 ] )
+    {
+        __syncthreads();
+        
+        changed[ currentIndex ] = false;
+        
+    //calculation of update cell
+        if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ )
+        {
+            if( ! interfaceMap[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] )
+            {
+                changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz);
+            }
+        }
+        __syncthreads();
+        
+    //pyramid reduction
+        if( blockDim.x*blockDim.y*blockDim.z == 1024 )
+        {
+            if( currentIndex < 512 )
+            {
+                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
+            }
+        }
+        __syncthreads();
+        if( blockDim.x*blockDim.y*blockDim.z >= 512 )
+        {
+            if( currentIndex < 256 )
+            {
+                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
+            }
+        }
+        __syncthreads();
+        if( blockDim.x*blockDim.y*blockDim.z >= 256 )
+        {
+            if( currentIndex < 128 )
+            {
+                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
+            }
+        }
+        __syncthreads();
+        if( blockDim.x*blockDim.y*blockDim.z >= 128 )
+        {
+            if( currentIndex < 64 )
+            {
+                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
+            }
+        }
+        __syncthreads();
+        if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
+        {
+            if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
+            if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
+            if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
+            if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
+            if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
+            if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
+        }
+        __syncthreads();
+        
+        /*if(thri == 0 && thrj ==0 && thrk ==0 && blIdx == 0 && blIdy == 0 && blIdz == 0)
+        {
+            for(int m = 0; m < 8; m++){
+                for(int n = 0; n<8; n++){
+                    for(int b=0; b<8; b++)
+                        printf(" %i ", changed[m*64 + n*8 + b]);
+                    printf("\n");
+                }
+                printf("\n \n");
+            }
+        }*/
+        if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 )
+        {
+            //loopcounter++;
+            BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 1;
+        }
+        __syncthreads();
+        /*if(thri == 0 && thrj==0 && thrk==0)
+            printf("%i \n",loopcounter);
+        if(loopcounter == 500)
+            break;*/
+    }
+  
+    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ && (!interfaceMap[ k*dimX*dimY+j * mesh.getDimensions().x() + i ]) )
+        aux[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ];
+}   
+#endif
diff --git a/src/TNL/Math.h b/src/TNL/Math.h
index 1f754bcecaa88ffc973387b56aa9d692aaca344c..67e9f5e082af0b238e1d045bc516bf578efe82a0 100644
--- a/src/TNL/Math.h
+++ b/src/TNL/Math.h
@@ -72,7 +72,12 @@ template< class T >
 __cuda_callable__ inline
 T abs( const T& n )
 {
-#if defined(__MIC__)
+#if defined(__CUDA_ARCH__)
+   if( std::is_integral< T >::value )
+      return ::abs( n );
+   else
+      return ::fabs( n );
+#elif defined(__MIC__)
    if( n < ( T ) 0 )
       return -n;
    return n;
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp b/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp
index 7d34ea0ccd210a2b86605ec12df4a1b99b3c32b0..01cf6e58e76dcdc3ffbe286dd39bf1b3f27523c6 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp
@@ -389,7 +389,7 @@ setupNeighbors()
       if( this->isThereNeighbor( direction ) )
          this->neighbors[ i ] = this->getRankOfProcCoord( coordinates );
       else
-         this->neighbors[ i ] =- 1;
+         this->neighbors[ i ] = -1;
       
       // Handling periodic neighbors
       for( int d = 0; d < Dimension; d++ )
diff --git a/src/Tools/tnl-diff.h b/src/Tools/tnl-diff.h
index 40b7c5fe01e8950a4db38f4b498813a949bd1036..5f4ca182a9fb23783c42f0f47ee13fcdefe9ea8e 100644
--- a/src/Tools/tnl-diff.h
+++ b/src/Tools/tnl-diff.h
@@ -300,6 +300,7 @@ bool computeDifference( const MeshPointer& meshPointer, const String& objectType
    if( objectType == "Containers::Vector" ||
        objectType == "tnlVector" || objectType == "tnlSharedVector" )   // TODO: remove deprecated type name
       return computeDifferenceOfVectors< MeshPointer, Element, Real, Index >( meshPointer, parameters );
+   std::cerr << "Unknown object type " << objectType << "." << std::endl;
    return false;
 }
 
@@ -490,4 +491,4 @@ bool resolveGridRealType( const Containers::List< String >& parsedMeshType,
    return false;
 }
 
-#endif /* TNL_DIFF_H_ */
+#endif /* TNL_DIFF_H_ */
\ No newline at end of file