From 9f44ea1d142af4caa0dd9a4afe405f09da02727d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matou=C5=A1=20Fencl?= <fenclmat@fjfi.cvut.cz>
Date: Fri, 30 Nov 2018 09:02:24 +0100
Subject: [PATCH 01/14] Change of int to IndexType and preparations for
 OpenMPI.

---
 .../hamilton-jacobi/HamiltonJacobiProblem.h   |  2 ++
 .../HamiltonJacobiProblem_impl.h              | 20 +++++++++++++++----
 2 files changed, 18 insertions(+), 4 deletions(-)
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem.h
index a41442000..7f1bd4193 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem.h
@@ -25,6 +25,8 @@
 #include <solvers/pde/tnlLinearSystemAssembler.h>
 #include <functions/tnlMeshFunction.h>
 
+#include <TNL/Meshes/DistributedMeshes/DistributedGridIO.h>
+
 template< typename Mesh,
 		    typename DifferentialOperator,
 		    typename BoundaryCondition,
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem_impl.h
index 3cc638849..9244b1833 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem_impl.h
@@ -123,12 +123,24 @@ setInitialCondition( const Config::ParameterContainer& parameters,
 {
   this->bindDofs( mesh, dofs );
   const String& initialConditionFile = parameters.getParameter< String >( "initial-condition" );
-  if( ! this->solution.boundLoad( initialConditionFile ) )
+  if(CommunicatorType::isDistributed())
   {
-    std::cerr << "I am not able to load the initial condition from the file " << initialConditionFile << "." <<std::endl;
-    return false;
+    std::cout<<"Nodes Distribution: " << uPointer->getMesh().getDistributedMesh()->printProcessDistr() << std::endl;
+    if(distributedIOType==Meshes::DistributedMeshes::MpiIO)
+      Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType,Meshes::DistributedMeshes::MpiIO> ::load(initialConditionFile, *uPointer );
+    if(distributedIOType==Meshes::DistributedMeshes::LocalCopy)
+      Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType,Meshes::DistributedMeshes::LocalCopy> ::load(initialConditionFile, *uPointer );
+    uPointer->template synchronize<CommunicatorType>();
   }
-  return true;
+  else
+  {
+    if( ! this->solution.boundLoad( initialConditionFile ) )
+    {
+      std::cerr << "I am not able to load the initial condition from the file " << initialConditionFile << "." <<std::endl;
+      return false;
+    }
+  }
+   return true;
 }
 
 template< typename Mesh,
-- 
GitLab


From 97ee2879c839634934466489fc158f7093f6b411 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matou=C5=A1=20Fencl?= <fenclmat@fjfi.cvut.cz>
Date: Fri, 30 Nov 2018 09:06:24 +0100
Subject: [PATCH 02/14] Changed int to IndexType

---
 .../tnlDirectEikonalMethodsBase_impl.h               | 12 ++++++------
 .../hamilton-jacobi/tnlFastSweepingMethod2D_impl.h   |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index 26444bcfa..47561768e 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -96,10 +96,10 @@ template< typename Real,
 template< int sizeSArray >
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
-updateBlocks( const InterfaceMapType& interfaceMap,
-        MeshFunctionType& aux,
-        MeshFunctionType& helpFunc,
-        ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
+updateBlocks( InterfaceMapType interfaceMap,
+        MeshFunctionType aux,
+        MeshFunctionType helpFunc,
+        ArrayContainer BlockIterHost, IndexType numThreadsPerBlock/*, Real **sArray*/ )
 {
 #pragma omp parallel for schedule( dynamic )
   for( IndexType i = 0; i < BlockIterHost.getSize(); i++ )
@@ -267,13 +267,13 @@ updateBlocks( const InterfaceMapType& interfaceMap,
 template< typename Real,
         typename Device,
         typename Index >
-template< int sizeSArray >
+template< IndexType sizeSArray >
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
 updateBlocks( const InterfaceMapType& interfaceMap,
         const MeshFunctionType& aux,
         MeshFunctionType& helpFunc,
-        ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
+        ArrayContainer BlockIterHost, IndexType numThreadsPerBlock/*, Real **sArray*/ )
 {  
 //#pragma omp parallel for schedule( dynamic )
   for( IndexType i = 0; i < BlockIterHost.getSize(); i++ )
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index 89cb60881..f7be7e7de 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -97,11 +97,11 @@ solve( const MeshPointer& mesh,
   if( i == 0 ) {
     printf( "0: mesh x: %d\n", mesh->getDimensions().x() );
     printf( "0: mesh y: %d\n", mesh->getDimensions().y() );
-    //aux.save("aux_proc0.tnl");
-    /*for( int k = 0; k < mesh->getDimensions().x()*mesh->getDimensions().y(); k++ )
+    aux.save("aux_proc0.tnl");
+    for( int k = 0; k < 16*16; k++ )
       aux[ k ] = 10;
-    for( int k = 0; k < mesh->getDimensions().x(); k++ ){
-      for( int l = 0; l < mesh->getDimensions().y(); l++ )
+    for( int k = 0; k < 16; k++ ){
+      for( int l = 0; l < 16; l++ )
         printf("%f.2\t",aux[ k * 16 + l ] );
     printf("\n");
     }*/
-- 
GitLab


From 933cc22bce2c84cd5c880862e94bf457424b6c17 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matou=C5=A1=20Fencl?= <fenclmat@fjfi.cvut.cz>
Date: Fri, 30 Nov 2018 15:16:19 +0100
Subject: [PATCH 03/14] MPI ready in tnlDirectEikonal*

---
 .../hamilton-jacobi/HamiltonJacobiProblem.h   |  2 --
 .../HamiltonJacobiProblem_impl.h              | 20 ++++---------------
 .../tnlDirectEikonalMethodsBase_impl.h        |  6 +++---
 .../tnlFastSweepingMethod2D_impl.h            |  8 ++++----
 4 files changed, 11 insertions(+), 25 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem.h
index 7f1bd4193..a41442000 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem.h
@@ -25,8 +25,6 @@
 #include <solvers/pde/tnlLinearSystemAssembler.h>
 #include <functions/tnlMeshFunction.h>
 
-#include <TNL/Meshes/DistributedMeshes/DistributedGridIO.h>
-
 template< typename Mesh,
 		    typename DifferentialOperator,
 		    typename BoundaryCondition,
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem_impl.h
index 9244b1833..3cc638849 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/HamiltonJacobiProblem_impl.h
@@ -123,24 +123,12 @@ setInitialCondition( const Config::ParameterContainer& parameters,
 {
   this->bindDofs( mesh, dofs );
   const String& initialConditionFile = parameters.getParameter< String >( "initial-condition" );
-  if(CommunicatorType::isDistributed())
+  if( ! this->solution.boundLoad( initialConditionFile ) )
   {
-    std::cout<<"Nodes Distribution: " << uPointer->getMesh().getDistributedMesh()->printProcessDistr() << std::endl;
-    if(distributedIOType==Meshes::DistributedMeshes::MpiIO)
-      Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType,Meshes::DistributedMeshes::MpiIO> ::load(initialConditionFile, *uPointer );
-    if(distributedIOType==Meshes::DistributedMeshes::LocalCopy)
-      Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType,Meshes::DistributedMeshes::LocalCopy> ::load(initialConditionFile, *uPointer );
-    uPointer->template synchronize<CommunicatorType>();
+    std::cerr << "I am not able to load the initial condition from the file " << initialConditionFile << "." <<std::endl;
+    return false;
   }
-  else
-  {
-    if( ! this->solution.boundLoad( initialConditionFile ) )
-    {
-      std::cerr << "I am not able to load the initial condition from the file " << initialConditionFile << "." <<std::endl;
-      return false;
-    }
-  }
-   return true;
+  return true;
 }
 
 template< typename Mesh,
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index 47561768e..68e9c5f2b 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -99,7 +99,7 @@ tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
 updateBlocks( InterfaceMapType interfaceMap,
         MeshFunctionType aux,
         MeshFunctionType helpFunc,
-        ArrayContainer BlockIterHost, IndexType numThreadsPerBlock/*, Real **sArray*/ )
+        ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
 {
 #pragma omp parallel for schedule( dynamic )
   for( IndexType i = 0; i < BlockIterHost.getSize(); i++ )
@@ -267,13 +267,13 @@ updateBlocks( InterfaceMapType interfaceMap,
 template< typename Real,
         typename Device,
         typename Index >
-template< IndexType sizeSArray >
+template< int sizeSArray >
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
 updateBlocks( const InterfaceMapType& interfaceMap,
         const MeshFunctionType& aux,
         MeshFunctionType& helpFunc,
-        ArrayContainer BlockIterHost, IndexType numThreadsPerBlock/*, Real **sArray*/ )
+        ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
 {  
 //#pragma omp parallel for schedule( dynamic )
   for( IndexType i = 0; i < BlockIterHost.getSize(); i++ )
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index f7be7e7de..89cb60881 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -97,11 +97,11 @@ solve( const MeshPointer& mesh,
   if( i == 0 ) {
     printf( "0: mesh x: %d\n", mesh->getDimensions().x() );
     printf( "0: mesh y: %d\n", mesh->getDimensions().y() );
-    aux.save("aux_proc0.tnl");
-    for( int k = 0; k < 16*16; k++ )
+    //aux.save("aux_proc0.tnl");
+    /*for( int k = 0; k < mesh->getDimensions().x()*mesh->getDimensions().y(); k++ )
       aux[ k ] = 10;
-    for( int k = 0; k < 16; k++ ){
-      for( int l = 0; l < 16; l++ )
+    for( int k = 0; k < mesh->getDimensions().x(); k++ ){
+      for( int l = 0; l < mesh->getDimensions().y(); l++ )
         printf("%f.2\t",aux[ k * 16 + l ] );
     printf("\n");
     }*/
-- 
GitLab


From 5c15d04c3ff096fd5b5df51ebf57386b8f218ddc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matou=C5=A1=20Fencl?= <fenclmat@fjfi.cvut.cz>
Date: Tue, 19 Feb 2019 19:04:27 +0100
Subject: [PATCH 04/14] MPI implemented for CPU and GPU in 2D but
 meshFunction.template synchronize< Communicator >(); doesn't copy overlaps.

---
 .../tnl-direct-eikonal-solver.h               |   3 +
 .../tnlDirectEikonalMethodsBase.h             |  12 +-
 .../tnlDirectEikonalMethodsBase_impl.h        | 267 +++--
 .../hamilton-jacobi/tnlDirectEikonalProblem.h |   2 +
 .../tnlDirectEikonalProblem_impl.h            |  66 +-
 .../hamilton-jacobi/tnlFastSweepingMethod.h   |  20 +-
 .../tnlFastSweepingMethod1D_impl.h            |  13 +-
 .../tnlFastSweepingMethod2D_impl.h            | 941 ++++++++++++------
 .../tnlFastSweepingMethod3D_impl.h            |  13 +-
 9 files changed, 833 insertions(+), 504 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h
index 1b46ecb3d..82411c939 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h
@@ -36,6 +36,9 @@ class DirectEikonalSolverConfig
       {
          config.addDelimiter( "Direct eikonal equation solver settings:" );
          config.addRequiredEntry< String >( "input-file", "Input file." );
+         config.addEntry< String >( "distributed-grid-io-type", "Choose Distributed Grid IO Type", "LocalCopy");
+            config.addEntryEnum< String >( "LocalCopy" );
+            config.addEntryEnum< String >( "MpiIO" );
       };
 };
 
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index 24a388554..3a78d0f54 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -70,7 +70,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
             InterfaceMapPointer& interfaceMap );
     
     template< typename MeshEntity >
-    __cuda_callable__ void updateCell( MeshFunctionType& u,
+    __cuda_callable__ bool updateCell( MeshFunctionType& u,
             const MeshEntity& cell,
             const RealType velocity = 1.0 );
     
@@ -147,7 +147,12 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
         const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
         const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
         Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock =0);
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+        Containers::StaticVector< 2, Index > vLower, Containers::StaticVector< 2, Index > vUpper, int k,int oddEvenBlock =0);
+
+template< typename Real, typename Device, typename Index >
+__global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc );
 
 template < typename Index >
 __global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
@@ -160,7 +165,8 @@ __global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, I
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
         Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
-        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap );
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
+        Containers::StaticVector< 2, Index > vLower, Containers::StaticVector< 2, Index > vUpper );
 
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, 
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index 68e9c5f2b..3f5b6eed2 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -722,6 +722,10 @@ initInterface( const MeshFunctionPointer& _input,
   {
 #ifdef HAVE_CUDA
     const MeshType& mesh = _input->getMesh();
+    Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh.getDistributedMesh();
+    
+    Containers::StaticVector< 2, Index > vLower = meshPom->getLowerOverlap();
+    Containers::StaticVector< 2, Index > vUpper = meshPom->getUpperOverlap();
     
     const int cudaBlockSize( 16 );
     int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
@@ -731,30 +735,31 @@ initInterface( const MeshFunctionPointer& _input,
     Devices::Cuda::synchronizeDevice();
     CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
             _output.template modifyData< Device >(),
-            _interfaceMap.template modifyData< Device >() );
+            _interfaceMap.template modifyData< Device >(),
+            vLower, vUpper);
     cudaDeviceSynchronize();
     TNL_CHECK_CUDA_DEVICE;
 #endif
   }
   if( std::is_same< Device, Devices::Host >::value )
   {
-    MeshFunctionType input = _input.getData();
-    
-    /*double A[320][320];
-     std::ifstream fileInit("/home/maty/Downloads/initData.txt");
-     
-     for (int i = 0; i < 320; i++)
-     for (int j = 0; j < 320; j++)
-     fileInit >> A[j];
-     fileInit.close();
-     for (int i = 0; i < 320; i++)
-     for (int j = 0; j < 320; j++)
-     input[i*320 + j] = A[j];*/
-    
-    
+    MeshFunctionType input = _input.getData();    
     MeshFunctionType& output = _output.modifyData();
     InterfaceMapType& interfaceMap = _interfaceMap.modifyData();
     const MeshType& mesh = input.getMesh();
+/*#ifdef HAVE_MPI
+    int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup );
+    if( i == 0 )
+    {
+      printf( "0: mesh x: %d\n", mesh.getDimensions().x() );
+      printf( "0: mesh y: %d\n", mesh.getDimensions().y() );
+      for( int k = 0; k < mesh.getDimensions().y(); k++ ){
+        for( int l = 0; l < mesh.getDimensions().x(); l++ )
+          printf( "%.2f\t", input[ k * 16 + l ] );
+        printf("\n");
+      }
+    }
+#endif*/
     typedef typename MeshType::Cell Cell;
     Cell cell( mesh );
     for( cell.getCoordinates().y() = 0;
@@ -766,8 +771,8 @@ initInterface( const MeshFunctionPointer& _input,
       {
         cell.refresh();
         output[ cell.getIndex() ] =
-                input( cell ) >= 0 ? std::numeric_limits< RealType >::max() :
-                  - std::numeric_limits< RealType >::max();
+                input( cell ) >= 0 ? 10://std::numeric_limits< RealType >::max() :
+                  -10;//- std::numeric_limits< RealType >::max();
         interfaceMap[ cell.getIndex() ] = false;
       }
     
@@ -850,6 +855,19 @@ initInterface( const MeshFunctionPointer& _input,
           }
         }
       }
+#ifdef HAVE_MPI
+    //int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup );
+    /*if( i == 0 )
+    {
+      printf( "0: mesh x: %d\n", mesh.getDimensions().x() );
+      printf( "0: mesh y: %d\n", mesh.getDimensions().y() );
+      for( int k = 0; k < mesh.getDimensions().y(); k++ ){
+        for( int l = 0; l < mesh.getDimensions().x(); l++ )
+          printf("%.2f\t",output[ k * 16 + l ] );
+        printf("\n");
+      }
+    }*/
+#endif
   }
 }
 
@@ -858,7 +876,7 @@ template< typename Real,
         typename Index >
 template< typename MeshEntity >
 __cuda_callable__
-void
+bool
 tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
 updateCell( MeshFunctionType& u,
         const MeshEntity& cell,   
@@ -890,47 +908,39 @@ updateCell( MeshFunctionType& u,
     b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0,  -1 >() ],
             u[ neighborEntities.template getEntityIndex< 0,   1 >() ] );
   }
-  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-          fabs( b ) == std::numeric_limits< RealType >::max() )
-    return;
-  /*if( fabs( a ) == TypeInfo< Real >::getMaxValue() ||
-   fabs( b ) == TypeInfo< Real >::getMaxValue() ||
-   fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) )
-   {
-   tmp = 
-   fabs( a ) >= fabs( b ) ? b + TNL::sign( value ) * hy :
-   a + TNL::sign( value ) * hx;
-   }*/
-  /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() &&
-   fabs( b ) != TypeInfo< Real >::getMaxValue() &&
-   fabs( a - b ) < TNL::sqrt( (hx * hx + hy * hy)/v ) )
-   {
-   tmp = ( hx * hx * b + hy * hy * a + 
-   sign( value ) * hx * hy * TNL::sqrt( ( hx * hx + hy * hy )/v - 
-   ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy );
-   u[ cell.getIndex() ] =  tmp;
-   }
-   else
-   {
-   tmp = 
-   fabs( a ) > fabs( b ) ? b + TNL::sign( value ) * hy/v :
-   a + TNL::sign( value ) * hx/v;
-   u[ cell.getIndex() ] = argAbsMin( value, tmp );
-   //tmp = TypeInfo< RealType >::getMaxValue();
-   }*/
+  if( fabs( a ) == 10&&//std::numeric_limits< RealType >::max() && 
+          fabs( b ) == 10)//std::numeric_limits< RealType >::max() )
+    return false;
+  
   RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
   sortMinims( pom );
   tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
   
-  
   if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
-    u[ cell.getIndex() ] = argAbsMin( value, tmp );
-  else
   {
+    u[ cell.getIndex() ] = argAbsMin( value, tmp );
+    tmp = value - u[ cell.getIndex() ];
+    if ( fabs( tmp ) >  0.001*hx ){
+      //printf( "Vracime true!\n");
+      return true;
+    }else{
+      //printf( "Vracime false2!\n");
+      return false;
+    }
+  }
+  else {
     tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
             TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
             ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
     u[ cell.getIndex() ] = argAbsMin( value, tmp );
+    tmp = value - u[ cell.getIndex() ];
+    if ( fabs( tmp ) > 0.001*hx ){
+      //printf( "Vracime true3!\n");
+      return true;
+    }else{
+      //printf( "Vracime false!\n");
+      return false;
+    }
   }
 }
 
@@ -984,8 +994,8 @@ initInterface( const MeshFunctionPointer& _input,
         {
           cell.refresh();
           output[ cell.getIndex() ] =
-                  input( cell ) > 0 ? 10://std::numeric_limits< RealType >::max() :
-                    -10;//- std::numeric_limits< RealType >::max();
+                  input( cell ) > 0 ? std::numeric_limits< RealType >::max() :
+                    - std::numeric_limits< RealType >::max();
           interfaceMap[ cell.getIndex() ] = false;
         }
     
@@ -1011,31 +1021,7 @@ initInterface( const MeshFunctionPointer& _input,
             const IndexType e = neighbors.template getEntityIndex<  1,  0,  0 >();
             const IndexType n = neighbors.template getEntityIndex<  0,  1,  0 >();
             const IndexType t = neighbors.template getEntityIndex<  0,  0,  1 >();
-            //Try exact initiation
-            /*const IndexType w = neighbors.template getEntityIndex< -1,  0,  0 >();
-             const IndexType s = neighbors.template getEntityIndex<  0, -1,  0 >();
-             const IndexType b = neighbors.template getEntityIndex<  0,  0, -1 >();
-             if( c * input[ e ] <= 0 )
-             {
-             output[ cell.getIndex() ] = c;
-             output[ e ] = input[ e ];
-             interfaceMap[ e ] = true;   
-             interfaceMap[ cell.getIndex() ] = true;
-             }
-             else if( c * input[ n ] <= 0 )
-             {
-             output[ cell.getIndex() ] = c;
-             output[ n ] = input[ n ];
-             interfaceMap[ n ] = true;   
-             interfaceMap[ cell.getIndex() ] = true;
-             }
-             else if( c * input[ t ] <= 0 )
-             {
-             output[ cell.getIndex() ] = c;
-             output[ t ] = input[ t ];
-             interfaceMap[ t ] = true;   
-             interfaceMap[ cell.getIndex() ] = true;
-             }*/
+           
             if( c * input[ n ] <= 0 )
             {
               if( c >= 0 )
@@ -1172,31 +1158,6 @@ updateCell( MeshFunctionType& u,
           fabs( c ) == std::numeric_limits< RealType >::max() )
     return;
   
-  
-  /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() &&
-   fabs( b ) != TypeInfo< Real >::getMaxValue() &&
-   fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) )
-   {
-   tmp = ( hx * hx * a + hy * hy * b + 
-   sign( value ) * hx * hy * sqrt( ( hx * hx + hy * hy )/v - 
-   ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy );
-   }
-   if( fabs( a ) != TypeInfo< Real >::getMaxValue() &&
-   fabs( c ) != TypeInfo< Real >::getMaxValue() &&
-   fabs( a - c ) >= TNL::sqrt( (hx * hx + hz * hz)/v ) )
-   {
-   tmp = ( hx * hx * a + hz * hz * c + 
-   sign( value ) * hx * hz * sqrt( ( hx * hx + hz * hz )/v - 
-   ( a - c ) * ( a - c ) ) )/( hx * hx + hz * hz );
-   }
-   if( fabs( b ) != TypeInfo< Real >::getMaxValue() &&
-   fabs( c ) != TypeInfo< Real >::getMaxValue() &&
-   fabs( b - c ) >= TNL::sqrt( (hy * hy + hz * hz)/v ) )
-   {
-   tmp = ( hy * hy * b + hz * hz * c + 
-   sign( value ) * hy * hz * sqrt( ( hy * hy + hz * hz )/v - 
-   ( b - c ) * ( b - c ) ) )/( hy * hy + hz * hz );
-   }*/
   RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
   sortMinims( pom );   
   tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
@@ -1279,8 +1240,8 @@ updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real
   a = TNL::argAbsMin( sArray[ thrj * sizeSArray + thri+1 ],
           sArray[ thrj * sizeSArray + thri-1 ] );
   
-  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-          fabs( b ) == std::numeric_limits< RealType >::max() )
+  if( fabs( a ) == 10&&//std::numeric_limits< RealType >::max() && 
+          fabs( b ) == 10)//std::numeric_limits< RealType >::max() )
     return false;
   
   RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
@@ -1338,9 +1299,9 @@ updateCell3D( volatile Real *sArray, int thri, int thrj, int thrk,
   /*if( thrk == 8 )
     printf("Calculating a = %f, b = %f, c = %f\n" , a, b, c );*/
   
-  if( fabs( a ) == 10&& //std::numeric_limits< RealType >::max() && 
-          fabs( b ) == 10&&//std::numeric_limits< RealType >::max() &&
-          fabs( c ) == 10)//std::numeric_limits< RealType >::max() )
+  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+          fabs( b ) == std::numeric_limits< RealType >::max() &&
+          fabs( c ) == std::numeric_limits< RealType >::max() )
     return false;
   
   RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
@@ -1393,7 +1354,7 @@ updateCell3D( volatile Real *sArray, int thri, int thrj, int thrk,
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, 
         Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output,
-        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap  )
+        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap )
 {
   int i = threadIdx.x + blockDim.x*blockIdx.x;
   const Meshes::Grid< 1, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
@@ -1444,7 +1405,8 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1,
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
         Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
-        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap ) 
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
+        Containers::StaticVector< 2, Index > vLower, Containers::StaticVector< 2, Index > vUpper ) 
 {
   int i = threadIdx.x + blockDim.x*blockIdx.x;
   int j = blockDim.y*blockIdx.y + threadIdx.y;
@@ -1460,54 +1422,57 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2,
     
     
     output[ cind ] =
-            input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
-              - std::numeric_limits< Real >::max();
+            input( cell ) >= 0 ? 10://std::numeric_limits< Real >::max() :
+              - 10;//- std::numeric_limits< Real >::max();
     interfaceMap[ cind ] = false; 
     
-    const Real& hx = mesh.getSpaceSteps().x();
-    const Real& hy = mesh.getSpaceSteps().y();
-    cell.refresh();
-    const Real& c = input( cell );
-    if( ! cell.isBoundaryEntity()  )
+    if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] && i>vLower[0] && j> vLower[0] )
     {
-      auto neighbors = cell.getNeighborEntities();
-      Real pom = 0;
-      const Index e = neighbors.template getEntityIndex<  1,  0 >();
-      const Index w = neighbors.template getEntityIndex<  -1,  0 >();
-      const Index n = neighbors.template getEntityIndex<  0,  1 >();
-      const Index s = neighbors.template getEntityIndex<  0,  -1 >();
-      
-      if( c * input[ n ] <= 0 )
-      {
-        pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-          output[ cind ] = pom;
-        
-        interfaceMap[ cell.getIndex() ] = true;
-      }
-      if( c * input[ e ] <= 0 )
-      {
-        pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
-          output[ cind ] = pom;                       
-        
-        interfaceMap[ cind ] = true;
-      }
-      if( c * input[ w ] <= 0 )
-      {
-        pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-          output[ cind ] = pom;
-        
-        interfaceMap[ cind ] = true;
-      }
-      if( c * input[ s ] <= 0 )
+      const Real& hx = mesh.getSpaceSteps().x();
+      const Real& hy = mesh.getSpaceSteps().y();
+      cell.refresh();
+      const Real& c = input( cell );
+      if( ! cell.isBoundaryEntity()  )
       {
-        pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-          output[ cind ] = pom;
+        auto neighbors = cell.getNeighborEntities();
+        Real pom = 0;
+        const Index e = neighbors.template getEntityIndex<  1,  0 >();
+        const Index w = neighbors.template getEntityIndex<  -1,  0 >();
+        const Index n = neighbors.template getEntityIndex<  0,  1 >();
+        const Index s = neighbors.template getEntityIndex<  0,  -1 >();
         
-        interfaceMap[ cind ] = true;
+        if( c * input[ n ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+            output[ cind ] = pom;
+          
+          interfaceMap[ cell.getIndex() ] = true;
+        }
+        if( c * input[ e ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
+            output[ cind ] = pom;                       
+          
+          interfaceMap[ cind ] = true;
+        }
+        if( c * input[ w ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+            output[ cind ] = pom;
+          
+          interfaceMap[ cind ] = true;
+        }
+        if( c * input[ s ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+            output[ cind ] = pom;
+          
+          interfaceMap[ cind ] = true;
+        }
       }
     }
   }
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h
index 61465bee9..41aea10a0 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h
@@ -71,6 +71,8 @@ class tnlDirectEikonalProblem
       
       bool setInitialCondition( const Config::ParameterContainer& parameters,
                                 DofVectorPointer& dofs );
+      
+      bool makeSnapshot( );
 
       bool solve( DofVectorPointer& dosf );
 
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
index 7437803e2..0aecde5db 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
@@ -12,6 +12,9 @@
  */
 
 #pragma once
+#include <TNL/FileName.h>
+
+#include "tnlDirectEikonalProblem.h"
 
 template< typename Mesh,
           typename Communicator,
@@ -76,6 +79,11 @@ tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >::
 setup( const Config::ParameterContainer& parameters,
        const String& prefix )
 {
+  String param=parameters.getParameter< String >( "distributed-grid-io-type" );
+   if(param=="MpiIO")
+        distributedIOType=Meshes::DistributedMeshes::MpiIO;
+   if(param=="LocalCopy")
+        distributedIOType=Meshes::DistributedMeshes::LocalCopy;
    return true;
 }
 
@@ -117,14 +125,14 @@ setInitialCondition( const Config::ParameterContainer& parameters,
   String inputFile = parameters.getParameter< String >( "input-file" );
   this->initialData->setMesh( this->getMesh() );
   std::cout<<"setInitialCondition" <<std::endl; 
-  if(CommunicatorType::isDistributed())
+  if( CommunicatorType::isDistributed() )
   {
-    std::cout<<"Nodes Distribution: " << u->getMesh().getDistributedMesh()->printProcessDistr() << std::endl;
+    std::cout<<"Nodes Distribution: " << initialData->getMesh().getDistributedMesh()->printProcessDistr() << std::endl;
     if(distributedIOType==Meshes::DistributedMeshes::MpiIO)
-      Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType,Meshes::DistributedMeshes::MpiIO> ::load(inputFile, *u );
+      Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType,Meshes::DistributedMeshes::MpiIO> ::load(inputFile, *initialData );
     if(distributedIOType==Meshes::DistributedMeshes::LocalCopy)
-      Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType,Meshes::DistributedMeshes::LocalCopy> ::load(inputFile, *u );
-    u->template synchronize<CommunicatorType>();
+      Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType,Meshes::DistributedMeshes::LocalCopy> ::load(inputFile, *initialData );
+    initialData->template synchronize<CommunicatorType>();
   }
   else
   {
@@ -141,6 +149,38 @@ setInitialCondition( const Config::ParameterContainer& parameters,
   return true;
 }
 
+template< typename Mesh,
+          typename Communicator,
+          typename Anisotropy,
+          typename Real,
+          typename Index >
+bool
+tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >::
+makeSnapshot(  )
+{
+   std::cout << std::endl << "Writing output." << std::endl;
+
+   //this->bindDofs( dofs );
+
+   FileName fileName;
+   fileName.setFileNameBase( "u-" );
+   fileName.setExtension( "tnl" );
+
+   if(CommunicatorType::isDistributed())
+   {
+      if(distributedIOType==Meshes::DistributedMeshes::MpiIO)
+        Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType,Meshes::DistributedMeshes::MpiIO> ::save(fileName.getFileName(), *u );
+      if(distributedIOType==Meshes::DistributedMeshes::LocalCopy)
+        Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType,Meshes::DistributedMeshes::LocalCopy> ::save(fileName.getFileName(), *u );
+   }
+   else
+   {
+      if( ! this->u->save( fileName.getFileName() ) )
+         return false;
+   }
+   return true;
+}
+
 
 template< typename Mesh,
           typename Communicator,
@@ -151,7 +191,19 @@ bool
 tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >::
 solve( DofVectorPointer& dofs )
 {
-   FastSweepingMethod< MeshType, AnisotropyType > fsm;
-   fsm.solve( this->getMesh(), anisotropy, initialData );
+   FastSweepingMethod< MeshType, Communicator,AnisotropyType > fsm;
+   fsm.solve( this->getMesh(), u, anisotropy, initialData );
+   
+   /*int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup );
+   const MeshPointer msh = this->getMesh();
+   if( i == 0 &&  msh->getMeshDimension() == 2 )
+   {
+     for( int k = 0; k < 9; k++ ){
+       for( int l = 0; l < msh->getDimensions().x(); l++ )
+         printf("%.2f\t",(*initialData)[ k * msh->getDimensions().x() + l ] );
+       printf("\n");
+     }
+   }*/
+   makeSnapshot();
    return true;
 }
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
index 57b1886e8..51b3faceb 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
@@ -17,6 +17,7 @@
 
 
 template< typename Mesh,
+        typename Communicator,
         typename Anisotropy = Functions::Analytic::Constant< Mesh::getMeshDimension(), typename Mesh::RealType > >
 class FastSweepingMethod
 {   
@@ -25,8 +26,9 @@ class FastSweepingMethod
 template< typename Real,
         typename Device,
         typename Index,
+        typename Communicator,
         typename Anisotropy >
-class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >
+class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Communicator, Anisotropy >
 : public tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >
 {
   //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
@@ -47,7 +49,7 @@ class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >
     using typename BaseType::MeshFunctionType;
     using typename BaseType::InterfaceMapPointer;
     using typename BaseType::MeshFunctionPointer;
-    
+   
     
     FastSweepingMethod();
     
@@ -56,6 +58,7 @@ class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >
     void setMaxIterations( const IndexType& maxIterations );
     
     void solve( const MeshPointer& mesh,
+            MeshFunctionPointer& Aux,
             const AnisotropyPointer& anisotropy,
             MeshFunctionPointer& u );
     
@@ -68,8 +71,9 @@ class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >
 template< typename Real,
         typename Device,
         typename Index,
+        typename Communicator,
         typename Anisotropy >
-class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >
+class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >
 : public tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
 {
   //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
@@ -84,13 +88,14 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >
     typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > BaseType;
     using MeshPointer = Pointers::SharedPointer<  MeshType >;
     using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
+    using MPI = Communicators::MpiCommunicator;
     
     using typename BaseType::InterfaceMapType;
     using typename BaseType::MeshFunctionType;
     using typename BaseType::InterfaceMapPointer;
     using typename BaseType::MeshFunctionPointer;
     using typename BaseType::ArrayContainer;
-    
+        
     FastSweepingMethod();
     
     const IndexType& getMaxIterations() const;
@@ -98,8 +103,9 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >
     void setMaxIterations( const IndexType& maxIterations );
     
     void solve( const MeshPointer& mesh,
+            MeshFunctionPointer& Aux,
             const AnisotropyPointer& anisotropy,
-            MeshFunctionPointer& u );
+            const MeshFunctionPointer& u );
     
     protected:
       
@@ -109,8 +115,9 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >
 template< typename Real,
         typename Device,
         typename Index,
+        typename Communicator,
         typename Anisotropy >
-class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >
+class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >
 : public tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
 {
   //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
@@ -140,6 +147,7 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >
     void setMaxIterations( const IndexType& maxIterations );
     
     void solve( const MeshPointer& mesh,
+            MeshFunctionPointer& Aux,
             const AnisotropyPointer& anisotropy,
             MeshFunctionPointer& u );
     
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h
index 890c6cb4c..662a5b79c 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h
@@ -18,8 +18,9 @@
 template< typename Real,
           typename Device,
           typename Index,
+        typename Communicator,
           typename Anisotropy >
-FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Communicator, Anisotropy >::
 FastSweepingMethod()
 : maxIterations( 1 )
 {
@@ -29,9 +30,10 @@ FastSweepingMethod()
 template< typename Real,
           typename Device,
           typename Index,
+        typename Communicator,
           typename Anisotropy >
 const Index&
-FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Communicator, Anisotropy >::
 getMaxIterations() const
 {
    
@@ -40,9 +42,10 @@ getMaxIterations() const
 template< typename Real,
           typename Device,
           typename Index,
+        typename Communicator,
           typename Anisotropy >
 void
-FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Communicator, Anisotropy >::
 setMaxIterations( const IndexType& maxIterations )
 {
    
@@ -51,10 +54,12 @@ setMaxIterations( const IndexType& maxIterations )
 template< typename Real,
           typename Device,
           typename Index,
+        typename Communicator,
           typename Anisotropy >
 void
-FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Communicator, Anisotropy >::
 solve( const MeshPointer& mesh,
+        MeshFunctionPointer& Aux,
        const AnisotropyPointer& anisotropy,
        MeshFunctionPointer& u )
 {   
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index 89cb60881..f28202a18 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -14,8 +14,10 @@
 #pragma once
 
 #include "tnlFastSweepingMethod.h"
+#include "tnlDirectEikonalProblem.h"
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Communicators/MpiDefs.h>
+#include "tnlDirectEikonalProblem.h"
 
 
 
@@ -27,8 +29,9 @@
 template< typename Real,
         typename Device,
         typename Index,
+        typename Communicator,
         typename Anisotropy >
-FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
 FastSweepingMethod()
 : maxIterations( 1 )
 {
@@ -38,9 +41,10 @@ FastSweepingMethod()
 template< typename Real,
         typename Device,
         typename Index,
+        typename Communicator,
         typename Anisotropy >
 const Index&
-FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
 getMaxIterations() const
 {
   
@@ -49,9 +53,10 @@ getMaxIterations() const
 template< typename Real,
         typename Device,
         typename Index,
+        typename Communicator,
         typename Anisotropy >
 void
-FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
 setMaxIterations( const IndexType& maxIterations )
 {
   
@@ -60,12 +65,14 @@ setMaxIterations( const IndexType& maxIterations )
 template< typename Real,
         typename Device,
         typename Index,
-        typename Anisotropy >
+        typename Communicator,
+        typename Anisotropy > 
 void
-FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
 solve( const MeshPointer& mesh,
+        MeshFunctionPointer& Aux,
         const AnisotropyPointer& anisotropy,
-        MeshFunctionPointer& u )
+        const MeshFunctionPointer& u )
 {  
   MeshFunctionPointer auxPtr;
   InterfaceMapPointer interfaceMapPtr;
@@ -81,6 +88,7 @@ solve( const MeshPointer& mesh,
   IndexType iteration( 0 );
   InterfaceMapType interfaceMap = *interfaceMapPtr;
   MeshFunctionType aux = *auxPtr;
+  aux.template synchronize< Communicator >();
   
   
 #ifdef HAVE_MPI
@@ -116,263 +124,260 @@ solve( const MeshPointer& mesh,
   
   while( iteration < this->maxIterations )
   {
-    if( std::is_same< DeviceType, Devices::Host >::value )
+#ifdef HAVE_MPI
+    int i = MPI::GetRank( MPI::AllGroup );
+    
+    /*if( i == 0 )
     {
-      int numThreadsPerBlock = -1;
-      
-      numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0));
-      //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
-      if( numThreadsPerBlock <= 16 )
-        numThreadsPerBlock = 16;
-      else if(numThreadsPerBlock <= 32 )
-        numThreadsPerBlock = 32;
-      else if(numThreadsPerBlock <= 64 )
-        numThreadsPerBlock = 64;
-      else if(numThreadsPerBlock <= 128 )
-        numThreadsPerBlock = 128;
-      else if(numThreadsPerBlock <= 256 )
-        numThreadsPerBlock = 256;
-      else if(numThreadsPerBlock <= 512 )
-        numThreadsPerBlock = 512;
-      else
-        numThreadsPerBlock = 1024;
-      //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
-      
-      if( numThreadsPerBlock == -1 ){
-        printf("Fail in setting numThreadsPerBlock.\n");
-        break;
+      for( int k = 0; k < mesh->getDimensions().y(); k++ ){
+        for( int l = 0; l < mesh->getDimensions().x(); l++ )
+          printf("%.2f\t",aux[ k * mesh->getDimensions().x() + l ] );
+        printf("\n");
       }
+    }*/
+    aux.template synchronize< Communicator >();
+    Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh();
+        
+    const int *neigh = meshPom->getNeighbors();
+    MPI::Request *req;
+    req = new MPI::Request[meshPom->getNeighborsCount()];
+    int WhileCount = 0;
+#endif
+    
+    Containers::StaticVector< 2, IndexType > vLower = meshPom->getLowerOverlap();
+    Containers::StaticVector< 2, IndexType > vUpper = meshPom->getUpperOverlap();
+    printf( "%d: meshDimensions are (x,y) = (%d,%d).\n",i, mesh->getDimensions().x(), mesh->getDimensions().y() );
+    printf( "%d: owerlaps are ([x1,x2],[y1,y2]) = ([%d,%d],[%d,%d]).\n",i, vLower[0], vUpper[0], vLower[1], vUpper[1] );
+    int calculated = 1;
+    int calculate = 1;
+    
+    while( calculated )
+    {
+      calculated = 0;
+      WhileCount++;
       
-      
-      
-      int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
-      int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
-      
-      //std::cout << "numBlocksX = " << numBlocksX << std::endl;
-      
-      /*Real **sArray = new Real*[numBlocksX*numBlocksY];
-       for( int i = 0; i < numBlocksX * numBlocksY; i++ )
-       sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];*/
-      
-      ArrayContainer BlockIterHost;
-      BlockIterHost.setSize( numBlocksX * numBlocksY );
-      BlockIterHost.setValue( 1 );
-      int IsCalculationDone = 1;
-      
-      MeshFunctionPointer helpFunc( mesh );
-      MeshFunctionPointer helpFunc1( mesh );
-      helpFunc1 = auxPtr;
-      auxPtr = helpFunc;
-      helpFunc = helpFunc1;
-      //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl;
-      /*for( int k = numBlocksX-1; k >-1; k-- ){
-       for( int l = 0; l < numBlocksY; l++ ){
-       std::cout<< BlockIterHost[ l*numBlocksX  + k ];
-       }
-       std::cout<<std::endl;
-       }
-       std::cout<<std::endl;*/
-      unsigned int numWhile = 0;
-      while( IsCalculationDone )
-      {      
-        IsCalculationDone = 0;
-        helpFunc1 = auxPtr;
-        auxPtr = helpFunc;
-        helpFunc = helpFunc1;
-        switch ( numThreadsPerBlock ){
-          case 16:
-            this->template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
-          case 32:
-            this->template updateBlocks< 34 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
-          case 64:
-            this->template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
-          case 128:
-            this->template updateBlocks< 130 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
-          case 256:
-            this->template updateBlocks< 258 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
-          case 512:
-            this->template updateBlocks< 514 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
-          default:
-            this->template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+      if( std::is_same< DeviceType, Devices::Host >::value && calculate )
+      {
+        calculate = 0;
+        
+        /**--HERE-IS-PARALLEL-OMP-CODE--!!!WITHOUT MPI!!!--------------------**/
+        /*
+         int numThreadsPerBlock = -1;
+         
+         numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0));
+         //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
+         if( numThreadsPerBlock <= 16 )
+         numThreadsPerBlock = 16;
+         else if(numThreadsPerBlock <= 32 )
+         numThreadsPerBlock = 32;
+         else if(numThreadsPerBlock <= 64 )
+         numThreadsPerBlock = 64;
+         else if(numThreadsPerBlock <= 128 )
+         numThreadsPerBlock = 128;
+         else if(numThreadsPerBlock <= 256 )
+         numThreadsPerBlock = 256;
+         else if(numThreadsPerBlock <= 512 )
+         numThreadsPerBlock = 512;
+         else
+         numThreadsPerBlock = 1024;
+         //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
+         
+         if( numThreadsPerBlock == -1 ){
+         printf("Fail in setting numThreadsPerBlock.\n");
+         break;
+         }
+         
+         
+         
+         int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
+         int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
+         
+         //std::cout << "numBlocksX = " << numBlocksX << std::endl;
+         
+         //Real **sArray = new Real*[numBlocksX*numBlocksY];
+         //for( int i = 0; i < numBlocksX * numBlocksY; i++ )
+         // sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];
+         
+         ArrayContainer BlockIterHost;
+         BlockIterHost.setSize( numBlocksX * numBlocksY );
+         BlockIterHost.setValue( 1 );
+         int IsCalculationDone = 1;
+         
+         MeshFunctionPointer helpFunc( mesh );
+         MeshFunctionPointer helpFunc1( mesh );
+         helpFunc1 = auxPtr;
+         auxPtr = helpFunc;
+         helpFunc = helpFunc1;
+         //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl;
+         //for( int k = numBlocksX-1; k >-1; k-- ){
+         // for( int l = 0; l < numBlocksY; l++ ){
+         // std::cout<< BlockIterHost[ l*numBlocksX  + k ];
+         // }
+         // std::cout<<std::endl;
+         // }
+         // std::cout<<std::endl;
+         unsigned int numWhile = 0;
+         while( IsCalculationDone )
+         {      
+         IsCalculationDone = 0;
+         helpFunc1 = auxPtr;
+         auxPtr = helpFunc;
+         helpFunc = helpFunc1;
+         switch ( numThreadsPerBlock ){
+         case 16:
+         this->template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         case 32:
+         this->template updateBlocks< 34 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         case 64:
+         this->template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         case 128:
+         this->template updateBlocks< 130 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         case 256:
+         this->template updateBlocks< 258 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         case 512:
+         this->template updateBlocks< 514 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         default:
+         this->template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         }
+         
+         
+         //Reduction      
+         for( int i = 0; i < BlockIterHost.getSize(); i++ ){
+         if( IsCalculationDone == 0 ){
+         IsCalculationDone = IsCalculationDone || BlockIterHost[ i ];
+         //break;
+         }
+         }
+         numWhile++;
+         //std::cout <<"numWhile = "<< numWhile <<std::endl;
+         
+         // for( int j = numBlocksY-1; j>-1; j-- ){
+         // for( int i = 0; i < numBlocksX; i++ )
+         // std::cout << BlockIterHost[ j * numBlocksX + i ];
+         // std::cout << std::endl;
+         // }
+         // std::cout << std::endl;
+         
+         this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY );
+         
+         //std::cout<<std::endl;
+         //String s( "aux-"+ std::to_string(numWhile) + ".tnl");
+         //aux.save( s );
+         }
+         if( numWhile == 1 ){
+         auxPtr = helpFunc;
+         }
+         */
+        /**-END-OF-OMP-PARALLEL------------------------------------------------**/
+        
+        
+        /*if( i == 1 )
+         {
+         for( int k = 0; k < mesh->getDimensions().y(); k++ ){
+         for( int l = 0; l < mesh->getDimensions().x(); l++ )
+         printf("%.2f\t",aux[ k * mesh->getDimensions().x() + l ] );
+         printf("\n");
+         }
+         }*/
+        
+        for( cell.getCoordinates().y() = 0 + vLower[1];
+                cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
+                cell.getCoordinates().y()++ )
+        {
+          for( cell.getCoordinates().x() = 0 + vLower[0];
+                  cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
+                  cell.getCoordinates().x()++ )
+          {
+            cell.refresh();
+            if( ! interfaceMap( cell ) )
+              calculated = this->updateCell( aux, cell ) || calculated;
+          }
         }
+        //if( i == 0 )
+        //{
+        //  for( int k = 0; k < mesh->getDimensions().y(); k++ ){
+        //    for( int l = 0; l < mesh->getDimensions().x(); l++ )
+        //      printf("%.2f\t",aux[ k * mesh->getDimensions().x() + l ] );
+        //    printf("\n");
+        //  }
+        //}
         
+        //aux.save( "aux-1.tnl" );
         
-        //Reduction      
-        for( int i = 0; i < BlockIterHost.getSize(); i++ ){
-          if( IsCalculationDone == 0 ){
-            IsCalculationDone = IsCalculationDone || BlockIterHost[ i ];
-            //break;
+        for( cell.getCoordinates().y() = 0 + vLower[1];
+                cell.getCoordinates().y() < mesh->getDimensions().y()-vUpper[1];
+                cell.getCoordinates().y()++ )
+        {
+          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0];
+                  cell.getCoordinates().x() >= 0 + vLower[0];
+                  cell.getCoordinates().x()-- )		
+          {
+            //std::cerr << "2 -> ";
+            cell.refresh();
+            if( ! interfaceMap( cell ) )            
+              this->updateCell( aux, cell );
           }
         }
-        numWhile++;
-        /*std::cout <<"numWhile = "<< numWhile <<std::endl;
         
-        for( int j = numBlocksY-1; j>-1; j-- ){
-          for( int i = 0; i < numBlocksX; i++ )
-            std::cout << BlockIterHost[ j * numBlocksX + i ];
-          std::cout << std::endl;
+        //aux.save( "aux-2.tnl" );
+        
+        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 -vUpper[1];
+                cell.getCoordinates().y() >= 0 + vLower[1] ;
+                cell.getCoordinates().y()-- )
+        {
+          for( cell.getCoordinates().x() = 0 + vLower[0];
+                  cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
+                  cell.getCoordinates().x()++ )
+          {
+            //std::cerr << "3 -> ";
+            cell.refresh();
+            if( ! interfaceMap( cell ) )            
+              this->updateCell( aux, cell );
+          }
         }
-        std::cout << std::endl;*/
         
-        this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY );
+        //aux.save( "aux-3.tnl" );
         
-        /*for( int j = numBlocksY-1; j>-1; j-- ){
-         for( int i = 0; i < numBlocksX; i++ )
-         std::cout << "BlockIterHost = "<< j*numBlocksX + i<< " ," << BlockIterHost[ j * numBlocksX + i ];
-         std::cout << std::endl;
-         }
-         std::cout << std::endl;*/
+        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 - vUpper[1];
+                cell.getCoordinates().y() >= 0 + vLower[1];
+                cell.getCoordinates().y()-- )
+        {
+          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0];
+                  cell.getCoordinates().x() >= 0 + vLower[0];
+                  cell.getCoordinates().x()-- )		
+          {
+            //std::cerr << "4 -> ";
+            cell.refresh();
+            if( ! interfaceMap( cell ) )            
+              this->updateCell( aux, cell );
+          }
+        }
         
-        //std::cout<<std::endl;
-        //string s( "aux-"+ std::to_string(numWhile) + ".tnl");
-        //aux.save( s );
       }
-      if( numWhile == 1 ){
-        auxPtr = helpFunc;
-      }
-      /*for( int i = 0; i < numBlocksX * numBlocksY; i++ )
-       delete []sArray[i];*/
-      
-      
-      /*for( cell.getCoordinates().y() = 0;
-       cell.getCoordinates().y() < mesh->getDimensions().y();
-       cell.getCoordinates().y()++ )
-       {
-       for( cell.getCoordinates().x() = 0;
-       cell.getCoordinates().x() < mesh->getDimensions().x();
-       cell.getCoordinates().x()++ )
-       {
-       cell.refresh();
-       if( ! interfaceMap( cell ) )
-       this->updateCell( aux, cell );
-       }
-       } 
-       
-       //aux.save( "aux-1.tnl" );
-       
-       for( cell.getCoordinates().y() = 0;
-       cell.getCoordinates().y() < mesh->getDimensions().y();
-       cell.getCoordinates().y()++ )
-       {
-       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-       cell.getCoordinates().x() >= 0 ;
-       cell.getCoordinates().x()-- )		
-       {
-       //std::cerr << "2 -> ";
-       cell.refresh();
-       if( ! interfaceMap( cell ) )            
-       this->updateCell( aux, cell );
-       }
-       }
-       
-       //aux.save( "aux-2.tnl" );
-       
-       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-       cell.getCoordinates().y() >= 0 ;
-       cell.getCoordinates().y()-- )
-       {
-       for( cell.getCoordinates().x() = 0;
-       cell.getCoordinates().x() < mesh->getDimensions().x();
-       cell.getCoordinates().x()++ )
-       {
-       //std::cerr << "3 -> ";
-       cell.refresh();
-       if( ! interfaceMap( cell ) )            
-       this->updateCell( aux, cell );
-       }
-       }
-       
-       //aux.save( "aux-3.tnl" );
-       
-       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-       cell.getCoordinates().y() >= 0;
-       cell.getCoordinates().y()-- )
-       {
-       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-       cell.getCoordinates().x() >= 0 ;
-       cell.getCoordinates().x()-- )		
-       {
-       //std::cerr << "4 -> ";
-       cell.refresh();
-       if( ! interfaceMap( cell ) )            
-       this->updateCell( aux, cell );
-       }
-       }
-       
-       for( int j = 0;
-       j < mesh->getDimensions().y();
-       j++ )
-       {
-       for( int i = 0;
-       i < mesh->getDimensions().x();
-       i++ )
-       {
-       std::cout << aux[ i * mesh->getDimensions().y() + j ] << " ";
-       }
-       std::cout << std::endl;
-       }*/
-      
-    }
-    if( std::is_same< DeviceType, Devices::Cuda >::value )
-    {
-      // TODO: CUDA code
-#ifdef HAVE_CUDA
-      TNL_CHECK_CUDA_DEVICE;
-      // Maximum cudaBlockSite is 32. Because of maximum num. of threads in kernel.
-      const int cudaBlockSize( 16 );
-      
-      int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
-      int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
-      dim3 blockSize( cudaBlockSize, cudaBlockSize );
-      dim3 gridSize( numBlocksX, numBlocksY );
-      
-      tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
-      
-      int BlockIterD = 1;
-      
-      TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
-      BlockIterDevice.setSize( numBlocksX * numBlocksY );
-      BlockIterDevice.setValue( 1 );
-      TNL_CHECK_CUDA_DEVICE;
-      
-      
-      TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom;
-      BlockIterPom.setSize( numBlocksX * numBlocksY  );
-      BlockIterPom.setValue( 0 );
-      /*TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1;
-       BlockIterPom1.setSize( numBlocksX * numBlocksY  );
-       BlockIterPom1.setValue( 0 );*/
-      /*int *BlockIterDevice;
-       cudaMalloc((void**) &BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/
-      int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
-      //std::cout << "nBlocksNeigh = " << nBlocksNeigh << std::endl;
-      //free( BlockIter );
-      /*int *BlockIterPom;
-       cudaMalloc((void**) &BlockIterPom, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/
-      
-      int nBlocks = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
-      
-      TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
-      dBlock.setSize( nBlocks );
-      TNL_CHECK_CUDA_DEVICE;
-      /*int *dBlock;
-       cudaMalloc((void**) &dBlock, nBlocks * sizeof( int ) );*/
-      
-      
-      MeshFunctionPointer helpFunc1( mesh );      
-      MeshFunctionPointer helpFunc( mesh );
-      
-      helpFunc1 = auxPtr;
-      auxPtr = helpFunc;
-      helpFunc = helpFunc1;
-      
-      int numIter = 0;
-      
-      //int oddEvenBlock = 0;
-      while( BlockIterD )
+      if( std::is_same< DeviceType, Devices::Cuda >::value && calculate )
       {
-        /** HERE IS CHESS METHOD **/
+        // TODO: CUDA code
+        
+        calculate = 0;
+        //if( i == 0 )
+        //  printf("%d: We are in Cuda code start.\n", i);
+#ifdef HAVE_CUDA
+        
+        TNL_CHECK_CUDA_DEVICE;
+        // Maximum cudaBlockSite is 32. Because of maximum num. of threads in kernel.
+        const int cudaBlockSize( 16 );
+        
+        int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vLower[0] - vUpper[0], cudaBlockSize );
+        int numBlocksXbez = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
+        int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vLower[1] - vUpper[1], cudaBlockSize );
+        int numBlocksYbez = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
+        dim3 blockSize( cudaBlockSize, cudaBlockSize );
+        dim3 gridSizeBez( numBlocksXbez, numBlocksYbez );
+        dim3 gridSize( numBlocksX, numBlocksY );
         
+        tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
+        
+        int BlockIterD = 1;
         /*auxPtr = helpFunc;
          
          CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
@@ -408,15 +413,44 @@ solve( const MeshPointer& mesh,
          
          BlockIterD = dBlock.getElement( 0 );*/
         
-        /**------------------------------------------------------------------------------------------------*/
+        TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
+        BlockIterDevice.setSize( numBlocksX * numBlocksY );
+        BlockIterDevice.setValue( 1 );
+        TNL_CHECK_CUDA_DEVICE;
+        
         
+        TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom;
+        BlockIterPom.setSize( numBlocksX * numBlocksY  );
+        BlockIterPom.setValue( 0 );
+        TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1;
+        BlockIterPom1.setSize( numBlocksX * numBlocksY  );
+        BlockIterPom1.setValue( 0 );
+        /*int *BlockIterDevice;
+         cudaMalloc((void**) &BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/
+        int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
+        //std::cout << "nBlocksNeigh = " << nBlocksNeigh << std::endl;
+        //free( BlockIter );
+        /*int *BlockIterPom;
+         cudaMalloc((void**) &BlockIterPom, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/
         
-        /** HERE IS FIM **/
+        int nBlocks = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
+        
+        TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
+        dBlock.setSize( nBlocks );
+        TNL_CHECK_CUDA_DEVICE;
+        /*int *dBlock;
+         cudaMalloc((void**) &dBlock, nBlocks * sizeof( int ) );*/
+        
+        
+        MeshFunctionPointer helpFunc1( mesh );      
+        MeshFunctionPointer helpFunc( mesh );
+        //helpFunc->bind( auxPtr->getData() );
+        DeepCopy<<< gridSizeBez, blockSize >>>( auxPtr.template getData< Device>(),
+           helpFunc.template modifyData< Device>() );
         
         helpFunc1 = auxPtr;
         auxPtr = helpFunc;
         helpFunc = helpFunc1;
-        TNL_CHECK_CUDA_DEVICE;
         
         //int pocBloku = 0;
         Devices::Cuda::synchronizeDevice();
@@ -428,67 +462,231 @@ solve( const MeshPointer& mesh,
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
         
-        //std::cout << "Pocet aktivnich bloku = " << pocBloku << std::endl;
-        //BlockIterPom1 = BlockIterDevice;
-        ///for( int i =0; i< numBlocksX; i++ ){
-        //  for( int j = 0; j < numBlocksY; j++ )
-        //  {
-        //    std::cout << BlockIterPom1[j*numBlocksX + i];
-        //  }
-        //  std::cout << std::endl;
-        //}
-        //std::cout << std::endl;
+        //int oddEvenBlock = 0;
+        //int numberWhile = 0;
+        while( BlockIterD /*numberWhile < 10*/)
+        {
+          //numberWhile++;
+          /** HERE IS CHESS METHOD **/
+          
+          /*auxPtr = helpFunc;
+           
+           CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
+           interfaceMapPtr.template getData< Device >(),
+           auxPtr.template getData< Device>(),
+           helpFunc.template modifyData< Device>(),
+           BlockIterDevice,
+           oddEvenBlock );
+           cudaDeviceSynchronize();
+           TNL_CHECK_CUDA_DEVICE;
+           auxPtr = helpFunc;
+           
+           oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
+           
+           CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
+           interfaceMapPtr.template getData< Device >(),
+           auxPtr.template getData< Device>(),
+           helpFunc.template modifyData< Device>(),
+           BlockIterDevice, vLower, vUpper, 
+           oddEvenBlock );
+           cudaDeviceSynchronize();
+           TNL_CHECK_CUDA_DEVICE;
+           auxPtr = helpFunc;
+           
+           oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
+           
+           CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
+           cudaDeviceSynchronize();
+           TNL_CHECK_CUDA_DEVICE;
+           CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+           cudaDeviceSynchronize();
+           TNL_CHECK_CUDA_DEVICE;
+           
+           BlockIterD = dBlock.getElement( 0 );*/
+          
+          /**------------------------------------------------------------------------------------------------*/
+          
+          
+          /** HERE IS FIM **/
+          
+          helpFunc1 = auxPtr;
+          auxPtr = helpFunc;
+          helpFunc = helpFunc1;
+          TNL_CHECK_CUDA_DEVICE;
+          
+          //int pocBloku = 0;
+          Devices::Cuda::synchronizeDevice();
+          CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
+                  interfaceMapPtr.template getData< Device >(),
+                  auxPtr.template modifyData< Device>(),
+                  helpFunc.template modifyData< Device>(),
+                  BlockIterDevice, vLower, vUpper, i );
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
         
-        GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice.getView(), BlockIterPom.getView(), numBlocksX, numBlocksY );
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
-        BlockIterDevice = BlockIterPom;
         
-        //std::cout<< "Probehlo" << std::endl;
         
-        //TNL::swap( auxPtr, helpFunc );
+        aux = *auxPtr;
+        interfaceMap = *interfaceMapPtr;
+#endif
+      }
+
+      
+/**----------------------MPI-TO-DO---------------------------------------------**/
         
+#ifdef HAVE_MPI
+        //int i = MPI::GetRank( MPI::AllGroup );
+        //TNL::Meshes::DistributedMeshes::DistributedMesh< MeshType > Mesh;
+        int neighCount = 0; // should this thread calculate again?
+        int calculpom[4] = {0,0,0,0};
         
-        CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY ) );
-        TNL_CHECK_CUDA_DEVICE;
+          if( i == 0 ){
+            BlockIterPom1 = BlockIterDevice;
+            for( int i =0; i< numBlocksX; i++ ){
+              for( int j = 0; j < numBlocksY; j++ )
+              {
+                std::cout << BlockIterPom1[j*numBlocksX + i];
+              }
+              std::cout << std::endl;
+            }
+            std::cout << std::endl;
+          }
+          GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, BlockIterPom, numBlocksX, numBlocksY );
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
+          BlockIterDevice = BlockIterPom;
+          
+          if( i == 0 ){
+            BlockIterPom1 = BlockIterDevice;
+            for( int i =0; i< numBlocksX; i++ ){
+              for( int j = 0; j < numBlocksY; j++ )
+              {
+                std::cout << BlockIterPom1[j*numBlocksX + i];
+              }
+              std::cout << std::endl;
+            }
+            std::cout << std::endl;
+          }
+          //std::cout<< "Probehlo" << std::endl;
+          
+          //TNL::swap( auxPtr, helpFunc );
+          
+          CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
+          TNL_CHECK_CUDA_DEVICE;
+          
+          CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+          TNL_CHECK_CUDA_DEVICE;
+          
+          
+          //if( i == 0 )
+          //  printf("%d: We did parallel reduction.\n", i);
+          BlockIterD = dBlock.getElement( 0 );
+          
+          //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
+          //if( i == 0 )
+          //  printf("%d: BlockIterD = %d.\n", i, BlockIterD);
+          
+#ifdef HAVE_MPI
+          if( BlockIterD ){
+            calculated = 1;
+            //printf( "calculated = %d\n",calculated );
+          }
+#endif
+          /**-----------------------------------------------------------------------------------------------------------*/
+          /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
+           BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
+          numIter ++;
+        }
+        if( numIter%2  == 1 ){
+          auxPtr = helpFunc;
+        }
+        /*cudaFree( BlockIterDevice );
+         cudaFree( dBlock );
+         delete BlockIter;*/
         
-        CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks );
-        TNL_CHECK_CUDA_DEVICE;
+        if( neigh[1] != -1 )
+        {
+          req[neighCount] = MPI::ISend( &calculated, 1, neigh[1], 0, MPI::AllGroup ); 
+          neighCount++;
+          
+          
+          req[neighCount] = MPI::IRecv( &calculpom[1], 1, neigh[1], 0, MPI::AllGroup );
+          neighCount++;
+        }
         
+        if( neigh[2] != -1 )
+        {
+          req[neighCount] = MPI::ISend( &calculated, 1, neigh[2], 0, MPI::AllGroup );
+          neighCount++;
+          
+          req[neighCount] = MPI::IRecv( &calculpom[2], 1, neigh[2], 0, MPI::AllGroup  );
+          neighCount++;
+        }
         
-        BlockIterD = dBlock.getElement( 0 );
-        //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
+        if( neigh[5] != -1 )
+        {
+          req[neighCount] = MPI::ISend( &calculated, 1, neigh[5], 0, MPI::AllGroup );
+          neighCount++;
+          
+          req[neighCount] = MPI::IRecv( &calculpom[3], 1, neigh[5], 0, MPI::AllGroup );
+          neighCount++;
+        }
         
+        MPI::WaitAll(req,neighCount);
+        MPI::Allreduce( &calculated, &calculated, 1, MPI_LOR,  MPI::AllGroup );
+        aux.template synchronize< Communicator >();
+        calculate = calculpom[0] || calculpom[1] || calculpom[2] || calculpom[3];
+        aux.template synchronize< Communicator >();
         
-        /**-----------------------------------------------------------------------------------------------------------*/
-        /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
-         BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
-        numIter ++;
-      }
-      if( numIter == 1 ){
-        auxPtr = helpFunc;
+        //printf( "%d: Receved reduced info about Calculated = %d.\n", i,calculated);
+        
+        
+        if( i == 0 )
+          printf("WhileCount = %d\n",WhileCount);
+        //calculated = 0; /// DEBUG;
       }
-      /*cudaFree( BlockIterDevice );
-       cudaFree( dBlock );
-       delete BlockIter;*/
-      cudaDeviceSynchronize();
-      TNL_CHECK_CUDA_DEVICE;
       
-      aux = *auxPtr;
-      interfaceMap = *interfaceMapPtr;
+      String s( "aux-" + std::to_string( i ) + ".tnl" );
+      aux.save( s );   
+      Aux=auxPtr;
+      
+      /*if( i == 0 )
+      {
+        for( int k = 0; k < mesh->getDimensions().y(); k++ ){
+          for( int l = 0; l < mesh->getDimensions().x(); l++ )
+            printf("%.2f\t",aux[ k * mesh->getDimensions().x() + l ] );
+          printf("\n");
+        }
+          printf("\n");
+        for( int k = 0; k < mesh->getDimensions().y(); k++ ){
+          for( int l = 0; l < mesh->getDimensions().x(); l++ )
+            printf("%.2f\t",(*Aux)[ k * mesh->getDimensions().x() + l ] );
+          printf("\n");
+        }
+      }*/
 #endif
-    }
-    iteration++;
+      iteration++;
   }
-  //#endif
   aux.save("aux-final.tnl");
 }
 
 
 #ifdef HAVE_CUDA
-
+// DeepCopy nebo pracne kopirovat kraje v zavislosti na vLower,vUpper z sArray do helpFunc.
+template< typename Real, typename Device, typename Index >
+__global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc )
+{
+  int i = threadIdx.x + blockDim.x*blockIdx.x;
+  int j = blockDim.y*blockIdx.y + threadIdx.y;
+  const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >();
+  if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+    helpFunc[ j * mesh.getDimensions().x() + i ] = aux[ j * mesh.getDimensions().x() + i ];
+}
 
 template < typename Index >
 __global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
@@ -590,11 +788,16 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
         const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
         const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
         Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock )
+        CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY ) );
+        TNL_CHECK_CUDA_DEVICE;
+        
+        CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks );
+        TNL_CHECK_CUDA_DEVICE;
 {
   int thri = threadIdx.x; int thrj = threadIdx.y;
-  int i = threadIdx.x + blockDim.x*blockIdx.x;
-  int j = blockDim.y*blockIdx.y + threadIdx.y;
+  int i = threadIdx.x + blockDim.x*blockIdx.x + vLower[0];
+  int j = blockDim.y*blockIdx.y + threadIdx.y + vLower[1];
+  const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >();
   /** FOR CHESS METHOD */
   //if( (blockIdx.y%2  + blockIdx.x) % 2 == oddEvenBlock )
   //{
@@ -606,8 +809,8 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
   if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] )
   { 
     __syncthreads();
+    
     /**-----------------------------------------*/
-    const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
     __shared__ int dimX;
     __shared__ int dimY;
     __shared__ Real hx;
@@ -628,16 +831,22 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     
     xkolik = blockDim.x + 1;
     ykolik = blockDim.y + 1;
-    numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0);
-    numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0);
+    numOfBlocky = (dimY-vUpper[1]-vLower[1])/blockDim.y + (((dimY-vUpper[1]-vLower[1])%blockDim.y != 0) ? 1:0);
+    numOfBlockx = (dimX-vUpper[0]-vLower[0])/blockDim.x + (((dimX-vUpper[0]-vLower[0])%blockDim.x != 0) ? 1:0);
     
     if( numOfBlockx - 1 == blockIdx.x )
-      xkolik = dimX - (blockIdx.x)*blockDim.x+1;
+      xkolik = (dimX-vUpper[0]-vLower[0]) - (blockIdx.x)*blockDim.x+1;
     
     if( numOfBlocky -1 == blockIdx.y )
-      ykolik = dimY - (blockIdx.y)*blockDim.y+1;
+      ykolik = (dimY-vUpper[1]-vLower[1]) - (blockIdx.y)*blockDim.y+1;
     __syncthreads();
     
+    /*if( thri==0 && thrj == 0 )
+    {
+      printf("%d: DimX = %d, DimY = %d, xKolik = %d, yKolik = %d, numOfBlockX = %d, numOfBlockY = %d, blockIdx.x = %d, blockIdx.y = %d.\n",
+              k, dimX, dimY, xkolik, ykolik, numOfBlockx, numOfBlocky, blockIdx.x, blockIdx.y);
+    }*/
+    
     int currentIndex = thrj * blockDim.x + thri;
     //__shared__ volatile bool changed[ blockDim.x*blockDim.y ];
     __shared__ volatile bool changed[ (sizeSArray-2)*(sizeSArray-2)];
@@ -648,47 +857,98 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     
     //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ];
     __shared__ volatile Real sArray[ sizeSArray * sizeSArray ];
-    sArray[ thrj * sizeSArray + thri ] = std::numeric_limits< Real >::max();
+    sArray[ (thrj+1) * sizeSArray + thri +1 ] = 10;//std::numeric_limits< Real >::max();
     
-    //filling sArray edges
+    /*if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 )
+    {
+      printf( "Kraje: \n");
+      for( int k = sizeSArray-1; k>-1; k-- ){
+        for( int l = 0; l < sizeSArray; l++ )
+          printf( "%.4f ", sArray[k * sizeSArray + l]);
+        printf( "\n");
+      }
+      printf( "\n");
+    }
+    __syncthreads();*/
+    
+        //filling sArray edges
     if( thri == 0 )
     {      
-      if( dimX > (blockIdx.x+1) * blockDim.x  && thrj+1 < ykolik )
-        sArray[(thrj+1)*sizeSArray + xkolik] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX + xkolik ];
+      if( dimX - vLower[ 0 ] > (blockIdx.x+1) * blockDim.x  && thrj+1 < ykolik )
+        sArray[(thrj+1)*sizeSArray + xkolik] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX + xkolik + vLower[0] ];
       else
-        sArray[(thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max();
+        sArray[(thrj+1)*sizeSArray + xkolik] = 10;//std::numeric_limits< Real >::max();
     }
-    
+        
     if( thri == 1 )
-    {
-      if( blockIdx.x != 0 && thrj+1 < ykolik )
-        sArray[(thrj+1)*sizeSArray + 0] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX ];
+    { 
+      if( ( blockIdx.x != 0 || vLower[0] != 0 ) && thrj+1 < ykolik )
+        sArray[(thrj+1)*sizeSArray + 0] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX  + vLower[0] ];
       else
-        sArray[(thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max();
+        sArray[(thrj+1)*sizeSArray + 0] = 10;//std::numeric_limits< Real >::max();
     }
     
     if( thri == 2 )
     {
-      if( dimY > (blockIdx.y+1) * blockDim.y  && thrj+1 < xkolik )
-        sArray[ ykolik*sizeSArray + thrj+1 ] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + ykolik*dimX + thrj+1 ];
+      if( dimY - vLower[ 1 ] > (blockIdx.y+1) * blockDim.y  && thrj+1 < xkolik )
+        sArray[ ykolik*sizeSArray + thrj+1 ] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + ykolik*dimX + thrj+1 + vLower[0] ];
       else
-        sArray[ykolik*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
+        sArray[ ykolik*sizeSArray + thrj+1 ] = 10;//std::numeric_limits< Real >::max();
       
     }
-    
+        
     if( thri == 3 )
     {
-      if( blockIdx.y != 0 && thrj+1 < xkolik )
-        sArray[0*sizeSArray + thrj+1] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + thrj+1 ];
+      if( ( blockIdx.y != 0 || vLower[1] != 0 ) && thrj+1 < xkolik )
+        sArray[0*sizeSArray + thrj+1] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + thrj+1 + vLower[0] ];
       else
-        sArray[0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
+        sArray[0*sizeSArray + thrj+1] = 10;//std::numeric_limits< Real >::max();
     }
+    /*__syncthreads();
+    if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 1 )
+    {
+      printf( "Kraje: \n");
+      for( int k = sizeSArray-1; k>-1; k-- ){
+        for( int l = 0; l < sizeSArray; l++ )
+          printf( "%.4f ", sArray[k * sizeSArray + l]);
+        printf( "\n");
+      }
+      printf( "\n");
+    }
+    __syncthreads();*/
+    
     
-    if( i < dimX && j < dimY )
-    {    
+    if( i < dimX && j < dimY && thri+1 < xkolik && thrj+1 < ykolik )
+    {  
+      /*if( k == 3 && blockIdx.x == 0 && blockIdx.y == 0 )
+        printf("at index = %d\n", j*dimX + i);*/
       sArray[(thrj+1)*sizeSArray + thri+1] = aux[ j*dimX + i ];
     }
     __syncthreads();  
+    if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 )
+    {
+      printf( "všechno před výpočtem: \n");
+      for( int k = sizeSArray-1; k>-1; k-- ){
+        for( int l = 0; l < sizeSArray; l++ )
+          printf( "%.4f ", sArray[k * sizeSArray + l]);
+        printf( "\n");
+      }
+      printf( "\n");
+    }
+    
+    if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 )
+    {
+      for( int k = mesh.getDimensions().y()-1; k>-1; k-- ){
+        for( int l = 0; l < 17; l++ )
+          printf( "%.2f ", aux[ k * mesh.getDimensions().x() + l ]);
+        printf( "\n");
+      }
+      printf( "\n");
+    }
+    
+    //main while cycle
+    //if( i == 0 && j == 0 )
+    //  printf("Overlaps [x1,y1],[x2,y2] = [%d,%d],[%d,%d]",vLower[0], vLower[1], vUpper[0], vUpper[1] );
     
     while( changed[ 0 ] )
     {
@@ -697,10 +957,12 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       changed[ currentIndex] = false;
       
       //calculation of update cell
-      if( i < dimX && j < dimY )
+      if( i < dimX - vUpper[0] && j < dimY - vUpper[1] /*&& i > vLower[0]-1 && j > vLower[1]-1*/ )
       {
         if( ! interfaceMap[ j * dimX + i ] )
         {
+          /*if( k == 1 && blockIdx.x == 1 && blockIdx.y == 0 )
+            printf( "thri = %d, thrj = %d \n", thri, thrj );*/
           changed[ currentIndex ] = ptr.updateCell<sizeSArray>( sArray, thri+1, thrj+1, hx,hy);
         }
       }
@@ -751,20 +1013,41 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       if( thri == 0 && thrj == 0 && changed[ 0 ] ){
         BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] = 1;
       }
-      /*if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 )
-       {
-       for( int k = 15; k>-1; k-- ){
-       for( int l = 0; l < 16; l++ )
-       printf( "%f\t", sArray[k * 16 + l]);
-       printf( "\n");
-       }
-       printf( "\n");
-       }*/
       __syncthreads();
     }
-    if( i < dimX && j < dimY )
+    
+    
+      
+    if( i < dimX && j < dimY && thri+1 < xkolik && thrj+1 < ykolik )
       helpFunc[ j * dimX + i ] = sArray[ ( thrj + 1 ) * sizeSArray + thri + 1 ];
+    __syncthreads();
+    /*if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 1 )
+    {
+      printf( "všechno po výpočtu: \n");
+      for( int k = sizeSArray-1; k>-1; k-- ){
+        for( int l = 0; l < sizeSArray; l++ )
+          printf( "%.4f ", sArray[k * sizeSArray + l]);
+        printf( "\n");
+      }
+      printf( "\n");
+    }*/
     
-  } 
+    /*if( thri==0 && thrj == 0 && blockIdx.x == 1 && blockIdx.y == 1 && k == 1 )
+    {
+      printf( "8: \n");
+      for( int k = mesh.getDimensions().y()-1; k>-1; k-- ){
+        for( int l = 0; l < mesh.getDimensions().x(); l++ )
+          printf( "%.2f\t", helpFunc[ k * mesh.getDimensions().x() + l ]);
+        printf("\n");
+      }
+      printf( "\n");
+    }*/
+  }
+  else
+  {
+    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+      helpFunc[ j * mesh.getDimensions().x() + i ] = aux[ j * mesh.getDimensions().x() + i ];
+  }
 }
 #endif
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock )
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index 2a1183bc2..9c5471beb 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -18,8 +18,9 @@
 template< typename Real,
         typename Device,
         typename Index,
+        typename Communicator,
         typename Anisotropy >
-FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
 FastSweepingMethod()
 : maxIterations( 1 )
 {
@@ -29,9 +30,10 @@ FastSweepingMethod()
 template< typename Real,
         typename Device,
         typename Index,
+        typename Communicator,
         typename Anisotropy >
 const Index&
-FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
 getMaxIterations() const
 {
   
@@ -40,9 +42,10 @@ getMaxIterations() const
 template< typename Real,
         typename Device,
         typename Index,
+        typename Communicator,
         typename Anisotropy >
 void
-FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
 setMaxIterations( const IndexType& maxIterations )
 {
   
@@ -51,10 +54,12 @@ setMaxIterations( const IndexType& maxIterations )
 template< typename Real,
         typename Device,
         typename Index,
+        typename Communicator,
         typename Anisotropy >
 void
-FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
 solve( const MeshPointer& mesh,
+        MeshFunctionPointer& Aux,
         const AnisotropyPointer& anisotropy,
         MeshFunctionPointer& u )
 {
-- 
GitLab


From be284ee4ae11e0a53e2fc3c559bf2528cea91ebb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matou=C5=A1=20Fencl?= <fenclmat@fjfi.cvut.cz>
Date: Sun, 3 Mar 2019 08:26:21 +0100
Subject: [PATCH 05/14] 2D MPI cuda repaired

---
 .../tnl-direct-eikonal-solver.h               |   2 +-
 .../tnlDirectEikonalMethodsBase.h             |   2 +-
 .../tnlDirectEikonalMethodsBase_impl.h        | 103 ++---
 .../tnlDirectEikonalProblem_impl.h            |   1 +
 .../hamilton-jacobi/tnlFastSweepingMethod.h   |   2 +
 .../tnlFastSweepingMethod2D_impl.h            | 357 ++++++++++--------
 6 files changed, 232 insertions(+), 235 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h
index 82411c939..b2cfc65dc 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h
@@ -36,7 +36,7 @@ class DirectEikonalSolverConfig
       {
          config.addDelimiter( "Direct eikonal equation solver settings:" );
          config.addRequiredEntry< String >( "input-file", "Input file." );
-         config.addEntry< String >( "distributed-grid-io-type", "Choose Distributed Grid IO Type", "LocalCopy");
+         config.addEntry< String >( "distributed-grid-io-type", "Choose Distributed Grid IO Type", "MpiIO");
             config.addEntryEnum< String >( "LocalCopy" );
             config.addEntryEnum< String >( "MpiIO" );
       };
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index 3a78d0f54..d933f1df3 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -152,7 +152,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
 
 template< typename Real, typename Device, typename Index >
 __global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc );
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, int copy, int k );
 
 template < typename Index >
 __global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index 3f5b6eed2..d7da1117e 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -771,8 +771,8 @@ initInterface( const MeshFunctionPointer& _input,
       {
         cell.refresh();
         output[ cell.getIndex() ] =
-                input( cell ) >= 0 ? 10://std::numeric_limits< RealType >::max() :
-                  -10;//- std::numeric_limits< RealType >::max();
+                input( cell ) >= 0 ? std::numeric_limits< RealType >::max() :
+                  - std::numeric_limits< RealType >::max();
         interfaceMap[ cell.getIndex() ] = false;
       }
     
@@ -908,8 +908,8 @@ updateCell( MeshFunctionType& u,
     b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0,  -1 >() ],
             u[ neighborEntities.template getEntityIndex< 0,   1 >() ] );
   }
-  if( fabs( a ) == 10&&//std::numeric_limits< RealType >::max() && 
-          fabs( b ) == 10)//std::numeric_limits< RealType >::max() )
+  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+          fabs( b ) == std::numeric_limits< RealType >::max() )
     return false;
   
   RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
@@ -1021,81 +1021,46 @@ initInterface( const MeshFunctionPointer& _input,
             const IndexType e = neighbors.template getEntityIndex<  1,  0,  0 >();
             const IndexType n = neighbors.template getEntityIndex<  0,  1,  0 >();
             const IndexType t = neighbors.template getEntityIndex<  0,  0,  1 >();
-           
+            
+            
             if( c * input[ n ] <= 0 )
             {
-              if( c >= 0 )
-              {
-                pom = ( hy * c )/( c - input[ n ]);
-                if( output[ cell.getIndex() ] > pom ) 
-                  output[ cell.getIndex() ] = pom;
-                
-                if ( output[ n ] < pom - hy)
-                  output[ n ] = pom - hy; // ( hy * c )/( c - input[ n ]) - hy;
-                
-              }else
-              {
-                pom = - ( hy * c )/( c - input[ n ]);
-                if( output[ cell.getIndex() ] < pom )
-                  output[ cell.getIndex() ] = pom;
-                if( output[ n ] > hy + pom )
-                  output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]);
-                
-              }
+              pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
+              if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
+                output[ cell.getIndex() ] = pom;
+              pom = pom - TNL::sign( c )*hy;
+              if( TNL::abs( output[ n ] ) > TNL::abs( pom ) )
+                output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy;
+              
               interfaceMap[ cell.getIndex() ] = true;
               interfaceMap[ n ] = true;
             }
+            
             if( c * input[ e ] <= 0 )
             {
-              if( c >= 0 )
-              {
-                pom = ( hx * c )/( c - input[ e ]);
-                if( output[ cell.getIndex() ] > pom )
-                  output[ cell.getIndex() ] = pom;
-                
-                pom = pom - hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
-                if( output[ e ] < pom )
-                  output[ e ] = pom;      
-                
-              }else
-              {
-                pom = - (hx * c)/( c - input[ e ]);
-                if( output[ cell.getIndex() ] < pom )
-                  output[ cell.getIndex() ] = pom;
-                
-                pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
-                if( output[ e ] > pom )
-                  output[ e ] = pom;
-              }
+              pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
+              if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
+                output[ cell.getIndex() ] = pom;
+              pom = pom - TNL::sign( c )*hx;
+              if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
+                output[ e ] = pom; //( hy * c )/( c - input[ n ]) - hy;
+              
               interfaceMap[ cell.getIndex() ] = true;
               interfaceMap[ e ] = true;
             }
+            
             if( c * input[ t ] <= 0 )
             {
-              if( c >= 0 )
-              {
-                pom = ( hz * c )/( c - input[ t ]);
-                if( output[ cell.getIndex() ] > pom )
-                  output[ cell.getIndex() ] = pom;
-                
-                pom = pom - hz; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
-                if( output[ t ] < pom )
-                  output[ t ] = pom; 
-                
-              }else
-              {
-                pom = - (hz * c)/( c - input[ t ]);
-                if( output[ cell.getIndex() ] < pom )
-                  output[ cell.getIndex() ] = pom;
-                
-                pom = pom + hz; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
-                if( output[ t ] > pom )
-                  output[ t ] = pom;
-                
-              }
+              pom = TNL::sign( c )*( hz * c )/( c - input[ t ]);
+              if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
+                output[ cell.getIndex() ] = pom;
+              pom = pom - TNL::sign( c )*hz;
+              if( TNL::abs( output[ t ] ) > TNL::abs( pom ) )
+                output[ t ] = pom; //( hy * c )/( c - input[ n ]) - hy;
+              
               interfaceMap[ cell.getIndex() ] = true;
               interfaceMap[ t ] = true;
-            }    
+            }  
           }
           /*output[ cell.getIndex() ] =
            c > 0 ? TypeInfo< RealType >::getMaxValue() :
@@ -1240,8 +1205,8 @@ updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real
   a = TNL::argAbsMin( sArray[ thrj * sizeSArray + thri+1 ],
           sArray[ thrj * sizeSArray + thri-1 ] );
   
-  if( fabs( a ) == 10&&//std::numeric_limits< RealType >::max() && 
-          fabs( b ) == 10)//std::numeric_limits< RealType >::max() )
+  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+          fabs( b ) == std::numeric_limits< RealType >::max() )
     return false;
   
   RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
@@ -1422,8 +1387,8 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2,
     
     
     output[ cind ] =
-            input( cell ) >= 0 ? 10://std::numeric_limits< Real >::max() :
-              - 10;//- std::numeric_limits< Real >::max();
+            input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
+              - std::numeric_limits< Real >::max();
     interfaceMap[ cind ] = false; 
     
     if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] && i>vLower[0] && j> vLower[0] )
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
index 0aecde5db..56fa9496f 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
@@ -191,6 +191,7 @@ bool
 tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >::
 solve( DofVectorPointer& dofs )
 {
+   std::cout << "We are in solve()." << std::endl;
    FastSweepingMethod< MeshType, Communicator,AnisotropyType > fsm;
    fsm.solve( this->getMesh(), u, anisotropy, initialData );
    
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
index 51b3faceb..8e1e6a72b 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
@@ -86,6 +86,8 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator,
     typedef Index IndexType;
     typedef Anisotropy AnisotropyType;
     typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > BaseType;
+    typedef Communicator CommunicatorType;
+    
     using MeshPointer = Pointers::SharedPointer<  MeshType >;
     using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
     using MPI = Communicators::MpiCommunicator;
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index f28202a18..bc1a97b43 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -19,6 +19,7 @@
 #include <TNL/Communicators/MpiDefs.h>
 #include "tnlDirectEikonalProblem.h"
 
+#define ForDebug false // false <=> off
 
 
 
@@ -124,39 +125,61 @@ solve( const MeshPointer& mesh,
   
   while( iteration < this->maxIterations )
   {
-#ifdef HAVE_MPI
-    int i = MPI::GetRank( MPI::AllGroup );
-    
-    /*if( i == 0 )
-    {
-      for( int k = 0; k < mesh->getDimensions().y(); k++ ){
-        for( int l = 0; l < mesh->getDimensions().x(); l++ )
-          printf("%.2f\t",aux[ k * mesh->getDimensions().x() + l ] );
-        printf("\n");
-      }
-    }*/
-    aux.template synchronize< Communicator >();
     Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh();
-        
-    const int *neigh = meshPom->getNeighbors();
-    MPI::Request *req;
-    req = new MPI::Request[meshPom->getNeighborsCount()];
-    int WhileCount = 0;
-#endif
+
+    
+    int i = MPI::GetRank( MPI::AllGroup ); // number that identifies rank
     
-    Containers::StaticVector< 2, IndexType > vLower = meshPom->getLowerOverlap();
+    // getting overlaps ( WITHOUT MPI SHOULD BE 0 )
+    Containers::StaticVector< 2, IndexType > vLower = meshPom->getLowerOverlap(); 
     Containers::StaticVector< 2, IndexType > vUpper = meshPom->getUpperOverlap();
+    
+#if  ForDebug 
+    int WhileCount = 0; // number of passages of while cycle with condition calculated
     printf( "%d: meshDimensions are (x,y) = (%d,%d).\n",i, mesh->getDimensions().x(), mesh->getDimensions().y() );
     printf( "%d: owerlaps are ([x1,x2],[y1,y2]) = ([%d,%d],[%d,%d]).\n",i, vLower[0], vUpper[0], vLower[1], vUpper[1] );
-    int calculated = 1;
-    int calculate = 1;
+    if( std::is_same< DeviceType, Devices::Host >::value && i == 0 )
+    {
+      for( int j = mesh->getDimensions().y()-1; j>-1; j-- ){
+        for( int i = 0; i < mesh->getDimensions().x(); i++ )
+          std::cout << aux[ j * mesh->getDimensions().x() + i ] << " ";
+        std::cout << std::endl;
+      }
+      std::cout << std::endl;
+    }
+    
+    // TO SEE CUDA OVERLAPS
+    /*const int cudaBlockSize( 16 );
+    int numBlocksXWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
+    int numBlocksYWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
+    dim3 gridSizeWithoutOverlaps( numBlocksXWithoutOverlaps, numBlocksYWithoutOverlaps );
+    dim3 blockSize( cudaBlockSize, cudaBlockSize );
+    MeshFunctionPointer helpFunc( mesh );
+    DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( helpFunc.template getData< Device>(),
+            auxPtr.template modifyData< Device>(), 1, i ); */
+    
+#endif
+    
+    int calculated = 1; // indicates weather we calculated in the last passage of the while cycle 
+    // calculated is same for all ranks 
+    // without MPI should be FALSE at the end of while cycle body
+    int calculate = 1; // indicates if the thread should calculate again in upcoming passage of cycle
+    // calculate is a value that can differ in every rank
+    // without MPI should be FALSE at the end of while cycle body
     
     while( calculated )
     {
       calculated = 0;
+#if ForDebug
       WhileCount++;
+      /*if( std::is_same< DeviceType, Devices::Cuda >::value )
+      {
+        DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(),
+                helpFunc.template modifyData< Device>(), 0, i );
+      }*/
+#endif
       
-      if( std::is_same< DeviceType, Devices::Host >::value && calculate )
+      if( std::is_same< DeviceType, Devices::Host >::value && calculate ) // should we calculate in Host?
       {
         calculate = 0;
         
@@ -280,6 +303,7 @@ solve( const MeshPointer& mesh,
          }
          }*/
         
+  // FSM FOR MPI and WITHOUT MPI
         for( cell.getCoordinates().y() = 0 + vLower[1];
                 cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
                 cell.getCoordinates().y()++ )
@@ -293,16 +317,6 @@ solve( const MeshPointer& mesh,
               calculated = this->updateCell( aux, cell ) || calculated;
           }
         }
-        //if( i == 0 )
-        //{
-        //  for( int k = 0; k < mesh->getDimensions().y(); k++ ){
-        //    for( int l = 0; l < mesh->getDimensions().x(); l++ )
-        //      printf("%.2f\t",aux[ k * mesh->getDimensions().x() + l ] );
-        //    printf("\n");
-        //  }
-        //}
-        
-        //aux.save( "aux-1.tnl" );
         
         for( cell.getCoordinates().y() = 0 + vLower[1];
                 cell.getCoordinates().y() < mesh->getDimensions().y()-vUpper[1];
@@ -352,30 +366,29 @@ solve( const MeshPointer& mesh,
               this->updateCell( aux, cell );
           }
         }
-        
       }
-      if( std::is_same< DeviceType, Devices::Cuda >::value && calculate )
+      
+      if( std::is_same< DeviceType, Devices::Cuda >::value && calculate ) // should we calculate on CUDA?
       {
-        // TODO: CUDA code
-        
         calculate = 0;
-        //if( i == 0 )
-        //  printf("%d: We are in Cuda code start.\n", i);
-#ifdef HAVE_CUDA
         
+#if ForDebug 
+        printf("%d: We are in Cuda code start.\n", i);
+#endif
+          
+#ifdef HAVE_CUDA
         TNL_CHECK_CUDA_DEVICE;
         // Maximum cudaBlockSite is 32. Because of maximum num. of threads in kernel.
         const int cudaBlockSize( 16 );
         
+        // Setting number of threads and blocks for kernel
         int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vLower[0] - vUpper[0], cudaBlockSize );
-        int numBlocksXbez = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
         int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vLower[1] - vUpper[1], cudaBlockSize );
-        int numBlocksYbez = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
         dim3 blockSize( cudaBlockSize, cudaBlockSize );
-        dim3 gridSizeBez( numBlocksXbez, numBlocksYbez );
         dim3 gridSize( numBlocksX, numBlocksY );
         
-        tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
+        // Need for calling functions from kernel
+        BaseType ptr;
         
         int BlockIterD = 1;
         /*auxPtr = helpFunc;
@@ -413,44 +426,50 @@ solve( const MeshPointer& mesh,
          
          BlockIterD = dBlock.getElement( 0 );*/
         
+        // Array that identifies which blocks should be calculated.
         TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
         BlockIterDevice.setSize( numBlocksX * numBlocksY );
         BlockIterDevice.setValue( 1 );
         TNL_CHECK_CUDA_DEVICE;
         
-        
+        // Array into which we identify the neighbours and then copy it into BlockIterDevice
         TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom;
         BlockIterPom.setSize( numBlocksX * numBlocksY  );
         BlockIterPom.setValue( 0 );
+        
+#if ForDebug // For printf of BlockIterDevice
+        TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1;
+        BlockIterPom1.setSize( numBlocksX * numBlocksY  );
+        BlockIterPom1.setValue( 0 );
+#endif   
         TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1;
         BlockIterPom1.setSize( numBlocksX * numBlocksY  );
         BlockIterPom1.setValue( 0 );
-        /*int *BlockIterDevice;
-         cudaMalloc((void**) &BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/
         int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
-        //std::cout << "nBlocksNeigh = " << nBlocksNeigh << std::endl;
-        //free( BlockIter );
-        /*int *BlockIterPom;
-         cudaMalloc((void**) &BlockIterPom, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/
-        
         int nBlocks = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
         
         TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
         dBlock.setSize( nBlocks );
+        TNL::Containers::Array< int, Devices::Host, IndexType > dBlock1;
+        dBlock1.setSize( nBlocks );
         TNL_CHECK_CUDA_DEVICE;
-        /*int *dBlock;
-         cudaMalloc((void**) &dBlock, nBlocks * sizeof( int ) );*/
-        
         
-        MeshFunctionPointer helpFunc1( mesh );      
+        // Helping meshFunction that switches with AuxPtr in every calculation of CudaUpdateCellCaller<<<>>>()
         MeshFunctionPointer helpFunc( mesh );
-        //helpFunc->bind( auxPtr->getData() );
-        DeepCopy<<< gridSizeBez, blockSize >>>( auxPtr.template getData< Device>(),
-           helpFunc.template modifyData< Device>() );
+        MeshFunctionPointer helpFunc1( mesh );
         
-        helpFunc1 = auxPtr;
-        auxPtr = helpFunc;
-        helpFunc = helpFunc1;
+        // Setting number of threads and blocks in grid for DeepCopy of meshFunction
+        int numBlocksXWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
+        int numBlocksYWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
+        dim3 gridSizeWithoutOverlaps( numBlocksXWithoutOverlaps, numBlocksYWithoutOverlaps );
+        
+        DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(),
+                helpFunc.template modifyData< Device>(), 1, i );
+        
+#if ForDebug
+        DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(),
+                helpFunc.template modifyData< Device>(), 0, i );
+#endif
         
         //int pocBloku = 0;
         Devices::Cuda::synchronizeDevice();
@@ -464,13 +483,12 @@ solve( const MeshPointer& mesh,
         
         //int oddEvenBlock = 0;
         //int numberWhile = 0;
-        while( BlockIterD /*numberWhile < 10*/)
+        while( BlockIterD )
         {
           //numberWhile++;
           /** HERE IS CHESS METHOD **/
           
-          /*auxPtr = helpFunc;
-           
+          /*
            CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
            interfaceMapPtr.template getData< Device >(),
            auxPtr.template getData< Device>(),
@@ -479,19 +497,17 @@ solve( const MeshPointer& mesh,
            oddEvenBlock );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
-           auxPtr = helpFunc;
            
            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
            
            CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
            interfaceMapPtr.template getData< Device >(),
-           auxPtr.template getData< Device>(),
-           helpFunc.template modifyData< Device>(),
+           helpFunc.template getData< Device>(),
+           auxPtr.template modifyData< Device>(),
            BlockIterDevice, vLower, vUpper, 
            oddEvenBlock );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
-           auxPtr = helpFunc;
            
            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
            
@@ -507,14 +523,7 @@ solve( const MeshPointer& mesh,
           /**------------------------------------------------------------------------------------------------*/
           
           
-          /** HERE IS FIM **/
-          
-          helpFunc1 = auxPtr;
-          auxPtr = helpFunc;
-          helpFunc = helpFunc1;
-          TNL_CHECK_CUDA_DEVICE;
-          
-          //int pocBloku = 0;
+     /** HERE IS FIM **/
           Devices::Cuda::synchronizeDevice();
           CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
                   interfaceMapPtr.template getData< Device >(),
@@ -553,12 +562,17 @@ solve( const MeshPointer& mesh,
             }
             std::cout << std::endl;
           }
+#endif
+          
+          // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now. 
           GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, BlockIterPom, numBlocksX, numBlocksY );
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
           BlockIterDevice = BlockIterPom;
-          
-          if( i == 0 ){
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
+#if ForDebug          
+          if( i == 1 ){
             BlockIterPom1 = BlockIterDevice;
             for( int i =0; i< numBlocksX; i++ ){
               for( int j = 0; j < numBlocksY; j++ )
@@ -569,36 +583,39 @@ solve( const MeshPointer& mesh,
             }
             std::cout << std::endl;
           }
-          //std::cout<< "Probehlo" << std::endl;
-          
-          //TNL::swap( auxPtr, helpFunc );
-          
+#endif
+          // Parallel reduction to see if we should calculate again BlockIterD
           CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
+          cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
           
-          CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+          // Parallel reduction on dBlock because of too large number of blocks (more than maximum number of threads)
+          CudaParallelReduc<<< 1, 1024 >>>( dBlock, dBlock, nBlocks );
+          cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
-          
-          
-          //if( i == 0 )
-          //  printf("%d: We did parallel reduction.\n", i);
+#if ForDebug          
+          if( i == 0 ){
+            dBlock1 = dBlock;
+            printf("nBlocks = %d\n",nBlocks);
+            for( int m =0; m< nBlocks; m++ ){
+              std::cout << dBlock1[m] << " ";
+            }
+            std::cout << std::endl;
+          }
+#endif          
+          // Copy of the first element which is result of parallel reduction
           BlockIterD = dBlock.getElement( 0 );
-          
-          //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
-          //if( i == 0 )
-          //  printf("%d: BlockIterD = %d.\n", i, BlockIterD);
           
-#ifdef HAVE_MPI
+          // When we change something then we should caclucate again in the next passage of MPI ( calculated = true )
+          
           if( BlockIterD ){
             calculated = 1;
-            //printf( "calculated = %d\n",calculated );
           }
-#endif
+          
           /**-----------------------------------------------------------------------------------------------------------*/
-          /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
-           BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
+       
           numIter ++;
         }
         if( numIter%2  == 1 ){
@@ -637,37 +654,28 @@ solve( const MeshPointer& mesh,
         }
         
         MPI::WaitAll(req,neighCount);
+#if ForDebug
+        printf( "%d: Sending Calculated = %d.\n", i, calculated );
+#endif        
         MPI::Allreduce( &calculated, &calculated, 1, MPI_LOR,  MPI::AllGroup );
         aux.template synchronize< Communicator >();
         calculate = calculpom[0] || calculpom[1] || calculpom[2] || calculpom[3];
+#if ForDebug
+        printf( "%d: Receved Calculated = %d.\n%d: Calculate = %d\n", i, calculated, i, calculate);
+#endif
         aux.template synchronize< Communicator >();
+      }
         
-        //printf( "%d: Receved reduced info about Calculated = %d.\n", i,calculated);
-        
-        
-        if( i == 0 )
+#if ForDebug 
+        if( i == 1 )
           printf("WhileCount = %d\n",WhileCount);
-        //calculated = 0; /// DEBUG;
+        //calculated = 0; // DEBUG;
+#endif
       }
       
       String s( "aux-" + std::to_string( i ) + ".tnl" );
       aux.save( s );   
       Aux=auxPtr;
-      
-      /*if( i == 0 )
-      {
-        for( int k = 0; k < mesh->getDimensions().y(); k++ ){
-          for( int l = 0; l < mesh->getDimensions().x(); l++ )
-            printf("%.2f\t",aux[ k * mesh->getDimensions().x() + l ] );
-          printf("\n");
-        }
-          printf("\n");
-        for( int k = 0; k < mesh->getDimensions().y(); k++ ){
-          for( int l = 0; l < mesh->getDimensions().x(); l++ )
-            printf("%.2f\t",(*Aux)[ k * mesh->getDimensions().x() + l ] );
-          printf("\n");
-        }
-      }*/
 #endif
       iteration++;
   }
@@ -679,13 +687,28 @@ solve( const MeshPointer& mesh,
 // DeepCopy nebo pracne kopirovat kraje v zavislosti na vLower,vUpper z sArray do helpFunc.
 template< typename Real, typename Device, typename Index >
 __global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc )
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, int copy, int k )
 {
   int i = threadIdx.x + blockDim.x*blockIdx.x;
   int j = blockDim.y*blockIdx.y + threadIdx.y;
   const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >();
-  if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
-    helpFunc[ j * mesh.getDimensions().x() + i ] = aux[ j * mesh.getDimensions().x() + i ];
+  if( copy ){
+    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+      helpFunc[ j * mesh.getDimensions().x() + i ] = aux[ j * mesh.getDimensions().x() + i ];
+  }
+  else
+  {
+    if( i==0 && j == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 )
+    {
+      for( int m = mesh.getDimensions().y()-1; m>-1; m-- ){
+        for( int l = 0; l < 17; l++ ){
+          printf( "%.2f ", aux[ m * mesh.getDimensions().x() + l ]);
+        }
+        printf( "\n");
+      }
+      printf( "\n");
+    }
+  }
 }
 
 template < typename Index >
@@ -722,16 +745,22 @@ __global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cud
   int blId = blockIdx.x;
   int blockSize = blockDim.x;
   /*if ( i == 0 && blId == 0 ){
-   printf( "nBlocks = %d \n", nBlocks );
-   for( int j = nBlocks-1; j > -1 ; j--){
-   printf( "cislo = %d \n", BlockIterDevice[ j ] );
-   }
-   }*/
+    printf( "nBlocks = %d\n", nBlocks );
+    for( int j = nBlocks-1; j > -1 ; j--){
+      printf( "%d: cislo = %d \n", j, BlockIterDevice[ j ] );
+    }
+  }*/
   __shared__ int sArray[ 1024 ];
   sArray[ i ] = 0;
   if( blId * 1024 + i < nBlocks )
     sArray[ i ] = BlockIterDevice[ blId * 1024 + i ];
   __syncthreads();
+  /*if ( i == 0 && blId == 0 ){
+   printf( "nBlocks = %d\n", nBlocks );
+   for( int j = 4; j > -1 ; j--){
+   printf( "%d: cislo = %d \n", j, sArray[ j ] );
+   }
+  }*/
   /*extern __shared__ volatile int sArray[];
    unsigned int i = threadIdx.x;
    unsigned int gid = blockIdx.x * blockSize * 2 + threadIdx.x;
@@ -769,13 +798,19 @@ __global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cud
   __syncthreads();
   if (i < 32 )
   {
-    if(  blockSize >= 64 ) sArray[ i ] += sArray[ i + 32 ];
-    if(  blockSize >= 32 )  sArray[ i ] += sArray[ i + 16 ];
-    if(  blockSize >= 16 )  sArray[ i ] += sArray[ i + 8 ];
-    if(  blockSize >= 8 )  sArray[ i ] += sArray[ i + 4 ];
-    if(  blockSize >= 4 )  sArray[ i ] += sArray[ i + 2 ];
-    if(  blockSize >= 2 )  sArray[ i ] += sArray[ i + 1 ];
+    if(  blockSize >= 64 ){ sArray[ i ] += sArray[ i + 32 ];}
+  __syncthreads();
+    if(  blockSize >= 32 ){  sArray[ i ] += sArray[ i + 16 ];}
+  __syncthreads();
+    if(  blockSize >= 16 ){  sArray[ i ] += sArray[ i + 8 ];}
+    if(  blockSize >= 8 ){  sArray[ i ] += sArray[ i + 4 ];}
+  __syncthreads();
+    if(  blockSize >= 4 ){  sArray[ i ] += sArray[ i + 2 ];}
+  __syncthreads();
+    if(  blockSize >= 2 ){  sArray[ i ] += sArray[ i + 1 ];}
+  __syncthreads();
   }
+  __syncthreads();
   
   if( i == 0 )
     dBlock[ blId ] = sArray[ 0 ];
@@ -841,11 +876,13 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       ykolik = (dimY-vUpper[1]-vLower[1]) - (blockIdx.y)*blockDim.y+1;
     __syncthreads();
     
+#if ForDebug
     /*if( thri==0 && thrj == 0 )
     {
       printf("%d: DimX = %d, DimY = %d, xKolik = %d, yKolik = %d, numOfBlockX = %d, numOfBlockY = %d, blockIdx.x = %d, blockIdx.y = %d.\n",
               k, dimX, dimY, xkolik, ykolik, numOfBlockx, numOfBlocky, blockIdx.x, blockIdx.y);
     }*/
+#endif
     
     int currentIndex = thrj * blockDim.x + thri;
     //__shared__ volatile bool changed[ blockDim.x*blockDim.y ];
@@ -857,27 +894,16 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     
     //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ];
     __shared__ volatile Real sArray[ sizeSArray * sizeSArray ];
-    sArray[ (thrj+1) * sizeSArray + thri +1 ] = 10;//std::numeric_limits< Real >::max();
-    
-    /*if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 )
-    {
-      printf( "Kraje: \n");
-      for( int k = sizeSArray-1; k>-1; k-- ){
-        for( int l = 0; l < sizeSArray; l++ )
-          printf( "%.4f ", sArray[k * sizeSArray + l]);
-        printf( "\n");
-      }
-      printf( "\n");
-    }
-    __syncthreads();*/
+    sArray[ (thrj+1) * sizeSArray + thri +1 ] = std::numeric_limits< Real >::max();
     
+       
         //filling sArray edges
     if( thri == 0 )
     {      
       if( dimX - vLower[ 0 ] > (blockIdx.x+1) * blockDim.x  && thrj+1 < ykolik )
         sArray[(thrj+1)*sizeSArray + xkolik] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX + xkolik + vLower[0] ];
       else
-        sArray[(thrj+1)*sizeSArray + xkolik] = 10;//std::numeric_limits< Real >::max();
+        sArray[(thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max();
     }
         
     if( thri == 1 )
@@ -885,7 +911,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       if( ( blockIdx.x != 0 || vLower[0] != 0 ) && thrj+1 < ykolik )
         sArray[(thrj+1)*sizeSArray + 0] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX  + vLower[0] ];
       else
-        sArray[(thrj+1)*sizeSArray + 0] = 10;//std::numeric_limits< Real >::max();
+        sArray[(thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max();
     }
     
     if( thri == 2 )
@@ -893,7 +919,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       if( dimY - vLower[ 1 ] > (blockIdx.y+1) * blockDim.y  && thrj+1 < xkolik )
         sArray[ ykolik*sizeSArray + thrj+1 ] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + ykolik*dimX + thrj+1 + vLower[0] ];
       else
-        sArray[ ykolik*sizeSArray + thrj+1 ] = 10;//std::numeric_limits< Real >::max();
+        sArray[ ykolik*sizeSArray + thrj+1 ] = std::numeric_limits< Real >::max();
       
     }
         
@@ -902,7 +928,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       if( ( blockIdx.y != 0 || vLower[1] != 0 ) && thrj+1 < xkolik )
         sArray[0*sizeSArray + thrj+1] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + thrj+1 + vLower[0] ];
       else
-        sArray[0*sizeSArray + thrj+1] = 10;//std::numeric_limits< Real >::max();
+        sArray[0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
     }
     /*__syncthreads();
     if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 1 )
@@ -918,19 +944,20 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     __syncthreads();*/
     
     
-    if( i < dimX && j < dimY && thri+1 < xkolik && thrj+1 < ykolik )
+    if( i-vLower[0] < dimX && j-vLower[1] < dimY && thri+1 < xkolik + vUpper[0] && thrj+1 < ykolik + vUpper[1] )
     {  
-      /*if( k == 3 && blockIdx.x == 0 && blockIdx.y == 0 )
+      /*if( k == 1 && blockIdx.x == 0 && blockIdx.y == 0 )
         printf("at index = %d\n", j*dimX + i);*/
-      sArray[(thrj+1)*sizeSArray + thri+1] = aux[ j*dimX + i ];
+      sArray[(thrj+1)*sizeSArray + thri+1] = aux[ (j)*dimX + i ];
     }
     __syncthreads();  
+#if ForDebug    
     if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 )
     {
       printf( "všechno před výpočtem: \n");
-      for( int k = sizeSArray-1; k>-1; k-- ){
+      for( int m = sizeSArray-1; m>-1; m-- ){
         for( int l = 0; l < sizeSArray; l++ )
-          printf( "%.4f ", sArray[k * sizeSArray + l]);
+          printf( "%.2f ", sArray[m * sizeSArray + l]);
         printf( "\n");
       }
       printf( "\n");
@@ -938,14 +965,14 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     
     if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 )
     {
-      for( int k = mesh.getDimensions().y()-1; k>-1; k-- ){
+      for( int m = mesh.getDimensions().y()-1; m>-1; m-- ){
         for( int l = 0; l < 17; l++ )
-          printf( "%.2f ", aux[ k * mesh.getDimensions().x() + l ]);
+          printf( "%.2f ", aux[ m * mesh.getDimensions().x() + l ]);
         printf( "\n");
       }
       printf( "\n");
     }
-    
+#endif 
     //main while cycle
     //if( i == 0 && j == 0 )
     //  printf("Overlaps [x1,y1],[x2,y2] = [%d,%d],[%d,%d]",vLower[0], vLower[1], vUpper[0], vUpper[1] );
@@ -1021,31 +1048,33 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     if( i < dimX && j < dimY && thri+1 < xkolik && thrj+1 < ykolik )
       helpFunc[ j * dimX + i ] = sArray[ ( thrj + 1 ) * sizeSArray + thri + 1 ];
     __syncthreads();
-    /*if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 1 )
+#if ForDebug
+    if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 )
     {
       printf( "všechno po výpočtu: \n");
-      for( int k = sizeSArray-1; k>-1; k-- ){
+      for( int m = sizeSArray-1; m>-1; m-- ){
         for( int l = 0; l < sizeSArray; l++ )
-          printf( "%.4f ", sArray[k * sizeSArray + l]);
+          printf( "%.2f ", sArray[m * sizeSArray + l]);
         printf( "\n");
       }
       printf( "\n");
-    }*/
+    }
     
-    /*if( thri==0 && thrj == 0 && blockIdx.x == 1 && blockIdx.y == 1 && k == 1 )
+    if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 )
     {
       printf( "8: \n");
-      for( int k = mesh.getDimensions().y()-1; k>-1; k-- ){
+      for( int m = mesh.getDimensions().y()-1; m>-1; m-- ){
         for( int l = 0; l < mesh.getDimensions().x(); l++ )
-          printf( "%.2f\t", helpFunc[ k * mesh.getDimensions().x() + l ]);
+          printf( "%.2f ", helpFunc[ m * mesh.getDimensions().x() + l ]);
         printf("\n");
       }
       printf( "\n");
-    }*/
+    }
+#endif
   }
   else
   {
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+    if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] )
       helpFunc[ j * mesh.getDimensions().x() + i ] = aux[ j * mesh.getDimensions().x() + i ];
   }
 }
-- 
GitLab


From df7abcac9456f6c8410d3e28f791fd11c5c8c382 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matou=C5=A1=20Fencl?= <fenclmat@fjfi.cvut.cz>
Date: Sun, 3 Mar 2019 09:54:03 +0100
Subject: [PATCH 06/14] First try to repair the installation error in 2D

---
 .../Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index bc1a97b43..586d37ba5 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -664,19 +664,19 @@ solve( const MeshPointer& mesh,
         printf( "%d: Receved Calculated = %d.\n%d: Calculate = %d\n", i, calculated, i, calculate);
 #endif
         aux.template synchronize< Communicator >();
-      }
         
 #if ForDebug 
         if( i == 1 )
           printf("WhileCount = %d\n",WhileCount);
         //calculated = 0; // DEBUG;
+#endif
+        }
 #endif
       }
-      
       String s( "aux-" + std::to_string( i ) + ".tnl" );
       aux.save( s );   
       Aux=auxPtr;
-#endif
+      
       iteration++;
   }
   aux.save("aux-final.tnl");
-- 
GitLab


From 204f7f1da254bd7ee89e9d3e896cfae0f2298559 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matou=C5=A1=20Fencl?= <fenclmat@fjfi.cvut.cz>
Date: Mon, 11 Mar 2019 19:22:07 +0100
Subject: [PATCH 07/14] 2D and 3D solvers extended with MPI (3D has issue on
 biggest mesh)

---
 .../hamilton-jacobi/tnl-run-fsm-eoc-test      |  127 +-
 .../tnlDirectEikonalMethodsBase.h             |   18 +-
 .../tnlDirectEikonalMethodsBase_impl.h        |  204 +--
 .../tnlDirectEikonalProblem_impl.h            |   14 +-
 .../hamilton-jacobi/tnlFastSweepingMethod.h   |    4 +
 .../tnlFastSweepingMethod1D_impl.h            |    2 +-
 .../tnlFastSweepingMethod2D_impl.h            |  130 +-
 .../tnlFastSweepingMethod3D_impl.h            | 1175 +++++++++++------
 8 files changed, 1088 insertions(+), 586 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test
index 0dc50246f..24b782a82 100755
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test
@@ -1,19 +1,33 @@
 #!/bin/bash
 
 device="host"
-dimensions="2D 3D"
-#dimensions="3D"
+#dimensions="2D 3D"
+dimensions="2D"
 sizes1D="16 32 64 128 256 512 1024 2048 4096"
 #sizes1D="256"
-sizes2D="16 32 64 128 256 512 1024"
-#sizes2D="8"
-sizes3D="16 32 64 128 256"
+sizes2D="16 32 64 128 256 512 1024 2048 4096"
+#sizes2D="16"
+sizes3D="8 16 32 64 128 256"
+#sizes3D="128 256"
 testFunctions="paraboloid"
+#testFunctions="sin-wave-sdf"
+#testFunctions="sin-bumps-sdf"
 snapshotPeriod=0.1
 finalTime=1.5
-solverName="tnl-direct-eikonal-solver"
-#solverName="gdb --args tnl-direct-eikonal-solver-dbg --catch-exceptions no"
-#
+realType="double"
+#mpiRun="mpirun -np 4 -oversubscribe "
+mpiRun=""
+
+## CAREFULL: If you set LocalCopy of MPI, you have to start with mpiRun even tnl-init
+## 	     This isnt problem with MpiIO.
+## CAREFULL: For smoothly calculated error, you have to choose the right output function which
+##           is for both MpiIO, LocalCopy different.
+
+solverName=${mpiRun}"tnl-direct-eikonal-solver"
+#solverName="gdb --args "${mpiRun}"tnl-direct-eikonal-solver-dbg --catch-exceptions no --mpi-gdb-debug false"
+#scale=2.0
+#finalSdf="aux-0.tnl aux-1.tnl"
+finalSdf="aux-final.tnl"
 
 setupTestFunction()
 {
@@ -22,16 +36,17 @@ setupTestFunction()
 #   then
       origin=-1.0
       proportions=2.0
+#      origin=-1.0
+#      proportions=2.0
       amplitude=1.0
-      waveLength=0.2
-      waveLengthX=0.2
-      waveLengthY=0.2
-      waveLengthZ=0.2
-      wavesNumber=3.0
-      wavesNumberX=0.5
-      wavesNumberY=2.0
+      waveLength=0.4
+      waveLengthX=0.5
+      waveLengthY=0.5
+      waveLengthZ=0.5
+      wavesNumber=1.25
+      wavesNumberX=0.5      wavesNumberY=2.0
       wavesNumberZ=3.0
-      phase=0.1
+      phase=-1.5
       phaseX=0.0
       phaseY=0.0
       phaseZ=0.0
@@ -44,6 +59,7 @@ setupGrid()
 {
    dimensions=$1
    gridSize=$2
+   #scale=$3
    tnl-grid-setup --dimensions ${dimensions} \
                   --origin-x ${origin} \
                   --origin-y ${origin} \
@@ -53,47 +69,51 @@ setupGrid()
                   --proportions-z ${proportions} \
                   --size-x ${gridSize} \
                   --size-y ${gridSize} \
-                  --size-z ${gridSize} 
+	    	  --real-type ${realType} \
+                  --size-z ${gridSize}
+#$((2*${gridSize})) 
 }
 
 setInitialCondition()
 {
    testFunction=$1
    tnl-init --test-function ${testFunction} \
-            --output-file initial-u.tnl \
-            --amplitude ${amplitude} \
-            --wave-length ${waveLength} \
-            --wave-length-x ${waveLengthX} \
-            --wave-length-y ${waveLengthY} \
-            --wave-length-z ${waveLengthZ} \
-            --waves-number ${wavesNumber} \
-            --waves-number-x ${wavesNumberX} \
-            --waves-number-y ${wavesNumberY} \
-            --waves-number-z ${wavesNumberZ} \
-            --phase ${phase} \
-            --phase-x ${phaseX} \
-            --phase-y ${phaseY} \
-            --phase-z ${phaseZ} \
-            --sigma ${sigma} \
-            --radius ${radius}
+	    		 --real-type ${realType} \
+            		 --output-file initial-u.tnl \
+            		 --amplitude ${amplitude} \
+            		 --wave-length ${waveLength} \
+            		 --wave-length-x ${waveLengthX} \
+            		 --wave-length-y ${waveLengthY} \
+            		 --wave-length-z ${waveLengthZ} \
+            		 --waves-number ${wavesNumber} \
+            		 --waves-number-x ${wavesNumberX} \
+            		 --waves-number-y ${wavesNumberY} \
+            		 --waves-number-z ${wavesNumberZ} \
+            		 --phase ${phase} \
+            		 --phase-x ${phaseX} \
+            		 --phase-y ${phaseY} \
+            		 --phase-z ${phaseZ} \
+            		 --sigma ${sigma} \
+            		 --radius ${radius}
    
-    tnl-init --test-function ${testFunction}-sdf \
-            --output-file exact-u.tnl \
-            --amplitude ${amplitude} \
-            --wave-length ${waveLength} \
-            --wave-length-x ${waveLengthX} \
-            --wave-length-y ${waveLengthY} \
-            --wave-length-z ${waveLengthZ} \
-            --waves-number ${wavesNumber} \
-            --waves-number-x ${wavesNumberX} \
-            --waves-number-y ${wavesNumberY} \
-            --waves-number-z ${wavesNumberZ} \
-            --phase ${phase} \
-            --phase-x ${phaseX} \
-            --phase-y ${phaseY} \
-            --phase-z ${phaseZ} \
-            --sigma ${sigma} \
-            --radius ${radius}
+   tnl-init --test-function ${testFunction}-sdf \
+	    		 --real-type ${realType} \
+            		 --output-file exact-u.tnl \
+            		 --amplitude ${amplitude} \
+            		 --wave-length ${waveLength} \
+            		 --wave-length-x ${waveLengthX} \
+            		 --wave-length-y ${waveLengthY} \
+            		 --wave-length-z ${waveLengthZ} \
+            		 --waves-number ${wavesNumber} \
+            		 --waves-number-x ${wavesNumberX} \
+            		 --waves-number-y ${wavesNumberY} \
+            		 --waves-number-z ${wavesNumberZ} \
+            		 --phase ${phase} \
+            		 --phase-x ${phaseZ} \
+            		 --phase-y ${phaseZ} \
+            		 --phase-z ${phaseZ} \
+            		 --sigma ${sigma} \
+            		 --radius ${radius} \
 
 }
 
@@ -111,17 +131,22 @@ solve()
                  --min-iterations 20 \
                  --convergence-residue 1.0e-12 \
                  --snapshot-period ${snapshotPeriod} \
+		 --real-type ${realType} \
                  --final-time ${finalTime}
 }
                
 computeError()
 {
+for sweep in ${finalSdf}
+do
    tnl-diff --mesh mesh.tnl \
-            --input-files aux-final.tnl exact-u.tnl \
+	    --input-files exact-u.tnl u-00000.tnl \
             --mode sequence \
             --snapshot-period ${snapshotPeriod} \
             --output-file errors.txt \
             --write-difference yes
+#aux-final.tnl \
+done
 }
 
 runTest()
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index d933f1df3..c6a522d8f 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -62,12 +62,15 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
     typedef Functions::MeshFunction< MeshType > MeshFunctionType;
     typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType;
     typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
+    typedef Containers::StaticVector< 2, Index > StaticVector;
     using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
     using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;
     
+    
     void initInterface( const MeshFunctionPointer& input,
             MeshFunctionPointer& output,
-            InterfaceMapPointer& interfaceMap );
+            InterfaceMapPointer& interfaceMap,
+            StaticVector vLower, StaticVector vUpper );
     
     template< typename MeshEntity >
     __cuda_callable__ bool updateCell( MeshFunctionType& u,
@@ -101,15 +104,17 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
     typedef Functions::MeshFunction< MeshType > MeshFunctionType;
     typedef Functions::MeshFunction< MeshType, 3, bool > InterfaceMapType;
     typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
+    typedef Containers::StaticVector< 3, Index > StaticVector;
     using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
     using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;      
     
     void initInterface( const MeshFunctionPointer& input,
             MeshFunctionPointer& output,
-            InterfaceMapPointer& interfaceMap );
+            InterfaceMapPointer& interfaceMap,
+            StaticVector vLower, StaticVector vUpper );
     
     template< typename MeshEntity >
-    __cuda_callable__ void updateCell( MeshFunctionType& u,
+    __cuda_callable__ bool updateCell( MeshFunctionType& u,
             const MeshEntity& cell,
             const RealType velocity = 1.0);
     
@@ -154,6 +159,10 @@ template< typename Real, typename Device, typename Index >
 __global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
         Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, int copy, int k );
 
+template< typename Real, typename Device, typename Index >
+__global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, int copy, int k );
+
 template < typename Index >
 __global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
         TNL::Containers::ArrayView< int, Devices::Cuda, Index > dBlock, int nBlocks );
@@ -171,7 +180,8 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2,
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, 
         Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
-        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap );
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
+        Containers::StaticVector< 3, Index > vLower, Containers::StaticVector< 3, Index > vUpper );
 
 template < int sizeSArray, typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index d7da1117e..a5d3d81df 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -715,17 +715,14 @@ void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
 initInterface( const MeshFunctionPointer& _input,
         MeshFunctionPointer& _output,
-        InterfaceMapPointer& _interfaceMap )
+        InterfaceMapPointer& _interfaceMap, 
+        StaticVector vLower, StaticVector vUpper )
 {
   
   if( std::is_same< Device, Devices::Cuda >::value )
   {
 #ifdef HAVE_CUDA
     const MeshType& mesh = _input->getMesh();
-    Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh.getDistributedMesh();
-    
-    Containers::StaticVector< 2, Index > vLower = meshPom->getLowerOverlap();
-    Containers::StaticVector< 2, Index > vUpper = meshPom->getUpperOverlap();
     
     const int cudaBlockSize( 16 );
     int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
@@ -748,7 +745,7 @@ initInterface( const MeshFunctionPointer& _input,
     InterfaceMapType& interfaceMap = _interfaceMap.modifyData();
     const MeshType& mesh = input.getMesh();
 /*#ifdef HAVE_MPI
-    int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup );
+    int i>s::>::GetRan>s::>::AllGroup );
     if( i == 0 )
     {
       printf( "0: mesh x: %d\n", mesh.getDimensions().x() );
@@ -778,11 +775,11 @@ initInterface( const MeshFunctionPointer& _input,
     
     const RealType& hx = mesh.getSpaceSteps().x();
     const RealType& hy = mesh.getSpaceSteps().y();     
-    for( cell.getCoordinates().y() = 0;
-            cell.getCoordinates().y() < mesh.getDimensions().y();
+    for( cell.getCoordinates().y() = 0 + vLower[1];
+            cell.getCoordinates().y() < mesh.getDimensions().y() - vUpper[1];
             cell.getCoordinates().y() ++ )
-      for( cell.getCoordinates().x() = 0;
-              cell.getCoordinates().x() < mesh.getDimensions().x();
+      for( cell.getCoordinates().x() = 0 + vLower[0];
+              cell.getCoordinates().x() < mesh.getDimensions().x() - vUpper[0];
               cell.getCoordinates().x() ++ )
       {
         cell.refresh();
@@ -856,7 +853,7 @@ initInterface( const MeshFunctionPointer& _input,
         }
       }
 #ifdef HAVE_MPI
-    //int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup );
+    //int i>s::>::GetRan>s::>::AllGroup );
     /*if( i == 0 )
     {
       printf( "0: mesh x: %d\n", mesh.getDimensions().x() );
@@ -951,13 +948,14 @@ void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
 initInterface( const MeshFunctionPointer& _input,
         MeshFunctionPointer& _output,
-        InterfaceMapPointer& _interfaceMap  )
+        InterfaceMapPointer& _interfaceMap, 
+        StaticVector vLower, StaticVector vUpper )
 {
   if( std::is_same< Device, Devices::Cuda >::value )
   {
 #ifdef HAVE_CUDA
     const MeshType& mesh = _input->getMesh();
-    
+   
     const int cudaBlockSize( 8 );
     int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
     int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
@@ -969,7 +967,7 @@ initInterface( const MeshFunctionPointer& _input,
     Devices::Cuda::synchronizeDevice();
     CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(),
             _output.template modifyData< Device >(),
-            _interfaceMap.template modifyData< Device >() );
+            _interfaceMap.template modifyData< Device >(), vLower, vUpper );
     cudaDeviceSynchronize();
     TNL_CHECK_CUDA_DEVICE;
 #endif
@@ -979,8 +977,10 @@ initInterface( const MeshFunctionPointer& _input,
     const MeshFunctionType& input =  _input.getData();
     MeshFunctionType& output =  _output.modifyData();
     InterfaceMapType& interfaceMap =  _interfaceMap.modifyData();
+    
     const MeshType& mesh = input.getMesh();
     typedef typename MeshType::Cell Cell;
+    
     Cell cell( mesh );
     for( cell.getCoordinates().z() = 0;
             cell.getCoordinates().z() < mesh.getDimensions().z();
@@ -1002,14 +1002,14 @@ initInterface( const MeshFunctionPointer& _input,
     const RealType& hx = mesh.getSpaceSteps().x();
     const RealType& hy = mesh.getSpaceSteps().y();
     const RealType& hz = mesh.getSpaceSteps().z();
-    for( cell.getCoordinates().z() = 0;
-            cell.getCoordinates().z() < mesh.getDimensions().z();
+    for( cell.getCoordinates().z() = 0 + vLower[2];
+            cell.getCoordinates().z() < mesh.getDimensions().z() - vUpper[2];
             cell.getCoordinates().z() ++ )   
-      for( cell.getCoordinates().y() = 0;
-              cell.getCoordinates().y() < mesh.getDimensions().y();
+      for( cell.getCoordinates().y() = 0 + vLower[1];
+              cell.getCoordinates().y() < mesh.getDimensions().y() - vUpper[1];
               cell.getCoordinates().y() ++ )
-        for( cell.getCoordinates().x() = 0;
-                cell.getCoordinates().x() < mesh.getDimensions().x();
+        for( cell.getCoordinates().x() = 0 + vLower[0];
+                cell.getCoordinates().x() < mesh.getDimensions().x() - vUpper[0];
                 cell.getCoordinates().x() ++ )
         {
           cell.refresh();
@@ -1062,10 +1062,6 @@ initInterface( const MeshFunctionPointer& _input,
               interfaceMap[ t ] = true;
             }  
           }
-          /*output[ cell.getIndex() ] =
-           c > 0 ? TypeInfo< RealType >::getMaxValue() :
-           -TypeInfo< RealType >::getMaxValue();
-           interfaceMap[ cell.getIndex() ] = false;*/ //is on line 245
         }
   }
 }
@@ -1075,7 +1071,7 @@ template< typename Real,
         typename Index >
 template< typename MeshEntity >
 __cuda_callable__
-void
+bool
 tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
 updateCell( MeshFunctionType& u,
         const MeshEntity& cell, 
@@ -1101,6 +1097,7 @@ updateCell( MeshFunctionType& u,
     a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ],
             u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ] );
   }
+  
   if( cell.getCoordinates().y() == 0 )
     b = u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ];
   else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 )
@@ -1109,7 +1106,9 @@ updateCell( MeshFunctionType& u,
   {
     b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ],
             u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ] );
-  }if( cell.getCoordinates().z() == 0 )
+  }
+  
+  if( cell.getCoordinates().z() == 0 )
     c = u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ];
   else if( cell.getCoordinates().z() == mesh.getDimensions().z() - 1 )
     c = u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ];
@@ -1118,17 +1117,25 @@ updateCell( MeshFunctionType& u,
     c = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ],
             u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ] );
   }
+  
   if( fabs( a ) == std::numeric_limits< RealType >::max() && 
           fabs( b ) == std::numeric_limits< RealType >::max() &&
           fabs( c ) == std::numeric_limits< RealType >::max() )
-    return;
+    return false;
   
   RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
   sortMinims( pom );   
   tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
+  
   if( fabs( tmp ) < fabs( pom[ 1 ] ) )
   {
-    u[ cell.getIndex() ] = argAbsMin( value, tmp ); 
+    u[ cell.getIndex() ] = argAbsMin( value, tmp );
+    tmp = value - u[ cell.getIndex() ];
+    if ( fabs( tmp ) > 0.001*hx ){
+      return true;
+    }else{
+      return false;
+    }
   }
   else
   {
@@ -1138,6 +1145,12 @@ updateCell( MeshFunctionType& u,
     if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
     {
       u[ cell.getIndex() ] = argAbsMin( value, tmp );
+      tmp = value - u[ cell.getIndex() ];
+      if ( fabs( tmp ) > 0.001*hx ){
+        return true;
+      }else{
+        return false;
+      }
     }
     else
     {
@@ -1146,6 +1159,12 @@ updateCell( MeshFunctionType& u,
               hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
               hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
       u[ cell.getIndex() ] = argAbsMin( value, tmp );
+      tmp = value - u[ cell.getIndex() ];
+      if ( fabs( tmp ) > 0.001*hx ){
+        return true;
+      }else{
+        return false;
+      }
     }
   }
 }
@@ -1391,7 +1410,7 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2,
               - std::numeric_limits< Real >::max();
     interfaceMap[ cind ] = false; 
     
-    if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] && i>vLower[0] && j> vLower[0] )
+    if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] && i>vLower[0] -1 && j> vLower[0]-1 )
     {
       const Real& hx = mesh.getSpaceSteps().x();
       const Real& hy = mesh.getSpaceSteps().y();
@@ -1446,7 +1465,8 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2,
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, 
         Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
-        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap )
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
+        Containers::StaticVector< 3, Index > vLower, Containers::StaticVector< 3, Index > vUpper )
 {
   int i = threadIdx.x + blockDim.x*blockIdx.x;
   int j = blockDim.y*blockIdx.y + threadIdx.y;
@@ -1468,76 +1488,78 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3
     interfaceMap[ cind ] = false; 
     cell.refresh();
     
-    const Real& hx = mesh.getSpaceSteps().x();
-    const Real& hy = mesh.getSpaceSteps().y();
-    const Real& hz = mesh.getSpaceSteps().z();
-    const Real& c = input( cell );
-    if( ! cell.isBoundaryEntity()  )
+    if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] &&
+            k < mesh.getDimensions().y() - vUpper[2] && i>vLower[0]-1 && j> vLower[1]-1 && k>vLower[2]-1 )
     {
-      auto neighbors = cell.getNeighborEntities();
-      Real pom = 0;
-      const Index e = neighbors.template getEntityIndex<  1, 0, 0 >();
-      const Index w = neighbors.template getEntityIndex<  -1, 0, 0 >();
-      const Index n = neighbors.template getEntityIndex<  0, 1, 0 >();
-      const Index s = neighbors.template getEntityIndex<  0, -1, 0 >();
-      const Index t = neighbors.template getEntityIndex<  0, 0, 1 >();
-      const Index b = neighbors.template getEntityIndex<  0, 0, -1 >();
-      
-      if( c * input[ n ] <= 0 )
-      {
-        pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-          output[ cind ] = pom;
-        
-        interfaceMap[ cind ] = true;
-      }
-      if( c * input[ e ] <= 0 )
-      {
-        pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
-          output[ cind ] = pom;                       
-        
-        interfaceMap[ cind ] = true;
-      }
-      if( c * input[ w ] <= 0 )
-      {
-        pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-          output[ cind ] = pom;
-        
-        interfaceMap[ cind ] = true;
-      }
-      if( c * input[ s ] <= 0 )
-      {
-        pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-          output[ cind ] = pom;
-        
-        interfaceMap[ cind ] = true;
-      }
-      if( c * input[ b ] <= 0 )
-      {
-        pom = TNL::sign( c )*( hz * c )/( c - input[ b ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-          output[ cind ] = pom;
-        
-        interfaceMap[ cind ] = true;
-      }
-      if( c * input[ t ] <= 0 )
+      const Real& hx = mesh.getSpaceSteps().x();
+      const Real& hy = mesh.getSpaceSteps().y();
+      const Real& hz = mesh.getSpaceSteps().z();
+      const Real& c = input( cell );
+      if( ! cell.isBoundaryEntity()  )
       {
-        pom = TNL::sign( c )*( hz * c )/( c - input[ t ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-          output[ cind ] = pom;
+        auto neighbors = cell.getNeighborEntities();
+        Real pom = 0;
+        const Index e = neighbors.template getEntityIndex<  1, 0, 0 >();
+        const Index w = neighbors.template getEntityIndex<  -1, 0, 0 >();
+        const Index n = neighbors.template getEntityIndex<  0, 1, 0 >();
+        const Index s = neighbors.template getEntityIndex<  0, -1, 0 >();
+        const Index t = neighbors.template getEntityIndex<  0, 0, 1 >();
+        const Index b = neighbors.template getEntityIndex<  0, 0, -1 >();
         
-        interfaceMap[ cind ] = true;
+        if( c * input[ n ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+            output[ cind ] = pom;
+          
+          interfaceMap[ cind ] = true;
+        }
+        if( c * input[ e ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
+            output[ cind ] = pom;                       
+          
+          interfaceMap[ cind ] = true;
+        }
+        if( c * input[ w ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+            output[ cind ] = pom;
+          
+          interfaceMap[ cind ] = true;
+        }
+        if( c * input[ s ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+            output[ cind ] = pom;
+          
+          interfaceMap[ cind ] = true;
+        }
+        if( c * input[ b ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hz * c )/( c - input[ b ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+            output[ cind ] = pom;
+          
+          interfaceMap[ cind ] = true;
+        }
+        if( c * input[ t ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hz * c )/( c - input[ t ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+            output[ cind ] = pom;
+          
+          interfaceMap[ cind ] = true;
+        }
       }
     }
   }
 }
 
 
-
-
 template< typename Real,
         typename Device,
         typename Index >
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
index 56fa9496f..105a068d3 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
@@ -123,8 +123,7 @@ setInitialCondition( const Config::ParameterContainer& parameters,
 {
   this->bindDofs( dofs );
   String inputFile = parameters.getParameter< String >( "input-file" );
-  this->initialData->setMesh( this->getMesh() );
-  std::cout<<"setInitialCondition" <<std::endl; 
+  this->initialData->setMesh( this->getMesh() ); 
   if( CommunicatorType::isDistributed() )
   {
     std::cout<<"Nodes Distribution: " << initialData->getMesh().getDistributedMesh()->printProcessDistr() << std::endl;
@@ -191,20 +190,9 @@ bool
 tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >::
 solve( DofVectorPointer& dofs )
 {
-   std::cout << "We are in solve()." << std::endl;
    FastSweepingMethod< MeshType, Communicator,AnisotropyType > fsm;
    fsm.solve( this->getMesh(), u, anisotropy, initialData );
    
-   /*int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup );
-   const MeshPointer msh = this->getMesh();
-   if( i == 0 &&  msh->getMeshDimension() == 2 )
-   {
-     for( int k = 0; k < 9; k++ ){
-       for( int l = 0; l < msh->getDimensions().x(); l++ )
-         printf("%.2f\t",(*initialData)[ k * msh->getDimensions().x() + l ] );
-       printf("\n");
-     }
-   }*/
    makeSnapshot();
    return true;
 }
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
index 8e1e6a72b..a57ef1491 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
@@ -14,6 +14,7 @@
 #include <TNL/Functions/Analytic/Constant.h>
 #include <TNL/Pointers/SharedPointer.h>
 #include "tnlDirectEikonalMethodsBase.h"
+#define ForDebug false // false <=> off
 
 
 template< typename Mesh,
@@ -132,8 +133,11 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator,
     typedef Index IndexType;
     typedef Anisotropy AnisotropyType;
     typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > BaseType;
+    typedef Communicator CommunicatorType;
+    
     using MeshPointer = Pointers::SharedPointer<  MeshType >;
     using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
+    using MPI = Communicators::MpiCommunicator;
     
     using typename BaseType::InterfaceMapType;
     using typename BaseType::MeshFunctionType;
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h
index 662a5b79c..f2f033ccb 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h
@@ -109,7 +109,7 @@ solve( const MeshPointer& mesh,
           dim3 blockSize( cudaBlockSize );
           dim3 gridSize( numBlocksX );
           
-          tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > ptr;
+          BaseType ptr;
           
           
           
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index 586d37ba5..5eac5232b 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -19,7 +19,6 @@
 #include <TNL/Communicators/MpiDefs.h>
 #include "tnlDirectEikonalProblem.h"
 
-#define ForDebug false // false <=> off
 
 
 
@@ -79,8 +78,27 @@ solve( const MeshPointer& mesh,
   InterfaceMapPointer interfaceMapPtr;
   auxPtr->setMesh( mesh );
   interfaceMapPtr->setMesh( mesh );
+  
+  //Distributed mesh for MPI overlaps (without MPI null pointer)
+  Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh();
+  
+  int i = MPI::GetRank( MPI::AllGroup ); // number that identifies rank
+  
+  // getting overlaps ( WITHOUT MPI SHOULD BE 0 )
+  Containers::StaticVector< 2, IndexType > vLower;
+  vLower[0] = 0; vLower[1] = 0;
+  Containers::StaticVector< 2, IndexType > vUpper;
+  vUpper[0] = 0; vUpper[1] = 0;
+#ifdef HAVE_MPI
+  if( CommunicatorType::isDistributed() ) //If we started solver with MPI
+  {
+    vLower = meshPom->getLowerOverlap();
+    vUpper = meshPom->getUpperOverlap();
+  }
+#endif
+  
   std::cout << "Initiating the interface cells ..." << std::endl;
-  BaseType::initInterface( u, auxPtr, interfaceMapPtr );
+  BaseType::initInterface( u, auxPtr, interfaceMapPtr, vLower, vUpper );
   
   auxPtr->save( "aux-ini.tnl" );
   
@@ -124,29 +142,20 @@ solve( const MeshPointer& mesh,
 #endif
   
   while( iteration < this->maxIterations )
-  {
-    Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh();
-
-    
-    int i = MPI::GetRank( MPI::AllGroup ); // number that identifies rank
-    
-    // getting overlaps ( WITHOUT MPI SHOULD BE 0 )
-    Containers::StaticVector< 2, IndexType > vLower = meshPom->getLowerOverlap(); 
-    Containers::StaticVector< 2, IndexType > vUpper = meshPom->getUpperOverlap();
-    
+  {    
 #if  ForDebug 
     int WhileCount = 0; // number of passages of while cycle with condition calculated
     printf( "%d: meshDimensions are (x,y) = (%d,%d).\n",i, mesh->getDimensions().x(), mesh->getDimensions().y() );
     printf( "%d: owerlaps are ([x1,x2],[y1,y2]) = ([%d,%d],[%d,%d]).\n",i, vLower[0], vUpper[0], vLower[1], vUpper[1] );
-    if( std::is_same< DeviceType, Devices::Host >::value && i == 0 )
+    /*if( std::is_same< DeviceType, Devices::Host >::value && i == 0 )
     {
       for( int j = mesh->getDimensions().y()-1; j>-1; j-- ){
-        for( int i = 0; i < mesh->getDimensions().x(); i++ )
-          std::cout << aux[ j * mesh->getDimensions().x() + i ] << " ";
+        for( int m = 0; m < mesh->getDimensions().x(); m++ )
+          std::cout << aux[ j * mesh->getDimensions().x() + m ] << " ";
         std::cout << std::endl;
       }
       std::cout << std::endl;
-    }
+    }*/
     
     // TO SEE CUDA OVERLAPS
     /*const int cudaBlockSize( 16 );
@@ -314,7 +323,9 @@ solve( const MeshPointer& mesh,
           {
             cell.refresh();
             if( ! interfaceMap( cell ) )
+            {
               calculated = this->updateCell( aux, cell ) || calculated;
+            }
           }
         }
         
@@ -379,6 +390,7 @@ solve( const MeshPointer& mesh,
 #ifdef HAVE_CUDA
         TNL_CHECK_CUDA_DEVICE;
         // Maximum cudaBlockSite is 32. Because of maximum num. of threads in kernel.
+        // IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2)
         const int cudaBlockSize( 16 );
         
         // Setting number of threads and blocks for kernel
@@ -442,27 +454,24 @@ solve( const MeshPointer& mesh,
         BlockIterPom1.setSize( numBlocksX * numBlocksY  );
         BlockIterPom1.setValue( 0 );
 #endif   
-        TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1;
-        BlockIterPom1.setSize( numBlocksX * numBlocksY  );
-        BlockIterPom1.setValue( 0 );
         int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
-        int nBlocks = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
-        
-        TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
-        dBlock.setSize( nBlocks );
-        TNL::Containers::Array< int, Devices::Host, IndexType > dBlock1;
-        dBlock1.setSize( nBlocks );
-        TNL_CHECK_CUDA_DEVICE;
+        // for CudaPrallelReduc (replaced with .containsValue(1))
+        //int nBlocks = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
+        //TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
+        //dBlock.setSize( nBlocks );
+        //TNL::Containers::Array< int, Devices::Host, IndexType > dBlock1;
+        //dBlock1.setSize( nBlocks );
+        //TNL_CHECK_CUDA_DEVICE;
         
         // Helping meshFunction that switches with AuxPtr in every calculation of CudaUpdateCellCaller<<<>>>()
         MeshFunctionPointer helpFunc( mesh );
-        MeshFunctionPointer helpFunc1( mesh );
+        //MeshFunctionPointer helpFunc1( mesh );
         
         // Setting number of threads and blocks in grid for DeepCopy of meshFunction
         int numBlocksXWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
         int numBlocksYWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
         dim3 gridSizeWithoutOverlaps( numBlocksXWithoutOverlaps, numBlocksYWithoutOverlaps );
-        
+                
         DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(),
                 helpFunc.template modifyData< Device>(), 1, i );
         
@@ -486,7 +495,7 @@ solve( const MeshPointer& mesh,
         while( BlockIterD )
         {
           //numberWhile++;
-          /** HERE IS CHESS METHOD **/
+          /** HERE IS CHESS METHOD (NO MPI) **/
           
           /*
            CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
@@ -523,7 +532,7 @@ solve( const MeshPointer& mesh,
           /**------------------------------------------------------------------------------------------------*/
           
           
-     /** HERE IS FIM **/
+     /** HERE IS FIM FOR MPI AND WITHOUT MPI **/
           Devices::Cuda::synchronizeDevice();
           CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
                   interfaceMapPtr.template getData< Device >(),
@@ -564,7 +573,8 @@ solve( const MeshPointer& mesh,
           }
 #endif
           
-          // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now. 
+          // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now.
+          Devices::Cuda::synchronizeDevice(); 
           GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, BlockIterPom, numBlocksX, numBlocksY );
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
@@ -584,31 +594,27 @@ solve( const MeshPointer& mesh,
             std::cout << std::endl;
           }
 #endif
-          // Parallel reduction to see if we should calculate again BlockIterD
+          // "Parallel reduction" to see if we should calculate again BlockIterD
+          BlockIterD = BlockIterDevice.containsValue(1);
+          /*Devices::Cuda::synchronizeDevice();
           CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
           
           // Parallel reduction on dBlock because of too large number of blocks (more than maximum number of threads)
+          Devices::Cuda::synchronizeDevice();
           CudaParallelReduc<<< 1, 1024 >>>( dBlock, dBlock, nBlocks );
           cudaDeviceSynchronize();
-          TNL_CHECK_CUDA_DEVICE;
-#if ForDebug          
-          if( i == 0 ){
-            dBlock1 = dBlock;
-            printf("nBlocks = %d\n",nBlocks);
-            for( int m =0; m< nBlocks; m++ ){
-              std::cout << dBlock1[m] << " ";
-            }
-            std::cout << std::endl;
-          }
-#endif          
+          TNL_CHECK_CUDA_DEVICE;*/
+          
           // Copy of the first element which is result of parallel reduction
+          /*Devices::Cuda::synchronizeDevice();
           BlockIterD = dBlock.getElement( 0 );
           cudaDeviceSynchronize();
-          TNL_CHECK_CUDA_DEVICE;
+          TNL_CHECK_CUDA_DEVICE;*/
           
           // When we change something then we should caclucate again in the next passage of MPI ( calculated = true )
+         
           
           if( BlockIterD ){
             calculated = 1;
@@ -663,22 +669,23 @@ solve( const MeshPointer& mesh,
 #if ForDebug
         printf( "%d: Receved Calculated = %d.\n%d: Calculate = %d\n", i, calculated, i, calculate);
 #endif
-        aux.template synchronize< Communicator >();
         
 #if ForDebug 
         if( i == 1 )
           printf("WhileCount = %d\n",WhileCount);
         //calculated = 0; // DEBUG;
-#endif
-        }
 #endif
       }
-      String s( "aux-" + std::to_string( i ) + ".tnl" );
-      aux.save( s );   
-      Aux=auxPtr;
-      
-      iteration++;
+#endif
+      if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculated 0!
+        calculated = 0;
+    }
+    iteration++;
   }
+  //String s( "aux-" + std::to_string( i ) + ".tnl" );
+  //aux.save( s );   
+  Aux=auxPtr; // copy it for MakeSnapshot
+  
   aux.save("aux-final.tnl");
 }
 
@@ -708,6 +715,16 @@ __global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 2, Real,
       }
       printf( "\n");
     }
+    if( i==0 && j == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 )
+    {
+      for( int m = mesh.getDimensions().y()-1; m>-1; m-- ){
+        for( int l = 0; l < 17; l++ ){
+          printf( "%.2f ", helpFunc[ m * mesh.getDimensions().x() + l ]);
+        }
+        printf( "\n");
+      }
+      printf( "\n");
+    }
   }
 }
 
@@ -733,7 +750,10 @@ __global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, I
       pom = 1;//BlockIterPom[ i ] = 1;
     }
     
-    BlockIterPom[ i ] = pom;//BlockIterPom[ i ];
+    if( BlockIterDevice[ i ] != 1 )
+      BlockIterPom[ i ] = pom;//BlockIterPom[ i ];
+    else
+      BlockIterPom[ i ] = 1;
   }
 }
 
@@ -866,8 +886,8 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     
     xkolik = blockDim.x + 1;
     ykolik = blockDim.y + 1;
-    numOfBlocky = (dimY-vUpper[1]-vLower[1])/blockDim.y + (((dimY-vUpper[1]-vLower[1])%blockDim.y != 0) ? 1:0);
-    numOfBlockx = (dimX-vUpper[0]-vLower[0])/blockDim.x + (((dimX-vUpper[0]-vLower[0])%blockDim.x != 0) ? 1:0);
+    numOfBlocky = gridDim.y;//(dimY-vUpper[1]-vLower[1])/blockDim.y + (((dimY-vUpper[1]-vLower[1])%blockDim.y != 0) ? 1:0);
+    numOfBlockx = gridDim.x;//(dimX-vUpper[0]-vLower[0])/blockDim.x + (((dimX-vUpper[0]-vLower[0])%blockDim.x != 0) ? 1:0);
     
     if( numOfBlockx - 1 == blockIdx.x )
       xkolik = (dimX-vUpper[0]-vLower[0]) - (blockIdx.x)*blockDim.x+1;
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index 9c5471beb..40a1efeba 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -15,6 +15,7 @@
 
 #include "tnlFastSweepingMethod.h"
 
+
 template< typename Real,
         typename Device,
         typename Index,
@@ -67,8 +68,25 @@ solve( const MeshPointer& mesh,
   InterfaceMapPointer interfaceMapPtr;
   auxPtr->setMesh( mesh );
   interfaceMapPtr->setMesh( mesh );
+  
+  //Distributed mesh for overlaps (without MPI is null pointer)
+  Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh();
+  
+  // getting overlaps ( WITHOUT MPI SHOULD BE 0 )
+  Containers::StaticVector< 3, IndexType > vLower;
+  vLower[0] = 0; vLower[1] = 0; vLower[2] = 0;
+  Containers::StaticVector< 3, IndexType > vUpper;
+  vUpper[0] = 0; vUpper[1] = 0; vUpper[2] = 0;
+#ifdef HAVE_MPI
+  if( CommunicatorType::isDistributed() )
+  {
+    vLower = meshPom->getLowerOverlap();
+    vUpper = meshPom->getUpperOverlap();
+  }
+#endif
+  
   std::cout << "Initiating the interface cells ..." << std::endl;
-  BaseType::initInterface( u, auxPtr, interfaceMapPtr );
+  BaseType::initInterface( u, auxPtr, interfaceMapPtr, vLower, vUpper );
   auxPtr->save( "aux-ini.tnl" );   
   
   typename MeshType::Cell cell( *mesh );
@@ -76,313 +94,525 @@ solve( const MeshPointer& mesh,
   IndexType iteration( 0 );
   MeshFunctionType aux = *auxPtr;
   InterfaceMapType interfaceMap = * interfaceMapPtr;
+  aux.template synchronize< Communicator >(); //synchronization of intial conditions
+  int i = MPI::GetRank( MPI::AllGroup ); //getting identification of MPI thread
+#if ForDebug
+        if( i == 2 ){
+          aux.save("aux-init2.tnl");
+          mesh->save("mesh-2.tnl");
+        }
+        if( i == 1 ){
+          aux.save("aux-init1.tnl");
+          mesh->save("mesh-1.tnl");
+        }
+        if( i == 3 ){
+          aux.save("aux-init3.tnl");
+          mesh->save("mesh-3.tnl");
+        }
+        if( i == 0 ){
+          aux.save("aux-init0.tnl");
+          mesh->save("mesh-0.tnl");
+        }
+#endif
+  
   while( iteration < this->maxIterations )
-  {
-    if( std::is_same< DeviceType, Devices::Host >::value )
+  {    
+#if  ForDebug 
+    int WhileCount = 0; // number of passages of while cycle with condition calculated
+    printf( "%d: meshDimensions are (x,y,z) = (%d,%d,%d).\n",i, mesh->getDimensions().x(), mesh->getDimensions().y(), mesh->getDimensions().z() );
+    printf( "%d: owerlaps are ([x1,x2],[y1,y2],[z1,z2]) = ([%d,%d],[%d,%d],[%d,%d]).\n",i, vLower[0], vUpper[0], vLower[1], vUpper[1], vUpper[2], vLower[2] );
+    /*if( std::is_same< DeviceType, Devices::Host >::value && i == 2 )
     {
-      int numThreadsPerBlock = 64;
-      
-      
-      int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
-      int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
-      int numBlocksZ = mesh->getDimensions().z() / numThreadsPerBlock + (mesh->getDimensions().z() % numThreadsPerBlock != 0 ? 1:0);
-      //std::cout << "numBlocksX = " << numBlocksX << std::endl;
-      
-      /*Real **sArray = new Real*[numBlocksX*numBlocksY];
-       for( int i = 0; i < numBlocksX * numBlocksY; i++ )
-       sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];*/
-      
-      ArrayContainer BlockIterHost;
-      BlockIterHost.setSize( numBlocksX * numBlocksY * numBlocksZ );
-      BlockIterHost.setValue( 1 );
-      int IsCalculationDone = 1;
-      
-      MeshFunctionPointer helpFunc( mesh );
-      MeshFunctionPointer helpFunc1( mesh );
-      helpFunc1 = auxPtr;
-      auxPtr = helpFunc;
-      helpFunc = helpFunc1;
-      //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl;
-      /*for( int k = numBlocksX-1; k >-1; k-- ){
-       for( int l = 0; l < numBlocksY; l++ ){
-       std::cout<< BlockIterHost[ l*numBlocksX  + k ];
-       }
-       std::cout<<std::endl;
-       }
-       std::cout<<std::endl;*/
-      unsigned int numWhile = 0;
-      while( IsCalculationDone  )
-      {      
-        IsCalculationDone = 0;
-        helpFunc1 = auxPtr;
-        auxPtr = helpFunc;
-        helpFunc = helpFunc1;
-        this->template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+      for( int j = mesh->getDimensions().y()-1; j>-1; j-- ){
+        for( int m = 0; m < mesh->getDimensions().x(); m++ )
+          printf( "%.2f " , aux[ j*mesh->getDimensions().x() + m ]);
+        printf("\n");
+      }
+      printf("\n");
+    }*/    
+#endif
+    
+    int calculated = 1; // indicates weather we calculated in the last passage of the while cycle 
+    // calculated is same for all ranks 
+    // without MPI should be FALSE at the end of while cycle body
+    int calculate = 1; // indicates if the thread should calculate again in upcoming passage of cycle
+    // calculate is a value that can differ in every rank
+    // without MPI should be FALSE at the end of while cycle body
+    
+    while( calculated )
+    {
+      calculated = 0;
+#if ForDebug
+      WhileCount++;
+#endif
+      if( std::is_same< DeviceType, Devices::Host >::value && calculate ) // should we calculate in Host?
+      {
+        calculate = 0;
+        
+/** HERE IS FSM FOR OPENMP (NO MPI) - isnt worthy */
+        /*int numThreadsPerBlock = 64;
+         
+         
+         int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
+         int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
+         int numBlocksZ = mesh->getDimensions().z() / numThreadsPerBlock + (mesh->getDimensions().z() % numThreadsPerBlock != 0 ? 1:0);
+         //std::cout << "numBlocksX = " << numBlocksX << std::endl;
+         
+         //Real **sArray = new Real*[numBlocksX*numBlocksY];
+         // for( int i = 0; i < numBlocksX * numBlocksY; i++ )
+         // sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];
+         
+         ArrayContainer BlockIterHost;
+         BlockIterHost.setSize( numBlocksX * numBlocksY * numBlocksZ );
+         BlockIterHost.setValue( 1 );
+         int IsCalculationDone = 1;
+         
+         MeshFunctionPointer helpFunc( mesh );
+         MeshFunctionPointer helpFunc1( mesh );
+         helpFunc1 = auxPtr;
+         auxPtr = helpFunc;
+         helpFunc = helpFunc1;
+         //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl;
+         //for( int k = numBlocksX-1; k >-1; k-- ){
+         //for( int l = 0; l < numBlocksY; l++ ){
+         // std::cout<< BlockIterHost[ l*numBlocksX  + k ];
+         // }
+         // std::cout<<std::endl;
+         // }
+         // std::cout<<std::endl;
+         unsigned int numWhile = 0;
+         while( IsCalculationDone  )
+         {      
+         IsCalculationDone = 0;
+         helpFunc1 = auxPtr;
+         auxPtr = helpFunc;
+         helpFunc = helpFunc1;
+         this->template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         
+         //Reduction      
+         for( int i = 0; i < BlockIterHost.getSize(); i++ ){
+         if( IsCalculationDone == 0 ){
+         IsCalculationDone = IsCalculationDone || BlockIterHost[ i ];
+         //break;
+         }
+         }
+         numWhile++;
+         
+         
+         this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY, numBlocksZ );
+         
+         //string s( "aux-"+ std::to_string(numWhile) + ".tnl");
+         //aux.save( s );
+         }
+         if( numWhile == 1 ){
+         auxPtr = helpFunc;
+         }
+         aux = *auxPtr;*/
+/**------------------------------------------------------------------------------*/
+        
+        
+/** HERE IS FSM WITH MPI AND WITHOUT MPI */
+        
+#if ForDebug
+        if( i == 1 ){
+          aux.save("aux-final10.tnl");
+        }
+#endif
+        for( cell.getCoordinates().z() = 0 + vLower[2];
+                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
+                cell.getCoordinates().z()++ )
+        {
+          for( cell.getCoordinates().y() = 0 + vLower[1];
+                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
+                  cell.getCoordinates().y()++ )
+          {
+            for( cell.getCoordinates().x() = 0 + vLower[0];
+                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
+                    cell.getCoordinates().x()++ )
+            {
+              cell.refresh();
+              if( ! interfaceMap( cell ) )
+              {
+                //getting information weather we calculated in this passage
+                calculated = this->updateCell( aux, cell ) || calculated;
+              }
+            }
+          }
+        }
+#if ForDebug
+        if( i == 1 ){
+          aux.save("aux-final11.tnl");
+        }
+        int pocNull = 0;
+        for( cell.getCoordinates().z() = 0 + vLower[2];
+                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
+                cell.getCoordinates().z()++ )
+        {
+          for( cell.getCoordinates().y() = 0 + vLower[1];
+                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
+                  cell.getCoordinates().y()++ )
+          {
+            for( cell.getCoordinates().x() = 0 + vLower[0];
+                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
+                    cell.getCoordinates().x()++ )
+            {
+              cell.refresh();
+              if( fabs( aux(cell) ) < 0.002 )
+                pocNull++;
+            }
+          }
+        }
+        printf("%d: 1. pocNull = %d\n", i , pocNull);
+#endif        
+        for( cell.getCoordinates().z() = 0 + vLower[2];
+                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
+                cell.getCoordinates().z()++ )
+        {
+          for( cell.getCoordinates().y() = 0 + vLower[1];
+                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
+                  cell.getCoordinates().y()++ )
+          {
+            for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0];
+                    cell.getCoordinates().x() >= 0 + vLower[0];
+                    cell.getCoordinates().x()-- )		
+            {
+              //std::cerr << "2 -> ";
+              cell.refresh();
+              if( ! interfaceMap( cell ) )            
+                this->updateCell( aux, cell );
+            }
+          }
+        }
+#if ForDebug
+        if( i == 1 ){
+          aux.save("aux-final12.tnl");
+        }
+        pocNull = 0;
+        for( cell.getCoordinates().z() = 0 + vLower[2];
+                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
+                cell.getCoordinates().z()++ )
+        {
+          for( cell.getCoordinates().y() = 0 + vLower[1];
+                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
+                  cell.getCoordinates().y()++ )
+          {
+            for( cell.getCoordinates().x() = 0 + vLower[0];
+                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
+                    cell.getCoordinates().x()++ )
+            {
+              cell.refresh();
+              if( fabs( aux(cell) ) < 0.002 )
+                pocNull++;
+            }
+          }
+        }
+        printf("%d: 2. pocNull = %d\n", i , pocNull);
+#endif        
+        //aux.save( "aux-2.tnl" );
+        for( cell.getCoordinates().z() = 0 + vLower[2];
+                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
+                cell.getCoordinates().z()++ )
+        {
+          for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 - vUpper[1];
+                  cell.getCoordinates().y() >= 0 + vLower[1];
+                  cell.getCoordinates().y()-- )
+          {
+            for( cell.getCoordinates().x() = 0 + vLower[0];
+                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
+                    cell.getCoordinates().x()++ )
+            {
+              //std::cerr << "3 -> ";
+              cell.refresh();
+              if( ! interfaceMap( cell ) )            
+                this->updateCell( aux, cell );
+            }
+          }
+        }
+#if ForDebug
+        if( i == 1 ){
+          aux.save("aux-final13.tnl");
+        }
+        pocNull = 0;
+        for( cell.getCoordinates().z() = 0 + vLower[2];
+                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
+                cell.getCoordinates().z()++ )
+        {
+          for( cell.getCoordinates().y() = 0 + vLower[1];
+                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
+                  cell.getCoordinates().y()++ )
+          {
+            for( cell.getCoordinates().x() = 0 + vLower[0];
+                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
+                    cell.getCoordinates().x()++ )
+            {
+              cell.refresh();
+              if( fabs( aux(cell) ) < 0.002 )
+                pocNull++;
+            }
+          }
+        }
+        printf("%d: 3. pocNull = %d\n", i , pocNull);
+#endif        
+        //aux.save( "aux-3.tnl" );
         
-        //Reduction      
-        for( int i = 0; i < BlockIterHost.getSize(); i++ ){
-          if( IsCalculationDone == 0 ){
-            IsCalculationDone = IsCalculationDone || BlockIterHost[ i ];
-            //break;
+        for( cell.getCoordinates().z() = 0 + vLower[2];
+                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
+                cell.getCoordinates().z()++ )
+        {
+          for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 - vUpper[1];
+                  cell.getCoordinates().y() >= 0 + vLower[1];
+                  cell.getCoordinates().y()-- )
+          {
+            for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0];
+                    cell.getCoordinates().x() >= 0 + vLower[0];
+                    cell.getCoordinates().x()-- )		
+            {
+              //std::cerr << "4 -> ";
+              cell.refresh();
+              if( ! interfaceMap( cell ) )            
+                this->updateCell( aux, cell );
+            }
+          }
+        }  
+#if ForDebug
+        if( i == 1 ){
+          aux.save("aux-final14.tnl");
+        }
+        pocNull = 0;
+        for( cell.getCoordinates().z() = 0 + vLower[2];
+                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
+                cell.getCoordinates().z()++ )
+        {
+          for( cell.getCoordinates().y() = 0 + vLower[1];
+                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
+                  cell.getCoordinates().y()++ )
+          {
+            for( cell.getCoordinates().x() = 0 + vLower[0];
+                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
+                    cell.getCoordinates().x()++ )
+            {
+              cell.refresh();
+              if( fabs( aux(cell) ) < 0.002 )
+                pocNull++;
+            }
           }
         }
-        numWhile++;
-        std::cout <<"numWhile = "<< numWhile <<std::endl;
-        /*for( int k = 0; k < numBlocksZ; k++ ){
-          for( int j = numBlocksY-1; j>-1; j-- ){
-            for( int i = 0; i < numBlocksX; i++ ){
-              //std::cout << (*auxPtr)[ k * numBlocksX * numBlocksY + j * numBlocksX + i ] << " ";
-              std::cout << BlockIterHost[ k * numBlocksX * numBlocksY + j * numBlocksX + i ];
+        printf("%d: 4. pocNull = %d\n", i , pocNull);
+#endif        
+        //aux.save( "aux-4.tnl" );
+        
+        for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1 - vUpper[2];
+                cell.getCoordinates().z() >= 0 + vLower[2];
+                cell.getCoordinates().z()-- )
+        {
+          for( cell.getCoordinates().y() = 0 + vLower[1];
+                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
+                  cell.getCoordinates().y()++ )
+          {
+            for( cell.getCoordinates().x() = 0 + vLower[0];
+                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
+                    cell.getCoordinates().x()++ )
+            {
+              //std::cerr << "5 -> ";
+              cell.refresh();
+              if( ! interfaceMap( cell ) )
+                this->updateCell( aux, cell );
+            }
+          }
+        }
+#if ForDebug
+        if( i == 1 ){
+          aux.save("aux-final15.tnl");
+        }
+        pocNull = 0;
+        for( cell.getCoordinates().z() = 0 + vLower[2];
+                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
+                cell.getCoordinates().z()++ )
+        {
+          for( cell.getCoordinates().y() = 0 + vLower[1];
+                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
+                  cell.getCoordinates().y()++ )
+          {
+            for( cell.getCoordinates().x() = 0 + vLower[0];
+                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
+                    cell.getCoordinates().x()++ )
+            {
+              cell.refresh();
+              if( fabs( aux(cell) ) < 0.002 )
+                pocNull++;
             }
-            std::cout << std::endl;
           }
-          std::cout << std::endl;
         }
-        std::cout << std::endl;*/
+        printf("%d: 5. pocNull = %d\n", i , pocNull);
+ #endif       
+        //aux.save( "aux-5.tnl" );
         
-        this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY, numBlocksZ );
+        for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1 - vUpper[2];
+                cell.getCoordinates().z() >= 0 + vLower[2];
+                cell.getCoordinates().z()-- )
+        {
+          for( cell.getCoordinates().y() = 0 + vLower[1];
+                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
+                  cell.getCoordinates().y()++ )
+          {
+            for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0];
+                    cell.getCoordinates().x() >= 0 + vLower[0];
+                    cell.getCoordinates().x()-- )		
+            {
+              //std::cerr << "6 -> ";
+              cell.refresh();
+              if( ! interfaceMap( cell ) )            
+                this->updateCell( aux, cell );
+            }
+          }
+        }
+#if ForDebug
+        if( i == 1 ){
+          aux.save("aux-final16.tnl");
+        }
+        pocNull = 0;
+        for( cell.getCoordinates().z() = 0 + vLower[2];
+                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
+                cell.getCoordinates().z()++ )
+        {
+          for( cell.getCoordinates().y() = 0 + vLower[1];
+                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
+                  cell.getCoordinates().y()++ )
+          {
+            for( cell.getCoordinates().x() = 0 + vLower[0];
+                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
+                    cell.getCoordinates().x()++ )
+            {
+              cell.refresh();
+              if( fabs( aux(cell) ) < 0.002 )
+                pocNull++;
+            }
+          }
+        }
+        printf("%d: 6. pocNull = %d\n", i , pocNull);
+#endif        
+        //aux.save( "aux-6.tnl" );
         
-        /*for( int k = 0; k < numBlocksZ; k++ ){
-          for( int j = numBlocksY-1; j>-1; j-- ){
-            for( int i = 0; i < numBlocksX; i++ ){
-              //std::cout << (*auxPtr)[ k * numBlocksX * numBlocksY + j * numBlocksX + i ] << " ";
-              std::cout << BlockIterHost[ k * numBlocksX * numBlocksY + j * numBlocksX + i ];
+        for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1 - vUpper[2];
+                cell.getCoordinates().z() >= 0 + vLower[2];
+                cell.getCoordinates().z()-- )
+        {
+          for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 - vUpper[1];
+                  cell.getCoordinates().y() >= 0 + vLower[1];
+                  cell.getCoordinates().y()-- )
+          {
+            for( cell.getCoordinates().x() = 0 + vLower[0];
+                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
+                    cell.getCoordinates().x()++ )
+            {
+              //std::cerr << "7 -> ";
+              cell.refresh();
+              if( ! interfaceMap( cell ) )            
+                this->updateCell( aux, cell );
             }
-            std::cout << std::endl;
           }
-          std::cout << std::endl;
-        }*/
+        }
         
-        /*for( int j = numBlocksY-1; j>-1; j-- ){
-         for( int i = 0; i < numBlocksX; i++ )
-         std::cout << "BlockIterHost = "<< j*numBlocksX + i<< " ," << BlockIterHost[ j * numBlocksX + i ];
-         std::cout << std::endl;
-         }
-         std::cout << std::endl;*/
+#if ForDebug
+        if( i == 1 ){
+          aux.save("aux-final17.tnl");
+        }
+        pocNull = 0;
+        for( cell.getCoordinates().z() = 0 + vLower[2];
+                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
+                cell.getCoordinates().z()++ )
+        {
+          for( cell.getCoordinates().y() = 0 + vLower[1];
+                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
+                  cell.getCoordinates().y()++ )
+          {
+            for( cell.getCoordinates().x() = 0 + vLower[0];
+                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
+                    cell.getCoordinates().x()++ )
+            {
+              cell.refresh();
+              if( fabs( aux(cell) ) < 0.002 )
+                pocNull++;
+            }
+          }
+        }
+        printf("%d: 7. pocNull = %d\n", i , pocNull);
+#endif        
+        //aux.save( "aux-7.tnl" );
         
-        //std::cout<<std::endl;
-        //string s( "aux-"+ std::to_string(numWhile) + ".tnl");
-        //aux.save( s );
-      }
-      if( numWhile == 1 ){
-        auxPtr = helpFunc;
+        for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1 - vUpper[2];
+                cell.getCoordinates().z() >= 0 + vLower[2];
+                cell.getCoordinates().z()-- )
+        {
+          for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 - vUpper[1];
+                  cell.getCoordinates().y() >= 0 + vLower[1];
+                  cell.getCoordinates().y()-- )
+          {
+            for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0];
+                    cell.getCoordinates().x() >= 0 + vLower[0];
+                    cell.getCoordinates().x()-- )		
+            {
+              //std::cerr << "8 -> ";
+              cell.refresh();
+              if( ! interfaceMap( cell ) )            
+                this->updateCell( aux, cell );
+            }
+          }
+        }
+#if ForDebug
+        if( i == 1 ){
+          aux.save("aux-final18.tnl");
+        }
+        pocNull = 0;
+        for( cell.getCoordinates().z() = 0 + vLower[2];
+                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
+                cell.getCoordinates().z()++ )
+        {
+          for( cell.getCoordinates().y() = 0 + vLower[1];
+                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
+                  cell.getCoordinates().y()++ )
+          {
+            for( cell.getCoordinates().x() = 0 + vLower[0];
+                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
+                    cell.getCoordinates().x()++ )
+            {
+              cell.refresh();
+              if( fabs( aux(cell) ) < 0.002 )
+                pocNull++;
+            }
+          }
+        }
+        printf("%d: 8. pocNull = %d\n", i , pocNull);
+        for( cell.getCoordinates().z() = 0 + vLower[2];
+                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
+                cell.getCoordinates().z()++ )
+        {
+          for( cell.getCoordinates().y() = 0 + vLower[1];
+                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
+                  cell.getCoordinates().y()++ )
+          {
+            for( cell.getCoordinates().x() = 0 + vLower[0];
+                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
+                    cell.getCoordinates().x()++ )
+            {
+              cell.refresh();
+              printf("%.2f ", aux(cell));
+            }
+            printf("\n");
+          }
+          printf("\n");
+        }
+#endif
+        
+        /**----------------------------------------------------------------------------------*/
       }
-      aux = *auxPtr;
-      
-      /*for( cell.getCoordinates().z() = 0;
-       cell.getCoordinates().z() < mesh->getDimensions().z();
-       cell.getCoordinates().z()++ )
-       {
-       for( cell.getCoordinates().y() = 0;
-       cell.getCoordinates().y() < mesh->getDimensions().y();
-       cell.getCoordinates().y()++ )
-       {
-       for( cell.getCoordinates().x() = 0;
-       cell.getCoordinates().x() < mesh->getDimensions().x();
-       cell.getCoordinates().x()++ )
-       {
-       cell.refresh();
-       if( ! interfaceMap( cell ) )
-       this->updateCell( aux, cell );
-       }
-       }
-       }
-       //aux.save( "aux-1.tnl" );
-       
-       for( cell.getCoordinates().z() = 0;
-       cell.getCoordinates().z() < mesh->getDimensions().z();
-       cell.getCoordinates().z()++ )
-       {
-       for( cell.getCoordinates().y() = 0;
-       cell.getCoordinates().y() < mesh->getDimensions().y();
-       cell.getCoordinates().y()++ )
-       {
-       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-       cell.getCoordinates().x() >= 0 ;
-       cell.getCoordinates().x()-- )		
-       {
-       //std::cerr << "2 -> ";
-       cell.refresh();
-       if( ! interfaceMap( cell ) )            
-       this->updateCell( aux, cell );
-       }
-       }
-       }
-       //aux.save( "aux-2.tnl" );
-       for( cell.getCoordinates().z() = 0;
-       cell.getCoordinates().z() < mesh->getDimensions().z();
-       cell.getCoordinates().z()++ )
-       {
-       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-       cell.getCoordinates().y() >= 0 ;
-       cell.getCoordinates().y()-- )
-       {
-       for( cell.getCoordinates().x() = 0;
-       cell.getCoordinates().x() < mesh->getDimensions().x();
-       cell.getCoordinates().x()++ )
-       {
-       //std::cerr << "3 -> ";
-       cell.refresh();
-       if( ! interfaceMap( cell ) )            
-       this->updateCell( aux, cell );
-       }
-       }
-       }
-       //aux.save( "aux-3.tnl" );
-       
-       for( cell.getCoordinates().z() = 0;
-       cell.getCoordinates().z() < mesh->getDimensions().z();
-       cell.getCoordinates().z()++ )
-       {
-       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-       cell.getCoordinates().y() >= 0;
-       cell.getCoordinates().y()-- )
-       {
-       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-       cell.getCoordinates().x() >= 0 ;
-       cell.getCoordinates().x()-- )		
-       {
-       //std::cerr << "4 -> ";
-       cell.refresh();
-       if( ! interfaceMap( cell ) )            
-       this->updateCell( aux, cell );
-       }
-       }
-       }     
-       //aux.save( "aux-4.tnl" );
-       
-       for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-       cell.getCoordinates().z() >= 0;
-       cell.getCoordinates().z()-- )
-       {
-       for( cell.getCoordinates().y() = 0;
-       cell.getCoordinates().y() < mesh->getDimensions().y();
-       cell.getCoordinates().y()++ )
-       {
-       for( cell.getCoordinates().x() = 0;
-       cell.getCoordinates().x() < mesh->getDimensions().x();
-       cell.getCoordinates().x()++ )
-       {
-       //std::cerr << "5 -> ";
-       cell.refresh();
-       if( ! interfaceMap( cell ) )
-       this->updateCell( aux, cell );
-       }
-       }
-       }
-       //aux.save( "aux-5.tnl" );
-       
-       for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-       cell.getCoordinates().z() >= 0;
-       cell.getCoordinates().z()-- )
-       {
-       for( cell.getCoordinates().y() = 0;
-       cell.getCoordinates().y() < mesh->getDimensions().y();
-       cell.getCoordinates().y()++ )
-       {
-       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-       cell.getCoordinates().x() >= 0 ;
-       cell.getCoordinates().x()-- )		
-       {
-       //std::cerr << "6 -> ";
-       cell.refresh();
-       if( ! interfaceMap( cell ) )            
-       this->updateCell( aux, cell );
-       }
-       }
-       }
-       //aux.save( "aux-6.tnl" );
-       
-       for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-       cell.getCoordinates().z() >= 0;
-       cell.getCoordinates().z()-- )
-       {
-       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-       cell.getCoordinates().y() >= 0 ;
-       cell.getCoordinates().y()-- )
-       {
-       for( cell.getCoordinates().x() = 0;
-       cell.getCoordinates().x() < mesh->getDimensions().x();
-       cell.getCoordinates().x()++ )
-       {
-       //std::cerr << "7 -> ";
-       cell.refresh();
-       if( ! interfaceMap( cell ) )            
-       this->updateCell( aux, cell );
-       }
-       }
-       }
-       //aux.save( "aux-7.tnl" );
-       
-       for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-       cell.getCoordinates().z() >= 0;
-       cell.getCoordinates().z()-- )
-       {
-       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-       cell.getCoordinates().y() >= 0;
-       cell.getCoordinates().y()-- )
-       {
-       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-       cell.getCoordinates().x() >= 0 ;
-       cell.getCoordinates().x()-- )		
-       {
-       //std::cerr << "8 -> ";
-       cell.refresh();
-       if( ! interfaceMap( cell ) )            
-       this->updateCell( aux, cell );
-       }
-       }
-       }*/
-    }
-    if( std::is_same< DeviceType, Devices::Cuda >::value )
-    {
-      // TODO: CUDA code
-#ifdef HAVE_CUDA
-      const int cudaBlockSize( 8 );
-      int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
-      int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
-      int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z(), cudaBlockSize ); 
-      if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
-        std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
-      dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
-      dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
-      
-      tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
-      
-      
-      int BlockIterD = 1;
-      
-      TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
-      BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ );
-      BlockIterDevice.setValue( 1 );
-      TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom;
-      BlockIterPom.setSize( numBlocksX * numBlocksY * numBlocksZ );
-      BlockIterPom.setValue( 0 );
-      /*int *BlockIterDevice;
-       cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/
-      int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0);
-      
-      TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
-      dBlock.setSize( nBlocks );
-      dBlock.setValue( 0 );
-      
-      int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0);
-      /*int *dBlock;
-       cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/
-      MeshFunctionPointer helpFunc1( mesh );      
-      MeshFunctionPointer helpFunc( mesh );
-      
-      helpFunc1 = auxPtr;
-      auxPtr = helpFunc;
-      helpFunc = helpFunc1;
-      int numIter = 0;
-      
-      while( BlockIterD )
+      if( std::is_same< DeviceType, Devices::Cuda >::value && calculate )
       {
-        helpFunc1 = auxPtr;
-        auxPtr = helpFunc;
-        helpFunc = helpFunc1;
-        TNL_CHECK_CUDA_DEVICE;
+#ifdef HAVE_CUDA
+        // cudaBlockSize is a size of blocks. It's the number raised to the 3 power.
+        // the number should be less than 10^3 (num of threads in one grid is maximally 1024)
+        // IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2)
+        const int cudaBlockSize( 8 );
         
         CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr,
                 interfaceMapPtr.template getData< Device >(),
@@ -404,32 +634,192 @@ solve( const MeshPointer& mesh,
         CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks );
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
-        cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
-        numIter++;
-        /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
-         BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
-        
+        aux = *auxPtr;
+        interfaceMap = *interfaceMapPtr;
+#endif
       }
-      if( numIter == 1 ){
-        auxPtr = helpFunc;
+      
+#ifdef HAVE_MPI
+      if( CommunicatorType::isDistributed() ){
+        
+        const int *neigh = meshPom->getNeighbors(); // Getting nembers of distributed mesh
+        MPI::Request *req;
+        req = new MPI::Request[meshPom->getNeighborsCount()];  
+        
+        int neighCount = 0; // we know the number in runtime and it can differ for every MPI thread
+        // Getting information weather some of six neghbours (top, bottom, right, left, ahead, behind) calculated
+        int calculpom[6] = {0,0,0,0,0,0}; 
+        
+        
+        if( neigh[0] != -1 ) // if you have west neighbour
+        {
+          // if we have this neighbour, we send calculated, one number, to him, ...
+          req[neighCount] = MPI::ISend( &calculated, 1, neigh[0], 0, MPI::AllGroup );
+          neighCount++;
+          // and we recive the same information from him
+          req[neighCount] = MPI::IRecv( &calculpom[0], 1, neigh[0], 0, MPI::AllGroup );
+          neighCount++;
+        }
+        
+        if( neigh[1] != -1 ) // east
+        {
+          req[neighCount] = MPI::ISend( &calculated, 1, neigh[1], 0, MPI::AllGroup ); 
+          neighCount++;
+          
+          
+          req[neighCount] = MPI::IRecv( &calculpom[1], 1, neigh[1], 0, MPI::AllGroup );
+          neighCount++;
+        }
+        
+        if( neigh[2] != -1 ) // north
+        {
+          req[neighCount] = MPI::ISend( &calculated, 1, neigh[2], 0, MPI::AllGroup );
+          neighCount++;
+          
+          req[neighCount] = MPI::IRecv( &calculpom[2], 1, neigh[2], 0, MPI::AllGroup  );
+          neighCount++;
+        }
+        
+        if( neigh[5] != -1 ) //south
+        {
+          req[neighCount] = MPI::ISend( &calculated, 1, neigh[5], 0, MPI::AllGroup );
+          neighCount++;
+          
+          req[neighCount] = MPI::IRecv( &calculpom[3], 1, neigh[5], 0, MPI::AllGroup );
+          neighCount++;
+        }
+        
+        if( neigh[8] != -1 ) // top 
+        {
+          req[neighCount] = MPI::ISend( &calculated, 1, neigh[8], 0, MPI::AllGroup );
+          neighCount++;
+          
+          req[neighCount] = MPI::IRecv( &calculpom[4], 1, neigh[8], 0, MPI::AllGroup );
+          neighCount++;
+        }
+        
+        if( neigh[17] != -1 ) //bottom
+        {
+          req[neighCount] = MPI::ISend( &calculated, 1, neigh[17], 0, MPI::AllGroup );
+          neighCount++;
+          
+          req[neighCount] = MPI::IRecv( &calculpom[5], 1, neigh[17], 0, MPI::AllGroup );
+          neighCount++;
+        }
+        
+        MPI::WaitAll(req,neighCount); //waiting for all to have all the information
+#if ForDebug
+        printf( "%d: Sending Calculated = %d.\n", i, calculated );
+        printf( "%d: calculpom[0] = %d, calculpom[1] = %d, calculpom[2] = %d, calculpom[3] = %d, calculpom[4] = %d,"
+                "calculpom[5] = %d", i ,calculpom[0],calculpom[1],calculpom[2],calculpom[3],calculpom[4],calculpom[5]);
+#endif        
+        // if one of the MPI thread had calculated = 1, then all get 1. Otherwise all get 0
+        MPI::Allreduce( &calculated, &calculated, 1, MPI_LOR,  MPI::AllGroup ); 
+        // synchronizate the overlaps 
+        aux.template synchronize< Communicator >();
+        // if any of my neighbours had calculated = 1, than I should calculate again (but all of us has to go throw while(calculated))
+        calculate = calculpom[0] || calculpom[1] || calculpom[2] ||
+                    calculpom[3] || calculpom[4] || calculpom[5];
+#if ForDebug
+        printf( "%d: Receved Calculated = %d.\n%d: Calculate = %d\n", i, calculated, i, calculate);
+#endif
+        
+#if ForDebug 
+        if( i == 1 )
+          printf("WhileCount = %d\n",WhileCount);
+        if( i == 2 ){
+          aux.save("aux-final2.tnl");
+          mesh->save("mesh-2.tnl");
+        }
+        if( i == 1 ){
+          aux.save("aux-final1.tnl");
+          mesh->save("mesh-1.tnl");
+        }
+        if( i == 3 ){
+          aux.save("aux-final3.tnl");
+          mesh->save("mesh-3.tnl");
+        }
+        if( i == 0 ){
+          aux.save("aux-final0.tnl");
+          mesh->save("mesh-0.tnl");
+        }
+        //calculated = 0; // DEBUG;
+#endif
       }
-      //cudaFree( BlockIterDevice );
-      //cudaFree( dBlock );
-      cudaDeviceSynchronize();
-      TNL_CHECK_CUDA_DEVICE;
-      aux = *auxPtr;
-      interfaceMap = *interfaceMapPtr;
 #endif
+      if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculated 0!
+        calculated = 0; //otherwise we would go throw the FSM code and CUDA FSM code again uselessly
     }
-    
     //aux.save( "aux-8.tnl" );
     iteration++;
     
   }
+  // Saving the results into Aux for MakeSnapshot function.
+  Aux = auxPtr; 
   aux.save("aux-final.tnl");
 }
 
 #ifdef HAVE_CUDA
+// DeepCopy nebo pracne kopirovat kraje v zavislosti na vLower,vUpper z sArray do helpFunc.
+template< typename Real, typename Device, typename Index >
+__global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, int copy, int k )
+{
+  int thri = threadIdx.x + blockDim.x*blockIdx.x;
+  int thrj = blockDim.y*blockIdx.y + threadIdx.y;
+  int thrk = blockDim.z*blockIdx.z + threadIdx.z;
+  
+  const Meshes::Grid< 3, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >();
+  if( copy ){
+    if( thri < mesh.getDimensions().x() && thrj < mesh.getDimensions().y() && thrk < mesh.getDimensions().z() )
+    {
+      helpFunc[ thrk * mesh.getDimensions().x() * mesh.getDimensions().y() + thrj * mesh.getDimensions().x() + thri ] =
+              aux[ thrk * mesh.getDimensions().x() * mesh.getDimensions().y() + thrj * mesh.getDimensions().x() + thri ];
+    }
+  }
+  else // for debug, values can be printed only from cuda kernel
+  {
+    if( thrk == 0 && thri==0 && thrj == 0 && blockIdx.z == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 0 )
+    {
+      printf("%d: DimX = %d, DimY = %d, DimZ = %d\n", k,mesh.getDimensions().x(),mesh.getDimensions().y(),mesh.getDimensions().z() );
+      for( int z = mesh.getDimensions().z()-1; z > mesh.getDimensions().z()-2; z-- )
+      {
+        for( int y = 0; y < mesh.getDimensions().y(); y++ )
+        {
+          for( int x = 0; x < mesh.getDimensions().x(); x++ )
+          {
+            printf("%.2f ", helpFunc[ z *mesh.getDimensions().y()*mesh.getDimensions().x() + y*mesh.getDimensions().x() + x ]);
+          }
+          printf("\n");
+        }
+        printf("\n");
+      }
+      printf("\n");
+    }
+    if( thrk == 0 && thri==0 && thrj == 0 && blockIdx.z == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 1 )
+    {
+      printf("%d: DimX = %d, DimY = %d, DimZ = %d\n", k,mesh.getDimensions().x(),mesh.getDimensions().y(),mesh.getDimensions().z() );
+      
+      if( k == 1 )
+      {
+        for( int z = 1; z < 2; z++ )
+        {
+          for( int y = 0; y < mesh.getDimensions().y(); y++ )
+          {
+            for( int x = 0; x < mesh.getDimensions().x(); x++ )
+            {
+              printf("%.2f ", aux[ z *mesh.getDimensions().y()*mesh.getDimensions().x() + y*mesh.getDimensions().x() + x ]);
+            }
+            printf("\n");
+          }
+          printf("\n");
+        }
+        printf("\n");
+      }
+    }
+  }
+}
+
 template < typename Index >
 __global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
         TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom,
@@ -444,21 +834,24 @@ __global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda,
     l = i/( numBlockX * numBlockY );
     k = (i-l*numBlockX * numBlockY )/(numBlockX );
     m = (i-l*numBlockX * numBlockY )%( numBlockX );
-    if( m > 0 && BlockIterDevice[ i - 1 ] ){
+    if( m > 0 && BlockIterDevice[ i - 1 ] ){ // left neighbour
       pom = 1;//BlockIterPom[ i ] = 1;
-    }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){
+    }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){ // right neighbour
       pom = 1;//BlockIterPom[ i ] = 1;
-    }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){
+    }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){ // bottom neighbour
       pom = 1;// BlockIterPom[ i ] = 1;
-    }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){
+    }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){ // top neighbour
       pom = 1;//BlockIterPom[ i ] = 1;
-    }else if( l > 0 && BlockIterDevice[ i - numBlockX*numBlockY ] ){
+    }else if( l > 0 && BlockIterDevice[ i - numBlockX*numBlockY ] ){ // neighbour behind 
       pom = 1;
-    }else if( l < numBlockZ-1 && BlockIterDevice[ i + numBlockX*numBlockY ] ){
+    }else if( l < numBlockZ-1 && BlockIterDevice[ i + numBlockX*numBlockY ] ){ // neighbour in front
       pom = 1;
     }
     
-    BlockIterPom[ i ] = pom;//BlockIterPom[ i ];
+    if( !BlockIterDevice[ i ] ) // only in CudaUpdateCellCaller can BlockIterDevice gain 0
+      BlockIterPom[ i ] = pom;
+    else
+      BlockIterPom[ i ] = 1;
   }
 }
 
@@ -471,23 +864,25 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
 {
   int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
   int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z;
-  int i = threadIdx.x + blockDim.x*blockIdx.x;
-  int j = blockDim.y*blockIdx.y + threadIdx.y;
-  int k = blockDim.z*blockIdx.z + threadIdx.z;
+  int i = threadIdx.x + blockDim.x*blockIdx.x + vLower[0]; // WITH OVERLAPS!!! i,j,k aren't coordinates of all values
+  int j = blockDim.y*blockIdx.y + threadIdx.y + vLower[1];
+  int k = blockDim.z*blockIdx.z + threadIdx.z + vLower[2];
   int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri;
+  const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
   
-  if( BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] )
+  // should this block calculate?
+  if( BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] ) 
   {
     __syncthreads();
     
-    __shared__ volatile bool changed[ 8*8*8/*(sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2)*/];
-    
+    // Array indicates weather some threads calculated (for parallel reduction)
+    __shared__ volatile bool changed[ (sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2) ];
     changed[ currentIndex ] = false;
+    
     if( thrj == 0 && thri == 0 && thrk == 0 )
-      changed[ 0 ] = true;
+      changed[ 0 ] = true; // first indicates weather we should calculate again (princip of parallel reduction)
     
-    const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
-    __shared__ Real hx; __shared__ int dimX;
+    __shared__ Real hx; __shared__ int dimX; //getting stepps and size of mesh
     __shared__ Real hy; __shared__ int dimY;
     __shared__ Real hz; __shared__ int dimZ;
     
@@ -500,16 +895,19 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       dimX = mesh.getDimensions().x();
       dimY = mesh.getDimensions().y();
       dimZ = mesh.getDimensions().z();
+      // we dont know if we will calculate in here, more info down in code
       BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 0;
     }
-    __shared__ volatile Real sArray[ 10*10*10/*sizeSArray * sizeSArray * sizeSArray*/ ];
+    
+    // sArray contains values of one block (coppied from aux) and edges (not MPI) of those blocks
+    __shared__ volatile Real sArray[ sizeSArray * sizeSArray * sizeSArray ];
     sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = std::numeric_limits< Real >::max();
     
-    //filling sArray edges
+    // getting some usefull information 
     int numOfBlockx;
     int numOfBlocky;
     int numOfBlockz;
-    int xkolik;
+    int xkolik; // maximum of threads in x direction (for all blocks different)
     int ykolik;
     int zkolik;
     xkolik = blockDim.x + 1;
@@ -521,65 +919,104 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     __syncthreads();
     
     if( numOfBlockx - 1 == blIdx )
-      xkolik = dimX - (blIdx)*blockDim.x+1;
+      xkolik = (dimX-vUpper[0]-vLower[0]) - (blIdx)*blockDim.x+1;
     if( numOfBlocky -1 == blIdy )
-      ykolik = dimY - (blIdy)*blockDim.y+1;
+      ykolik = (dimY-vUpper[1]-vLower[1]) - (blIdy)*blockDim.y+1;
     if( numOfBlockz-1 == blIdz )
-      zkolik = dimZ - (blIdz)*blockDim.z+1;
+      zkolik = (dimZ-vUpper[2]-vLower[2]) - (blIdz)*blockDim.z+1;
     __syncthreads();
     
-    if( thri == 0 )
+     //filling sArray edges
+    if( thri == 0 ) //x bottom
     {        
-      if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik )
-        sArray[(thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ];
-      else
+      if( (blIdx != 0 || vLower[0] !=0) && thrj+1 < ykolik && thrk+1 < zkolik )
+        sArray[ (thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0 ] = 
+                aux[ (blIdz*blockDim.z + vLower[2]) * dimX * dimY + (blIdy * blockDim.y+vLower[1])*dimX 
+                + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY + vLower[0] ];
+    else
         sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max();
     }
     
-    if( thri == 1 )
+    if( thri == 1 ) //xtop
     {
-      if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik )
-        sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ];
-      else
+      if( dimX - vLower[ 0 ] > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik )
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] =
+                aux[ (blIdz*blockDim.z + vLower[2]) * dimX * dimY + (blIdy * blockDim.y+vLower[1])*dimX
+                + blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY + vLower[0] ];
+     else
         sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max();
     }
-    if( thri == 2 )
+    if( thri == 2 ) //y bottom
     {        
-      if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik )
-        sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ];
+      if( (blIdy != 0 || vLower[1] !=0) && thrj+1 < xkolik && thrk+1 < zkolik )
+        sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] =
+                aux[ (blIdz*blockDim.z + vLower[2]) * dimX * dimY + (blIdy * blockDim.y+vLower[1])*dimX
+                + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY + vLower[0] ];
       else
         sArray[ (thrk+1) * sizeSArray * sizeSArray + 0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
     }
     
-    if( thri == 3 )
+    if( thri == 3 ) //y top
     {
-      if( dimY > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik )
-        sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ];
-      else
+      if( dimY - vLower[ 1 ] > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik )
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] =
+                aux[ (blIdz*blockDim.z + vLower[2]) * dimX * dimY + ((blIdy+1) * blockDim.y+vLower[1])*dimX
+                + blIdx*blockDim.x + thrj + thrk*dimX*dimY + vLower[0] ];
+     else
         sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
     }
-    if( thri == 4 )
+    if( thri == 4 ) //z bottom
     {        
-      if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik )
-        sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ];
-      else
+      if( (blIdz != 0 || vLower[2] !=0) && thrj+1 < ykolik && thrk+1 < xkolik )
+        sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] =
+                aux[ (blIdz*blockDim.z + vLower[2]) * dimX * dimY + (blIdy * blockDim.y+vLower[1])*dimX
+                + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk + vLower[0] ];
+     else
         sArray[0 * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thrk+1] = std::numeric_limits< Real >::max();
     }
     
-    if( thri == 5 )
+    if( thri == 5 ) //z top
     {
-      if( dimZ > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik )
-        sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ];
-      else
+      if( dimZ - vLower[ 2 ] > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik )
+        sArray[ zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] =
+                aux[ ((blIdz+1)*blockDim.z + vLower[2]) * dimX * dimY + (blIdy * blockDim.y+vLower[1])*dimX
+                + blIdx*blockDim.x + thrj * dimX + thrk + vLower[0] ];
+     else
         sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = std::numeric_limits< Real >::max();
     }
     
-    if( i < dimX && j < dimY && k < dimZ )
+    // Copy all other values that aren't edges
+    if( i - vLower[0] < dimX && j - vLower[1] < dimY && k - vLower[2] < dimZ &&
+        thri+1 < xkolik + vUpper[0] && thrj+1 < ykolik + vUpper[1] && thrk+1 < zkolik + vUpper[2] )
     {
       sArray[(thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = aux[ k*dimX*dimY + j*dimX + i ];
     }
     __syncthreads(); 
     
+#if ForDebug    
+    /*if( thri==0 && thrj == 0 && thrk == 0 && blockIdx.z == 0 && blockIdx.x == 2 && blockIdx.y == 2 && MPIthread == 1 )
+    {
+      printf( "všechno před výpočtem: \n");
+      for( int m = sizeSArray-1; m>-1; m-- ){
+        for( int l = 0; l < sizeSArray; l++ )
+          printf( "%.2f ", sArray[4*sizeSArray * sizeSArray + m * sizeSArray + l]);
+        printf( "\n");
+      }
+      printf( "\n");
+    }
+    
+    if(thri==0 && thrj == 0 && thrk == 0 && blockIdx.z == 0 && blockIdx.x == 2 && blockIdx.y == 2 && MPIthread == 1 )
+    {
+      for( int m = 24; m>14; m-- ){
+        for( int l = 15; l < 25; l++ )  
+          printf("%.2f ", aux[ 4 *mesh.getDimensions().y()*mesh.getDimensions().x() + m*mesh.getDimensions().x() + l ]);
+        printf( "\n");
+      }
+      printf( "\n");
+    }*/
+#endif 
+    
+    //main while cycle. each value can get information only from neighbour but that information has to spread there
     while( changed[ 0 ] )
     {
       __syncthreads();
@@ -587,16 +1024,17 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       changed[ currentIndex ] = false;
       
       //calculation of update cell
-      if( i < dimX && j < dimY && k < dimZ )
+      if( i < dimX - vUpper[0] && j < dimY - vUpper[1] && k < dimZ - vUpper[2] )
       {
         if( ! interfaceMap[ k*dimX*dimY + j * dimX + i ] )
         {
-          changed[ currentIndex ] = ptr.updateCell3D< sizeSArray >( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz);
+          // calculate new value depending on neighbours in sArray on (thri+1, thrj+1) coordinates
+          changed[ currentIndex ] = ptr.updateCell3D< sizeSArray >( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz); 
         }
       }
       __syncthreads();
       
-      //pyramid reduction
+      //pyramid reduction (parallel reduction)
       if( blockDim.x*blockDim.y*blockDim.z == 1024 )
       {
         if( currentIndex < 512 )
@@ -640,30 +1078,25 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       }
       __syncthreads();
       
-      /*if(thri == 0 && thrj ==0 && thrk ==0 && blIdx == 0 && blIdy == 0 && blIdz == 0)
-       {
-       //for(int m = 0; m < 8; m++){
-       int m = 4;
-       for(int n = 0; n<8; n++){
-       for(int b=0; b<8; b++)
-       printf(" %i ", changed[m*64 + n*8 + b]);
-       printf("\n");
-       }
-       printf("\n \n");
-       }
-       //}*/
-      
+      // if we calculated, then the BlockIterDevice should contain the info about this whole block! (only one number for one block)
       if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 )
       {
-        //printf( "Setting block calculation. Block = %d.\n",blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x  );
         BlockIterDevice[ blIdz * gridDim.x * gridDim.y + blIdy * gridDim.x + blIdx ] = 1;
       }
       __syncthreads();
     }
     
-    if( i < dimX && j < dimY && k < dimZ )
+    // copy results into helpFunc (not into aux bcs of conflicts)
+    if( i < dimX && j < dimY && k < dimZ && thri+1 < xkolik && thrj+1 < ykolik && thrk+1 < zkolik )
       helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thri+1 ];
     
-  } 
+  }
+  else // if not, then it should at least copy the values from aux to helpFunc.
+  {
+    if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1]
+            && k < mesh.getDimensions().z() - vUpper[2])
+      helpFunc[ k * mesh.getDimensions().x() * mesh.getDimensions().y() + j * mesh.getDimensions().x() + i ] =
+              aux[ k * mesh.getDimensions().x() * mesh.getDimensions().y() + j * mesh.getDimensions().x() + i ];
+  }
 }  
 #endif
-- 
GitLab


From 7af752a5290d881a36bbd287235cc5da862b63e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matou=C5=A1=20Fencl?= <fenclmat@fjfi.cvut.cz>
Date: Mon, 11 Mar 2019 20:01:43 +0100
Subject: [PATCH 08/14] DeepCopy removed from CUDA

---
 .../tnlFastSweepingMethod2D_impl.h            | 22 +++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index 5eac5232b..f9bef30c3 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -465,17 +465,31 @@ solve( const MeshPointer& mesh,
         
         // Helping meshFunction that switches with AuxPtr in every calculation of CudaUpdateCellCaller<<<>>>()
         MeshFunctionPointer helpFunc( mesh );
+        helpFunc.template modifyData() = auxPtr.template getData();
+        Devices::Cuda::synchronizeDevice(); 
         //MeshFunctionPointer helpFunc1( mesh );
         
         // Setting number of threads and blocks in grid for DeepCopy of meshFunction
-        int numBlocksXWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
+        /*int numBlocksXWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
         int numBlocksYWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
         dim3 gridSizeWithoutOverlaps( numBlocksXWithoutOverlaps, numBlocksYWithoutOverlaps );
-                
+        
+        
+          Devices::Cuda::synchronizeDevice();
         DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(),
                 helpFunc.template modifyData< Device>(), 1, i );
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
+          Devices::Cuda::synchronizeDevice();
+        DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(),
+                helpFunc.template modifyData< Device>(), 0, i );
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;*/
         
 #if ForDebug
+        /*int numBlocksXWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
+        int numBlocksYWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
+        dim3 gridSizeWithoutOverlaps( numBlocksXWithoutOverlaps, numBlocksYWithoutOverlaps );*/
         DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(),
                 helpFunc.template modifyData< Device>(), 0, i );
 #endif
@@ -536,7 +550,7 @@ solve( const MeshPointer& mesh,
           Devices::Cuda::synchronizeDevice();
           CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
                   interfaceMapPtr.template getData< Device >(),
-                  auxPtr.template modifyData< Device>(),
+                  auxPtr.template getData< Device>(),
                   helpFunc.template modifyData< Device>(),
                   BlockIterDevice, vLower, vUpper, i );
           cudaDeviceSynchronize();
@@ -701,7 +715,7 @@ __global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 2, Real,
   const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >();
   if( copy ){
     if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
-      helpFunc[ j * mesh.getDimensions().x() + i ] = aux[ j * mesh.getDimensions().x() + i ];
+      helpFunc[ j * mesh.getDimensions().x() + i ] = 1;//aux[ j * mesh.getDimensions().x() + i ];
   }
   else
   {
-- 
GitLab


From d4412d3188dfd01f6d38d4a9095f5eaf3f847e6d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matou=C5=A1=20Fencl?= <fenclmat@fjfi.cvut.cz>
Date: Sat, 16 Mar 2019 11:09:27 +0100
Subject: [PATCH 09/14] Refactoring

---
 .../tnlDirectEikonalMethodBase1D_impl.h       |  204 +++
 .../tnlDirectEikonalMethodBase2D_impl.h       |  803 ++++++++++++
 .../tnlDirectEikonalMethodBase3D_impl.h       | 1091 +++++++++++++++++
 .../tnlDirectEikonalMethodsBase.h             |   79 +-
 .../hamilton-jacobi/tnlFastSweepingMethod.h   |   27 +-
 .../tnlFastSweepingMethod2D_impl.h            |  707 +++--------
 .../tnlFastSweepingMethod3D_impl.h            | 1032 +++-------------
 7 files changed, 2493 insertions(+), 1450 deletions(-)
 create mode 100644 src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h
 create mode 100644 src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
 create mode 100644 src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h
new file mode 100644
index 000000000..55129c4e1
--- /dev/null
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h
@@ -0,0 +1,204 @@
+/* 
+ * File:   tnlDirectEikonalMethodBase1D_impl.h
+ * Author: Fencl
+ *
+ * Created on March 15, 2019
+ */
+
+#pragma once
+
+template< typename Real,
+        typename Device,
+        typename Index >
+void
+tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >::
+initInterface( const MeshFunctionPointer& _input,
+        MeshFunctionPointer& _output,
+        InterfaceMapPointer& _interfaceMap  )
+{
+  if( std::is_same< Device, Devices::Cuda >::value )
+  {
+#ifdef HAVE_CUDA
+    const MeshType& mesh = _input->getMesh();
+    
+    const int cudaBlockSize( 16 );
+    int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
+    dim3 blockSize( cudaBlockSize );
+    dim3 gridSize( numBlocksX );
+    Devices::Cuda::synchronizeDevice();
+    CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
+            _output.template modifyData< Device >(),
+            _interfaceMap.template modifyData< Device >() );
+    cudaDeviceSynchronize();
+    TNL_CHECK_CUDA_DEVICE;
+#endif
+  }
+  if( std::is_same< Device, Devices::Host >::value )
+  {
+    const MeshType& mesh = _input->getMesh();
+    typedef typename MeshType::Cell Cell;
+    const MeshFunctionType& input = _input.getData();
+    MeshFunctionType& output = _output.modifyData();
+    InterfaceMapType& interfaceMap = _interfaceMap.modifyData();
+    Cell cell( mesh );
+    for( cell.getCoordinates().x() = 0;
+            cell.getCoordinates().x() < mesh.getDimensions().x();
+            cell.getCoordinates().x() ++ )
+    {
+      cell.refresh();
+      output[ cell.getIndex() ] =
+              input( cell ) >= 0 ? std::numeric_limits< RealType >::max() :
+                -std::numeric_limits< RealType >::max();
+      interfaceMap[ cell.getIndex() ] = false;
+    }
+    
+    
+    const RealType& h = mesh.getSpaceSteps().x();
+    for( cell.getCoordinates().x() = 0;
+            cell.getCoordinates().x() < mesh.getDimensions().x() - 1;
+            cell.getCoordinates().x() ++ )
+    {
+      cell.refresh();
+      const RealType& c = input( cell );      
+      if( ! cell.isBoundaryEntity()  )
+      {
+        const auto& neighbors = cell.getNeighborEntities();
+        Real pom = 0;
+        //const IndexType& c = cell.getIndex();
+        const IndexType e = neighbors.template getEntityIndex<  1 >();
+        if( c * input[ e ] <= 0 )
+        {
+          pom = TNL::sign( c )*( h * c )/( c - input[ e ]);
+          if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
+            output[ cell.getIndex() ] = pom;
+          
+          pom = pom - TNL::sign( c )*h; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
+          if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
+            output[ e ] = pom; 
+          
+          interfaceMap[ cell.getIndex() ] = true;
+          interfaceMap[ e ] = true;
+        }
+      }
+    }
+  }
+}
+
+template< typename Real,
+        typename Device,
+        typename Index >
+template< typename MeshEntity >
+void
+tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >::
+updateCell( MeshFunctionType& u,
+        const MeshEntity& cell, 
+        const RealType v )
+{
+  const auto& neighborEntities = cell.template getNeighborEntities< 1 >();
+  const MeshType& mesh = cell.getMesh();
+  const RealType& h = mesh.getSpaceSteps().x();
+  const RealType value = u( cell );
+  RealType a, tmp = std::numeric_limits< RealType >::max();
+  
+  if( cell.getCoordinates().x() == 0 )
+    a = u[ neighborEntities.template getEntityIndex< 1 >() ];
+  else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
+    a = u[ neighborEntities.template getEntityIndex< -1 >() ];
+  else
+  {
+    a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1 >() ],
+            u[ neighborEntities.template getEntityIndex<  1 >() ] );
+  }
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() )
+    return;
+  
+  tmp = a + TNL::sign( value ) * h/v;
+  
+  u[ cell.getIndex() ] = argAbsMin( value, tmp );
+}
+
+template< typename Real,
+        typename Device,
+        typename Index >
+__cuda_callable__
+bool
+tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >::
+updateCell( volatile Real sArray[18], int thri, const Real h, const Real v )
+{
+  const RealType value = sArray[ thri ];
+  RealType a, tmp = std::numeric_limits< RealType >::max();
+  
+  a = TNL::argAbsMin( sArray[ thri+1 ],
+          sArray[ thri-1 ] );
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() )
+    return false;
+  
+  tmp = a + TNL::sign( value ) * h/v;
+  
+  
+  sArray[ thri ] = argAbsMin( value, tmp );
+  
+  tmp = value - sArray[ thri ];
+  if ( fabs( tmp ) >  0.001*h )
+    return true;
+  else
+    return false;
+}
+
+#ifdef HAVE_CUDA
+template < typename Real, typename Device, typename Index >
+__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, 
+        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap )
+{
+  int i = threadIdx.x + blockDim.x*blockIdx.x;
+  const Meshes::Grid< 1, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
+  
+  if( i < mesh.getDimensions().x()  )
+  {
+    typedef typename Meshes::Grid< 1, Real, Device, Index >::Cell Cell;
+    Cell cell( mesh );
+    cell.getCoordinates().x() = i;
+    cell.refresh();
+    const Index cind = cell.getIndex();
+    
+    
+    output[ cind ] =
+            input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
+              - std::numeric_limits< Real >::max();
+    interfaceMap[ cind ] = false; 
+    
+    const Real& h = mesh.getSpaceSteps().x();
+    cell.refresh();
+    const Real& c = input( cell );
+    if( ! cell.isBoundaryEntity()  )
+    {
+      auto neighbors = cell.getNeighborEntities();
+      Real pom = 0;
+      const Index e = neighbors.template getEntityIndex< 1 >();
+      const Index w = neighbors.template getEntityIndex< -1 >();
+      if( c * input[ e ] <= 0 )
+      {
+        pom = TNL::sign( c )*( h * c )/( c - input[ e ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
+          output[ cind ] = pom;                       
+        
+        interfaceMap[ cind ] = true;
+      }
+      if( c * input[ w ] <= 0 )
+      {
+        pom = TNL::sign( c )*( h * c )/( c - input[ w ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+          output[ cind ] = pom;
+        
+        interfaceMap[ cind ] = true;
+      }
+    }
+  }
+  
+}
+#endif
+
+
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
new file mode 100644
index 000000000..583e22478
--- /dev/null
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
@@ -0,0 +1,803 @@
+/* 
+ * File:   tnlDirectEikonalMethodBase2D_impl.h
+ * Author: Fencl
+ *
+ * Created on March 15, 2019
+ */
+
+#pragma once
+
+template< typename Real,
+        typename Device,
+        typename Index >
+void
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+initInterface( const MeshFunctionPointer& _input,
+        MeshFunctionPointer& _output,
+        InterfaceMapPointer& _interfaceMap, 
+        const StaticVector vecLowerOverlaps, 
+        const StaticVector vecUpperOverlaps )
+{
+  
+  if( std::is_same< Device, Devices::Cuda >::value )
+  {
+#ifdef HAVE_CUDA
+    const MeshType& mesh = _input->getMesh();
+    
+    const int cudaBlockSize( 16 );
+    int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
+    int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
+    dim3 blockSize( cudaBlockSize, cudaBlockSize );
+    dim3 gridSize( numBlocksX, numBlocksY );
+    Devices::Cuda::synchronizeDevice();
+    CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
+            _output.template modifyData< Device >(),
+            _interfaceMap.template modifyData< Device >(),
+            vecLowerOverlaps, vecUpperOverlaps);
+    cudaDeviceSynchronize();
+    TNL_CHECK_CUDA_DEVICE;
+#endif
+  }
+  if( std::is_same< Device, Devices::Host >::value )
+  {
+    MeshFunctionType input = _input.getData();    
+    MeshFunctionType& output = _output.modifyData();
+    InterfaceMapType& interfaceMap = _interfaceMap.modifyData();
+    const MeshType& mesh = input.getMesh();
+    typedef typename MeshType::Cell Cell;
+    Cell cell( mesh );
+    for( cell.getCoordinates().y() = 0;
+            cell.getCoordinates().y() < mesh.getDimensions().y();
+            cell.getCoordinates().y() ++ )
+      for( cell.getCoordinates().x() = 0;
+              cell.getCoordinates().x() < mesh.getDimensions().x();
+              cell.getCoordinates().x() ++ )
+      {
+        cell.refresh();
+        output[ cell.getIndex() ] =
+                input( cell ) >= 0 ? std::numeric_limits< RealType >::max() :
+                  - std::numeric_limits< RealType >::max();
+        interfaceMap[ cell.getIndex() ] = false;
+      }
+    
+    const RealType& hx = mesh.getSpaceSteps().x();
+    const RealType& hy = mesh.getSpaceSteps().y();     
+    for( cell.getCoordinates().y() = 0 + vecLowerOverlaps[1];
+            cell.getCoordinates().y() < mesh.getDimensions().y() - vecUpperOverlaps[1];
+            cell.getCoordinates().y() ++ )
+      for( cell.getCoordinates().x() = 0 + vecLowerOverlaps[0];
+              cell.getCoordinates().x() < mesh.getDimensions().x() - vecUpperOverlaps[0];
+              cell.getCoordinates().x() ++ )
+      {
+        cell.refresh();
+        const RealType& c = input( cell );
+        if( ! cell.isBoundaryEntity()  )
+        {
+          auto neighbors = cell.getNeighborEntities();
+          Real pom = 0;
+          const IndexType e = neighbors.template getEntityIndex<  1,  0 >();
+          const IndexType n = neighbors.template getEntityIndex<  0,  1 >();
+          if( c * input[ n ] <= 0 )
+          {
+            pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
+            if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
+              output[ cell.getIndex() ] = pom;
+            pom = pom - TNL::sign( c )*hy;
+            if( TNL::abs( output[ n ] ) > TNL::abs( pom ) )
+              output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy;
+            
+            interfaceMap[ cell.getIndex() ] = true;
+            interfaceMap[ n ] = true;
+          }
+          if( c * input[ e ] <= 0 )
+          {
+            pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
+            if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
+              output[ cell.getIndex() ] = pom;
+            
+            pom = pom - TNL::sign( c )*hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
+            if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
+              output[ e ] = pom; 
+            
+            interfaceMap[ cell.getIndex() ] = true;
+            interfaceMap[ e ] = true;
+          }
+        }
+      }
+  }
+}
+
+template< typename Real,
+        typename Device,
+        typename Index >
+template< typename MeshEntity >
+__cuda_callable__
+bool
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+updateCell( MeshFunctionType& u,
+        const MeshEntity& cell,   
+        const RealType v)
+{
+  const auto& neighborEntities = cell.template getNeighborEntities< 2 >();
+  const MeshType& mesh = cell.getMesh();
+  const RealType& hx = mesh.getSpaceSteps().x();
+  const RealType& hy = mesh.getSpaceSteps().y();
+  const RealType value = u( cell );
+  RealType a, b, tmp = std::numeric_limits< RealType >::max();
+  
+  if( cell.getCoordinates().x() == 0 )
+    a = u[ neighborEntities.template getEntityIndex< 1,  0 >() ];
+  else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
+    a = u[ neighborEntities.template getEntityIndex< -1,  0 >() ];
+  else
+  {
+    a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1,  0 >() ],
+            u[ neighborEntities.template getEntityIndex<  1,  0 >() ] );
+  }
+  
+  if( cell.getCoordinates().y() == 0 )
+    b = u[ neighborEntities.template getEntityIndex< 0,  1 >()];
+  else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 )
+    b = u[ neighborEntities.template getEntityIndex< 0,  -1 >() ];
+  else
+  {
+    b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0,  -1 >() ],
+            u[ neighborEntities.template getEntityIndex< 0,   1 >() ] );
+  }
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+          fabs( b ) == std::numeric_limits< RealType >::max() )
+    return false;
+  
+  RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), hx, hy, 0.0 };
+  
+  tmp = getNewValue( pom , value, v );
+  
+  u[ cell.getIndex() ] = tmp;
+  
+  
+  tmp = value - u[ cell.getIndex() ];
+  
+  if ( fabs( tmp ) >  0.001*hx )
+    return true;
+  else
+    return false;
+  
+}
+
+template< typename Real,
+        typename Device,
+        typename Index >
+template< int sizeSArray >
+__cuda_callable__
+bool
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real hy,
+        const Real v )
+{
+  const RealType value = sArray[ thrj * sizeSArray + thri ];
+  RealType a, b, tmp = std::numeric_limits< RealType >::max();
+  
+  b = TNL::argAbsMin( sArray[ (thrj+1) * sizeSArray + thri ],
+          sArray[ (thrj-1) * sizeSArray + thri ] );
+  
+  a = TNL::argAbsMin( sArray[ thrj * sizeSArray + thri+1 ],
+          sArray[ thrj * sizeSArray + thri-1 ] );
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+          fabs( b ) == std::numeric_limits< RealType >::max() )
+    return false;
+  
+  RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
+  
+  tmp = getNewValue( pom , value, v );
+  
+  sArray[ thrj * sizeSArray + thri ] = tmp;
+  tmp = value - sArray[ thrj * sizeSArray + thri ];
+  if ( fabs( tmp ) >  0.001*hx )
+    return true;
+  else
+    return false;
+}
+
+template< typename Real,
+        typename Device,
+        typename Index >
+__cuda_callable__
+Real
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+getNewValue( RealType valuesAndSteps[], const RealType originalValue, const RealType v )
+{
+  RealType newValue = std::numeric_limits< RealType >::max();
+  sortMinims( valuesAndSteps );
+  
+  // calculation of real value taken from ZHAO
+  newValue = valuesAndSteps[ 0 ] + TNL::sign( originalValue ) * valuesAndSteps[ 3 ]/v;
+  if( fabs( newValue ) < fabs( valuesAndSteps[ 1 ] ) ) 
+  {
+    newValue = argAbsMin( originalValue, newValue );
+  }
+  else
+  {
+    newValue = ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] * valuesAndSteps[ 1 ] + 
+            valuesAndSteps[ 4 ] * valuesAndSteps[ 4 ] * valuesAndSteps[ 0 ] + 
+            TNL::sign( originalValue ) * valuesAndSteps[ 3 ] * valuesAndSteps[ 4 ] * 
+            TNL::sqrt( ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] +  valuesAndSteps[ 4 ] *  valuesAndSteps[ 4 ] )/( v * v ) - 
+            ( valuesAndSteps[ 1 ] - valuesAndSteps[ 0 ] ) * 
+            ( valuesAndSteps[ 1 ] - valuesAndSteps[ 0 ] ) ) )/
+            ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] + valuesAndSteps[ 4 ] * valuesAndSteps[ 4 ] );
+    newValue = argAbsMin( originalValue, newValue );
+  }
+  
+  return newValue;
+}
+
+
+template < typename T1 >
+__cuda_callable__ void sortMinims( T1 pom[] )
+{
+  T1 tmp[6] = {0.0,0.0,0.0,0.0,0.0,0.0}; 
+  if( fabs(pom[0]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[2])){
+    tmp[0] = pom[0]; tmp[1] = pom[1]; tmp[2] = pom[2];
+    tmp[3] = pom[3]; tmp[4] = pom[4]; tmp[5] = pom[5];
+    
+  }
+  else if( fabs(pom[0]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[1]) ){
+    tmp[0] = pom[0]; tmp[1] = pom[2]; tmp[2] = pom[1];
+    tmp[3] = pom[3]; tmp[4] = pom[5]; tmp[5] = pom[4];
+  }
+  else if( fabs(pom[1]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[2]) ){
+    tmp[0] = pom[1]; tmp[1] = pom[0]; tmp[2] = pom[2];
+    tmp[3] = pom[4]; tmp[4] = pom[3]; tmp[5] = pom[5];
+  }
+  else if( fabs(pom[1]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[0]) ){
+    tmp[0] = pom[1]; tmp[1] = pom[2]; tmp[2] = pom[0];
+    tmp[3] = pom[4]; tmp[4] = pom[5]; tmp[5] = pom[3];
+  }
+  else if( fabs(pom[2]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[1]) ){
+    tmp[0] = pom[2]; tmp[1] = pom[0]; tmp[2] = pom[1];
+    tmp[3] = pom[5]; tmp[4] = pom[3]; tmp[5] = pom[4];
+  }
+  else if( fabs(pom[2]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[0]) ){
+    tmp[0] = pom[2]; tmp[1] = pom[1]; tmp[2] = pom[0];
+    tmp[3] = pom[5]; tmp[4] = pom[4]; tmp[5] = pom[3];
+  }
+  
+  for( unsigned int i = 0; i < 6; i++ )
+  {
+    pom[ i ] = tmp[ i ];
+  }   
+}
+
+#ifdef HAVE_CUDA
+template < typename Real, typename Device, typename Index >
+__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
+        const Containers::StaticVector< 2, Index > vecLowerOverlaps, 
+        const Containers::StaticVector< 2, Index > vecUpperOverlaps ) 
+{
+  int i = threadIdx.x + blockDim.x*blockIdx.x;
+  int j = blockDim.y*blockIdx.y + threadIdx.y;
+  const Meshes::Grid< 2, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
+  
+  if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+  {
+    typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell;
+    Cell cell( mesh );
+    cell.getCoordinates().x() = i; cell.getCoordinates().y() = j;
+    cell.refresh();
+    const Index cind = cell.getIndex();
+    
+    
+    output[ cind ] =
+            input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
+              - std::numeric_limits< Real >::max();
+    interfaceMap[ cind ] = false; 
+    
+    if( i < mesh.getDimensions().x() - vecUpperOverlaps[ 0 ] &&
+            j < mesh.getDimensions().y() - vecUpperOverlaps[ 1 ] &&
+            i>vecLowerOverlaps[ 0 ] -1 && j> vecLowerOverlaps[ 0 ]-1 )
+    {
+      const Real& hx = mesh.getSpaceSteps().x();
+      const Real& hy = mesh.getSpaceSteps().y();
+      cell.refresh();
+      const Real& c = input( cell );
+      if( ! cell.isBoundaryEntity()  )
+      {
+        auto neighbors = cell.getNeighborEntities();
+        Real tmp = 0;
+        const Index e = neighbors.template getEntityIndex<  1,  0 >();
+        const Index w = neighbors.template getEntityIndex<  -1,  0 >();
+        const Index n = neighbors.template getEntityIndex<  0,  1 >();
+        const Index s = neighbors.template getEntityIndex<  0,  -1 >();
+        
+        if( c * input[ n ] <= 0 )
+        {
+          tmp = TNL::sign( c )*( hy * c )/( c - input[ n ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( tmp ) )
+              output[ cind ] = tmp;
+          
+          interfaceMap[ cell.getIndex() ] = true;
+        }
+        
+        
+        if( c * input[ e ] <= 0 )
+        {
+          tmp = TNL::sign( c )*( hx * c )/( c - input[ e ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( tmp ) )
+            output[ cind ] = tmp;                       
+          
+          interfaceMap[ cind ] = true;
+        }
+        if( c * input[ w ] <= 0 )
+        {
+          tmp = TNL::sign( c )*( hx * c )/( c - input[ w ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( tmp ) ) 
+            output[ cind ] = tmp;
+          
+          interfaceMap[ cind ] = true;
+        }
+        if( c * input[ s ] <= 0 )
+        {
+          tmp = TNL::sign( c )*( hy * c )/( c - input[ s ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( tmp ) ) 
+            output[ cind ] = tmp;
+          
+          interfaceMap[ cind ] = true;
+        }
+      }
+    }
+  }
+}
+
+
+template < typename Index >
+__global__ void GetNeighbours( const TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator,
+        TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY )
+{
+  int i = blockIdx.x * 1024 + threadIdx.x;
+  
+  if( i < numBlockX * numBlockY )
+  {
+    int pom = 0;//BlockIterPom[ i ] = 0;
+    int m=0, k=0;
+    m = i%numBlockX;
+    k = i/numBlockX;
+    if( m > 0 && blockCalculationIndicator[ i - 1 ] ){
+      pom = 1;//BlockIterPom[ i ] = 1;
+    }else if( m < numBlockX -1 && blockCalculationIndicator[ i + 1 ] ){
+      pom = 1;//BlockIterPom[ i ] = 1;
+    }else if( k > 0 && blockCalculationIndicatorHelp[ i - numBlockX ] ){
+      pom = 1;// BlockIterPom[ i ] = 1;
+    }else if( k < numBlockY -1 && blockCalculationIndicator[ i + numBlockX ] ){
+      pom = 1;//BlockIterPom[ i ] = 1;
+    }
+    
+    if( blockCalculationIndicator[ i ] != 1 )
+      blockCalculationIndicatorHelp[ i ] = pom;//BlockIterPom[ i ];
+    else
+      blockCalculationIndicatorHelp[ i ] = 1;
+  }
+}
+
+
+
+
+template < int sizeSArray, typename Real, typename Device, typename Index >
+__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
+        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
+        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
+        TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator,
+        const Containers::StaticVector< 2, Index > vecLowerOverlaps, 
+        const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock )
+{
+  // Setting up threads
+  int thri = threadIdx.x; int thrj = threadIdx.y;
+  int i = threadIdx.x + blockDim.x*blockIdx.x + vecLowerOverlaps[0];
+  int j = blockDim.y*blockIdx.y + threadIdx.y + vecLowerOverlaps[1];
+  const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >();
+  
+/** FOR CHESS METHOD */
+  //if( (blockIdx.y%2  + blockIdx.x) % 2 == oddEvenBlock )
+  //{
+/**------------------------------------------*/
+  
+/** FOR FIM METHOD */
+  if( blockCalculationIndicator[ blockIdx.y * gridDim.x + blockIdx.x ] )
+  { 
+    __syncthreads();
+/**-----------------------------------------*/
+    
+    const int dimX = mesh.getDimensions().x(); const int dimY = mesh.getDimensions().y();
+    const Real hx = mesh.getSpaceSteps().x(); const Real hy = mesh.getSpaceSteps().y();
+    if( thri==0 && thrj == 0)
+    {
+      blockCalculationIndicator[ blockIdx.y * gridDim.x + blockIdx.x ] = 0;
+    }
+    __syncthreads();
+    int maxThreadsInXDirection;
+    int maxThreadsInYDirection;
+    
+    // Maximum threads in each direction can differ
+    // e.g. cudaBlockSize = 16, dimX = 50, then:
+    // blockIdx   maxThreadsInXDirection   calculation [from, to]  sArray [from, to] 
+    //    0                 16                    [ 0,15]             [ 0,16]   //"-1" set to inf
+    //    1                 16                    [16,31]             [15,32]
+    //    2                 16                    [32,47]             [31,48]
+    //    3                  2                    [48,50]             [47,50]   // rest set to inf
+    // same for YDirection because blocks are squared 
+    maxThreadsInXDirection = blockDim.x + 1;
+    maxThreadsInYDirection = blockDim.y + 1;
+    
+    if( gridDim.x - 1 == blockIdx.x ) // care about number of values if we are in last block
+      maxThreadsInXDirection = (dimX-vecUpperOverlaps[0]-vecLowerOverlaps[0]) - (blockIdx.x)*blockDim.x+1;
+    
+    if( gridDim.y - 1 == blockIdx.y ) // care about number of values if we are in last block
+      maxThreadsInYDirection = (dimY-vecUpperOverlaps[1]-vecLowerOverlaps[1]) - (blockIdx.y)*blockDim.y+1;
+    __syncthreads();
+    
+    // Setting changed array that contains info: "Did the value of this thread changed in last passage?"
+    // Will be used in parallel reduction ( inside block level )
+    int currentIndex = thrj * blockDim.x + thri;
+    __shared__ volatile bool changed[ ( sizeSArray - 2 ) * ( sizeSArray - 2 ) ];
+    changed[ currentIndex ] = false;
+    if( thrj == 0 && thri == 0 )
+      changed[ 0 ] = true; // fist must be true to start while cycle
+    
+    
+    //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ];
+    __shared__ volatile Real sArray[ sizeSArray * sizeSArray ];
+    sArray[ (thrj+1) * sizeSArray + thri +1 ] = std::numeric_limits< Real >::max();
+    
+       
+    //filling sArray edges
+    if( thri == 0 ) // 
+    {      
+      if( dimX - vecLowerOverlaps[ 0 ] > (blockIdx.x+1) * blockDim.x  && thrj+1 < maxThreadsInYDirection )
+        sArray[ (thrj+1)*sizeSArray + maxThreadsInXDirection ] = 
+                aux[ (blockIdx.y*blockDim.y+vecLowerOverlaps[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 // this to get to right possition
+                + (thrj+1)*dimX + maxThreadsInXDirection + vecLowerOverlaps[0] ];                        // rest to get the right sArray overlap
+      else
+        sArray[ (thrj+1)*sizeSArray + maxThreadsInXDirection ] = std::numeric_limits< Real >::max();
+    }
+        
+    if( thri == 1 )
+    { 
+      if( ( blockIdx.x != 0 || vecLowerOverlaps[0] != 0 ) && thrj+1 < maxThreadsInYDirection )
+        sArray[(thrj+1)*sizeSArray + 0] = 
+                aux[ (blockIdx.y*blockDim.y+vecLowerOverlaps[1])*dimX - dimX + blockIdx.x*blockDim.x - 1
+                + (thrj+1)*dimX  + vecLowerOverlaps[0] ];
+      else
+        sArray[(thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max();
+    }
+    
+    if( thri == 2 )
+    {
+      if( dimY - vecLowerOverlaps[ 1 ] > (blockIdx.y+1) * blockDim.y  && thrj+1 < maxThreadsInXDirection )
+        sArray[ maxThreadsInYDirection * sizeSArray + thrj+1 ] = 
+                aux[ ( blockIdx.y * blockDim.y + vecLowerOverlaps[ 1 ] ) * dimX - dimX + blockIdx.x * blockDim.x - 1
+                + maxThreadsInYDirection * dimX + thrj + 1 + vecLowerOverlaps[0] ];
+      else
+        sArray[ maxThreadsInYDirection*sizeSArray + thrj+1 ] = std::numeric_limits< Real >::max();
+      
+    }
+        
+    if( thri == 3 )
+    {
+      if( ( blockIdx.y != 0 || vecLowerOverlaps[1] != 0 ) && thrj+1 < maxThreadsInXDirection )
+        sArray[0*sizeSArray + thrj+1] = 
+                aux[ ( blockIdx.y * blockDim.y + vecLowerOverlaps[ 1 ] ) * dimX - dimX + blockIdx.x * blockDim.x - 1
+                + thrj + 1 + vecLowerOverlaps[ 0 ] ];
+      else
+        sArray[0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
+    }
+    
+    // Filling sArray inside
+    if( i - vecLowerOverlaps[ 0 ] < dimX && j - vecLowerOverlaps[ 1 ] < dimY &&
+            thri + 1 < maxThreadsInXDirection + vecUpperOverlaps[ 0 ] && 
+            thrj + 1 < maxThreadsInYDirection + vecUpperOverlaps[ 1 ] )
+    {
+      sArray[ ( thrj + 1 ) * sizeSArray + thri + 1 ] = aux[ j * dimX + i ];
+    }
+    __syncthreads();  
+
+    //main while cycle ( CALCULATES TILL VALUES ARE CHANGING )
+    while( changed[ 0 ] )
+    {
+      __syncthreads();
+      
+      changed[ currentIndex] = false;
+      
+      //calculation of update cell
+      if( i < dimX - vecUpperOverlaps[ 0 ] && j < dimY - vecUpperOverlaps[ 1 ] )
+      {
+        if( ! interfaceMap[ j * dimX + i ] )
+        {
+          changed[ currentIndex ] = ptr.updateCell<sizeSArray>( sArray, thri + 1, thrj + 1, hx, hy );
+        }
+      }
+      __syncthreads();
+      
+      //pyramid reduction
+      if( blockDim.x * blockDim.y == 1024 )
+      {
+        if( currentIndex < 512 )
+        {
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
+        }
+      }
+      __syncthreads();
+      if( blockDim.x * blockDim.y >= 512 )
+      {
+        if( currentIndex < 256 )
+        {
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
+        }
+      }
+      __syncthreads();
+      if( blockDim.x * blockDim.y >= 256 )
+      {
+        if( currentIndex < 128 )
+        {
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
+        }
+      }
+      __syncthreads();
+      if( blockDim.x * blockDim.y >= 128 )
+      {
+        if( currentIndex < 64 )
+        {
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
+        }
+      }
+      __syncthreads();
+      if( currentIndex < 32 ) 
+      {
+        if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
+        if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
+        if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
+        if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
+        if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
+        if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
+      }
+      // result of reduction is in changed[ 0 ]
+      
+      // If we calculated in passage, then the blockCalculationIndicator for this block has to be 1
+      // means that we calculated in this block
+      if( thri == 0 && thrj == 0 && changed[ 0 ] ){
+        blockCalculationIndicator[ blockIdx.y * gridDim.x + blockIdx.x ] = 1;
+      }
+      __syncthreads();
+    }
+    
+    
+      
+    if( i < dimX && j < dimY && thri+1 < maxThreadsInXDirection && thrj+1 < maxThreadsInYDirection )
+      helpFunc[ j * dimX + i ] = sArray[ ( thrj + 1 ) * sizeSArray + thri + 1 ];
+    __syncthreads();
+  }
+  else
+  {
+    if( i < mesh.getDimensions().x() - vecUpperOverlaps[0] && j < mesh.getDimensions().y() - vecUpperOverlaps[1] )
+      helpFunc[ j * mesh.getDimensions().x() + i ] = aux[ j * mesh.getDimensions().x() + i ];
+  }
+}
+#endif
+
+
+
+/// ====================OPEN=MP============================================
+template< typename Real,
+        typename Device,
+        typename Index >
+template< int sizeSArray >
+void
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+updateBlocks( InterfaceMapType interfaceMap,
+        MeshFunctionType aux,
+        MeshFunctionType helpFunc,
+        ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
+{
+#pragma omp parallel for schedule( dynamic )
+  for( IndexType i = 0; i < BlockIterHost.getSize(); i++ )
+  {
+    if( BlockIterHost[ i ] )
+    {
+      MeshType mesh = interfaceMap.template getMesh< Devices::Host >();
+      
+      int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
+      //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl;
+      int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
+      int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
+      //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl;
+      int xkolik = numThreadsPerBlock + 1;
+      int ykolik = numThreadsPerBlock + 1;
+      
+      int blIdx = i%numOfBlockx;
+      int blIdy = i/numOfBlockx;
+      //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl;
+      
+      if( numOfBlockx - 1 == blIdx )
+        xkolik = dimX - (blIdx)*numThreadsPerBlock+1;
+      
+      if( numOfBlocky -1 == blIdy )
+        ykolik = dimY - (blIdy)*numThreadsPerBlock+1;
+      //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl;
+      
+      
+      /*bool changed[numThreadsPerBlock*numThreadsPerBlock];
+       changed[ 0 ] = 1;*/
+      Real hx = mesh.getSpaceSteps().x();
+      Real hy = mesh.getSpaceSteps().y();
+      
+      bool changed = false;
+      BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0;
+      
+      
+      Real *sArray;
+      sArray = new Real[ sizeSArray * sizeSArray ];
+      if( sArray == nullptr )
+        std::cout << "Error while allocating memory for sArray." << std::endl;
+      
+      for( IndexType thri = 0; thri < sizeSArray; thri++ ){
+        for( IndexType thrj = 0; thrj < sizeSArray; thrj++ )
+          sArray[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max();
+      }
+      
+      
+      //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
+      for( IndexType thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ )
+      {        
+        if( dimX > (blIdx+1) * numThreadsPerBlock  && thrj+1 < ykolik )
+          sArray[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ];
+        
+        
+        if( blIdx != 0 && thrj+1 < ykolik )
+          sArray[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ];
+        
+        if( dimY > (blIdy+1) * numThreadsPerBlock  && thrj+1 < xkolik )
+          sArray[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ];
+        
+        if( blIdy != 0 && thrj+1 < xkolik )
+          sArray[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ];
+      }
+      
+      for( IndexType k = 0; k < numThreadsPerBlock; k++ ){
+        for( IndexType l = 0; l < numThreadsPerBlock; l++ )
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
+            sArray[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ];
+      }
+      
+      for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ 
+        for( IndexType l = 0; l < numThreadsPerBlock; l++ ){
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ){
+            //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl;
+            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
+            {
+              changed = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy) || changed;
+              
+            }
+          }
+        }
+      }
+      /*aux.save( "aux-1pruch.tnl" );
+       for( int k = 0; k < sizeSArray; k++ ){ 
+       for( int l = 0; l < sizeSArray; l++ ) {
+       std::cout << sArray[ k * sizeSArray + l] << " ";
+       }
+       std::cout << std::endl;
+       }*/
+      
+      for( IndexType k = 0; k < numThreadsPerBlock; k++ ) 
+        for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ) { 
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
+          {
+            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
+            {
+              this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy);
+            }
+          }
+        }
+      /*aux.save( "aux-2pruch.tnl" );
+       for( int k = 0; k < sizeSArray; k++ ){ 
+       for( int l = 0; l < sizeSArray; l++ ) {
+       std::cout << sArray[ k * sizeSArray + l] << " ";
+       }
+       std::cout << std::endl;
+       }*/
+      
+      for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ) 
+        for( IndexType l = 0; l < numThreadsPerBlock; l++ ) {
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
+          {
+            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
+            {
+              this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy);
+            }
+          }
+        }
+      /*aux.save( "aux-3pruch.tnl" );
+       for( int k = 0; k < sizeSArray; k++ ){ 
+       for( int l = 0; l < sizeSArray; l++ ) {
+       std::cout << sArray[ k * sizeSArray + l] << " ";
+       }
+       std::cout << std::endl;
+       }*/
+      
+      for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){
+        for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ) { 
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
+          {
+            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
+            {
+              this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx, hy, 1.0);
+            }
+          }
+        }
+      }
+      /*aux.save( "aux-4pruch.tnl" );
+       for( int k = 0; k < sizeSArray; k++ ){ 
+       for( int l = 0; l < sizeSArray; l++ ) {
+       std::cout << sArray[ k * sizeSArray + l] << " ";
+       }
+       std::cout << std::endl;
+       }*/
+      
+      
+      if( changed ){
+        BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 1;
+      }
+      
+      
+      for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ 
+        for( IndexType l = 0; l < numThreadsPerBlock; l++ ) {
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )      
+            helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] = sArray[ (k + 1)* sizeSArray + l + 1 ];
+          //std::cout<< sArray[k+1][l+1];
+        }
+        //std::cout<<std::endl;
+      }
+      delete []sArray;
+    }
+  }
+}
+
+template< typename Real,
+        typename Device,
+        typename Index >
+void 
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY )
+{
+  int* BlockIterPom; 
+  BlockIterPom = new int [numBlockX * numBlockY];
+  
+  for(int i = 0; i < numBlockX * numBlockY; i++)
+  {
+    BlockIterPom[ i ] = 0;//BlockIterPom[ i ] = 0;
+    int m=0, k=0;
+    m = i%numBlockX;
+    k = i/numBlockX;
+    if( m > 0 && BlockIterHost[ i - 1 ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){
+      BlockIterPom[ i ] = 1;
+    }
+  }
+  
+  for(int i = 0; i < numBlockX * numBlockY; i++)
+  {
+    if( !BlockIterHost[ i ] )
+      BlockIterHost[ i ] = BlockIterPom[ i ];
+  }
+  delete[] BlockIterPom;
+}
+
+
+
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h
new file mode 100644
index 000000000..91f9a0efe
--- /dev/null
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h
@@ -0,0 +1,1091 @@
+/* 
+ * File:   tnlDirectEikonalMethodBase3D_impl.h
+ * Author: Fencl
+ *
+ * Created on March 15, 2019
+ */
+
+#pragma once
+
+template< typename Real,
+        typename Device,
+        typename Index >
+void
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+initInterface( const MeshFunctionPointer& _input,
+        MeshFunctionPointer& _output,
+        InterfaceMapPointer& _interfaceMap, 
+        StaticVector vLower, StaticVector vUpper )
+{
+  if( std::is_same< Device, Devices::Cuda >::value )
+  {
+#ifdef HAVE_CUDA
+    const MeshType& mesh = _input->getMesh();
+    
+    const int cudaBlockSize( 8 );
+    int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
+    int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
+    int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize );
+    if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
+      std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
+    dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
+    dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
+    Devices::Cuda::synchronizeDevice();
+    CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(),
+            _output.template modifyData< Device >(),
+            _interfaceMap.template modifyData< Device >(), vLower, vUpper );
+    cudaDeviceSynchronize();
+    TNL_CHECK_CUDA_DEVICE;
+#endif
+  }
+  if( std::is_same< Device, Devices::Host >::value )
+  {
+    const MeshFunctionType& input =  _input.getData();
+    MeshFunctionType& output =  _output.modifyData();
+    InterfaceMapType& interfaceMap =  _interfaceMap.modifyData();
+    
+    const MeshType& mesh = input.getMesh();
+    typedef typename MeshType::Cell Cell;
+    
+    Cell cell( mesh );
+    for( cell.getCoordinates().z() = 0;
+            cell.getCoordinates().z() < mesh.getDimensions().z();
+            cell.getCoordinates().z() ++ )
+      for( cell.getCoordinates().y() = 0;
+              cell.getCoordinates().y() < mesh.getDimensions().y();
+              cell.getCoordinates().y() ++ )
+        for( cell.getCoordinates().x() = 0;
+                cell.getCoordinates().x() < mesh.getDimensions().x();
+                cell.getCoordinates().x() ++ )
+        {
+          cell.refresh();
+          output[ cell.getIndex() ] =
+                  input( cell ) > 0 ? std::numeric_limits< RealType >::max() :
+                    - std::numeric_limits< RealType >::max();
+          interfaceMap[ cell.getIndex() ] = false;
+        }
+    
+    const RealType& hx = mesh.getSpaceSteps().x();
+    const RealType& hy = mesh.getSpaceSteps().y();
+    const RealType& hz = mesh.getSpaceSteps().z();
+    for( cell.getCoordinates().z() = 0 + vLower[2];
+            cell.getCoordinates().z() < mesh.getDimensions().z() - vUpper[2];
+            cell.getCoordinates().z() ++ )   
+      for( cell.getCoordinates().y() = 0 + vLower[1];
+              cell.getCoordinates().y() < mesh.getDimensions().y() - vUpper[1];
+              cell.getCoordinates().y() ++ )
+        for( cell.getCoordinates().x() = 0 + vLower[0];
+                cell.getCoordinates().x() < mesh.getDimensions().x() - vUpper[0];
+                cell.getCoordinates().x() ++ )
+        {
+          cell.refresh();
+          const RealType& c = input( cell );
+          if( ! cell.isBoundaryEntity() )
+          {
+            auto neighbors = cell.getNeighborEntities();
+            Real pom = 0;
+            const IndexType e = neighbors.template getEntityIndex<  1,  0,  0 >();
+            const IndexType n = neighbors.template getEntityIndex<  0,  1,  0 >();
+            const IndexType t = neighbors.template getEntityIndex<  0,  0,  1 >();
+            
+            
+            if( c * input[ n ] <= 0 )
+            {
+              pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
+              if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
+                output[ cell.getIndex() ] = pom;
+              pom = pom - TNL::sign( c )*hy;
+              if( TNL::abs( output[ n ] ) > TNL::abs( pom ) )
+                output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy;
+              
+              interfaceMap[ cell.getIndex() ] = true;
+              interfaceMap[ n ] = true;
+            }
+            
+            if( c * input[ e ] <= 0 )
+            {
+              pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
+              if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
+                output[ cell.getIndex() ] = pom;
+              pom = pom - TNL::sign( c )*hx;
+              if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
+                output[ e ] = pom; //( hy * c )/( c - input[ n ]) - hy;
+              
+              interfaceMap[ cell.getIndex() ] = true;
+              interfaceMap[ e ] = true;
+            }
+            
+            if( c * input[ t ] <= 0 )
+            {
+              pom = TNL::sign( c )*( hz * c )/( c - input[ t ]);
+              if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
+                output[ cell.getIndex() ] = pom;
+              pom = pom - TNL::sign( c )*hz;
+              if( TNL::abs( output[ t ] ) > TNL::abs( pom ) )
+                output[ t ] = pom; //( hy * c )/( c - input[ n ]) - hy;
+              
+              interfaceMap[ cell.getIndex() ] = true;
+              interfaceMap[ t ] = true;
+            }  
+          }
+        }
+  }
+}
+
+template< typename Real,
+        typename Device,
+        typename Index >
+template< typename MeshEntity >
+__cuda_callable__
+bool
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+updateCell( MeshFunctionType& u,
+        const MeshEntity& cell, 
+        const RealType v )
+{
+  const auto& neighborEntities = cell.template getNeighborEntities< 3 >();
+  const MeshType& mesh = cell.getMesh();
+  
+  const RealType& hx = mesh.getSpaceSteps().x();
+  const RealType& hy = mesh.getSpaceSteps().y();
+  const RealType& hz = mesh.getSpaceSteps().z();
+  const RealType value = u( cell );
+  //std::cout << value << std::endl;
+  RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
+  
+  
+  if( cell.getCoordinates().x() == 0 )
+    a = u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ];
+  else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
+    a = u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ];
+  else
+  {
+    a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ],
+            u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ] );
+  }
+  
+  if( cell.getCoordinates().y() == 0 )
+    b = u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ];
+  else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 )
+    b = u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ];
+  else
+  {
+    b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ],
+            u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ] );
+  }
+  
+  if( cell.getCoordinates().z() == 0 )
+    c = u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ];
+  else if( cell.getCoordinates().z() == mesh.getDimensions().z() - 1 )
+    c = u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ];
+  else
+  {
+    c = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ],
+            u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ] );
+  }
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+          fabs( b ) == std::numeric_limits< RealType >::max() &&
+          fabs( c ) == std::numeric_limits< RealType >::max() )
+    return false;
+  
+  RealType pom[6] = { a, b, c, hx, hy, hz};
+  tmp = getNewValue( pom , value, v );
+  
+  u[ cell.getIndex() ] = tmp;
+  tmp = value - u[ cell.getIndex() ];
+  if ( fabs( tmp ) >  0.001*hx )
+    return true;
+  else
+    return false;
+  /*sortMinims( pom );   
+  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
+  
+  if( fabs( tmp ) < fabs( pom[ 1 ] ) )
+  {
+    u[ cell.getIndex() ] = argAbsMin( value, tmp );
+    tmp = value - u[ cell.getIndex() ];
+    if ( fabs( tmp ) > 0.001*hx ){
+      return true;
+    }else{
+      return false;
+    }
+  }
+  else
+  {
+    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
+            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
+    if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
+    {
+      u[ cell.getIndex() ] = argAbsMin( value, tmp );
+      tmp = value - u[ cell.getIndex() ];
+      if ( fabs( tmp ) > 0.001*hx ){
+        return true;
+      }else{
+        return false;
+      }
+    }
+    else
+    {
+      tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
+              TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
+              hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
+              hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
+      u[ cell.getIndex() ] = argAbsMin( value, tmp );
+      tmp = value - u[ cell.getIndex() ];
+      if ( fabs( tmp ) > 0.001*hx ){
+        return true;
+      }else{
+        return false;
+      }
+    }
+  }*/
+}
+
+template< typename Real,
+        typename Device,
+        typename Index >
+template< int sizeSArray >
+__cuda_callable__ 
+bool 
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+updateCell( volatile Real *sArray, int thri, int thrj, int thrk,
+        const Real hx, const Real hy, const Real hz, const Real v )
+{
+  const RealType value = sArray[thrk *sizeSArray * sizeSArray + thrj * sizeSArray + thri];
+  
+  RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
+  
+  c = TNL::argAbsMin( sArray[ (thrk+1)* sizeSArray*sizeSArray + thrj * sizeSArray + thri ],
+          sArray[ (thrk-1) * sizeSArray *sizeSArray + thrj* sizeSArray + thri ] );
+  
+  b = TNL::argAbsMin( sArray[ thrk* sizeSArray*sizeSArray + (thrj+1) * sizeSArray + thri ],
+          sArray[ thrk* sizeSArray * sizeSArray + (thrj-1)* sizeSArray +thri ] );
+  
+  a = TNL::argAbsMin( sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri+1 ],
+          sArray[ thrk* sizeSArray * sizeSArray + thrj* sizeSArray +thri-1 ] );
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+          fabs( b ) == std::numeric_limits< RealType >::max() &&
+          fabs( c ) == std::numeric_limits< RealType >::max() )
+    return false;
+  
+  RealType pom[6] = { a, b, c, hx, hy, hz};
+  
+  tmp = getNewValue( pom , value, v );
+  
+  sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ] = tmp;
+  tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
+  if ( fabs( tmp ) >  0.001*hx )
+    return true;
+  else
+    return false;
+  /*sortMinims( pom );
+  
+  // calculation of real value taken from ZHAO
+  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
+  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
+  {
+    sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ] = argAbsMin( value, tmp );
+    tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
+    if ( fabs( tmp ) >  0.001*hx )
+      return true;
+    else
+      return false;
+  }
+  else
+  {
+    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
+            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
+    if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
+    {
+      sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ] = argAbsMin( value, tmp );
+      tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
+      if ( fabs( tmp ) > 0.001*hx )
+        return true;
+      else
+        return false;
+    }
+    else
+    {
+      tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
+              TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
+              hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
+              hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
+      sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ] = argAbsMin( value, tmp );
+      tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
+      if ( fabs( tmp ) > 0.001*hx )
+        return true;
+      else
+        return false;
+    }
+  }*/
+  
+}
+
+
+template< typename Real,
+        typename Device,
+        typename Index >
+__cuda_callable__ 
+Real
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+getNewValue( RealType valuesAndSteps[], const RealType originalValue, const RealType v )
+{
+  RealType newValue = std::numeric_limits< RealType >::max();
+  sortMinims( valuesAndSteps );
+  
+  // calculation of real value taken from ZHAO
+  newValue = valuesAndSteps[ 0 ] + TNL::sign( originalValue ) * valuesAndSteps[ 3 ]/v;
+  if( fabs( newValue ) < fabs( valuesAndSteps[ 1 ] ) ) 
+  {
+    newValue = argAbsMin( originalValue, newValue );
+  }
+  else
+  {
+    newValue = ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] * valuesAndSteps[ 1 ] + 
+            valuesAndSteps[ 4 ] * valuesAndSteps[ 4 ] * valuesAndSteps[ 0 ] + 
+            TNL::sign( originalValue ) * valuesAndSteps[ 3 ] * valuesAndSteps[ 4 ] * 
+            TNL::sqrt( ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] +  valuesAndSteps[ 4 ] *  valuesAndSteps[ 4 ] )/( v * v ) - 
+            ( valuesAndSteps[ 1 ] - valuesAndSteps[ 0 ] ) * 
+            ( valuesAndSteps[ 1 ] - valuesAndSteps[ 0 ] ) ) )/
+            ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] + valuesAndSteps[ 4 ] * valuesAndSteps[ 4 ] );
+    if( fabs( newValue ) < fabs( valuesAndSteps[ 2 ]) ) 
+    {
+      newValue = argAbsMin( originalValue, newValue );
+    }
+    else
+    {
+      // Value from algorithm by Zhao
+      newValue = ( valuesAndSteps[4] * valuesAndSteps[4] * valuesAndSteps[5] * valuesAndSteps[5] * valuesAndSteps[0] +
+              valuesAndSteps[3] * valuesAndSteps[3] * valuesAndSteps[5] * valuesAndSteps[5] * valuesAndSteps[1] +
+              valuesAndSteps[3] * valuesAndSteps[3] * valuesAndSteps[4] * valuesAndSteps[4] * valuesAndSteps[2] +
+              TNL::sign(originalValue) * valuesAndSteps[3] * valuesAndSteps[4] * valuesAndSteps[5] * TNL::sqrt(
+              (valuesAndSteps[3] * valuesAndSteps[3] * valuesAndSteps[5] * valuesAndSteps[5] +
+              valuesAndSteps[4] * valuesAndSteps[4] * valuesAndSteps[5] * valuesAndSteps[5] +
+              valuesAndSteps[3] * valuesAndSteps[3] * valuesAndSteps[4] * valuesAndSteps[4]) / (v * v) -
+              valuesAndSteps[5] * valuesAndSteps[5] * (valuesAndSteps[0] - valuesAndSteps[1]) * (valuesAndSteps[0] - valuesAndSteps[1]) -
+              valuesAndSteps[4] * valuesAndSteps[4] * (valuesAndSteps[0] - valuesAndSteps[2]) * (valuesAndSteps[0] - valuesAndSteps[2]) -
+              valuesAndSteps[3] * valuesAndSteps[3] * (valuesAndSteps[1] - valuesAndSteps[2]) * (valuesAndSteps[1] - valuesAndSteps[2]))) / (
+              valuesAndSteps[3] * valuesAndSteps[3] * valuesAndSteps[4] * valuesAndSteps[4] +
+              valuesAndSteps[4] * valuesAndSteps[4] * valuesAndSteps[5] * valuesAndSteps[5] +
+              valuesAndSteps[5] * valuesAndSteps[5] * valuesAndSteps[3] * valuesAndSteps[3]);
+      newValue = argAbsMin( originalValue, newValue );
+    }
+  }
+  
+  return newValue;
+}
+
+
+#ifdef HAVE_CUDA
+template < typename Real, typename Device, typename Index >
+__global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, 
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
+        Containers::StaticVector< 3, Index > vLower, Containers::StaticVector< 3, Index > vUpper )
+{
+  int i = threadIdx.x + blockDim.x*blockIdx.x;
+  int j = blockDim.y*blockIdx.y + threadIdx.y;
+  int k = blockDim.z*blockIdx.z + threadIdx.z;
+  const Meshes::Grid< 3, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
+  
+  if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
+  {
+    typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell;
+    Cell cell( mesh );
+    cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k;
+    cell.refresh();
+    const Index cind = cell.getIndex();
+    
+    
+    output[ cind ] =
+            input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
+              - std::numeric_limits< Real >::max();
+    interfaceMap[ cind ] = false; 
+    cell.refresh();
+    
+    if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] &&
+            k < mesh.getDimensions().y() - vUpper[2] && i>vLower[0]-1 && j> vLower[1]-1 && k>vLower[2]-1 )
+    {
+      const Real& hx = mesh.getSpaceSteps().x();
+      const Real& hy = mesh.getSpaceSteps().y();
+      const Real& hz = mesh.getSpaceSteps().z();
+      const Real& c = input( cell );
+      if( ! cell.isBoundaryEntity()  )
+      {
+        auto neighbors = cell.getNeighborEntities();
+        Real pom = 0;
+        const Index e = neighbors.template getEntityIndex<  1, 0, 0 >();
+        const Index w = neighbors.template getEntityIndex<  -1, 0, 0 >();
+        const Index n = neighbors.template getEntityIndex<  0, 1, 0 >();
+        const Index s = neighbors.template getEntityIndex<  0, -1, 0 >();
+        const Index t = neighbors.template getEntityIndex<  0, 0, 1 >();
+        const Index b = neighbors.template getEntityIndex<  0, 0, -1 >();
+        
+        if( c * input[ n ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+            output[ cind ] = pom;
+          
+          interfaceMap[ cind ] = true;
+        }
+        if( c * input[ e ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
+            output[ cind ] = pom;                       
+          
+          interfaceMap[ cind ] = true;
+        }
+        if( c * input[ w ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+            output[ cind ] = pom;
+          
+          interfaceMap[ cind ] = true;
+        }
+        if( c * input[ s ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+            output[ cind ] = pom;
+          
+          interfaceMap[ cind ] = true;
+        }
+        if( c * input[ b ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hz * c )/( c - input[ b ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+            output[ cind ] = pom;
+          
+          interfaceMap[ cind ] = true;
+        }
+        if( c * input[ t ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hz * c )/( c - input[ t ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+            output[ cind ] = pom;
+          
+          interfaceMap[ cind ] = true;
+        }
+      }
+    }
+  }
+}
+
+
+template < typename Index >
+__global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,
+        int numBlockX, int numBlockY, int numBlockZ )
+{
+  int i = blockIdx.x * 1024 + threadIdx.x;
+  
+  if( i < numBlockX * numBlockY * numBlockZ )
+  {
+    int pom = 0;//BlockIterPom[ i ] = 0;
+    int m=0, l=0, k=0;
+    l = i/( numBlockX * numBlockY );
+    k = (i-l*numBlockX * numBlockY )/(numBlockX );
+    m = (i-l*numBlockX * numBlockY )%( numBlockX );
+    if( m > 0 && BlockIterDevice[ i - 1 ] ){ // left neighbour
+      pom = 1;//BlockIterPom[ i ] = 1;
+    }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){ // right neighbour
+      pom = 1;//BlockIterPom[ i ] = 1;
+    }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){ // bottom neighbour
+      pom = 1;// BlockIterPom[ i ] = 1;
+    }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){ // top neighbour
+      pom = 1;//BlockIterPom[ i ] = 1;
+    }else if( l > 0 && BlockIterDevice[ i - numBlockX*numBlockY ] ){ // neighbour behind 
+      pom = 1;
+    }else if( l < numBlockZ-1 && BlockIterDevice[ i + numBlockX*numBlockY ] ){ // neighbour in front
+      pom = 1;
+    }
+    
+    if( !BlockIterDevice[ i ] ) // only in CudaUpdateCellCaller can BlockIterDevice gain 0
+      BlockIterPom[ i ] = pom;
+    else
+      BlockIterPom[ i ] = 1;
+  }
+}
+
+
+template < int sizeSArray, typename Real, typename Device, typename Index >
+__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
+        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
+        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+        Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps )
+{
+  int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
+  int i = threadIdx.x + blockDim.x*blockIdx.x + vecLowerOverlaps[0]; // WITH OVERLAPS!!! i,j,k aren't coordinates of all values
+  int j = blockDim.y*blockIdx.y + threadIdx.y + vecLowerOverlaps[1];
+  int k = blockDim.z*blockIdx.z + threadIdx.z + vecLowerOverlaps[2];
+  int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri;
+  const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
+  
+  // should this block calculate?
+  if( BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] ) 
+  {
+    __syncthreads();
+    
+    // Array indicates weather some threads calculated (for parallel reduction)
+    __shared__ volatile bool changed[ (sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2) ];
+    changed[ currentIndex ] = false;
+    
+    if( thrj == 0 && thri == 0 && thrk == 0 )
+      changed[ 0 ] = true; // first indicates weather we should calculate again (princip of parallel reduction)
+    
+    //getting stepps and size of mesh
+    const Real hx = mesh.getSpaceSteps().x(); const int dimX = mesh.getDimensions().x(); 
+    const Real hy = mesh.getSpaceSteps().y(); const int dimY = mesh.getDimensions().y();
+    const Real hz = mesh.getSpaceSteps().z(); const int dimZ  = mesh.getDimensions().z();
+    
+    if( thrj == 1 && thri == 1 && thrk == 1 )
+    {
+      // we dont know if we will calculate in here, more info down in code
+      BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 0;
+    }
+    
+    // sArray contains values of one block (coppied from aux) and edges (not MPI) of those blocks
+    __shared__ volatile Real sArray[ sizeSArray * sizeSArray * sizeSArray ];
+    sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = std::numeric_limits< Real >::max();
+    
+    
+    int xkolik = blockDim.x + 1;// maximum of threads in x direction (for all blocks different)
+    int ykolik = blockDim.y + 1;
+    int zkolik = blockDim.z + 1;
+    __syncthreads();
+    
+    if( gridDim.x - 1 == blockIdx.x )
+      xkolik = (dimX-vecUpperOverlaps[0]-vecLowerOverlaps[0]) -  blockIdx.x*blockDim.x+1;
+    if( gridDim.y -1 == blockIdx.y )
+      ykolik = (dimY-vecUpperOverlaps[1]-vecLowerOverlaps[1]) - (blockIdx.y)*blockDim.y+1;
+    if( gridDim.z-1 == blockIdx.z )
+      zkolik = (dimZ-vecUpperOverlaps[2]-vecLowerOverlaps[2]) - (blockIdx.z)*blockDim.z+1;
+    __syncthreads();
+    
+     //filling sArray edges
+    if( thri == 0 ) //x bottom
+    {        
+      if(  (blockIdx.x != 0 || vecLowerOverlaps[0] !=0) && thrj+1 < ykolik && thrk+1 < zkolik )
+        sArray[ (thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0 ] = 
+                aux[ (blockIdx.z*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + (blockIdx.y * blockDim.y+vecLowerOverlaps[1])*dimX 
+                + blockIdx.x*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY + vecLowerOverlaps[0] ];
+    else
+        sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max();
+    }
+    
+    if( thri == 1 ) //xtop
+    {
+      if( dimX - vecLowerOverlaps[ 0 ] >  (blockIdx.x+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik )
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] =
+                aux[ (blockIdx.z*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + (blockIdx.y * blockDim.y+vecLowerOverlaps[1])*dimX
+                + blockIdx.x*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY + vecLowerOverlaps[0] ];
+     else
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max();
+    }
+    if( thri == 2 ) //y bottom
+    {        
+      if( (blockIdx.y != 0 || vecLowerOverlaps[1] !=0) && thrj+1 < xkolik && thrk+1 < zkolik )
+        sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] =
+                aux[ (blockIdx.z*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + (blockIdx.y * blockDim.y+vecLowerOverlaps[1])*dimX
+                + blockIdx.x*blockDim.x - dimX + thrj + thrk*dimX*dimY + vecLowerOverlaps[0] ];
+      else
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + 0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
+    }
+    
+    if( thri == 3 ) //y top
+    {
+      if( dimY - vecLowerOverlaps[ 1 ] > (blockIdx.y+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik )
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] =
+                aux[ (blockIdx.z*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + ((blockIdx.y+1) * blockDim.y+vecLowerOverlaps[1])*dimX
+                + blockIdx.x*blockDim.x + thrj + thrk*dimX*dimY + vecLowerOverlaps[0] ];
+     else
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
+    }
+    if( thri == 4 ) //z bottom
+    {        
+      if( (blockIdx.z != 0 || vecLowerOverlaps[2] !=0) && thrj+1 < ykolik && thrk+1 < xkolik )
+        sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] =
+                aux[ (blockIdx.z*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + (blockIdx.y * blockDim.y+vecLowerOverlaps[1])*dimX
+                + blockIdx.x*blockDim.x - dimX * dimY + thrj * dimX + thrk + vecLowerOverlaps[0] ];
+     else
+        sArray[0 * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thrk+1] = std::numeric_limits< Real >::max();
+    }
+    
+    if( thri == 5 ) //z top
+    {
+      if( dimZ - vecLowerOverlaps[ 2 ] > (blockIdx.z+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik )
+        sArray[ zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] =
+                aux[ ((blockIdx.z+1)*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + (blockIdx.y * blockDim.y+vecLowerOverlaps[1])*dimX
+                + blockIdx.x*blockDim.x + thrj * dimX + thrk + vecLowerOverlaps[0] ];
+     else
+        sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = std::numeric_limits< Real >::max();
+    }
+    
+    // Copy all other values that aren't edges
+    if( i - vecLowerOverlaps[0] < dimX && j - vecLowerOverlaps[1] < dimY && k - vecLowerOverlaps[2] < dimZ &&
+        thri+1 < xkolik + vecUpperOverlaps[0] && thrj+1 < ykolik + vecUpperOverlaps[1] && thrk+1 < zkolik + vecUpperOverlaps[2] )
+    {
+      sArray[(thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = aux[ k*dimX*dimY + j*dimX + i ];
+    }
+    __syncthreads(); 
+    
+    //main while cycle. each value can get information only from neighbour but that information has to spread there
+    while( changed[ 0 ] )
+    {
+      __syncthreads();
+      
+      changed[ currentIndex ] = false;
+      
+      //calculation of update cell
+      if( i < dimX - vecUpperOverlaps[0] && j < dimY - vecUpperOverlaps[1] && k < dimZ - vecUpperOverlaps[2] )
+      {
+        if( ! interfaceMap[ k*dimX*dimY + j * dimX + i ] )
+        {
+          // calculate new value depending on neighbours in sArray on (thri+1, thrj+1) coordinates
+          changed[ currentIndex ] = ptr.updateCell< sizeSArray >( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz); 
+        }
+      }
+      __syncthreads();
+      
+      //pyramid reduction (parallel reduction)
+      if( blockDim.x*blockDim.y*blockDim.z == 1024 )
+      {
+        if( currentIndex < 512 )
+        {
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
+        }
+      }
+      __syncthreads();
+      if( blockDim.x*blockDim.y*blockDim.z >= 512 )
+      {
+        if( currentIndex < 256 )
+        {
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
+        }
+      }
+      __syncthreads();
+      if( blockDim.x*blockDim.y*blockDim.z >= 256 )
+      {
+        if( currentIndex < 128 )
+        {
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
+        }
+      }
+      __syncthreads();
+      if( blockDim.x*blockDim.y*blockDim.z >= 128 )
+      {
+        if( currentIndex < 64 )
+        {
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
+        }
+      }
+      __syncthreads();
+      if( currentIndex < 32 )
+      {
+        if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
+        if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
+        if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
+        if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
+        if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
+        if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
+      }
+      __syncthreads();
+      
+      // if we calculated, then the BlockIterDevice should contain the info about this whole block! (only one number for one block)
+      if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 )
+      {
+        BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 1;
+      }
+      __syncthreads();
+    }
+    
+    // copy results into helpFunc (not into aux bcs of conflicts)
+    if( i < dimX && j < dimY && k < dimZ && thri+1 < xkolik && thrj+1 < ykolik && thrk+1 < zkolik )
+      helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thri+1 ];
+    
+  }
+  else // if not, then it should at least copy the values from aux to helpFunc.
+  {
+    if( i < mesh.getDimensions().x() - vecUpperOverlaps[0] && j < mesh.getDimensions().y() - vecUpperOverlaps[1]
+            && k < mesh.getDimensions().z() - vecUpperOverlaps[2])
+      helpFunc[ k * mesh.getDimensions().x() * mesh.getDimensions().y() + j * mesh.getDimensions().x() + i ] =
+              aux[ k * mesh.getDimensions().x() * mesh.getDimensions().y() + j * mesh.getDimensions().x() + i ];
+  }
+}  
+#endif
+
+
+/// ==========================OPEN=MP=================================
+template< typename Real,
+        typename Device,
+        typename Index >
+template< int sizeSArray >
+void
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+updateBlocks( const InterfaceMapType interfaceMap,
+        const MeshFunctionType aux,
+        MeshFunctionType& helpFunc,
+        ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
+{  
+  //#pragma omp parallel for schedule( dynamic )
+  for( IndexType i = 0; i < BlockIterHost.getSize(); i++ )
+  {
+    if( BlockIterHost[ i ] )
+    {
+      MeshType mesh = interfaceMap.template getMesh< Devices::Host >();
+      
+      int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
+      int dimZ = mesh.getDimensions().z();
+      //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl;
+      int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
+      int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
+      int numOfBlockz = dimZ/numThreadsPerBlock + ((dimZ%numThreadsPerBlock != 0) ? 1:0);
+      //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl;
+      int xkolik = numThreadsPerBlock + 1;
+      int ykolik = numThreadsPerBlock + 1;
+      int zkolik = numThreadsPerBlock + 1;
+      
+      
+      int blIdz = i/( numOfBlockx * numOfBlocky );
+      int blIdy = (i-blIdz*numOfBlockx * numOfBlocky )/(numOfBlockx );
+      int blIdx = (i-blIdz*numOfBlockx * numOfBlocky )%( numOfBlockx );
+      //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl;
+      
+      if( numOfBlockx - 1 == blIdx )
+        xkolik = dimX - (blIdx)*numThreadsPerBlock+1;
+      if( numOfBlocky -1 == blIdy )
+        ykolik = dimY - (blIdy)*numThreadsPerBlock+1;
+      if( numOfBlockz-1 == blIdz )
+        zkolik = dimZ - (blIdz)*numThreadsPerBlock+1;
+      //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl;
+      
+      
+      /*bool changed[numThreadsPerBlock*numThreadsPerBlock];
+       changed[ 0 ] = 1;*/
+      Real hx = mesh.getSpaceSteps().x();
+      Real hy = mesh.getSpaceSteps().y();
+      Real hz = mesh.getSpaceSteps().z();
+      
+      bool changed = false;
+      BlockIterHost[ i ] = 0;
+      
+      
+      Real *sArray;
+      sArray = new Real[ sizeSArray * sizeSArray * sizeSArray ];
+      if( sArray == nullptr )
+        std::cout << "Error while allocating memory for sArray." << std::endl;
+      
+      for( IndexType k = 0; k < sizeSArray; k++ )
+        for( IndexType l = 0; l < sizeSArray; l++ )
+          for( IndexType m = 0; m < sizeSArray; m++ ){
+            sArray[ m * sizeSArray * sizeSArray + k * sizeSArray + l ] = std::numeric_limits< Real >::max();
+          }
+      
+      
+      for( IndexType thrk = 0; thrk < numThreadsPerBlock; thrk++ )
+        for( IndexType thrj = 0; thrj < numThreadsPerBlock; thrj++ )
+        {
+          if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik )
+            sArray[(thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX -1 + thrk*dimX*dimY ];
+          
+          if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < zkolik )
+            sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy *numThreadsPerBlock*dimX+ blIdx*numThreadsPerBlock + numThreadsPerBlock + thrj * dimX + thrk*dimX*dimY ];
+          
+          if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik )
+            sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX + thrj + thrk*dimX*dimY ];
+          
+          if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik && thrk+1 < zkolik )
+            sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + (blIdy+1) * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj + thrk*dimX*dimY ];
+          
+          if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik )
+            sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX * dimY + thrj * dimX + thrk ];
+          
+          if( dimZ > (blIdz+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < xkolik )
+            sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = 
+                    aux[ (blIdz+1)*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX + thrk ];
+        }
+      
+      for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
+        for( IndexType k = 0; k < numThreadsPerBlock; k++ ){
+          for( IndexType l = 0; l < numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+              sArray[(m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1] = 
+                      aux[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ];
+          }
+        }
+      }
+      /*string s;
+       int numWhile = 0;
+       for( int k = 0; k < numThreadsPerBlock; k++ ){
+       for( int l = 0; l < numThreadsPerBlock; l++ ) 
+       for( int m = 0; m < numThreadsPerBlock; m++ )
+       if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+       helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+       } 
+       numWhile++;
+       s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+       helpFunc.save( s );*/
+      
+      for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
+        for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ 
+          for( IndexType l = 0; l < numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){
+              //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl;
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                //printf("In with point m  = %d, k = %d, l = %d\n", m, k, l);
+                changed = this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz) || changed;
+                
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+       for( int l = 0; l < numThreadsPerBlock; l++ ) 
+       for( int m = 0; m < numThreadsPerBlock; m++ )
+       if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+       helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+       } 
+       numWhile++;
+       s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+       helpFunc.save( s );*/
+      
+      for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){
+        for( IndexType k = 0; k < numThreadsPerBlock; k++ ){
+          for( IndexType l = 0; l <numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+       for( int l = 0; l < numThreadsPerBlock; l++ ) 
+       for( int m = 0; m < numThreadsPerBlock; m++ )
+       if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+       helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+       } 
+       numWhile++;
+       s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+       helpFunc.save( s );*/
+      
+      for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
+        for( IndexType k = 0; k < numThreadsPerBlock; k++ ){
+          for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+       for( int l = 0; l < numThreadsPerBlock; l++ ) 
+       for( int m = 0; m < numThreadsPerBlock; m++ )
+       if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+       helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+       } 
+       numWhile++;
+       s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+       helpFunc.save( s );
+       */
+      for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){
+        for( IndexType k = 0; k < numThreadsPerBlock; k++ ){
+          for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell< sizeSArray >(  sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+       for( int l = 0; l < numThreadsPerBlock; l++ ) 
+       for( int m = 0; m < numThreadsPerBlock; m++ )
+       if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+       helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+       } 
+       numWhile++;
+       s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+       helpFunc.save( s );*/
+      
+      for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
+        for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){
+          for( IndexType l = 0; l <numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+       for( int l = 0; l < numThreadsPerBlock; l++ ) 
+       for( int m = 0; m < numThreadsPerBlock; m++ )
+       if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+       helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+       } 
+       numWhile++;
+       s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+       helpFunc.save( s );*/
+      
+      for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){
+        for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){
+          for( IndexType l = 0; l <numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+       for( int l = 0; l < numThreadsPerBlock; l++ ) 
+       for( int m = 0; m < numThreadsPerBlock; m++ )
+       if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+       helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+       } 
+       numWhile++;
+       s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+       helpFunc.save( s );*/
+      
+      for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
+        for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){
+          for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+       for( int l = 0; l < numThreadsPerBlock; l++ ) 
+       for( int m = 0; m < numThreadsPerBlock; m++ )
+       if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+       helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+       } 
+       numWhile++;
+       s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+       helpFunc.save( s );*/
+      
+      
+      for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){
+        for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){
+          for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+       for( int l = 0; l < numThreadsPerBlock; l++ ) 
+       for( int m = 0; m < numThreadsPerBlock; m++ )
+       if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+       helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+       } 
+       numWhile++;
+       s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+       helpFunc.save( s );*/
+      
+      if( changed ){
+        BlockIterHost[ i ] = 1;
+      }
+      
+      
+      for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ 
+        for( IndexType l = 0; l < numThreadsPerBlock; l++ ) {
+          for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){      
+              helpFunc[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] = 
+                      sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+              //std::cout << helpFunc[ m*dimX*dimY + k*dimX + l ] << " ";
+            }
+          }
+          //std::cout << std::endl;
+        }
+        //std::cout << std::endl;
+      }
+      //helpFunc.save( "helpF.tnl");
+      delete []sArray;
+    }
+  }
+}
+
+template< typename Real,
+        typename Device,
+        typename Index >
+void 
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ )
+{
+  int* BlockIterPom; 
+  BlockIterPom = new int [ numBlockX * numBlockY * numBlockZ ];
+  
+  for( int i = 0; i< BlockIterHost.getSize(); i++)
+  {
+    BlockIterPom[ i ] = 0;
+    
+    int m=0, l=0, k=0;
+    l = i/( numBlockX * numBlockY );
+    k = (i-l*numBlockX * numBlockY )/(numBlockX );
+    m = (i-l*numBlockX * numBlockY )%( numBlockX );
+    
+    if( m > 0 && BlockIterHost[ i - 1 ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( l > 0 && BlockIterHost[ i - numBlockX*numBlockY ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( l < numBlockZ-1 && BlockIterHost[ i + numBlockX*numBlockY ] ){
+      BlockIterPom[ i ] = 1;
+    }
+  }
+  for( int i = 0; i< BlockIterHost.getSize(); i++)
+  { 
+    BlockIterHost[ i ] = BlockIterPom[ i ];
+  }
+}
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index c6a522d8f..e0ece04bf 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -7,9 +7,9 @@
 
 #pragma once 
 
-#include <TNL/Meshes/Grid.h>
-#include <TNL/Functions/MeshFunction.h>
-#include <TNL/Devices/Cuda.h>
+//#include <TNL/Meshes/Grid.h>
+//#include <TNL/Functions/MeshFunction.h>
+//#include <TNL/Devices/Cuda.h>
 
 using namespace TNL;
 
@@ -63,25 +63,32 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
     typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType;
     typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
     typedef Containers::StaticVector< 2, Index > StaticVector;
+    
+    using MeshPointer = Pointers::SharedPointer<  MeshType >;
     using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
     using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;
     
-    
+    // CALLER FOR HOST AND CUDA
     void initInterface( const MeshFunctionPointer& input,
             MeshFunctionPointer& output,
             InterfaceMapPointer& interfaceMap,
             StaticVector vLower, StaticVector vUpper );
-    
+        
+    // FOR HOST
     template< typename MeshEntity >
     __cuda_callable__ bool updateCell( MeshFunctionType& u,
             const MeshEntity& cell,
             const RealType velocity = 1.0 );
     
+    // FOR CUDA
     template< int sizeSArray >
-    __cuda_callable__ bool updateCell( volatile Real *sArray,
-            int thri, int thrj, const Real hx, const Real hy,
-            const Real velocity = 1.0 );
-    
+    __cuda_callable__ bool updateCell( volatile RealType *sArray,
+            int thri, int thrj, const RealType hx, const RealType hy,
+            const RealType velocity = 1.0 );
+        
+// FOR OPENMP WILL BE REMOVED
+    void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY  );
+        
     template< int sizeSArray >
     void updateBlocks( const InterfaceMapType& interfaceMap,
             MeshFunctionType& aux,
@@ -108,16 +115,27 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
     using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
     using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;      
     
+     // CALLER FOR HOST AND CUDA
     void initInterface( const MeshFunctionPointer& input,
             MeshFunctionPointer& output,
             InterfaceMapPointer& interfaceMap,
             StaticVector vLower, StaticVector vUpper );
     
+    // FOR HOST
     template< typename MeshEntity >
     __cuda_callable__ bool updateCell( MeshFunctionType& u,
             const MeshEntity& cell,
             const RealType velocity = 1.0);
     
+    // FOR CUDA
+    template< int sizeSArray >
+    __cuda_callable__ bool updateCell( volatile Real *sArray,
+            int thri, int thrj, int thrk, const RealType hx, const RealType hy, const RealType hz,
+            const RealType velocity = 1.0 );
+    
+    // OPENMP WILL BE REMOVED
+    void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ );
+    
     template< int sizeSArray >
     void updateBlocks( const InterfaceMapType& interfaceMap,
             const MeshFunctionType& aux,
@@ -126,16 +144,15 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
     
     void getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY, int numBlockZ );
     
-    template< int sizeSArray >
-    __cuda_callable__ bool updateCell3D( volatile Real *sArray,
-            int thri, int thrj, int thrk, const Real hx, const Real hy, const Real hz,
-            const Real velocity = 1.0 );
+    __cuda_callable__ RealType getNewValue( RealType valuesAndSteps[],
+           const RealType originalValue, const RealType v );
 };
 
 template < typename T1 >
 __cuda_callable__ void sortMinims( T1 pom[] );
 
 #ifdef HAVE_CUDA
+// 1D
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, 
         Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output,
@@ -147,21 +164,25 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
         Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& aux,
         bool *BlockIterDevice );
 
+
+
+
+// 2D
+template < typename Real, typename Device, typename Index >
+__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
+        const Containers::StaticVector< 2, Index > vecLowerOverlas,
+        const Containers::StaticVector< 2, Index > vecUpperOerlaps );
+
 template < int sizeSArray, typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
         const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
         const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
         Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
-        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
-        Containers::StaticVector< 2, Index > vLower, Containers::StaticVector< 2, Index > vUpper, int k,int oddEvenBlock =0);
-
-template< typename Real, typename Device, typename Index >
-__global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, int copy, int k );
-
-template< typename Real, typename Device, typename Index >
-__global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
-        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, int copy, int k );
+        TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator,
+        const Containers::StaticVector< 2, Index > vecLowerOverlaps, 
+        const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock =0);
 
 template < typename Index >
 __global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
@@ -171,17 +192,13 @@ template < typename Index >
 __global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
         TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY );
 
-template < typename Real, typename Device, typename Index >
-__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
-        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
-        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
-        Containers::StaticVector< 2, Index > vLower, Containers::StaticVector< 2, Index > vUpper );
 
+// 3D
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, 
         Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
         Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
-        Containers::StaticVector< 3, Index > vLower, Containers::StaticVector< 3, Index > vUpper );
+        Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps );
 
 template < int sizeSArray, typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
@@ -196,4 +213,6 @@ __global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda,
         int numBlockX, int numBlockY, int numBlockZ );
 #endif
 
-#include "tnlDirectEikonalMethodsBase_impl.h"
+#include "tnlDirectEikonalMethodBase1D_impl.h"
+#include "tnlDirectEikonalMethodBase2D_impl.h"
+#include "tnlDirectEikonalMethodBase3D_impl.h"
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
index a57ef1491..8c58ee610 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
@@ -10,11 +10,10 @@
 
 #pragma once
 
-#include <TNL/Meshes/Grid.h>
-#include <TNL/Functions/Analytic/Constant.h>
-#include <TNL/Pointers/SharedPointer.h>
+//#include <TNL/Meshes/Grid.h>
+//#include <TNL/Functions/Analytic/Constant.h>
+//#include <TNL/Pointers/SharedPointer.h>
 #include "tnlDirectEikonalMethodsBase.h"
-#define ForDebug false // false <=> off
 
 
 template< typename Mesh,
@@ -88,6 +87,7 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator,
     typedef Anisotropy AnisotropyType;
     typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > BaseType;
     typedef Communicator CommunicatorType;
+    typedef Containers::StaticVector< 2, Index > StaticVector;
     
     using MeshPointer = Pointers::SharedPointer<  MeshType >;
     using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
@@ -113,6 +113,15 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator,
     protected:
       
       const IndexType maxIterations;
+    
+      void setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps,
+              const MeshPointer& mesh);
+      
+      bool goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, 
+              MeshFunctionType& aux, const InterfaceMapType& interfaceMap,
+              const AnisotropyPointer& anisotropy );
+      
+      void getInfoFromNeighbours( int& calculated, int& calculateAgain, const MeshPointer& mesh );
 };
 
 template< typename Real,
@@ -134,6 +143,7 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator,
     typedef Anisotropy AnisotropyType;
     typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > BaseType;
     typedef Communicator CommunicatorType;
+    typedef Containers::StaticVector< 3, Index > StaticVector;
     
     using MeshPointer = Pointers::SharedPointer<  MeshType >;
     using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
@@ -161,6 +171,15 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator,
     protected:
       
       const IndexType maxIterations;
+      
+      void setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps,
+              const MeshPointer& mesh);
+      
+      bool goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, 
+              MeshFunctionType& aux, const InterfaceMapType& interfaceMap,
+              const AnisotropyPointer& anisotropy );
+      
+      void getInfoFromNeighbours( int& calculated, int& calculateAgain, const MeshPointer& mesh );
 };
 
 
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index f9bef30c3..66f9e6cdf 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -13,19 +13,6 @@
 
 #pragma once
 
-#include "tnlFastSweepingMethod.h"
-#include "tnlDirectEikonalProblem.h"
-#include <TNL/Devices/Cuda.h>
-#include <TNL/Communicators/MpiDefs.h>
-#include "tnlDirectEikonalProblem.h"
-
-
-
-
-#include <string.h>
-#include <iostream>
-#include <fstream>
-
 template< typename Real,
         typename Device,
         typename Index,
@@ -79,28 +66,14 @@ solve( const MeshPointer& mesh,
   auxPtr->setMesh( mesh );
   interfaceMapPtr->setMesh( mesh );
   
-  //Distributed mesh for MPI overlaps (without MPI null pointer)
-  Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh();
-  
-  int i = MPI::GetRank( MPI::AllGroup ); // number that identifies rank
-  
-  // getting overlaps ( WITHOUT MPI SHOULD BE 0 )
-  Containers::StaticVector< 2, IndexType > vLower;
-  vLower[0] = 0; vLower[1] = 0;
-  Containers::StaticVector< 2, IndexType > vUpper;
-  vUpper[0] = 0; vUpper[1] = 0;
-#ifdef HAVE_MPI
-  if( CommunicatorType::isDistributed() ) //If we started solver with MPI
-  {
-    vLower = meshPom->getLowerOverlap();
-    vUpper = meshPom->getUpperOverlap();
-  }
-#endif
+  // Setting overlaps ( WITHOUT MPI SHOULD BE 0 )
+  StaticVector vecLowerOverlaps, vecUpperOverlaps;
+  setOverlaps( vecLowerOverlaps, vecUpperOverlaps, mesh );
   
   std::cout << "Initiating the interface cells ..." << std::endl;
-  BaseType::initInterface( u, auxPtr, interfaceMapPtr, vLower, vUpper );
+  BaseType::initInterface( u, auxPtr, interfaceMapPtr, vecLowerOverlaps, vecUpperOverlaps );
   
-  auxPtr->save( "aux-ini.tnl" );
+  //auxPtr->save( "aux-ini.tnl" );
   
   typename MeshType::Cell cell( *mesh );
   
@@ -142,57 +115,26 @@ solve( const MeshPointer& mesh,
 #endif
   
   while( iteration < this->maxIterations )
-  {    
-#if  ForDebug 
-    int WhileCount = 0; // number of passages of while cycle with condition calculated
-    printf( "%d: meshDimensions are (x,y) = (%d,%d).\n",i, mesh->getDimensions().x(), mesh->getDimensions().y() );
-    printf( "%d: owerlaps are ([x1,x2],[y1,y2]) = ([%d,%d],[%d,%d]).\n",i, vLower[0], vUpper[0], vLower[1], vUpper[1] );
-    /*if( std::is_same< DeviceType, Devices::Host >::value && i == 0 )
-    {
-      for( int j = mesh->getDimensions().y()-1; j>-1; j-- ){
-        for( int m = 0; m < mesh->getDimensions().x(); m++ )
-          std::cout << aux[ j * mesh->getDimensions().x() + m ] << " ";
-        std::cout << std::endl;
-      }
-      std::cout << std::endl;
-    }*/
-    
-    // TO SEE CUDA OVERLAPS
-    /*const int cudaBlockSize( 16 );
-    int numBlocksXWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
-    int numBlocksYWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
-    dim3 gridSizeWithoutOverlaps( numBlocksXWithoutOverlaps, numBlocksYWithoutOverlaps );
-    dim3 blockSize( cudaBlockSize, cudaBlockSize );
-    MeshFunctionPointer helpFunc( mesh );
-    DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( helpFunc.template getData< Device>(),
-            auxPtr.template modifyData< Device>(), 1, i ); */
-    
-#endif
-    
-    int calculated = 1; // indicates weather we calculated in the last passage of the while cycle 
-    // calculated is same for all ranks 
+  {
+    // calculatedBefore indicates weather we calculated in the last passage of the while cycle 
+    // calculatedBefore is same for all ranks 
     // without MPI should be FALSE at the end of while cycle body
-    int calculate = 1; // indicates if the thread should calculate again in upcoming passage of cycle
-    // calculate is a value that can differ in every rank
+    int calculatedBefore = 1;
+    
+    // calculateMPIAgain indicates if the thread should calculate again in upcoming passage of while cycle
+    // calculateMPIAgain is a value that can differ in every rank
     // without MPI should be FALSE at the end of while cycle body
+    int calculateMPIAgain = 1;  
     
-    while( calculated )
+    while( calculatedBefore )
     {
-      calculated = 0;
-#if ForDebug
-      WhileCount++;
-      /*if( std::is_same< DeviceType, Devices::Cuda >::value )
-      {
-        DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(),
-                helpFunc.template modifyData< Device>(), 0, i );
-      }*/
-#endif
+      calculatedBefore = 0;
       
-      if( std::is_same< DeviceType, Devices::Host >::value && calculate ) // should we calculate in Host?
+      if( std::is_same< DeviceType, Devices::Host >::value && calculateMPIAgain ) // should we calculate in Host?
       {
-        calculate = 0;
+        calculateMPIAgain = 0;
         
-        /**--HERE-IS-PARALLEL-OMP-CODE--!!!WITHOUT MPI!!!--------------------**/
+  /**--HERE-IS-PARALLEL-OMP-CODE--!!!WITHOUT MPI!!!--------------------**/
         /*
          int numThreadsPerBlock = -1;
          
@@ -300,92 +242,38 @@ solve( const MeshPointer& mesh,
          auxPtr = helpFunc;
          }
          */
-        /**-END-OF-OMP-PARALLEL------------------------------------------------**/
+  /**-END-OF-OMP-PARALLEL------------------------------------------------**/
         
         
-        /*if( i == 1 )
-         {
-         for( int k = 0; k < mesh->getDimensions().y(); k++ ){
-         for( int l = 0; l < mesh->getDimensions().x(); l++ )
-         printf("%.2f\t",aux[ k * mesh->getDimensions().x() + l ] );
-         printf("\n");
-         }
-         }*/
-        
   // FSM FOR MPI and WITHOUT MPI
-        for( cell.getCoordinates().y() = 0 + vLower[1];
-                cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
-                cell.getCoordinates().y()++ )
-        {
-          for( cell.getCoordinates().x() = 0 + vLower[0];
-                  cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
-                  cell.getCoordinates().x()++ )
-          {
-            cell.refresh();
-            if( ! interfaceMap( cell ) )
-            {
-              calculated = this->updateCell( aux, cell ) || calculated;
-            }
-          }
-        }
-        
-        for( cell.getCoordinates().y() = 0 + vLower[1];
-                cell.getCoordinates().y() < mesh->getDimensions().y()-vUpper[1];
-                cell.getCoordinates().y()++ )
-        {
-          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0];
-                  cell.getCoordinates().x() >= 0 + vLower[0];
-                  cell.getCoordinates().x()-- )		
-          {
-            //std::cerr << "2 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )            
-              this->updateCell( aux, cell );
-          }
-        }
-        
+        StaticVector boundsFrom; StaticVector boundsTo;
+    // UP and RIGHT
+        boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
+        boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
+        calculatedBefore = goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
+        //aux.save("aux-1.tnl");
+        
+    // UP and LEFL
+        boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
+        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = -1 + vecLowerOverlaps[0];
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
         //aux.save( "aux-2.tnl" );
         
-        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 -vUpper[1];
-                cell.getCoordinates().y() >= 0 + vLower[1] ;
-                cell.getCoordinates().y()-- )
-        {
-          for( cell.getCoordinates().x() = 0 + vLower[0];
-                  cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
-                  cell.getCoordinates().x()++ )
-          {
-            //std::cerr << "3 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )            
-              this->updateCell( aux, cell );
-          }
-        }
-        
+    // DOWN and RIGHT
+        boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
+        boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
         //aux.save( "aux-3.tnl" );
         
-        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 - vUpper[1];
-                cell.getCoordinates().y() >= 0 + vLower[1];
-                cell.getCoordinates().y()-- )
-        {
-          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0];
-                  cell.getCoordinates().x() >= 0 + vLower[0];
-                  cell.getCoordinates().x()-- )		
-          {
-            //std::cerr << "4 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )            
-              this->updateCell( aux, cell );
-          }
-        }
+    // DOWN and LEFT
+        boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
+        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0];
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
+        
       }
-      
-      if( std::is_same< DeviceType, Devices::Cuda >::value && calculate ) // should we calculate on CUDA?
+      if( std::is_same< DeviceType, Devices::Cuda >::value && calculateMPIAgain ) // should we calculate on CUDA?
       {
-        calculate = 0;
-        
-#if ForDebug 
-        printf("%d: We are in Cuda code start.\n", i);
-#endif
+        calculateMPIAgain = 0;
           
 #ifdef HAVE_CUDA
         TNL_CHECK_CUDA_DEVICE;
@@ -394,8 +282,8 @@ solve( const MeshPointer& mesh,
         const int cudaBlockSize( 16 );
         
         // Setting number of threads and blocks for kernel
-        int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vLower[0] - vUpper[0], cudaBlockSize );
-        int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vLower[1] - vUpper[1], cudaBlockSize );
+        int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize );
+        int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize );
         dim3 blockSize( cudaBlockSize, cudaBlockSize );
         dim3 gridSize( numBlocksX, numBlocksY );
         
@@ -439,60 +327,21 @@ solve( const MeshPointer& mesh,
          BlockIterD = dBlock.getElement( 0 );*/
         
         // Array that identifies which blocks should be calculated.
-        TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
-        BlockIterDevice.setSize( numBlocksX * numBlocksY );
-        BlockIterDevice.setValue( 1 );
+        // All blocks should calculate in first passage ( setValue(1) )
+        TNL::Containers::Array< int, Devices::Cuda, IndexType > blockCalculationIndicator( numBlocksX * numBlocksY );
+        blockCalculationIndicator.setValue( 1 );
         TNL_CHECK_CUDA_DEVICE;
         
-        // Array into which we identify the neighbours and then copy it into BlockIterDevice
-        TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom;
-        BlockIterPom.setSize( numBlocksX * numBlocksY  );
-        BlockIterPom.setValue( 0 );
+        // Array into which we identify the neighbours and then copy it into blockCalculationIndicator
+        TNL::Containers::Array< int, Devices::Cuda, IndexType > blockCalculationIndicatorHelp(numBlocksX * numBlocksY );
+        blockCalculationIndicatorHelp.setValue( 0 );
         
-#if ForDebug // For printf of BlockIterDevice
-        TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1;
-        BlockIterPom1.setSize( numBlocksX * numBlocksY  );
-        BlockIterPom1.setValue( 0 );
-#endif   
+        // number of Blocks for kernel that calculates neighbours.
         int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
-        // for CudaPrallelReduc (replaced with .containsValue(1))
-        //int nBlocks = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
-        //TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
-        //dBlock.setSize( nBlocks );
-        //TNL::Containers::Array< int, Devices::Host, IndexType > dBlock1;
-        //dBlock1.setSize( nBlocks );
-        //TNL_CHECK_CUDA_DEVICE;
         
         // Helping meshFunction that switches with AuxPtr in every calculation of CudaUpdateCellCaller<<<>>>()
         MeshFunctionPointer helpFunc( mesh );
-        helpFunc.template modifyData() = auxPtr.template getData();
-        Devices::Cuda::synchronizeDevice(); 
-        //MeshFunctionPointer helpFunc1( mesh );
-        
-        // Setting number of threads and blocks in grid for DeepCopy of meshFunction
-        /*int numBlocksXWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
-        int numBlocksYWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
-        dim3 gridSizeWithoutOverlaps( numBlocksXWithoutOverlaps, numBlocksYWithoutOverlaps );
-        
-        
-          Devices::Cuda::synchronizeDevice();
-        DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(),
-                helpFunc.template modifyData< Device>(), 1, i );
-          cudaDeviceSynchronize();
-          TNL_CHECK_CUDA_DEVICE;
-          Devices::Cuda::synchronizeDevice();
-        DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(),
-                helpFunc.template modifyData< Device>(), 0, i );
-          cudaDeviceSynchronize();
-          TNL_CHECK_CUDA_DEVICE;*/
-        
-#if ForDebug
-        /*int numBlocksXWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
-        int numBlocksYWithoutOverlaps = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
-        dim3 gridSizeWithoutOverlaps( numBlocksXWithoutOverlaps, numBlocksYWithoutOverlaps );*/
-        DeepCopy<<< gridSizeWithoutOverlaps, blockSize >>>( auxPtr.template getData< Device>(),
-                helpFunc.template modifyData< Device>(), 0, i );
-#endif
+        helpFunc.template modifyData() = auxPtr.template getData(); 
         
         //int pocBloku = 0;
         Devices::Cuda::synchronizeDevice();
@@ -505,18 +354,16 @@ solve( const MeshPointer& mesh,
         TNL_CHECK_CUDA_DEVICE;
         
         //int oddEvenBlock = 0;
-        //int numberWhile = 0;
-        while( BlockIterD )
+        while( calculateCudaBlocksAgain )
         {
-          //numberWhile++;
-          /** HERE IS CHESS METHOD (NO MPI) **/
+  /** HERE IS CHESS METHOD (NO MPI) **/
           
           /*
            CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
            interfaceMapPtr.template getData< Device >(),
            auxPtr.template getData< Device>(),
            helpFunc.template modifyData< Device>(),
-           BlockIterDevice,
+           blockCalculationIndicator,
            oddEvenBlock );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
@@ -527,14 +374,14 @@ solve( const MeshPointer& mesh,
            interfaceMapPtr.template getData< Device >(),
            helpFunc.template getData< Device>(),
            auxPtr.template modifyData< Device>(),
-           BlockIterDevice, vLower, vUpper, 
+           blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps, 
            oddEvenBlock );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
            
            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
            
-           CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
+           CudaParallelReduc<<< nBlocks , 1024 >>>( blockCalculationIndicator, dBlock, ( numBlocksX * numBlocksY ) );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
@@ -543,16 +390,14 @@ solve( const MeshPointer& mesh,
            
            BlockIterD = dBlock.getElement( 0 );*/
           
-          /**------------------------------------------------------------------------------------------------*/
+  /**------------------------------------------------------------------------------------------------*/
           
           
-     /** HERE IS FIM FOR MPI AND WITHOUT MPI **/
+  /** HERE IS FIM FOR MPI AND WITHOUT MPI **/
           Devices::Cuda::synchronizeDevice();
-          CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
-                  interfaceMapPtr.template getData< Device >(),
-                  auxPtr.template getData< Device>(),
-                  helpFunc.template modifyData< Device>(),
-                  BlockIterDevice, vLower, vUpper, i );
+          CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(),
+                  auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(),
+                  blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps );
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
         
@@ -589,53 +434,22 @@ solve( const MeshPointer& mesh,
           
           // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now.
           Devices::Cuda::synchronizeDevice(); 
-          GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, BlockIterPom, numBlocksX, numBlocksY );
-          cudaDeviceSynchronize();
-          TNL_CHECK_CUDA_DEVICE;
-          BlockIterDevice = BlockIterPom;
+          GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator, blockCalculationIndicatorHelp, numBlocksX, numBlocksY );
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
-#if ForDebug          
-          if( i == 1 ){
-            BlockIterPom1 = BlockIterDevice;
-            for( int i =0; i< numBlocksX; i++ ){
-              for( int j = 0; j < numBlocksY; j++ )
-              {
-                std::cout << BlockIterPom1[j*numBlocksX + i];
-              }
-              std::cout << std::endl;
-            }
-            std::cout << std::endl;
-          }
-#endif
-          // "Parallel reduction" to see if we should calculate again BlockIterD
-          BlockIterD = BlockIterDevice.containsValue(1);
-          /*Devices::Cuda::synchronizeDevice();
-          CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
+          blockCalculationIndicator = blockCalculationIndicatorHelp;
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
           
-          // Parallel reduction on dBlock because of too large number of blocks (more than maximum number of threads)
-          Devices::Cuda::synchronizeDevice();
-          CudaParallelReduc<<< 1, 1024 >>>( dBlock, dBlock, nBlocks );
-          cudaDeviceSynchronize();
-          TNL_CHECK_CUDA_DEVICE;*/
-          
-          // Copy of the first element which is result of parallel reduction
-          /*Devices::Cuda::synchronizeDevice();
-          BlockIterD = dBlock.getElement( 0 );
-          cudaDeviceSynchronize();
-          TNL_CHECK_CUDA_DEVICE;*/
+          // "Parallel reduction" to see if we should calculate again BlockIterD
+          calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1);
           
           // When we change something then we should caclucate again in the next passage of MPI ( calculated = true )
-         
-          
-          if( BlockIterD ){
-            calculated = 1;
+         if( calculateCudaBlocksAgain ){
+            calculatedBefore = 1;
           }
           
-          /**-----------------------------------------------------------------------------------------------------------*/
-       
+/**-----------------------------------------------------------------------------------------------------------*/
           numIter ++;
         }
         if( numIter%2  == 1 ){
@@ -679,96 +493,66 @@ solve( const MeshPointer& mesh,
 #endif        
         MPI::Allreduce( &calculated, &calculated, 1, MPI_LOR,  MPI::AllGroup );
         aux.template synchronize< Communicator >();
-        calculate = calculpom[0] || calculpom[1] || calculpom[2] || calculpom[3];
-#if ForDebug
-        printf( "%d: Receved Calculated = %d.\n%d: Calculate = %d\n", i, calculated, i, calculate);
-#endif
-        
-#if ForDebug 
-        if( i == 1 )
-          printf("WhileCount = %d\n",WhileCount);
-        //calculated = 0; // DEBUG;
-#endif
       }
 #endif
       if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculated 0!
-        calculated = 0;
+        calculatedBefore = 0;
     }
     iteration++;
   }
-  //String s( "aux-" + std::to_string( i ) + ".tnl" );
-  //aux.save( s );   
   Aux=auxPtr; // copy it for MakeSnapshot
-  
-  aux.save("aux-final.tnl");
 }
 
 
-#ifdef HAVE_CUDA
-// DeepCopy nebo pracne kopirovat kraje v zavislosti na vLower,vUpper z sArray do helpFunc.
-template< typename Real, typename Device, typename Index >
-__global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, int copy, int k )
+// PROTECTED FUNCTIONS:
+
+template< typename Real, typename Device, typename Index, 
+          typename Communicator, typename Anisotropy >
+void 
+FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
+setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps,
+              const MeshPointer& mesh)
 {
-  int i = threadIdx.x + blockDim.x*blockIdx.x;
-  int j = blockDim.y*blockIdx.y + threadIdx.y;
-  const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >();
-  if( copy ){
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
-      helpFunc[ j * mesh.getDimensions().x() + i ] = 1;//aux[ j * mesh.getDimensions().x() + i ];
-  }
-  else
+  vecLowerOverlaps[0] = 0; vecLowerOverlaps[1] = 0; vecUpperOverlaps[0] = 0; vecUpperOverlaps[1] = 0;
+#ifdef HAVE_MPI
+  if( CommunicatorType::isDistributed() ) //If we started solver with MPI
   {
-    if( i==0 && j == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 )
-    {
-      for( int m = mesh.getDimensions().y()-1; m>-1; m-- ){
-        for( int l = 0; l < 17; l++ ){
-          printf( "%.2f ", aux[ m * mesh.getDimensions().x() + l ]);
-        }
-        printf( "\n");
-      }
-      printf( "\n");
-    }
-    if( i==0 && j == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 )
-    {
-      for( int m = mesh.getDimensions().y()-1; m>-1; m-- ){
-        for( int l = 0; l < 17; l++ ){
-          printf( "%.2f ", helpFunc[ m * mesh.getDimensions().x() + l ]);
-        }
-        printf( "\n");
-      }
-      printf( "\n");
-    }
+    //Distributed mesh for MPI overlaps (without MPI null pointer)
+    const Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh();
+    vecLowerOverlaps = meshPom->getLowerOverlap();
+    vecUpperOverlaps = meshPom->getUpperOverlap();
   }
+#endif
 }
 
 template < typename Index >
 __global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
         TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY )
 {
-  int i = blockIdx.x * 1024 + threadIdx.x;
+  bool calculated = false;
+  const MeshType& mesh = aux.getMesh();
+  const IndexType stepX = boundsFrom[0] < boundsTo[0]? 1 : -1;
+  const IndexType stepY = boundsFrom[1] < boundsTo[1]? 1 : -1;
+  
+  typename MeshType::Cell cell( mesh );
+  cell.refresh();
   
-  if( i < numBlockX * numBlockY )
+  for( cell.getCoordinates().y() = boundsFrom[1];
+          TNL::abs( cell.getCoordinates().y() - boundsTo[1] ) > 0;
+          cell.getCoordinates().y() += stepY )
   {
-    int pom = 0;//BlockIterPom[ i ] = 0;
-    int m=0, k=0;
-    m = i%numBlockX;
-    k = i/numBlockX;
-    if( m > 0 && BlockIterDevice[ i - 1 ] ){
-      pom = 1;//BlockIterPom[ i ] = 1;
-    }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){
-      pom = 1;//BlockIterPom[ i ] = 1;
-    }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){
-      pom = 1;// BlockIterPom[ i ] = 1;
-    }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){
-      pom = 1;//BlockIterPom[ i ] = 1;
+    for( cell.getCoordinates().x() = boundsFrom[0];
+           TNL::abs( cell.getCoordinates().x() - boundsTo[0] ) > 0;
+            cell.getCoordinates().x() += stepX )
+    {
+      cell.refresh();
+      if( ! interfaceMap( cell ) )
+      {
+        calculated = this->updateCell( aux, cell ) || calculated;
+      }
     }
-    
-    if( BlockIterDevice[ i ] != 1 )
-      BlockIterPom[ i ] = pom;//BlockIterPom[ i ];
-    else
-      BlockIterPom[ i ] = 1;
   }
+  return calculated;
 }
 
 template < typename Index >
@@ -863,254 +647,51 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
         CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks );
         TNL_CHECK_CUDA_DEVICE;
 {
-  int thri = threadIdx.x; int thrj = threadIdx.y;
-  int i = threadIdx.x + blockDim.x*blockIdx.x + vLower[0];
-  int j = blockDim.y*blockIdx.y + threadIdx.y + vLower[1];
-  const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >();
-  /** FOR CHESS METHOD */
-  //if( (blockIdx.y%2  + blockIdx.x) % 2 == oddEvenBlock )
-  //{
-  /**------------------------------------------*/
+  Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh();
   
+  int calculateFromNeighbours[4] = {0,0,0,0};
+  const int *neighbours = meshDistr->getNeighbors(); // Getting neighbors of distributed mesh
+  MPI::Request *requestsInformation;
+  requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ];  
   
-  /** FOR FIM METHOD */
+  int neighCount = 0; // should this thread calculate again?
   
-  if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] )
-  { 
-    __syncthreads();
-    
-    /**-----------------------------------------*/
-    __shared__ int dimX;
-    __shared__ int dimY;
-    __shared__ Real hx;
-    __shared__ Real hy;
-    if( thri==0 && thrj == 0)
-    {
-      dimX = mesh.getDimensions().x();
-      dimY = mesh.getDimensions().y();
-      hx = mesh.getSpaceSteps().x();
-      hy = mesh.getSpaceSteps().y();
-      BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] = 0;
-    }
-    __syncthreads();
-    int numOfBlockx;
-    int numOfBlocky;
-    int xkolik;
-    int ykolik;
-    
-    xkolik = blockDim.x + 1;
-    ykolik = blockDim.y + 1;
-    numOfBlocky = gridDim.y;//(dimY-vUpper[1]-vLower[1])/blockDim.y + (((dimY-vUpper[1]-vLower[1])%blockDim.y != 0) ? 1:0);
-    numOfBlockx = gridDim.x;//(dimX-vUpper[0]-vLower[0])/blockDim.x + (((dimX-vUpper[0]-vLower[0])%blockDim.x != 0) ? 1:0);
-    
-    if( numOfBlockx - 1 == blockIdx.x )
-      xkolik = (dimX-vUpper[0]-vLower[0]) - (blockIdx.x)*blockDim.x+1;
-    
-    if( numOfBlocky -1 == blockIdx.y )
-      ykolik = (dimY-vUpper[1]-vLower[1]) - (blockIdx.y)*blockDim.y+1;
-    __syncthreads();
-    
-#if ForDebug
-    /*if( thri==0 && thrj == 0 )
-    {
-      printf("%d: DimX = %d, DimY = %d, xKolik = %d, yKolik = %d, numOfBlockX = %d, numOfBlockY = %d, blockIdx.x = %d, blockIdx.y = %d.\n",
-              k, dimX, dimY, xkolik, ykolik, numOfBlockx, numOfBlocky, blockIdx.x, blockIdx.y);
-    }*/
-#endif
-    
-    int currentIndex = thrj * blockDim.x + thri;
-    //__shared__ volatile bool changed[ blockDim.x*blockDim.y ];
-    __shared__ volatile bool changed[ (sizeSArray-2)*(sizeSArray-2)];
-    changed[ currentIndex ] = false;
-    if( thrj == 0 && thri == 0 )
-      changed[ 0 ] = true;
-    
-    
-    //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ];
-    __shared__ volatile Real sArray[ sizeSArray * sizeSArray ];
-    sArray[ (thrj+1) * sizeSArray + thri +1 ] = std::numeric_limits< Real >::max();
-    
-       
-        //filling sArray edges
-    if( thri == 0 )
-    {      
-      if( dimX - vLower[ 0 ] > (blockIdx.x+1) * blockDim.x  && thrj+1 < ykolik )
-        sArray[(thrj+1)*sizeSArray + xkolik] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX + xkolik + vLower[0] ];
-      else
-        sArray[(thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max();
-    }
-        
-    if( thri == 1 )
-    { 
-      if( ( blockIdx.x != 0 || vLower[0] != 0 ) && thrj+1 < ykolik )
-        sArray[(thrj+1)*sizeSArray + 0] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX  + vLower[0] ];
-      else
-        sArray[(thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max();
-    }
-    
-    if( thri == 2 )
-    {
-      if( dimY - vLower[ 1 ] > (blockIdx.y+1) * blockDim.y  && thrj+1 < xkolik )
-        sArray[ ykolik*sizeSArray + thrj+1 ] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + ykolik*dimX + thrj+1 + vLower[0] ];
-      else
-        sArray[ ykolik*sizeSArray + thrj+1 ] = std::numeric_limits< Real >::max();
-      
-    }
-        
-    if( thri == 3 )
-    {
-      if( ( blockIdx.y != 0 || vLower[1] != 0 ) && thrj+1 < xkolik )
-        sArray[0*sizeSArray + thrj+1] = aux[ (blockIdx.y*blockDim.y+vLower[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 + thrj+1 + vLower[0] ];
-      else
-        sArray[0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
-    }
-    /*__syncthreads();
-    if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 1 )
-    {
-      printf( "Kraje: \n");
-      for( int k = sizeSArray-1; k>-1; k-- ){
-        for( int l = 0; l < sizeSArray; l++ )
-          printf( "%.4f ", sArray[k * sizeSArray + l]);
-        printf( "\n");
-      }
-      printf( "\n");
-    }
-    __syncthreads();*/
-    
-    
-    if( i-vLower[0] < dimX && j-vLower[1] < dimY && thri+1 < xkolik + vUpper[0] && thrj+1 < ykolik + vUpper[1] )
-    {  
-      /*if( k == 1 && blockIdx.x == 0 && blockIdx.y == 0 )
-        printf("at index = %d\n", j*dimX + i);*/
-      sArray[(thrj+1)*sizeSArray + thri+1] = aux[ (j)*dimX + i ];
-    }
-    __syncthreads();  
-#if ForDebug    
-    if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 )
-    {
-      printf( "všechno před výpočtem: \n");
-      for( int m = sizeSArray-1; m>-1; m-- ){
-        for( int l = 0; l < sizeSArray; l++ )
-          printf( "%.2f ", sArray[m * sizeSArray + l]);
-        printf( "\n");
-      }
-      printf( "\n");
-    }
-    
-    if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 )
-    {
-      for( int m = mesh.getDimensions().y()-1; m>-1; m-- ){
-        for( int l = 0; l < 17; l++ )
-          printf( "%.2f ", aux[ m * mesh.getDimensions().x() + l ]);
-        printf( "\n");
-      }
-      printf( "\n");
-    }
-#endif 
-    //main while cycle
-    //if( i == 0 && j == 0 )
-    //  printf("Overlaps [x1,y1],[x2,y2] = [%d,%d],[%d,%d]",vLower[0], vLower[1], vUpper[0], vUpper[1] );
-    
-    while( changed[ 0 ] )
-    {
-      __syncthreads();
-      
-      changed[ currentIndex] = false;
-      
-      //calculation of update cell
-      if( i < dimX - vUpper[0] && j < dimY - vUpper[1] /*&& i > vLower[0]-1 && j > vLower[1]-1*/ )
-      {
-        if( ! interfaceMap[ j * dimX + i ] )
-        {
-          /*if( k == 1 && blockIdx.x == 1 && blockIdx.y == 0 )
-            printf( "thri = %d, thrj = %d \n", thri, thrj );*/
-          changed[ currentIndex ] = ptr.updateCell<sizeSArray>( sArray, thri+1, thrj+1, hx,hy);
-        }
-      }
-      __syncthreads();
-      
-      //pyramid reduction
-      if( blockDim.x*blockDim.y == 1024 )
-      {
-        if( currentIndex < 512 )
-        {
-          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
-        }
-      }
-      __syncthreads();
-      if( blockDim.x*blockDim.y >= 512 )
-      {
-        if( currentIndex < 256 )
-        {
-          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
-        }
-      }
-      __syncthreads();
-      if( blockDim.x*blockDim.y >= 256 )
-      {
-        if( currentIndex < 128 )
-        {
-          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
-        }
-      }
-      __syncthreads();
-      if( blockDim.x*blockDim.y >= 128 )
-      {
-        if( currentIndex < 64 )
-        {
-          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
-        }
-      }
-      __syncthreads();
-      if( currentIndex < 32 ) 
-      {
-        if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
-        if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
-        if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
-        if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
-        if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
-        if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
-      }
-      if( thri == 0 && thrj == 0 && changed[ 0 ] ){
-        BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] = 1;
-      }
-      __syncthreads();
-    }
-    
-    
-      
-    if( i < dimX && j < dimY && thri+1 < xkolik && thrj+1 < ykolik )
-      helpFunc[ j * dimX + i ] = sArray[ ( thrj + 1 ) * sizeSArray + thri + 1 ];
-    __syncthreads();
-#if ForDebug
-    if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 )
-    {
-      printf( "všechno po výpočtu: \n");
-      for( int m = sizeSArray-1; m>-1; m-- ){
-        for( int l = 0; l < sizeSArray; l++ )
-          printf( "%.2f ", sArray[m * sizeSArray + l]);
-        printf( "\n");
-      }
-      printf( "\n");
-    }
-    
-    if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 3 )
-    {
-      printf( "8: \n");
-      for( int m = mesh.getDimensions().y()-1; m>-1; m-- ){
-        for( int l = 0; l < mesh.getDimensions().x(); l++ )
-          printf( "%.2f ", helpFunc[ m * mesh.getDimensions().x() + l ]);
-        printf("\n");
-      }
-      printf( "\n");
-    }
-#endif
+  if( neighbours[0] != -1 ) // LEFT
+  {
+    requestsInformation[neighCount++] =
+            MPI::ISend( &calculatedBefore, 1, neighbours[0], 0, MPI::AllGroup );
+    requestsInformation[neighCount++] = 
+            MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup );
   }
-  else
+  
+  if( neighbours[1] != -1 ) // RIGHT
+  {
+    requestsInformation[neighCount++] =
+            MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup ); 
+    requestsInformation[neighCount++] = 
+            MPI::IRecv( &calculateFromNeighbours[1], 1, neighbours[1], 0, MPI::AllGroup );
+  }
+  
+  if( neighbours[2] != -1 ) //UP
+  {
+    requestsInformation[neighCount++] = 
+            MPI::ISend( &calculatedBefore, 1, neighbours[2], 0, MPI::AllGroup );
+    requestsInformation[neighCount++] =
+            MPI::IRecv( &calculateFromNeighbours[2], 1, neighbours[2], 0, MPI::AllGroup  );
+  }
+  
+  if( neighbours[5] != -1 ) //DOWN
   {
-    if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] )
-      helpFunc[ j * mesh.getDimensions().x() + i ] = aux[ j * mesh.getDimensions().x() + i ];
+    requestsInformation[neighCount++] = 
+            MPI::ISend( &calculatedBefore, 1, neighbours[5], 0, MPI::AllGroup );
+    requestsInformation[neighCount++] = 
+            MPI::IRecv( &calculateFromNeighbours[3], 1, neighbours[5], 0, MPI::AllGroup );
   }
+  MPI::WaitAll( requestsInformation, neighCount );
+  
+  MPI::Allreduce( &calculatedBefore, &calculatedBefore, 1, MPI_LOR,  MPI::AllGroup );
+  calculateMPIAgain = calculateFromNeighbours[0] || calculateFromNeighbours[1] ||
+              calculateFromNeighbours[2] || calculateFromNeighbours[3];
 }
 #endif
         TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock )
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index 40a1efeba..2d73b174e 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -13,8 +13,6 @@
 
 #pragma once
 
-#include "tnlFastSweepingMethod.h"
-
 
 template< typename Real,
         typename Device,
@@ -69,24 +67,12 @@ solve( const MeshPointer& mesh,
   auxPtr->setMesh( mesh );
   interfaceMapPtr->setMesh( mesh );
   
-  //Distributed mesh for overlaps (without MPI is null pointer)
-  Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh();
-  
   // getting overlaps ( WITHOUT MPI SHOULD BE 0 )
-  Containers::StaticVector< 3, IndexType > vLower;
-  vLower[0] = 0; vLower[1] = 0; vLower[2] = 0;
-  Containers::StaticVector< 3, IndexType > vUpper;
-  vUpper[0] = 0; vUpper[1] = 0; vUpper[2] = 0;
-#ifdef HAVE_MPI
-  if( CommunicatorType::isDistributed() )
-  {
-    vLower = meshPom->getLowerOverlap();
-    vUpper = meshPom->getUpperOverlap();
-  }
-#endif
+  Containers::StaticVector< 3, IndexType > vecLowerOverlaps, vecUpperOverlaps;
+  setOverlaps( vecLowerOverlaps, vecUpperOverlaps, mesh );
   
   std::cout << "Initiating the interface cells ..." << std::endl;
-  BaseType::initInterface( u, auxPtr, interfaceMapPtr, vLower, vUpper );
+  BaseType::initInterface( u, auxPtr, interfaceMapPtr, vecLowerOverlaps, vecUpperOverlaps );
   auxPtr->save( "aux-ini.tnl" );   
   
   typename MeshType::Cell cell( *mesh );
@@ -95,59 +81,26 @@ solve( const MeshPointer& mesh,
   MeshFunctionType aux = *auxPtr;
   InterfaceMapType interfaceMap = * interfaceMapPtr;
   aux.template synchronize< Communicator >(); //synchronization of intial conditions
-  int i = MPI::GetRank( MPI::AllGroup ); //getting identification of MPI thread
-#if ForDebug
-        if( i == 2 ){
-          aux.save("aux-init2.tnl");
-          mesh->save("mesh-2.tnl");
-        }
-        if( i == 1 ){
-          aux.save("aux-init1.tnl");
-          mesh->save("mesh-1.tnl");
-        }
-        if( i == 3 ){
-          aux.save("aux-init3.tnl");
-          mesh->save("mesh-3.tnl");
-        }
-        if( i == 0 ){
-          aux.save("aux-init0.tnl");
-          mesh->save("mesh-0.tnl");
-        }
-#endif
   
   while( iteration < this->maxIterations )
-  {    
-#if  ForDebug 
-    int WhileCount = 0; // number of passages of while cycle with condition calculated
-    printf( "%d: meshDimensions are (x,y,z) = (%d,%d,%d).\n",i, mesh->getDimensions().x(), mesh->getDimensions().y(), mesh->getDimensions().z() );
-    printf( "%d: owerlaps are ([x1,x2],[y1,y2],[z1,z2]) = ([%d,%d],[%d,%d],[%d,%d]).\n",i, vLower[0], vUpper[0], vLower[1], vUpper[1], vUpper[2], vLower[2] );
-    /*if( std::is_same< DeviceType, Devices::Host >::value && i == 2 )
-    {
-      for( int j = mesh->getDimensions().y()-1; j>-1; j-- ){
-        for( int m = 0; m < mesh->getDimensions().x(); m++ )
-          printf( "%.2f " , aux[ j*mesh->getDimensions().x() + m ]);
-        printf("\n");
-      }
-      printf("\n");
-    }*/    
-#endif
-    
-    int calculated = 1; // indicates weather we calculated in the last passage of the while cycle 
-    // calculated is same for all ranks 
+  {
+    // indicates weather we calculated in the last passage of the while cycle 
+    // calculatedBefore is same for all ranks 
     // without MPI should be FALSE at the end of while cycle body
-    int calculate = 1; // indicates if the thread should calculate again in upcoming passage of cycle
-    // calculate is a value that can differ in every rank
+    int calculatedBefore = 1; 
+    
+    // indicates if the MPI process should calculate again in upcoming passage of cycle
+    // calculateMPIAgain is a value that can differ in every rank
     // without MPI should be FALSE at the end of while cycle body
+    int calculateMPIAgain = 1; 
     
-    while( calculated )
+    while( calculatedBefore )
     {
-      calculated = 0;
-#if ForDebug
-      WhileCount++;
-#endif
-      if( std::is_same< DeviceType, Devices::Host >::value && calculate ) // should we calculate in Host?
+      calculatedBefore = 0;
+      
+      if( std::is_same< DeviceType, Devices::Host >::value && calculateMPIAgain ) // should we calculate in Host?
       {
-        calculate = 0;
+        calculateMPIAgain = 0;
         
 /** HERE IS FSM FOR OPENMP (NO MPI) - isnt worthy */
         /*int numThreadsPerBlock = 64;
@@ -212,401 +165,60 @@ solve( const MeshPointer& mesh,
         
         
 /** HERE IS FSM WITH MPI AND WITHOUT MPI */
+        StaticVector boundsFrom; StaticVector boundsTo;
         
-#if ForDebug
-        if( i == 1 ){
-          aux.save("aux-final10.tnl");
-        }
-#endif
-        for( cell.getCoordinates().z() = 0 + vLower[2];
-                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
-                cell.getCoordinates().z()++ )
-        {
-          for( cell.getCoordinates().y() = 0 + vLower[1];
-                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
-                  cell.getCoordinates().y()++ )
-          {
-            for( cell.getCoordinates().x() = 0 + vLower[0];
-                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
-                    cell.getCoordinates().x()++ )
-            {
-              cell.refresh();
-              if( ! interfaceMap( cell ) )
-              {
-                //getting information weather we calculated in this passage
-                calculated = this->updateCell( aux, cell ) || calculated;
-              }
-            }
-          }
-        }
-#if ForDebug
-        if( i == 1 ){
-          aux.save("aux-final11.tnl");
-        }
-        int pocNull = 0;
-        for( cell.getCoordinates().z() = 0 + vLower[2];
-                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
-                cell.getCoordinates().z()++ )
-        {
-          for( cell.getCoordinates().y() = 0 + vLower[1];
-                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
-                  cell.getCoordinates().y()++ )
-          {
-            for( cell.getCoordinates().x() = 0 + vLower[0];
-                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
-                    cell.getCoordinates().x()++ )
-            {
-              cell.refresh();
-              if( fabs( aux(cell) ) < 0.002 )
-                pocNull++;
-            }
-          }
-        }
-        printf("%d: 1. pocNull = %d\n", i , pocNull);
-#endif        
-        for( cell.getCoordinates().z() = 0 + vLower[2];
-                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
-                cell.getCoordinates().z()++ )
-        {
-          for( cell.getCoordinates().y() = 0 + vLower[1];
-                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
-                  cell.getCoordinates().y()++ )
-          {
-            for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0];
-                    cell.getCoordinates().x() >= 0 + vLower[0];
-                    cell.getCoordinates().x()-- )		
-            {
-              //std::cerr << "2 -> ";
-              cell.refresh();
-              if( ! interfaceMap( cell ) )            
-                this->updateCell( aux, cell );
-            }
-          }
-        }
-#if ForDebug
-        if( i == 1 ){
-          aux.save("aux-final12.tnl");
-        }
-        pocNull = 0;
-        for( cell.getCoordinates().z() = 0 + vLower[2];
-                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
-                cell.getCoordinates().z()++ )
-        {
-          for( cell.getCoordinates().y() = 0 + vLower[1];
-                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
-                  cell.getCoordinates().y()++ )
-          {
-            for( cell.getCoordinates().x() = 0 + vLower[0];
-                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
-                    cell.getCoordinates().x()++ )
-            {
-              cell.refresh();
-              if( fabs( aux(cell) ) < 0.002 )
-                pocNull++;
-            }
-          }
-        }
-        printf("%d: 2. pocNull = %d\n", i , pocNull);
-#endif        
-        //aux.save( "aux-2.tnl" );
-        for( cell.getCoordinates().z() = 0 + vLower[2];
-                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
-                cell.getCoordinates().z()++ )
-        {
-          for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 - vUpper[1];
-                  cell.getCoordinates().y() >= 0 + vLower[1];
-                  cell.getCoordinates().y()-- )
-          {
-            for( cell.getCoordinates().x() = 0 + vLower[0];
-                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
-                    cell.getCoordinates().x()++ )
-            {
-              //std::cerr << "3 -> ";
-              cell.refresh();
-              if( ! interfaceMap( cell ) )            
-                this->updateCell( aux, cell );
-            }
-          }
-        }
-#if ForDebug
-        if( i == 1 ){
-          aux.save("aux-final13.tnl");
-        }
-        pocNull = 0;
-        for( cell.getCoordinates().z() = 0 + vLower[2];
-                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
-                cell.getCoordinates().z()++ )
-        {
-          for( cell.getCoordinates().y() = 0 + vLower[1];
-                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
-                  cell.getCoordinates().y()++ )
-          {
-            for( cell.getCoordinates().x() = 0 + vLower[0];
-                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
-                    cell.getCoordinates().x()++ )
-            {
-              cell.refresh();
-              if( fabs( aux(cell) ) < 0.002 )
-                pocNull++;
-            }
-          }
-        }
-        printf("%d: 3. pocNull = %d\n", i , pocNull);
-#endif        
-        //aux.save( "aux-3.tnl" );
+    // TOP, NORTH and EAST        
+        boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2];
+        boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
+        boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
+        calculatedBefore = goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
         
-        for( cell.getCoordinates().z() = 0 + vLower[2];
-                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
-                cell.getCoordinates().z()++ )
-        {
-          for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 - vUpper[1];
-                  cell.getCoordinates().y() >= 0 + vLower[1];
-                  cell.getCoordinates().y()-- )
-          {
-            for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0];
-                    cell.getCoordinates().x() >= 0 + vLower[0];
-                    cell.getCoordinates().x()-- )		
-            {
-              //std::cerr << "4 -> ";
-              cell.refresh();
-              if( ! interfaceMap( cell ) )            
-                this->updateCell( aux, cell );
-            }
-          }
-        }  
-#if ForDebug
-        if( i == 1 ){
-          aux.save("aux-final14.tnl");
-        }
-        pocNull = 0;
-        for( cell.getCoordinates().z() = 0 + vLower[2];
-                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
-                cell.getCoordinates().z()++ )
-        {
-          for( cell.getCoordinates().y() = 0 + vLower[1];
-                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
-                  cell.getCoordinates().y()++ )
-          {
-            for( cell.getCoordinates().x() = 0 + vLower[0];
-                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
-                    cell.getCoordinates().x()++ )
-            {
-              cell.refresh();
-              if( fabs( aux(cell) ) < 0.002 )
-                pocNull++;
-            }
-          }
-        }
-        printf("%d: 4. pocNull = %d\n", i , pocNull);
-#endif        
-        //aux.save( "aux-4.tnl" );
+    // TOP, NORTH and WEST        
+        boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2];
+        boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
+        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = vecLowerOverlaps[0];
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
         
-        for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1 - vUpper[2];
-                cell.getCoordinates().z() >= 0 + vLower[2];
-                cell.getCoordinates().z()-- )
-        {
-          for( cell.getCoordinates().y() = 0 + vLower[1];
-                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
-                  cell.getCoordinates().y()++ )
-          {
-            for( cell.getCoordinates().x() = 0 + vLower[0];
-                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
-                    cell.getCoordinates().x()++ )
-            {
-              //std::cerr << "5 -> ";
-              cell.refresh();
-              if( ! interfaceMap( cell ) )
-                this->updateCell( aux, cell );
-            }
-          }
-        }
-#if ForDebug
-        if( i == 1 ){
-          aux.save("aux-final15.tnl");
-        }
-        pocNull = 0;
-        for( cell.getCoordinates().z() = 0 + vLower[2];
-                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
-                cell.getCoordinates().z()++ )
-        {
-          for( cell.getCoordinates().y() = 0 + vLower[1];
-                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
-                  cell.getCoordinates().y()++ )
-          {
-            for( cell.getCoordinates().x() = 0 + vLower[0];
-                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
-                    cell.getCoordinates().x()++ )
-            {
-              cell.refresh();
-              if( fabs( aux(cell) ) < 0.002 )
-                pocNull++;
-            }
-          }
-        }
-        printf("%d: 5. pocNull = %d\n", i , pocNull);
- #endif       
-        //aux.save( "aux-5.tnl" );
+    // TOP, SOUTH and EAST        
+        boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2];
+        boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = vecLowerOverlaps[1];
+        boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
         
-        for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1 - vUpper[2];
-                cell.getCoordinates().z() >= 0 + vLower[2];
-                cell.getCoordinates().z()-- )
-        {
-          for( cell.getCoordinates().y() = 0 + vLower[1];
-                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
-                  cell.getCoordinates().y()++ )
-          {
-            for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0];
-                    cell.getCoordinates().x() >= 0 + vLower[0];
-                    cell.getCoordinates().x()-- )		
-            {
-              //std::cerr << "6 -> ";
-              cell.refresh();
-              if( ! interfaceMap( cell ) )            
-                this->updateCell( aux, cell );
-            }
-          }
-        }
-#if ForDebug
-        if( i == 1 ){
-          aux.save("aux-final16.tnl");
-        }
-        pocNull = 0;
-        for( cell.getCoordinates().z() = 0 + vLower[2];
-                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
-                cell.getCoordinates().z()++ )
-        {
-          for( cell.getCoordinates().y() = 0 + vLower[1];
-                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
-                  cell.getCoordinates().y()++ )
-          {
-            for( cell.getCoordinates().x() = 0 + vLower[0];
-                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
-                    cell.getCoordinates().x()++ )
-            {
-              cell.refresh();
-              if( fabs( aux(cell) ) < 0.002 )
-                pocNull++;
-            }
-          }
-        }
-        printf("%d: 6. pocNull = %d\n", i , pocNull);
-#endif        
-        //aux.save( "aux-6.tnl" );
+    // TOP, SOUTH and WEST        
+        boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2];
+        boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = vecLowerOverlaps[1];
+        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = vecLowerOverlaps[0]; 
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
+            
+    // BOTTOM, NOTH and EAST        
+        boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = vecLowerOverlaps[2];
+        boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
+        boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); 
         
-        for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1 - vUpper[2];
-                cell.getCoordinates().z() >= 0 + vLower[2];
-                cell.getCoordinates().z()-- )
-        {
-          for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 - vUpper[1];
-                  cell.getCoordinates().y() >= 0 + vLower[1];
-                  cell.getCoordinates().y()-- )
-          {
-            for( cell.getCoordinates().x() = 0 + vLower[0];
-                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
-                    cell.getCoordinates().x()++ )
-            {
-              //std::cerr << "7 -> ";
-              cell.refresh();
-              if( ! interfaceMap( cell ) )            
-                this->updateCell( aux, cell );
-            }
-          }
-        }
+    // BOTTOM, NOTH and WEST        
+        boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = vecLowerOverlaps[2];
+        boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
+        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = vecLowerOverlaps[0]; 
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
         
-#if ForDebug
-        if( i == 1 ){
-          aux.save("aux-final17.tnl");
-        }
-        pocNull = 0;
-        for( cell.getCoordinates().z() = 0 + vLower[2];
-                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
-                cell.getCoordinates().z()++ )
-        {
-          for( cell.getCoordinates().y() = 0 + vLower[1];
-                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
-                  cell.getCoordinates().y()++ )
-          {
-            for( cell.getCoordinates().x() = 0 + vLower[0];
-                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
-                    cell.getCoordinates().x()++ )
-            {
-              cell.refresh();
-              if( fabs( aux(cell) ) < 0.002 )
-                pocNull++;
-            }
-          }
-        }
-        printf("%d: 7. pocNull = %d\n", i , pocNull);
-#endif        
-        //aux.save( "aux-7.tnl" );
+    // BOTTOM, SOUTH and EAST        
+        boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = vecLowerOverlaps[2];
+        boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = vecLowerOverlaps[1];
+        boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );    
+        
+    // BOTTOM, SOUTH and WEST        
+        boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = vecLowerOverlaps[2];
+        boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = vecLowerOverlaps[1];
+        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = vecLowerOverlaps[0];
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );    
         
-        for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1 - vUpper[2];
-                cell.getCoordinates().z() >= 0 + vLower[2];
-                cell.getCoordinates().z()-- )
-        {
-          for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1 - vUpper[1];
-                  cell.getCoordinates().y() >= 0 + vLower[1];
-                  cell.getCoordinates().y()-- )
-          {
-            for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1 - vUpper[0];
-                    cell.getCoordinates().x() >= 0 + vLower[0];
-                    cell.getCoordinates().x()-- )		
-            {
-              //std::cerr << "8 -> ";
-              cell.refresh();
-              if( ! interfaceMap( cell ) )            
-                this->updateCell( aux, cell );
-            }
-          }
-        }
-#if ForDebug
-        if( i == 1 ){
-          aux.save("aux-final18.tnl");
-        }
-        pocNull = 0;
-        for( cell.getCoordinates().z() = 0 + vLower[2];
-                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
-                cell.getCoordinates().z()++ )
-        {
-          for( cell.getCoordinates().y() = 0 + vLower[1];
-                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
-                  cell.getCoordinates().y()++ )
-          {
-            for( cell.getCoordinates().x() = 0 + vLower[0];
-                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
-                    cell.getCoordinates().x()++ )
-            {
-              cell.refresh();
-              if( fabs( aux(cell) ) < 0.002 )
-                pocNull++;
-            }
-          }
-        }
-        printf("%d: 8. pocNull = %d\n", i , pocNull);
-        for( cell.getCoordinates().z() = 0 + vLower[2];
-                cell.getCoordinates().z() < mesh->getDimensions().z() - vUpper[2];
-                cell.getCoordinates().z()++ )
-        {
-          for( cell.getCoordinates().y() = 0 + vLower[1];
-                  cell.getCoordinates().y() < mesh->getDimensions().y() - vUpper[1];
-                  cell.getCoordinates().y()++ )
-          {
-            for( cell.getCoordinates().x() = 0 + vLower[0];
-                    cell.getCoordinates().x() < mesh->getDimensions().x() - vUpper[0];
-                    cell.getCoordinates().x()++ )
-            {
-              cell.refresh();
-              printf("%.2f ", aux(cell));
-            }
-            printf("\n");
-          }
-          printf("\n");
-        }
-#endif
         
-        /**----------------------------------------------------------------------------------*/
+  /**----------------------------------------------------------------------------------*/
       }
-      if( std::is_same< DeviceType, Devices::Cuda >::value && calculate )
+      if( std::is_same< DeviceType, Devices::Cuda >::value && calculateMPIAgain )
       {
 #ifdef HAVE_CUDA
         // cudaBlockSize is a size of blocks. It's the number raised to the 3 power.
@@ -640,115 +252,18 @@ solve( const MeshPointer& mesh,
       }
       
 #ifdef HAVE_MPI
-      if( CommunicatorType::isDistributed() ){
-        
-        const int *neigh = meshPom->getNeighbors(); // Getting nembers of distributed mesh
-        MPI::Request *req;
-        req = new MPI::Request[meshPom->getNeighborsCount()];  
-        
-        int neighCount = 0; // we know the number in runtime and it can differ for every MPI thread
-        // Getting information weather some of six neghbours (top, bottom, right, left, ahead, behind) calculated
-        int calculpom[6] = {0,0,0,0,0,0}; 
-        
-        
-        if( neigh[0] != -1 ) // if you have west neighbour
-        {
-          // if we have this neighbour, we send calculated, one number, to him, ...
-          req[neighCount] = MPI::ISend( &calculated, 1, neigh[0], 0, MPI::AllGroup );
-          neighCount++;
-          // and we recive the same information from him
-          req[neighCount] = MPI::IRecv( &calculpom[0], 1, neigh[0], 0, MPI::AllGroup );
-          neighCount++;
-        }
-        
-        if( neigh[1] != -1 ) // east
-        {
-          req[neighCount] = MPI::ISend( &calculated, 1, neigh[1], 0, MPI::AllGroup ); 
-          neighCount++;
-          
-          
-          req[neighCount] = MPI::IRecv( &calculpom[1], 1, neigh[1], 0, MPI::AllGroup );
-          neighCount++;
-        }
-        
-        if( neigh[2] != -1 ) // north
-        {
-          req[neighCount] = MPI::ISend( &calculated, 1, neigh[2], 0, MPI::AllGroup );
-          neighCount++;
-          
-          req[neighCount] = MPI::IRecv( &calculpom[2], 1, neigh[2], 0, MPI::AllGroup  );
-          neighCount++;
-        }
-        
-        if( neigh[5] != -1 ) //south
-        {
-          req[neighCount] = MPI::ISend( &calculated, 1, neigh[5], 0, MPI::AllGroup );
-          neighCount++;
-          
-          req[neighCount] = MPI::IRecv( &calculpom[3], 1, neigh[5], 0, MPI::AllGroup );
-          neighCount++;
-        }
-        
-        if( neigh[8] != -1 ) // top 
-        {
-          req[neighCount] = MPI::ISend( &calculated, 1, neigh[8], 0, MPI::AllGroup );
-          neighCount++;
-          
-          req[neighCount] = MPI::IRecv( &calculpom[4], 1, neigh[8], 0, MPI::AllGroup );
-          neighCount++;
-        }
-        
-        if( neigh[17] != -1 ) //bottom
-        {
-          req[neighCount] = MPI::ISend( &calculated, 1, neigh[17], 0, MPI::AllGroup );
-          neighCount++;
-          
-          req[neighCount] = MPI::IRecv( &calculpom[5], 1, neigh[17], 0, MPI::AllGroup );
-          neighCount++;
-        }
-        
-        MPI::WaitAll(req,neighCount); //waiting for all to have all the information
-#if ForDebug
-        printf( "%d: Sending Calculated = %d.\n", i, calculated );
-        printf( "%d: calculpom[0] = %d, calculpom[1] = %d, calculpom[2] = %d, calculpom[3] = %d, calculpom[4] = %d,"
-                "calculpom[5] = %d", i ,calculpom[0],calculpom[1],calculpom[2],calculpom[3],calculpom[4],calculpom[5]);
-#endif        
-        // if one of the MPI thread had calculated = 1, then all get 1. Otherwise all get 0
-        MPI::Allreduce( &calculated, &calculated, 1, MPI_LOR,  MPI::AllGroup ); 
+      if( CommunicatorType::isDistributed() )
+      {
+        getInfoFromNeighbours( calculatedBefore, calculateMPIAgain, mesh );
+
         // synchronizate the overlaps 
         aux.template synchronize< Communicator >();
-        // if any of my neighbours had calculated = 1, than I should calculate again (but all of us has to go throw while(calculated))
-        calculate = calculpom[0] || calculpom[1] || calculpom[2] ||
-                    calculpom[3] || calculpom[4] || calculpom[5];
-#if ForDebug
-        printf( "%d: Receved Calculated = %d.\n%d: Calculate = %d\n", i, calculated, i, calculate);
-#endif
-        
-#if ForDebug 
-        if( i == 1 )
-          printf("WhileCount = %d\n",WhileCount);
-        if( i == 2 ){
-          aux.save("aux-final2.tnl");
-          mesh->save("mesh-2.tnl");
-        }
-        if( i == 1 ){
-          aux.save("aux-final1.tnl");
-          mesh->save("mesh-1.tnl");
-        }
-        if( i == 3 ){
-          aux.save("aux-final3.tnl");
-          mesh->save("mesh-3.tnl");
-        }
-        if( i == 0 ){
-          aux.save("aux-final0.tnl");
-          mesh->save("mesh-0.tnl");
-        }
-        //calculated = 0; // DEBUG;
-#endif
+
       }
 #endif
-      if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculated 0!
-        calculated = 0; //otherwise we would go throw the FSM code and CUDA FSM code again uselessly
+      
+      if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculatedBefore 0!
+        calculatedBefore = 0; //otherwise we would go throw the FSM code and CUDA FSM code again uselessly
     }
     //aux.save( "aux-8.tnl" );
     iteration++;
@@ -759,65 +274,69 @@ solve( const MeshPointer& mesh,
   aux.save("aux-final.tnl");
 }
 
-#ifdef HAVE_CUDA
-// DeepCopy nebo pracne kopirovat kraje v zavislosti na vLower,vUpper z sArray do helpFunc.
-template< typename Real, typename Device, typename Index >
-__global__ void DeepCopy( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
-        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, int copy, int k )
+// PROTECTED FUNCTIONS:
+
+template< typename Real, typename Device, typename Index, 
+          typename Communicator, typename Anisotropy >
+void 
+FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
+setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps,
+              const MeshPointer& mesh)
 {
-  int thri = threadIdx.x + blockDim.x*blockIdx.x;
-  int thrj = blockDim.y*blockIdx.y + threadIdx.y;
-  int thrk = blockDim.z*blockIdx.z + threadIdx.z;
-  
-  const Meshes::Grid< 3, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >();
-  if( copy ){
-    if( thri < mesh.getDimensions().x() && thrj < mesh.getDimensions().y() && thrk < mesh.getDimensions().z() )
-    {
-      helpFunc[ thrk * mesh.getDimensions().x() * mesh.getDimensions().y() + thrj * mesh.getDimensions().x() + thri ] =
-              aux[ thrk * mesh.getDimensions().x() * mesh.getDimensions().y() + thrj * mesh.getDimensions().x() + thri ];
-    }
+  vecLowerOverlaps[0] = 0; vecLowerOverlaps[1] = 0; vecLowerOverlaps[2] = 0;
+  vecUpperOverlaps[0] = 0; vecUpperOverlaps[1] = 0; vecUpperOverlaps[2] = 0;
+#ifdef HAVE_MPI
+  if( CommunicatorType::isDistributed() ) //If we started solver with MPI
+  {
+    //Distributed mesh for MPI overlaps (without MPI null pointer)
+    const Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh();
+    vecLowerOverlaps = meshPom->getLowerOverlap();
+    vecUpperOverlaps = meshPom->getUpperOverlap();
   }
-  else // for debug, values can be printed only from cuda kernel
+#endif
+}
+
+
+
+
+template< typename Real, typename Device, typename Index, 
+          typename Communicator, typename Anisotropy >
+bool 
+FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
+goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, 
+        MeshFunctionType& aux, const InterfaceMapType& interfaceMap,
+        const AnisotropyPointer& anisotropy )
+{
+  bool calculated = false;
+  const MeshType& mesh = aux.getMesh();
+  const IndexType stepX = boundsFrom[0] < boundsTo[0]? 1 : -1;
+  const IndexType stepY = boundsFrom[1] < boundsTo[1]? 1 : -1;
+  const IndexType stepZ = boundsFrom[2] < boundsTo[2]? 1 : -1;
+  
+  typename MeshType::Cell cell( mesh );
+  cell.refresh();
+  
+  for( cell.getCoordinates().z() = boundsFrom[2];
+          TNL::abs( cell.getCoordinates().z() - boundsTo[2] ) > 0;
+          cell.getCoordinates().z() += stepZ )
   {
-    if( thrk == 0 && thri==0 && thrj == 0 && blockIdx.z == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 0 )
-    {
-      printf("%d: DimX = %d, DimY = %d, DimZ = %d\n", k,mesh.getDimensions().x(),mesh.getDimensions().y(),mesh.getDimensions().z() );
-      for( int z = mesh.getDimensions().z()-1; z > mesh.getDimensions().z()-2; z-- )
-      {
-        for( int y = 0; y < mesh.getDimensions().y(); y++ )
-        {
-          for( int x = 0; x < mesh.getDimensions().x(); x++ )
-          {
-            printf("%.2f ", helpFunc[ z *mesh.getDimensions().y()*mesh.getDimensions().x() + y*mesh.getDimensions().x() + x ]);
-          }
-          printf("\n");
-        }
-        printf("\n");
-      }
-      printf("\n");
-    }
-    if( thrk == 0 && thri==0 && thrj == 0 && blockIdx.z == 0 && blockIdx.x == 0 && blockIdx.y == 0 && k == 1 )
+    for( cell.getCoordinates().y() = boundsFrom[1];
+            TNL::abs( cell.getCoordinates().y() - boundsTo[1] ) > 0;
+            cell.getCoordinates().y() += stepY )
     {
-      printf("%d: DimX = %d, DimY = %d, DimZ = %d\n", k,mesh.getDimensions().x(),mesh.getDimensions().y(),mesh.getDimensions().z() );
-      
-      if( k == 1 )
+      for( cell.getCoordinates().x() = boundsFrom[0];
+              TNL::abs( cell.getCoordinates().x() - boundsTo[0] ) > 0;
+              cell.getCoordinates().x() += stepX )
       {
-        for( int z = 1; z < 2; z++ )
+        cell.refresh();
+        if( ! interfaceMap( cell ) )
         {
-          for( int y = 0; y < mesh.getDimensions().y(); y++ )
-          {
-            for( int x = 0; x < mesh.getDimensions().x(); x++ )
-            {
-              printf("%.2f ", aux[ z *mesh.getDimensions().y()*mesh.getDimensions().x() + y*mesh.getDimensions().x() + x ]);
-            }
-            printf("\n");
-          }
-          printf("\n");
+          calculated = this->updateCell( aux, cell ) || calculated;
         }
-        printf("\n");
       }
     }
   }
+  return calculated;
 }
 
 template < typename Index >
@@ -825,33 +344,22 @@ __global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda,
         TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom,
         int numBlockX, int numBlockY, int numBlockZ )
 {
-  int i = blockIdx.x * 1024 + threadIdx.x;
+  Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh();
   
-  if( i < numBlockX * numBlockY * numBlockZ )
+  int calculateFromNeighbours[6] = {0,0,0,0,0,0};
+        
+  const int *neighbours = meshDistr->getNeighbors(); // Getting neighbors of distributed mesh
+  MPI::Request *requestsInformation;
+  requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ];  
+  
+  int neighCount = 0; // should this thread calculate again?
+  
+  if( neighbours[0] != -1 ) // WEST
   {
-    int pom = 0;//BlockIterPom[ i ] = 0;
-    int m=0, l=0, k=0;
-    l = i/( numBlockX * numBlockY );
-    k = (i-l*numBlockX * numBlockY )/(numBlockX );
-    m = (i-l*numBlockX * numBlockY )%( numBlockX );
-    if( m > 0 && BlockIterDevice[ i - 1 ] ){ // left neighbour
-      pom = 1;//BlockIterPom[ i ] = 1;
-    }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){ // right neighbour
-      pom = 1;//BlockIterPom[ i ] = 1;
-    }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){ // bottom neighbour
-      pom = 1;// BlockIterPom[ i ] = 1;
-    }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){ // top neighbour
-      pom = 1;//BlockIterPom[ i ] = 1;
-    }else if( l > 0 && BlockIterDevice[ i - numBlockX*numBlockY ] ){ // neighbour behind 
-      pom = 1;
-    }else if( l < numBlockZ-1 && BlockIterDevice[ i + numBlockX*numBlockY ] ){ // neighbour in front
-      pom = 1;
-    }
-    
-    if( !BlockIterDevice[ i ] ) // only in CudaUpdateCellCaller can BlockIterDevice gain 0
-      BlockIterPom[ i ] = pom;
-    else
-      BlockIterPom[ i ] = 1;
+    requestsInformation[neighCount++] =
+            MPI::ISend( &calculatedBefore, 1, neighbours[0], 0, MPI::AllGroup );
+    requestsInformation[neighCount++] = 
+            MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup );
   }
 }
 
@@ -870,233 +378,51 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
   int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri;
   const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
   
-  // should this block calculate?
-  if( BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] ) 
+  if( neighbours[1] != -1 ) // EAST
   {
-    __syncthreads();
-    
-    // Array indicates weather some threads calculated (for parallel reduction)
-    __shared__ volatile bool changed[ (sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2) ];
-    changed[ currentIndex ] = false;
-    
-    if( thrj == 0 && thri == 0 && thrk == 0 )
-      changed[ 0 ] = true; // first indicates weather we should calculate again (princip of parallel reduction)
-    
-    __shared__ Real hx; __shared__ int dimX; //getting stepps and size of mesh
-    __shared__ Real hy; __shared__ int dimY;
-    __shared__ Real hz; __shared__ int dimZ;
-    
-    if( thrj == 1 && thri == 1 && thrk == 1 )
-    {
-      //printf( "We are in the calculation. Block = %d.\n",blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x  );
-      hx = mesh.getSpaceSteps().x();
-      hy = mesh.getSpaceSteps().y();
-      hz = mesh.getSpaceSteps().z();
-      dimX = mesh.getDimensions().x();
-      dimY = mesh.getDimensions().y();
-      dimZ = mesh.getDimensions().z();
-      // we dont know if we will calculate in here, more info down in code
-      BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 0;
-    }
-    
-    // sArray contains values of one block (coppied from aux) and edges (not MPI) of those blocks
-    __shared__ volatile Real sArray[ sizeSArray * sizeSArray * sizeSArray ];
-    sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = std::numeric_limits< Real >::max();
-    
-    // getting some usefull information 
-    int numOfBlockx;
-    int numOfBlocky;
-    int numOfBlockz;
-    int xkolik; // maximum of threads in x direction (for all blocks different)
-    int ykolik;
-    int zkolik;
-    xkolik = blockDim.x + 1;
-    ykolik = blockDim.y + 1;
-    zkolik = blockDim.z + 1;
-    numOfBlockx = gridDim.x;
-    numOfBlocky = gridDim.y;
-    numOfBlockz = gridDim.z;
-    __syncthreads();
-    
-    if( numOfBlockx - 1 == blIdx )
-      xkolik = (dimX-vUpper[0]-vLower[0]) - (blIdx)*blockDim.x+1;
-    if( numOfBlocky -1 == blIdy )
-      ykolik = (dimY-vUpper[1]-vLower[1]) - (blIdy)*blockDim.y+1;
-    if( numOfBlockz-1 == blIdz )
-      zkolik = (dimZ-vUpper[2]-vLower[2]) - (blIdz)*blockDim.z+1;
-    __syncthreads();
-    
-     //filling sArray edges
-    if( thri == 0 ) //x bottom
-    {        
-      if( (blIdx != 0 || vLower[0] !=0) && thrj+1 < ykolik && thrk+1 < zkolik )
-        sArray[ (thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0 ] = 
-                aux[ (blIdz*blockDim.z + vLower[2]) * dimX * dimY + (blIdy * blockDim.y+vLower[1])*dimX 
-                + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY + vLower[0] ];
-    else
-        sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max();
-    }
-    
-    if( thri == 1 ) //xtop
-    {
-      if( dimX - vLower[ 0 ] > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik )
-        sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] =
-                aux[ (blIdz*blockDim.z + vLower[2]) * dimX * dimY + (blIdy * blockDim.y+vLower[1])*dimX
-                + blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY + vLower[0] ];
-     else
-        sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max();
-    }
-    if( thri == 2 ) //y bottom
-    {        
-      if( (blIdy != 0 || vLower[1] !=0) && thrj+1 < xkolik && thrk+1 < zkolik )
-        sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] =
-                aux[ (blIdz*blockDim.z + vLower[2]) * dimX * dimY + (blIdy * blockDim.y+vLower[1])*dimX
-                + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY + vLower[0] ];
-      else
-        sArray[ (thrk+1) * sizeSArray * sizeSArray + 0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
-    }
-    
-    if( thri == 3 ) //y top
-    {
-      if( dimY - vLower[ 1 ] > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik )
-        sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] =
-                aux[ (blIdz*blockDim.z + vLower[2]) * dimX * dimY + ((blIdy+1) * blockDim.y+vLower[1])*dimX
-                + blIdx*blockDim.x + thrj + thrk*dimX*dimY + vLower[0] ];
-     else
-        sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
-    }
-    if( thri == 4 ) //z bottom
-    {        
-      if( (blIdz != 0 || vLower[2] !=0) && thrj+1 < ykolik && thrk+1 < xkolik )
-        sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] =
-                aux[ (blIdz*blockDim.z + vLower[2]) * dimX * dimY + (blIdy * blockDim.y+vLower[1])*dimX
-                + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk + vLower[0] ];
-     else
-        sArray[0 * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thrk+1] = std::numeric_limits< Real >::max();
-    }
-    
-    if( thri == 5 ) //z top
-    {
-      if( dimZ - vLower[ 2 ] > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik )
-        sArray[ zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] =
-                aux[ ((blIdz+1)*blockDim.z + vLower[2]) * dimX * dimY + (blIdy * blockDim.y+vLower[1])*dimX
-                + blIdx*blockDim.x + thrj * dimX + thrk + vLower[0] ];
-     else
-        sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = std::numeric_limits< Real >::max();
-    }
-    
-    // Copy all other values that aren't edges
-    if( i - vLower[0] < dimX && j - vLower[1] < dimY && k - vLower[2] < dimZ &&
-        thri+1 < xkolik + vUpper[0] && thrj+1 < ykolik + vUpper[1] && thrk+1 < zkolik + vUpper[2] )
-    {
-      sArray[(thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = aux[ k*dimX*dimY + j*dimX + i ];
-    }
-    __syncthreads(); 
-    
-#if ForDebug    
-    /*if( thri==0 && thrj == 0 && thrk == 0 && blockIdx.z == 0 && blockIdx.x == 2 && blockIdx.y == 2 && MPIthread == 1 )
-    {
-      printf( "všechno před výpočtem: \n");
-      for( int m = sizeSArray-1; m>-1; m-- ){
-        for( int l = 0; l < sizeSArray; l++ )
-          printf( "%.2f ", sArray[4*sizeSArray * sizeSArray + m * sizeSArray + l]);
-        printf( "\n");
-      }
-      printf( "\n");
-    }
-    
-    if(thri==0 && thrj == 0 && thrk == 0 && blockIdx.z == 0 && blockIdx.x == 2 && blockIdx.y == 2 && MPIthread == 1 )
-    {
-      for( int m = 24; m>14; m-- ){
-        for( int l = 15; l < 25; l++ )  
-          printf("%.2f ", aux[ 4 *mesh.getDimensions().y()*mesh.getDimensions().x() + m*mesh.getDimensions().x() + l ]);
-        printf( "\n");
-      }
-      printf( "\n");
-    }*/
-#endif 
-    
-    //main while cycle. each value can get information only from neighbour but that information has to spread there
-    while( changed[ 0 ] )
-    {
-      __syncthreads();
-      
-      changed[ currentIndex ] = false;
-      
-      //calculation of update cell
-      if( i < dimX - vUpper[0] && j < dimY - vUpper[1] && k < dimZ - vUpper[2] )
-      {
-        if( ! interfaceMap[ k*dimX*dimY + j * dimX + i ] )
-        {
-          // calculate new value depending on neighbours in sArray on (thri+1, thrj+1) coordinates
-          changed[ currentIndex ] = ptr.updateCell3D< sizeSArray >( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz); 
-        }
-      }
-      __syncthreads();
-      
-      //pyramid reduction (parallel reduction)
-      if( blockDim.x*blockDim.y*blockDim.z == 1024 )
-      {
-        if( currentIndex < 512 )
-        {
-          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
-        }
-      }
-      __syncthreads();
-      if( blockDim.x*blockDim.y*blockDim.z >= 512 )
-      {
-        if( currentIndex < 256 )
-        {
-          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
-        }
-      }
-      __syncthreads();
-      if( blockDim.x*blockDim.y*blockDim.z >= 256 )
-      {
-        if( currentIndex < 128 )
-        {
-          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
-        }
-      }
-      __syncthreads();
-      if( blockDim.x*blockDim.y*blockDim.z >= 128 )
-      {
-        if( currentIndex < 64 )
-        {
-          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
-        }
-      }
-      __syncthreads();
-      if( currentIndex < 32 )
-      {
-        if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
-        if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
-        if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
-        if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
-        if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
-        if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
-      }
-      __syncthreads();
-      
-      // if we calculated, then the BlockIterDevice should contain the info about this whole block! (only one number for one block)
-      if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 )
-      {
-        BlockIterDevice[ blIdz * gridDim.x * gridDim.y + blIdy * gridDim.x + blIdx ] = 1;
-      }
-      __syncthreads();
-    }
-    
-    // copy results into helpFunc (not into aux bcs of conflicts)
-    if( i < dimX && j < dimY && k < dimZ && thri+1 < xkolik && thrj+1 < ykolik && thrk+1 < zkolik )
-      helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thri+1 ];
-    
+    requestsInformation[neighCount++] =
+            MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup ); 
+    requestsInformation[neighCount++] = 
+            MPI::IRecv( &calculateFromNeighbours[1], 1, neighbours[1], 0, MPI::AllGroup );
   }
-  else // if not, then it should at least copy the values from aux to helpFunc.
+  
+  if( neighbours[2] != -1 ) //NORTH
   {
-    if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1]
-            && k < mesh.getDimensions().z() - vUpper[2])
-      helpFunc[ k * mesh.getDimensions().x() * mesh.getDimensions().y() + j * mesh.getDimensions().x() + i ] =
-              aux[ k * mesh.getDimensions().x() * mesh.getDimensions().y() + j * mesh.getDimensions().x() + i ];
+    requestsInformation[neighCount++] = 
+            MPI::ISend( &calculatedBefore, 1, neighbours[2], 0, MPI::AllGroup );
+    requestsInformation[neighCount++] =
+            MPI::IRecv( &calculateFromNeighbours[2], 1, neighbours[2], 0, MPI::AllGroup );
   }
-}  
+  
+  if( neighbours[5] != -1 ) //SOUTH
+  {
+    requestsInformation[neighCount++] = 
+            MPI::ISend( &calculatedBefore, 1, neighbours[5], 0, MPI::AllGroup );
+    requestsInformation[neighCount++] = 
+            MPI::IRecv( &calculateFromNeighbours[3], 1, neighbours[5], 0, MPI::AllGroup );
+  }
+  
+  if( neighbours[8] != -1 ) // TOP 
+  {
+    requestsInformation[neighCount++] = 
+            MPI::ISend( &calculatedBefore, 1, neighbours[8], 0, MPI::AllGroup );
+    requestsInformation[neighCount++] = 
+            MPI::IRecv( &calculateFromNeighbours[4], 1, neighbours[8], 0, MPI::AllGroup );
+  }
+  
+  if( neighbours[17] != -1 ) //BOTTOM
+  {
+    requestsInformation[neighCount++] =
+            MPI::ISend( &calculatedBefore, 1, neighbours[17], 0, MPI::AllGroup );
+    requestsInformation[neighCount++] = 
+            MPI::IRecv( &calculateFromNeighbours[5], 1, neighbours[17], 0, MPI::AllGroup );
+  }
+  
+  MPI::WaitAll( requestsInformation, neighCount );
+  
+  MPI::Allreduce( &calculatedBefore, &calculatedBefore, 1, MPI_LOR,  MPI::AllGroup );
+  calculateMPIAgain = calculateFromNeighbours[0] || calculateFromNeighbours[1] ||
+                      calculateFromNeighbours[2] || calculateFromNeighbours[3] ||
+                      calculateFromNeighbours[4] || calculateFromNeighbours[5];
+}
 #endif
-- 
GitLab


From b42fa59a58ec27b6266d772620df32926c0117db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matou=C5=A1=20Fencl?= <fenclmat@fjfi.cvut.cz>
Date: Sat, 16 Mar 2019 11:10:46 +0100
Subject: [PATCH 10/14] deleting tnlDirectEikonalMethodsBase_impl.h

---
 .../tnlDirectEikonalMethodsBase_impl.h        | 1591 -----------------
 1 file changed, 1591 deletions(-)
 delete mode 100644 src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
deleted file mode 100644
index a5d3d81df..000000000
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ /dev/null
@@ -1,1591 +0,0 @@
-/* 
- * File:   tnlDirectEikonalMethodsBase_impl.h
- * Author: oberhuber
- *
- * Created on July 14, 2016, 3:22 PM
- */
-
-#pragma once
-
-#include <limits>
-
-#include <iostream>
-#include "tnlFastSweepingMethod.h"
-#include "tnlDirectEikonalMethodsBase.h"
-
-template< typename Real,
-        typename Device,
-        typename Index >
-void
-tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >::
-initInterface( const MeshFunctionPointer& _input,
-        MeshFunctionPointer& _output,
-        InterfaceMapPointer& _interfaceMap  )
-{
-  if( std::is_same< Device, Devices::Cuda >::value )
-  {
-#ifdef HAVE_CUDA
-    const MeshType& mesh = _input->getMesh();
-    
-    const int cudaBlockSize( 16 );
-    int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
-    dim3 blockSize( cudaBlockSize );
-    dim3 gridSize( numBlocksX );
-    Devices::Cuda::synchronizeDevice();
-    CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
-            _output.template modifyData< Device >(),
-            _interfaceMap.template modifyData< Device >() );
-    cudaDeviceSynchronize();
-    TNL_CHECK_CUDA_DEVICE;
-#endif
-  }
-  if( std::is_same< Device, Devices::Host >::value )
-  {
-    const MeshType& mesh = _input->getMesh();
-    typedef typename MeshType::Cell Cell;
-    const MeshFunctionType& input = _input.getData();
-    MeshFunctionType& output = _output.modifyData();
-    InterfaceMapType& interfaceMap = _interfaceMap.modifyData();
-    Cell cell( mesh );
-    for( cell.getCoordinates().x() = 0;
-            cell.getCoordinates().x() < mesh.getDimensions().x();
-            cell.getCoordinates().x() ++ )
-    {
-      cell.refresh();
-      output[ cell.getIndex() ] =
-              input( cell ) >= 0 ? std::numeric_limits< RealType >::max() :
-                -std::numeric_limits< RealType >::max();
-      interfaceMap[ cell.getIndex() ] = false;
-    }
-    
-    
-    const RealType& h = mesh.getSpaceSteps().x();
-    for( cell.getCoordinates().x() = 0;
-            cell.getCoordinates().x() < mesh.getDimensions().x() - 1;
-            cell.getCoordinates().x() ++ )
-    {
-      cell.refresh();
-      const RealType& c = input( cell );      
-      if( ! cell.isBoundaryEntity()  )
-      {
-        const auto& neighbors = cell.getNeighborEntities();
-        Real pom = 0;
-        //const IndexType& c = cell.getIndex();
-        const IndexType e = neighbors.template getEntityIndex<  1 >();
-        if( c * input[ e ] <= 0 )
-        {
-          pom = TNL::sign( c )*( h * c )/( c - input[ e ]);
-          if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
-            output[ cell.getIndex() ] = pom;
-          
-          pom = pom - TNL::sign( c )*h; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
-          if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
-            output[ e ] = pom; 
-          
-          interfaceMap[ cell.getIndex() ] = true;
-          interfaceMap[ e ] = true;
-        }
-      }
-    }
-  }
-}
-
-template< typename Real,
-        typename Device,
-        typename Index >
-template< int sizeSArray >
-void
-tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
-updateBlocks( InterfaceMapType interfaceMap,
-        MeshFunctionType aux,
-        MeshFunctionType helpFunc,
-        ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
-{
-#pragma omp parallel for schedule( dynamic )
-  for( IndexType i = 0; i < BlockIterHost.getSize(); i++ )
-  {
-    if( BlockIterHost[ i ] )
-    {
-      MeshType mesh = interfaceMap.template getMesh< Devices::Host >();
-      
-      int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
-      //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl;
-      int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
-      int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
-      //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl;
-      int xkolik = numThreadsPerBlock + 1;
-      int ykolik = numThreadsPerBlock + 1;
-      
-      int blIdx = i%numOfBlockx;
-      int blIdy = i/numOfBlockx;
-      //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl;
-      
-      if( numOfBlockx - 1 == blIdx )
-        xkolik = dimX - (blIdx)*numThreadsPerBlock+1;
-      
-      if( numOfBlocky -1 == blIdy )
-        ykolik = dimY - (blIdy)*numThreadsPerBlock+1;
-      //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl;
-      
-      
-      /*bool changed[numThreadsPerBlock*numThreadsPerBlock];
-       changed[ 0 ] = 1;*/
-      Real hx = mesh.getSpaceSteps().x();
-      Real hy = mesh.getSpaceSteps().y();
-      
-      bool changed = false;
-      BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0;
-      
-      
-      Real *sArray;
-      sArray = new Real[ sizeSArray * sizeSArray ];
-      if( sArray == nullptr )
-        std::cout << "Error while allocating memory for sArray." << std::endl;
-      
-      for( IndexType thri = 0; thri < sizeSArray; thri++ ){
-        for( IndexType thrj = 0; thrj < sizeSArray; thrj++ )
-          sArray[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max();
-      }
-      
-      
-      //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
-      for( IndexType thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ )
-      {        
-        if( dimX > (blIdx+1) * numThreadsPerBlock  && thrj+1 < ykolik )
-          sArray[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ];
-        
-        
-        if( blIdx != 0 && thrj+1 < ykolik )
-          sArray[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ];
-        
-        if( dimY > (blIdy+1) * numThreadsPerBlock  && thrj+1 < xkolik )
-          sArray[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ];
-        
-        if( blIdy != 0 && thrj+1 < xkolik )
-          sArray[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ];
-      }
-      
-      for( IndexType k = 0; k < numThreadsPerBlock; k++ ){
-        for( IndexType l = 0; l < numThreadsPerBlock; l++ )
-          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
-            sArray[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ];
-      }
-      
-      for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ 
-        for( IndexType l = 0; l < numThreadsPerBlock; l++ ){
-          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ){
-            //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl;
-            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
-            {
-              changed = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy) || changed;
-              
-            }
-          }
-        }
-      }
-      /*aux.save( "aux-1pruch.tnl" );
-       for( int k = 0; k < sizeSArray; k++ ){ 
-       for( int l = 0; l < sizeSArray; l++ ) {
-       std::cout << sArray[ k * sizeSArray + l] << " ";
-       }
-       std::cout << std::endl;
-       }*/
-      
-      for( IndexType k = 0; k < numThreadsPerBlock; k++ ) 
-        for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ) { 
-          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
-          {
-            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
-            {
-              this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy);
-            }
-          }
-        }
-      /*aux.save( "aux-2pruch.tnl" );
-       for( int k = 0; k < sizeSArray; k++ ){ 
-       for( int l = 0; l < sizeSArray; l++ ) {
-       std::cout << sArray[ k * sizeSArray + l] << " ";
-       }
-       std::cout << std::endl;
-       }*/
-      
-      for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ) 
-        for( IndexType l = 0; l < numThreadsPerBlock; l++ ) {
-          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
-          {
-            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
-            {
-              this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy);
-            }
-          }
-        }
-      /*aux.save( "aux-3pruch.tnl" );
-       for( int k = 0; k < sizeSArray; k++ ){ 
-       for( int l = 0; l < sizeSArray; l++ ) {
-       std::cout << sArray[ k * sizeSArray + l] << " ";
-       }
-       std::cout << std::endl;
-       }*/
-      
-      for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){
-        for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ) { 
-          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
-          {
-            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
-            {
-              this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx, hy, 1.0);
-            }
-          }
-        }
-      }
-      /*aux.save( "aux-4pruch.tnl" );
-       for( int k = 0; k < sizeSArray; k++ ){ 
-       for( int l = 0; l < sizeSArray; l++ ) {
-       std::cout << sArray[ k * sizeSArray + l] << " ";
-       }
-       std::cout << std::endl;
-       }*/
-      
-      
-      if( changed ){
-        BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 1;
-      }
-      
-      
-      for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ 
-        for( IndexType l = 0; l < numThreadsPerBlock; l++ ) {
-          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )      
-            helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] = sArray[ (k + 1)* sizeSArray + l + 1 ];
-          //std::cout<< sArray[k+1][l+1];
-        }
-        //std::cout<<std::endl;
-      }
-      delete []sArray;
-    }
-  }
-}
-template< typename Real,
-        typename Device,
-        typename Index >
-template< int sizeSArray >
-void
-tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
-updateBlocks( const InterfaceMapType& interfaceMap,
-        const MeshFunctionType& aux,
-        MeshFunctionType& helpFunc,
-        ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
-{  
-//#pragma omp parallel for schedule( dynamic )
-  for( IndexType i = 0; i < BlockIterHost.getSize(); i++ )
-  {
-    if( BlockIterHost[ i ] )
-    {
-      MeshType mesh = interfaceMap.template getMesh< Devices::Host >();
-      
-      int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
-      int dimZ = mesh.getDimensions().z();
-      //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl;
-      int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
-      int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
-      int numOfBlockz = dimZ/numThreadsPerBlock + ((dimZ%numThreadsPerBlock != 0) ? 1:0);
-      //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl;
-      int xkolik = numThreadsPerBlock + 1;
-      int ykolik = numThreadsPerBlock + 1;
-      int zkolik = numThreadsPerBlock + 1;
-      
-      
-      int blIdz = i/( numOfBlockx * numOfBlocky );
-      int blIdy = (i-blIdz*numOfBlockx * numOfBlocky )/(numOfBlockx );
-      int blIdx = (i-blIdz*numOfBlockx * numOfBlocky )%( numOfBlockx );
-      //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl;
-      
-      if( numOfBlockx - 1 == blIdx )
-        xkolik = dimX - (blIdx)*numThreadsPerBlock+1;
-      if( numOfBlocky -1 == blIdy )
-        ykolik = dimY - (blIdy)*numThreadsPerBlock+1;
-      if( numOfBlockz-1 == blIdz )
-        zkolik = dimZ - (blIdz)*numThreadsPerBlock+1;
-      //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl;
-      
-      
-      /*bool changed[numThreadsPerBlock*numThreadsPerBlock];
-       changed[ 0 ] = 1;*/
-      Real hx = mesh.getSpaceSteps().x();
-      Real hy = mesh.getSpaceSteps().y();
-      Real hz = mesh.getSpaceSteps().z();
-      
-      bool changed = false;
-      BlockIterHost[ i ] = 0;
-      
-      
-      Real *sArray;
-      sArray = new Real[ sizeSArray * sizeSArray * sizeSArray ];
-      if( sArray == nullptr )
-        std::cout << "Error while allocating memory for sArray." << std::endl;
-      
-      for( IndexType k = 0; k < sizeSArray; k++ )
-        for( IndexType l = 0; l < sizeSArray; l++ )
-          for( IndexType m = 0; m < sizeSArray; m++ ){
-            sArray[ m * sizeSArray * sizeSArray + k * sizeSArray + l ] = std::numeric_limits< Real >::max();
-          }
-      
-      
-      for( IndexType thrk = 0; thrk < numThreadsPerBlock; thrk++ )
-        for( IndexType thrj = 0; thrj < numThreadsPerBlock; thrj++ )
-        {
-          if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik )
-            sArray[(thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = 
-                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX -1 + thrk*dimX*dimY ];
-          
-          if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < zkolik )
-            sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = 
-                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy *numThreadsPerBlock*dimX+ blIdx*numThreadsPerBlock + numThreadsPerBlock + thrj * dimX + thrk*dimX*dimY ];
-          
-          if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik )
-            sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = 
-                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX + thrj + thrk*dimX*dimY ];
-          
-          if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik && thrk+1 < zkolik )
-            sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = 
-                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + (blIdy+1) * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj + thrk*dimX*dimY ];
-          
-          if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik )
-            sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = 
-                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX * dimY + thrj * dimX + thrk ];
-          
-          if( dimZ > (blIdz+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < xkolik )
-            sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = 
-                    aux[ (blIdz+1)*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX + thrk ];
-        }
-      
-      for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
-        for( IndexType k = 0; k < numThreadsPerBlock; k++ ){
-          for( IndexType l = 0; l < numThreadsPerBlock; l++ ){
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
-              sArray[(m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1] = 
-                      aux[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ];
-          }
-        }
-      }
-      /*string s;
-      int numWhile = 0;
-      for( int k = 0; k < numThreadsPerBlock; k++ ){
-        for( int l = 0; l < numThreadsPerBlock; l++ ) 
-          for( int m = 0; m < numThreadsPerBlock; m++ )
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
-              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
-      } 
-      numWhile++;
-      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
-      helpFunc.save( s );*/
-      
-      for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
-        for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ 
-          for( IndexType l = 0; l < numThreadsPerBlock; l++ ){
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){
-              //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl;
-              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
-              {
-                //printf("In with point m  = %d, k = %d, l = %d\n", m, k, l);
-                changed = this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz) || changed;
-                
-              }
-            }
-          }
-        }
-      }
-      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
-        for( int l = 0; l < numThreadsPerBlock; l++ ) 
-          for( int m = 0; m < numThreadsPerBlock; m++ )
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
-              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
-      } 
-      numWhile++;
-      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
-      helpFunc.save( s );*/
-      
-      for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){
-        for( IndexType k = 0; k < numThreadsPerBlock; k++ ){
-          for( IndexType l = 0; l <numThreadsPerBlock; l++ ){
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
-            {
-              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
-              {
-                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
-              }
-            }
-          }
-        }
-      }
-      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
-        for( int l = 0; l < numThreadsPerBlock; l++ ) 
-          for( int m = 0; m < numThreadsPerBlock; m++ )
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
-              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
-      } 
-      numWhile++;
-      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
-      helpFunc.save( s );*/
-      
-      for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
-        for( IndexType k = 0; k < numThreadsPerBlock; k++ ){
-          for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
-            {
-              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
-              {
-                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
-              }
-            }
-          }
-        }
-      }
-      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
-        for( int l = 0; l < numThreadsPerBlock; l++ ) 
-          for( int m = 0; m < numThreadsPerBlock; m++ )
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
-              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
-      } 
-      numWhile++;
-      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
-      helpFunc.save( s );
-      */
-      for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){
-        for( IndexType k = 0; k < numThreadsPerBlock; k++ ){
-          for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
-            {
-              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
-              {
-                this->template updateCell3D< sizeSArray >(  sArray, l+1, k+1, m+1, hx,hy,hz);
-              }
-            }
-          }
-        }
-      }
-      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
-        for( int l = 0; l < numThreadsPerBlock; l++ ) 
-          for( int m = 0; m < numThreadsPerBlock; m++ )
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
-              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
-      } 
-      numWhile++;
-      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
-      helpFunc.save( s );*/
-      
-      for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
-        for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){
-          for( IndexType l = 0; l <numThreadsPerBlock; l++ ){
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
-            {
-              if( ! interfaceMap[blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
-              {
-                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
-              }
-            }
-          }
-        }
-      }
-      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
-        for( int l = 0; l < numThreadsPerBlock; l++ ) 
-          for( int m = 0; m < numThreadsPerBlock; m++ )
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
-              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
-      } 
-      numWhile++;
-      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
-      helpFunc.save( s );*/
-      
-      for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){
-        for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){
-          for( IndexType l = 0; l <numThreadsPerBlock; l++ ){
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
-            {
-              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
-              {
-                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
-              }
-            }
-          }
-        }
-      }
-      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
-        for( int l = 0; l < numThreadsPerBlock; l++ ) 
-          for( int m = 0; m < numThreadsPerBlock; m++ )
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
-              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
-      } 
-      numWhile++;
-      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
-      helpFunc.save( s );*/
-      
-      for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
-        for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){
-          for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
-            {
-              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
-              {
-                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
-              }
-            }
-          }
-        }
-      }
-      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
-        for( int l = 0; l < numThreadsPerBlock; l++ ) 
-          for( int m = 0; m < numThreadsPerBlock; m++ )
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
-              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
-      } 
-      numWhile++;
-      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
-      helpFunc.save( s );*/
-      
-      
-      for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){
-        for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){
-          for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
-            {
-              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
-              {
-                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
-              }
-            }
-          }
-        }
-      }
-      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
-        for( int l = 0; l < numThreadsPerBlock; l++ ) 
-          for( int m = 0; m < numThreadsPerBlock; m++ )
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
-              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
-      } 
-      numWhile++;
-      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
-      helpFunc.save( s );*/
-      
-      if( changed ){
-        BlockIterHost[ i ] = 1;
-      }
-      
-      
-      for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ 
-        for( IndexType l = 0; l < numThreadsPerBlock; l++ ) {
-          for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){      
-              helpFunc[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] = 
-                      sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
-              //std::cout << helpFunc[ m*dimX*dimY + k*dimX + l ] << " ";
-            }
-          }
-          //std::cout << std::endl;
-        }
-        //std::cout << std::endl;
-      }
-      //helpFunc.save( "helpF.tnl");
-      delete []sArray;
-    }
-  }
-}
-template< typename Real,
-        typename Device,
-        typename Index >
-void 
-tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
-getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY, int numBlockZ )
-{
-  int* BlockIterPom; 
-  BlockIterPom = new int [ numBlockX * numBlockY * numBlockZ ];
-  
-  for( int i = 0; i< BlockIterHost.getSize(); i++)
-  {
-    BlockIterPom[ i ] = 0;
-    
-    int m=0, l=0, k=0;
-    l = i/( numBlockX * numBlockY );
-    k = (i-l*numBlockX * numBlockY )/(numBlockX );
-    m = (i-l*numBlockX * numBlockY )%( numBlockX );
-    
-    if( m > 0 && BlockIterHost[ i - 1 ] ){
-      BlockIterPom[ i ] = 1;
-    }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){
-      BlockIterPom[ i ] = 1;
-    }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){
-      BlockIterPom[ i ] = 1;
-    }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){
-      BlockIterPom[ i ] = 1;
-    }else if( l > 0 && BlockIterHost[ i - numBlockX*numBlockY ] ){
-      BlockIterPom[ i ] = 1;
-    }else if( l < numBlockZ-1 && BlockIterHost[ i + numBlockX*numBlockY ] ){
-      BlockIterPom[ i ] = 1;
-    }
-  }
-  for( int i = 0; i< BlockIterHost.getSize(); i++)
-  { 
-    BlockIterHost[ i ] = BlockIterPom[ i ];
-  }
-}
-
-
-template< typename Real,
-        typename Device,
-        typename Index >
-void 
-tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
-getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY )
-{
-  int* BlockIterPom; 
-  BlockIterPom = new int [numBlockX * numBlockY];
-  
-  for(int i = 0; i < numBlockX * numBlockY; i++)
-  {
-    BlockIterPom[ i ] = 0;//BlockIterPom[ i ] = 0;
-    int m=0, k=0;
-    m = i%numBlockX;
-    k = i/numBlockX;
-    if( m > 0 && BlockIterHost[ i - 1 ] ){
-      BlockIterPom[ i ] = 1;
-    }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){
-      BlockIterPom[ i ] = 1;
-    }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){
-      BlockIterPom[ i ] = 1;
-    }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){
-      BlockIterPom[ i ] = 1;
-    }
-    //BlockIterPom[ i ];
-  }
-  
-  for(int i = 0; i < numBlockX * numBlockY; i++)
-  {
-    if( !BlockIterHost[ i ] )
-      BlockIterHost[ i ] = BlockIterPom[ i ];
-  }
-  /*else
-   BlockIter[ i ] = 0;*/
-  /*for( int i = numBlockX-1; i > -1; i-- )
-   {
-   for( int j = 0; j< numBlockY; j++ )
-   std::cout << BlockIterHost[ i*numBlockY + j ];
-   std::cout << std::endl;
-   }
-   std::cout << std::endl;*/
-  delete[] BlockIterPom;
-}
-
-template< typename Real,
-        typename Device,
-        typename Index >
-template< typename MeshEntity >
-void
-tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >::
-updateCell( MeshFunctionType& u,
-        const MeshEntity& cell, 
-        const RealType v )
-{
-  const auto& neighborEntities = cell.template getNeighborEntities< 1 >();
-  const MeshType& mesh = cell.getMesh();
-  const RealType& h = mesh.getSpaceSteps().x();
-  const RealType value = u( cell );
-  RealType a, tmp = std::numeric_limits< RealType >::max();
-  
-  if( cell.getCoordinates().x() == 0 )
-    a = u[ neighborEntities.template getEntityIndex< 1 >() ];
-  else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
-    a = u[ neighborEntities.template getEntityIndex< -1 >() ];
-  else
-  {
-    a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1 >() ],
-            u[ neighborEntities.template getEntityIndex<  1 >() ] );
-  }
-  
-  if( fabs( a ) == std::numeric_limits< RealType >::max() )
-    return;
-  
-  tmp = a + TNL::sign( value ) * h/v;
-  
-  u[ cell.getIndex() ] = argAbsMin( value, tmp );
-}
-
-template< typename Real,
-        typename Device,
-        typename Index >
-void
-tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
-initInterface( const MeshFunctionPointer& _input,
-        MeshFunctionPointer& _output,
-        InterfaceMapPointer& _interfaceMap, 
-        StaticVector vLower, StaticVector vUpper )
-{
-  
-  if( std::is_same< Device, Devices::Cuda >::value )
-  {
-#ifdef HAVE_CUDA
-    const MeshType& mesh = _input->getMesh();
-    
-    const int cudaBlockSize( 16 );
-    int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
-    int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
-    dim3 blockSize( cudaBlockSize, cudaBlockSize );
-    dim3 gridSize( numBlocksX, numBlocksY );
-    Devices::Cuda::synchronizeDevice();
-    CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
-            _output.template modifyData< Device >(),
-            _interfaceMap.template modifyData< Device >(),
-            vLower, vUpper);
-    cudaDeviceSynchronize();
-    TNL_CHECK_CUDA_DEVICE;
-#endif
-  }
-  if( std::is_same< Device, Devices::Host >::value )
-  {
-    MeshFunctionType input = _input.getData();    
-    MeshFunctionType& output = _output.modifyData();
-    InterfaceMapType& interfaceMap = _interfaceMap.modifyData();
-    const MeshType& mesh = input.getMesh();
-/*#ifdef HAVE_MPI
-    int i>s::>::GetRan>s::>::AllGroup );
-    if( i == 0 )
-    {
-      printf( "0: mesh x: %d\n", mesh.getDimensions().x() );
-      printf( "0: mesh y: %d\n", mesh.getDimensions().y() );
-      for( int k = 0; k < mesh.getDimensions().y(); k++ ){
-        for( int l = 0; l < mesh.getDimensions().x(); l++ )
-          printf( "%.2f\t", input[ k * 16 + l ] );
-        printf("\n");
-      }
-    }
-#endif*/
-    typedef typename MeshType::Cell Cell;
-    Cell cell( mesh );
-    for( cell.getCoordinates().y() = 0;
-            cell.getCoordinates().y() < mesh.getDimensions().y();
-            cell.getCoordinates().y() ++ )
-      for( cell.getCoordinates().x() = 0;
-              cell.getCoordinates().x() < mesh.getDimensions().x();
-              cell.getCoordinates().x() ++ )
-      {
-        cell.refresh();
-        output[ cell.getIndex() ] =
-                input( cell ) >= 0 ? std::numeric_limits< RealType >::max() :
-                  - std::numeric_limits< RealType >::max();
-        interfaceMap[ cell.getIndex() ] = false;
-      }
-    
-    const RealType& hx = mesh.getSpaceSteps().x();
-    const RealType& hy = mesh.getSpaceSteps().y();     
-    for( cell.getCoordinates().y() = 0 + vLower[1];
-            cell.getCoordinates().y() < mesh.getDimensions().y() - vUpper[1];
-            cell.getCoordinates().y() ++ )
-      for( cell.getCoordinates().x() = 0 + vLower[0];
-              cell.getCoordinates().x() < mesh.getDimensions().x() - vUpper[0];
-              cell.getCoordinates().x() ++ )
-      {
-        cell.refresh();
-        const RealType& c = input( cell );
-        if( ! cell.isBoundaryEntity()  )
-        {
-          auto neighbors = cell.getNeighborEntities();
-          Real pom = 0;
-          const IndexType e = neighbors.template getEntityIndex<  1,  0 >();
-          const IndexType n = neighbors.template getEntityIndex<  0,  1 >();
-          //Try init with exact data:
-          /*if( c * input[ n ] <= 0 )
-           {
-           output[ cell.getIndex() ] = c;
-           output[ n ] = input[ n ];
-           interfaceMap[ cell.getIndex() ] = true;
-           interfaceMap[ n ] = true;
-           }   
-           if( c * input[ e ] <= 0 )
-           {   
-           output[ cell.getIndex() ] = c;
-           output[ e ] = input[ e ];
-           interfaceMap[ cell.getIndex() ] = true;
-           interfaceMap[ e ] = true;
-           }*/
-          if( c * input[ n ] <= 0 )
-          {
-            /*if( c >= 0 )
-             {*/
-            pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
-            if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
-              output[ cell.getIndex() ] = pom;
-            pom = pom - TNL::sign( c )*hy;
-            if( TNL::abs( output[ n ] ) > TNL::abs( pom ) )
-              output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy;
-            /*}else
-             {
-             pom = - ( hy * c )/( c - input[ n ]);
-             if( output[ cell.getIndex() ] < pom )
-             output[ cell.getIndex() ] = pom;
-             if( output[ n ] > hy + pom )
-             output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]);
-             }*/
-            interfaceMap[ cell.getIndex() ] = true;
-            interfaceMap[ n ] = true;
-          }
-          if( c * input[ e ] <= 0 )
-          {
-            /*if( c >= 0 )
-             {*/
-            pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
-            if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
-              output[ cell.getIndex() ] = pom;
-            
-            pom = pom - TNL::sign( c )*hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
-            if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
-              output[ e ] = pom; 
-            /*}else
-             {
-             pom = - (hx * c)/( c - input[ e ]);
-             if( output[ cell.getIndex() ] < pom )
-             output[ cell.getIndex() ] = pom;
-             
-             pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
-             if( output[ e ] > pom )
-             output[ e ] = pom;
-             }*/
-            interfaceMap[ cell.getIndex() ] = true;
-            interfaceMap[ e ] = true;
-          }
-        }
-      }
-#ifdef HAVE_MPI
-    //int i>s::>::GetRan>s::>::AllGroup );
-    /*if( i == 0 )
-    {
-      printf( "0: mesh x: %d\n", mesh.getDimensions().x() );
-      printf( "0: mesh y: %d\n", mesh.getDimensions().y() );
-      for( int k = 0; k < mesh.getDimensions().y(); k++ ){
-        for( int l = 0; l < mesh.getDimensions().x(); l++ )
-          printf("%.2f\t",output[ k * 16 + l ] );
-        printf("\n");
-      }
-    }*/
-#endif
-  }
-}
-
-template< typename Real,
-        typename Device,
-        typename Index >
-template< typename MeshEntity >
-__cuda_callable__
-bool
-tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
-updateCell( MeshFunctionType& u,
-        const MeshEntity& cell,   
-        const RealType v)
-{
-  const auto& neighborEntities = cell.template getNeighborEntities< 2 >();
-  const MeshType& mesh = cell.getMesh();
-  const RealType& hx = mesh.getSpaceSteps().x();
-  const RealType& hy = mesh.getSpaceSteps().y();
-  const RealType value = u( cell );
-  RealType a, b, tmp = std::numeric_limits< RealType >::max();
-  
-  if( cell.getCoordinates().x() == 0 )
-    a = u[ neighborEntities.template getEntityIndex< 1,  0 >() ];
-  else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
-    a = u[ neighborEntities.template getEntityIndex< -1,  0 >() ];
-  else
-  {
-    a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1,  0 >() ],
-            u[ neighborEntities.template getEntityIndex<  1,  0 >() ] );
-  }
-  
-  if( cell.getCoordinates().y() == 0 )
-    b = u[ neighborEntities.template getEntityIndex< 0,  1 >()];
-  else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 )
-    b = u[ neighborEntities.template getEntityIndex< 0,  -1 >() ];
-  else
-  {
-    b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0,  -1 >() ],
-            u[ neighborEntities.template getEntityIndex< 0,   1 >() ] );
-  }
-  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-          fabs( b ) == std::numeric_limits< RealType >::max() )
-    return false;
-  
-  RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
-  sortMinims( pom );
-  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
-  
-  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
-  {
-    u[ cell.getIndex() ] = argAbsMin( value, tmp );
-    tmp = value - u[ cell.getIndex() ];
-    if ( fabs( tmp ) >  0.001*hx ){
-      //printf( "Vracime true!\n");
-      return true;
-    }else{
-      //printf( "Vracime false2!\n");
-      return false;
-    }
-  }
-  else {
-    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
-            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
-            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-    u[ cell.getIndex() ] = argAbsMin( value, tmp );
-    tmp = value - u[ cell.getIndex() ];
-    if ( fabs( tmp ) > 0.001*hx ){
-      //printf( "Vracime true3!\n");
-      return true;
-    }else{
-      //printf( "Vracime false!\n");
-      return false;
-    }
-  }
-}
-
-template< typename Real,
-        typename Device,
-        typename Index >
-void
-tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
-initInterface( const MeshFunctionPointer& _input,
-        MeshFunctionPointer& _output,
-        InterfaceMapPointer& _interfaceMap, 
-        StaticVector vLower, StaticVector vUpper )
-{
-  if( std::is_same< Device, Devices::Cuda >::value )
-  {
-#ifdef HAVE_CUDA
-    const MeshType& mesh = _input->getMesh();
-   
-    const int cudaBlockSize( 8 );
-    int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
-    int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
-    int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize );
-    if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
-      std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
-    dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
-    dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
-    Devices::Cuda::synchronizeDevice();
-    CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(),
-            _output.template modifyData< Device >(),
-            _interfaceMap.template modifyData< Device >(), vLower, vUpper );
-    cudaDeviceSynchronize();
-    TNL_CHECK_CUDA_DEVICE;
-#endif
-  }
-  if( std::is_same< Device, Devices::Host >::value )
-  {
-    const MeshFunctionType& input =  _input.getData();
-    MeshFunctionType& output =  _output.modifyData();
-    InterfaceMapType& interfaceMap =  _interfaceMap.modifyData();
-    
-    const MeshType& mesh = input.getMesh();
-    typedef typename MeshType::Cell Cell;
-    
-    Cell cell( mesh );
-    for( cell.getCoordinates().z() = 0;
-            cell.getCoordinates().z() < mesh.getDimensions().z();
-            cell.getCoordinates().z() ++ )
-      for( cell.getCoordinates().y() = 0;
-              cell.getCoordinates().y() < mesh.getDimensions().y();
-              cell.getCoordinates().y() ++ )
-        for( cell.getCoordinates().x() = 0;
-                cell.getCoordinates().x() < mesh.getDimensions().x();
-                cell.getCoordinates().x() ++ )
-        {
-          cell.refresh();
-          output[ cell.getIndex() ] =
-                  input( cell ) > 0 ? std::numeric_limits< RealType >::max() :
-                    - std::numeric_limits< RealType >::max();
-          interfaceMap[ cell.getIndex() ] = false;
-        }
-    
-    const RealType& hx = mesh.getSpaceSteps().x();
-    const RealType& hy = mesh.getSpaceSteps().y();
-    const RealType& hz = mesh.getSpaceSteps().z();
-    for( cell.getCoordinates().z() = 0 + vLower[2];
-            cell.getCoordinates().z() < mesh.getDimensions().z() - vUpper[2];
-            cell.getCoordinates().z() ++ )   
-      for( cell.getCoordinates().y() = 0 + vLower[1];
-              cell.getCoordinates().y() < mesh.getDimensions().y() - vUpper[1];
-              cell.getCoordinates().y() ++ )
-        for( cell.getCoordinates().x() = 0 + vLower[0];
-                cell.getCoordinates().x() < mesh.getDimensions().x() - vUpper[0];
-                cell.getCoordinates().x() ++ )
-        {
-          cell.refresh();
-          const RealType& c = input( cell );
-          if( ! cell.isBoundaryEntity() )
-          {
-            auto neighbors = cell.getNeighborEntities();
-            Real pom = 0;
-            const IndexType e = neighbors.template getEntityIndex<  1,  0,  0 >();
-            const IndexType n = neighbors.template getEntityIndex<  0,  1,  0 >();
-            const IndexType t = neighbors.template getEntityIndex<  0,  0,  1 >();
-            
-            
-            if( c * input[ n ] <= 0 )
-            {
-              pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
-              if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
-                output[ cell.getIndex() ] = pom;
-              pom = pom - TNL::sign( c )*hy;
-              if( TNL::abs( output[ n ] ) > TNL::abs( pom ) )
-                output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy;
-              
-              interfaceMap[ cell.getIndex() ] = true;
-              interfaceMap[ n ] = true;
-            }
-            
-            if( c * input[ e ] <= 0 )
-            {
-              pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
-              if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
-                output[ cell.getIndex() ] = pom;
-              pom = pom - TNL::sign( c )*hx;
-              if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
-                output[ e ] = pom; //( hy * c )/( c - input[ n ]) - hy;
-              
-              interfaceMap[ cell.getIndex() ] = true;
-              interfaceMap[ e ] = true;
-            }
-            
-            if( c * input[ t ] <= 0 )
-            {
-              pom = TNL::sign( c )*( hz * c )/( c - input[ t ]);
-              if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
-                output[ cell.getIndex() ] = pom;
-              pom = pom - TNL::sign( c )*hz;
-              if( TNL::abs( output[ t ] ) > TNL::abs( pom ) )
-                output[ t ] = pom; //( hy * c )/( c - input[ n ]) - hy;
-              
-              interfaceMap[ cell.getIndex() ] = true;
-              interfaceMap[ t ] = true;
-            }  
-          }
-        }
-  }
-}
-
-template< typename Real,
-        typename Device,
-        typename Index >
-template< typename MeshEntity >
-__cuda_callable__
-bool
-tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
-updateCell( MeshFunctionType& u,
-        const MeshEntity& cell, 
-        const RealType v )
-{
-  const auto& neighborEntities = cell.template getNeighborEntities< 3 >();
-  const MeshType& mesh = cell.getMesh();
-  
-  const RealType& hx = mesh.getSpaceSteps().x();
-  const RealType& hy = mesh.getSpaceSteps().y();
-  const RealType& hz = mesh.getSpaceSteps().z();
-  const RealType value = u( cell );
-  //std::cout << value << std::endl;
-  RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
-  
-  
-  if( cell.getCoordinates().x() == 0 )
-    a = u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ];
-  else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
-    a = u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ];
-  else
-  {
-    a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ],
-            u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ] );
-  }
-  
-  if( cell.getCoordinates().y() == 0 )
-    b = u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ];
-  else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 )
-    b = u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ];
-  else
-  {
-    b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ],
-            u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ] );
-  }
-  
-  if( cell.getCoordinates().z() == 0 )
-    c = u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ];
-  else if( cell.getCoordinates().z() == mesh.getDimensions().z() - 1 )
-    c = u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ];
-  else
-  {
-    c = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ],
-            u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ] );
-  }
-  
-  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-          fabs( b ) == std::numeric_limits< RealType >::max() &&
-          fabs( c ) == std::numeric_limits< RealType >::max() )
-    return false;
-  
-  RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
-  sortMinims( pom );   
-  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
-  
-  if( fabs( tmp ) < fabs( pom[ 1 ] ) )
-  {
-    u[ cell.getIndex() ] = argAbsMin( value, tmp );
-    tmp = value - u[ cell.getIndex() ];
-    if ( fabs( tmp ) > 0.001*hx ){
-      return true;
-    }else{
-      return false;
-    }
-  }
-  else
-  {
-    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
-            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
-            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-    if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
-    {
-      u[ cell.getIndex() ] = argAbsMin( value, tmp );
-      tmp = value - u[ cell.getIndex() ];
-      if ( fabs( tmp ) > 0.001*hx ){
-        return true;
-      }else{
-        return false;
-      }
-    }
-    else
-    {
-      tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
-              TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
-              hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
-              hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
-      u[ cell.getIndex() ] = argAbsMin( value, tmp );
-      tmp = value - u[ cell.getIndex() ];
-      if ( fabs( tmp ) > 0.001*hx ){
-        return true;
-      }else{
-        return false;
-      }
-    }
-  }
-}
-
-template < typename T1 >
-__cuda_callable__ void sortMinims( T1 pom[] )
-{
-  T1 tmp[6] = {0.0,0.0,0.0,0.0,0.0,0.0}; 
-  if( fabs(pom[0]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[2])){
-    tmp[0] = pom[0]; tmp[1] = pom[1]; tmp[2] = pom[2];
-    tmp[3] = pom[3]; tmp[4] = pom[4]; tmp[5] = pom[5];
-    
-  }
-  else if( fabs(pom[0]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[1]) ){
-    tmp[0] = pom[0]; tmp[1] = pom[2]; tmp[2] = pom[1];
-    tmp[3] = pom[3]; tmp[4] = pom[5]; tmp[5] = pom[4];
-  }
-  else if( fabs(pom[1]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[2]) ){
-    tmp[0] = pom[1]; tmp[1] = pom[0]; tmp[2] = pom[2];
-    tmp[3] = pom[4]; tmp[4] = pom[3]; tmp[5] = pom[5];
-  }
-  else if( fabs(pom[1]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[0]) ){
-    tmp[0] = pom[1]; tmp[1] = pom[2]; tmp[2] = pom[0];
-    tmp[3] = pom[4]; tmp[4] = pom[5]; tmp[5] = pom[3];
-  }
-  else if( fabs(pom[2]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[1]) ){
-    tmp[0] = pom[2]; tmp[1] = pom[0]; tmp[2] = pom[1];
-    tmp[3] = pom[5]; tmp[4] = pom[3]; tmp[5] = pom[4];
-  }
-  else if( fabs(pom[2]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[0]) ){
-    tmp[0] = pom[2]; tmp[1] = pom[1]; tmp[2] = pom[0];
-    tmp[3] = pom[5]; tmp[4] = pom[4]; tmp[5] = pom[3];
-  }
-  
-  for( unsigned int i = 0; i < 6; i++ )
-  {
-    pom[ i ] = tmp[ i ];
-  }   
-}
-
-template< typename Real,
-        typename Device,
-        typename Index >
-template< int sizeSArray >
-__cuda_callable__
-bool
-tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
-updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real hy,
-        const Real v )
-{
-  const RealType value = sArray[ thrj * sizeSArray + thri ];
-  RealType a, b, tmp = std::numeric_limits< RealType >::max();
-  
-  b = TNL::argAbsMin( sArray[ (thrj+1) * sizeSArray + thri ],
-          sArray[ (thrj-1) * sizeSArray + thri ] );
-  
-  a = TNL::argAbsMin( sArray[ thrj * sizeSArray + thri+1 ],
-          sArray[ thrj * sizeSArray + thri-1 ] );
-  
-  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-          fabs( b ) == std::numeric_limits< RealType >::max() )
-    return false;
-  
-  RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
-  sortMinims( pom );
-  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
-  
-  
-  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
-  {
-    sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp );
-    tmp = value - sArray[ thrj * sizeSArray + thri ];
-    if ( fabs( tmp ) >  0.001*hx )
-      return true;
-    else
-      return false;
-  }
-  else
-  {
-    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
-            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
-            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-    sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp );
-    tmp = value - sArray[ thrj * sizeSArray + thri ];
-    if ( fabs( tmp ) > 0.001*hx )
-      return true;
-    else
-      return false;
-  }
-  
-  return false;
-}
-template< typename Real,
-        typename Device,
-        typename Index >
-template< int sizeSArray >
-__cuda_callable__ 
-bool 
-tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
-updateCell3D( volatile Real *sArray, int thri, int thrj, int thrk,
-        const Real hx, const Real hy, const Real hz, const Real v )
-{
-  const RealType value = sArray[thrk *sizeSArray * sizeSArray + thrj * sizeSArray + thri];
-  
-  RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
-  
-  c = TNL::argAbsMin( sArray[ (thrk+1)* sizeSArray*sizeSArray + thrj * sizeSArray + thri ],
-          sArray[ (thrk-1) * sizeSArray *sizeSArray + thrj* sizeSArray + thri ] );
-  
-  b = TNL::argAbsMin( sArray[ thrk* sizeSArray*sizeSArray + (thrj+1) * sizeSArray + thri ],
-          sArray[ thrk* sizeSArray * sizeSArray + (thrj-1)* sizeSArray +thri ] );
-  
-  a = TNL::argAbsMin( sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri+1 ],
-          sArray[ thrk* sizeSArray * sizeSArray + thrj* sizeSArray +thri-1 ] );
-  
-  /*if( thrk == 8 )
-    printf("Calculating a = %f, b = %f, c = %f\n" , a, b, c );*/
-  
-  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-          fabs( b ) == std::numeric_limits< RealType >::max() &&
-          fabs( c ) == std::numeric_limits< RealType >::max() )
-    return false;
-  
-  RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
-  
-  sortMinims( pom );
-  
-  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
-  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
-  {
-    sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ] = argAbsMin( value, tmp );
-    tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
-    if ( fabs( tmp ) >  0.001*hx )
-      return true;
-    else
-      return false;
-  }
-  else
-  {
-    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
-            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
-            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-    if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
-    {
-      sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ] = argAbsMin( value, tmp );
-      tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
-      if ( fabs( tmp ) > 0.001*hx )
-        return true;
-      else
-        return false;
-    }
-    else
-    {
-      tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
-              TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
-              hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
-              hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
-      sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ] = argAbsMin( value, tmp );
-      tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
-      if ( fabs( tmp ) > 0.001*hx )
-        return true;
-      else
-        return false;
-    }
-  }
-  
-  return false;
-}
-
-#ifdef HAVE_CUDA
-template < typename Real, typename Device, typename Index >
-__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, 
-        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output,
-        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap )
-{
-  int i = threadIdx.x + blockDim.x*blockIdx.x;
-  const Meshes::Grid< 1, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
-  
-  if( i < mesh.getDimensions().x()  )
-  {
-    typedef typename Meshes::Grid< 1, Real, Device, Index >::Cell Cell;
-    Cell cell( mesh );
-    cell.getCoordinates().x() = i;
-    cell.refresh();
-    const Index cind = cell.getIndex();
-    
-    
-    output[ cind ] =
-            input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
-              - std::numeric_limits< Real >::max();
-    interfaceMap[ cind ] = false; 
-    
-    const Real& h = mesh.getSpaceSteps().x();
-    cell.refresh();
-    const Real& c = input( cell );
-    if( ! cell.isBoundaryEntity()  )
-    {
-      auto neighbors = cell.getNeighborEntities();
-      Real pom = 0;
-      const Index e = neighbors.template getEntityIndex< 1 >();
-      const Index w = neighbors.template getEntityIndex< -1 >();
-      if( c * input[ e ] <= 0 )
-      {
-        pom = TNL::sign( c )*( h * c )/( c - input[ e ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
-          output[ cind ] = pom;                       
-        
-        interfaceMap[ cind ] = true;
-      }
-      if( c * input[ w ] <= 0 )
-      {
-        pom = TNL::sign( c )*( h * c )/( c - input[ w ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-          output[ cind ] = pom;
-        
-        interfaceMap[ cind ] = true;
-      }
-    }
-  }
-  
-}
-template < typename Real, typename Device, typename Index >
-__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
-        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
-        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
-        Containers::StaticVector< 2, Index > vLower, Containers::StaticVector< 2, Index > vUpper ) 
-{
-  int i = threadIdx.x + blockDim.x*blockIdx.x;
-  int j = blockDim.y*blockIdx.y + threadIdx.y;
-  const Meshes::Grid< 2, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
-  
-  if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
-  {
-    typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell;
-    Cell cell( mesh );
-    cell.getCoordinates().x() = i; cell.getCoordinates().y() = j;
-    cell.refresh();
-    const Index cind = cell.getIndex();
-    
-    
-    output[ cind ] =
-            input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
-              - std::numeric_limits< Real >::max();
-    interfaceMap[ cind ] = false; 
-    
-    if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] && i>vLower[0] -1 && j> vLower[0]-1 )
-    {
-      const Real& hx = mesh.getSpaceSteps().x();
-      const Real& hy = mesh.getSpaceSteps().y();
-      cell.refresh();
-      const Real& c = input( cell );
-      if( ! cell.isBoundaryEntity()  )
-      {
-        auto neighbors = cell.getNeighborEntities();
-        Real pom = 0;
-        const Index e = neighbors.template getEntityIndex<  1,  0 >();
-        const Index w = neighbors.template getEntityIndex<  -1,  0 >();
-        const Index n = neighbors.template getEntityIndex<  0,  1 >();
-        const Index s = neighbors.template getEntityIndex<  0,  -1 >();
-        
-        if( c * input[ n ] <= 0 )
-        {
-          pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
-          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-            output[ cind ] = pom;
-          
-          interfaceMap[ cell.getIndex() ] = true;
-        }
-        if( c * input[ e ] <= 0 )
-        {
-          pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
-          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
-            output[ cind ] = pom;                       
-          
-          interfaceMap[ cind ] = true;
-        }
-        if( c * input[ w ] <= 0 )
-        {
-          pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
-          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-            output[ cind ] = pom;
-          
-          interfaceMap[ cind ] = true;
-        }
-        if( c * input[ s ] <= 0 )
-        {
-          pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
-          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-            output[ cind ] = pom;
-          
-          interfaceMap[ cind ] = true;
-        }
-      }
-    }
-  }
-}
-
-template < typename Real, typename Device, typename Index >
-__global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, 
-        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
-        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
-        Containers::StaticVector< 3, Index > vLower, Containers::StaticVector< 3, Index > vUpper )
-{
-  int i = threadIdx.x + blockDim.x*blockIdx.x;
-  int j = blockDim.y*blockIdx.y + threadIdx.y;
-  int k = blockDim.z*blockIdx.z + threadIdx.z;
-  const Meshes::Grid< 3, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
-  
-  if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
-  {
-    typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell;
-    Cell cell( mesh );
-    cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k;
-    cell.refresh();
-    const Index cind = cell.getIndex();
-    
-    
-    output[ cind ] =
-            input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
-              - std::numeric_limits< Real >::max();
-    interfaceMap[ cind ] = false; 
-    cell.refresh();
-    
-    if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] &&
-            k < mesh.getDimensions().y() - vUpper[2] && i>vLower[0]-1 && j> vLower[1]-1 && k>vLower[2]-1 )
-    {
-      const Real& hx = mesh.getSpaceSteps().x();
-      const Real& hy = mesh.getSpaceSteps().y();
-      const Real& hz = mesh.getSpaceSteps().z();
-      const Real& c = input( cell );
-      if( ! cell.isBoundaryEntity()  )
-      {
-        auto neighbors = cell.getNeighborEntities();
-        Real pom = 0;
-        const Index e = neighbors.template getEntityIndex<  1, 0, 0 >();
-        const Index w = neighbors.template getEntityIndex<  -1, 0, 0 >();
-        const Index n = neighbors.template getEntityIndex<  0, 1, 0 >();
-        const Index s = neighbors.template getEntityIndex<  0, -1, 0 >();
-        const Index t = neighbors.template getEntityIndex<  0, 0, 1 >();
-        const Index b = neighbors.template getEntityIndex<  0, 0, -1 >();
-        
-        if( c * input[ n ] <= 0 )
-        {
-          pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
-          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-            output[ cind ] = pom;
-          
-          interfaceMap[ cind ] = true;
-        }
-        if( c * input[ e ] <= 0 )
-        {
-          pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
-          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
-            output[ cind ] = pom;                       
-          
-          interfaceMap[ cind ] = true;
-        }
-        if( c * input[ w ] <= 0 )
-        {
-          pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
-          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-            output[ cind ] = pom;
-          
-          interfaceMap[ cind ] = true;
-        }
-        if( c * input[ s ] <= 0 )
-        {
-          pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
-          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-            output[ cind ] = pom;
-          
-          interfaceMap[ cind ] = true;
-        }
-        if( c * input[ b ] <= 0 )
-        {
-          pom = TNL::sign( c )*( hz * c )/( c - input[ b ]);
-          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-            output[ cind ] = pom;
-          
-          interfaceMap[ cind ] = true;
-        }
-        if( c * input[ t ] <= 0 )
-        {
-          pom = TNL::sign( c )*( hz * c )/( c - input[ t ]);
-          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-            output[ cind ] = pom;
-          
-          interfaceMap[ cind ] = true;
-        }
-      }
-    }
-  }
-}
-
-
-template< typename Real,
-        typename Device,
-        typename Index >
-__cuda_callable__
-bool
-tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >::
-updateCell( volatile Real sArray[18], int thri, const Real h, const Real v )
-{
-  const RealType value = sArray[ thri ];
-  RealType a, tmp = std::numeric_limits< RealType >::max();
-  
-  a = TNL::argAbsMin( sArray[ thri+1 ],
-          sArray[ thri-1 ] );
-  
-  if( fabs( a ) == std::numeric_limits< RealType >::max() )
-    return false;
-  
-  tmp = a + TNL::sign( value ) * h/v;
-  
-  
-  sArray[ thri ] = argAbsMin( value, tmp );
-  
-  tmp = value - sArray[ thri ];
-  if ( fabs( tmp ) >  0.001*h )
-    return true;
-  else
-    return false;
-}
-#endif
-- 
GitLab


From f2dc45179eed5e340509370dfe9b252817173007 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 11 Apr 2019 14:07:15 +0200
Subject: [PATCH 11/14] Fixed saving with expcetions.

---
 .../Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h   | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
index 105a068d3..c36c4dca9 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
@@ -173,10 +173,7 @@ makeSnapshot(  )
         Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType,Meshes::DistributedMeshes::LocalCopy> ::save(fileName.getFileName(), *u );
    }
    else
-   {
-      if( ! this->u->save( fileName.getFileName() ) )
-         return false;
-   }
+      this->u->save( fileName.getFileName() );
    return true;
 }
 
-- 
GitLab


From 008601adc763ccbbcfdf8db6901a9af11d75b723 Mon Sep 17 00:00:00 2001
From: fencl <fenclmat@fjfi.cvut.cz>
Date: Mon, 23 Sep 2019 22:09:53 +0200
Subject: [PATCH 12/14] Fix 2D GPU neighbours. Version with Chess method and
 OpenMP FSM methods.

---
 .../tnlDirectEikonalMethodBase2D_impl.h       | 10 +--
 .../tnlDirectEikonalMethodBase3D_impl.h       |  6 +-
 .../tnlFastSweepingMethod2D_impl.h            | 15 ++---
 .../tnlFastSweepingMethod3D_impl.h            | 64 +++++++++++++++----
 4 files changed, 62 insertions(+), 33 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
index 583e22478..50ea7bde8 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
@@ -365,13 +365,13 @@ __global__ void GetNeighbours( const TNL::Containers::Array< int, Devices::Cuda,
     m = i%numBlockX;
     k = i/numBlockX;
     if( m > 0 && blockCalculationIndicator[ i - 1 ] ){
-      pom = 1;//BlockIterPom[ i ] = 1;
+      pom = 1;//blockCalculationIndicatorHelp[ i ] = 1;
     }else if( m < numBlockX -1 && blockCalculationIndicator[ i + 1 ] ){
-      pom = 1;//BlockIterPom[ i ] = 1;
-    }else if( k > 0 && blockCalculationIndicatorHelp[ i - numBlockX ] ){
-      pom = 1;// BlockIterPom[ i ] = 1;
+      pom = 1;//blockCalculationIndicatorHelp[ i ] = 1;
+    }else if( k > 0 && blockCalculationIndicator[ i - numBlockX ] ){
+      pom = 1;// blockCalculationIndicatorHelp[ i ] = 1;
     }else if( k < numBlockY -1 && blockCalculationIndicator[ i + numBlockX ] ){
-      pom = 1;//BlockIterPom[ i ] = 1;
+      pom = 1;//blockCalculationIndicatorHelp[ i ] = 1;
     }
     
     if( blockCalculationIndicator[ i ] != 1 )
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h
index 91f9a0efe..5b2a4b685 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h
@@ -109,7 +109,7 @@ initInterface( const MeshFunctionPointer& _input,
                 output[ cell.getIndex() ] = pom;
               pom = pom - TNL::sign( c )*hx;
               if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
-                output[ e ] = pom; //( hy * c )/( c - input[ n ]) - hy;
+                output[ e ] = pom; 
               
               interfaceMap[ cell.getIndex() ] = true;
               interfaceMap[ e ] = true;
@@ -122,7 +122,7 @@ initInterface( const MeshFunctionPointer& _input,
                 output[ cell.getIndex() ] = pom;
               pom = pom - TNL::sign( c )*hz;
               if( TNL::abs( output[ t ] ) > TNL::abs( pom ) )
-                output[ t ] = pom; //( hy * c )/( c - input[ n ]) - hy;
+                output[ t ] = pom; 
               
               interfaceMap[ cell.getIndex() ] = true;
               interfaceMap[ t ] = true;
@@ -736,7 +736,7 @@ updateBlocks( const InterfaceMapType interfaceMap,
         MeshFunctionType& helpFunc,
         ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
 {  
-  //#pragma omp parallel for schedule( dynamic )
+  #pragma omp parallel for schedule( dynamic )
   for( IndexType i = 0; i < BlockIterHost.getSize(); i++ )
   {
     if( BlockIterHost[ i ] )
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index 66f9e6cdf..31d3f8b32 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -363,7 +363,7 @@ solve( const MeshPointer& mesh,
            interfaceMapPtr.template getData< Device >(),
            auxPtr.template getData< Device>(),
            helpFunc.template modifyData< Device>(),
-           blockCalculationIndicator,
+           blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps,
            oddEvenBlock );
            cudaDeviceSynchronize();
            TNL_CHECK_CUDA_DEVICE;
@@ -381,15 +381,8 @@ solve( const MeshPointer& mesh,
            
            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
            
-           CudaParallelReduc<<< nBlocks , 1024 >>>( blockCalculationIndicator, dBlock, ( numBlocksX * numBlocksY ) );
-           cudaDeviceSynchronize();
-           TNL_CHECK_CUDA_DEVICE;
-           CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
-           cudaDeviceSynchronize();
-           TNL_CHECK_CUDA_DEVICE;
-           
-           BlockIterD = dBlock.getElement( 0 );*/
-          
+           calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1);
+          */
   /**------------------------------------------------------------------------------------------------*/
           
           
@@ -441,7 +434,7 @@ solve( const MeshPointer& mesh,
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
           
-          // "Parallel reduction" to see if we should calculate again BlockIterD
+          // "Parallel reduction" to see if we should calculate again calculateCudaBlocksAgain
           calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1);
           
           // When we change something then we should caclucate again in the next passage of MPI ( calculated = true )
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index 2d73b174e..4895c7693 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -103,8 +103,30 @@ solve( const MeshPointer& mesh,
         calculateMPIAgain = 0;
         
 /** HERE IS FSM FOR OPENMP (NO MPI) - isnt worthy */
-        /*int numThreadsPerBlock = 64;
+        /*int numThreadsPerBlock = -1;
          
+         numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0));
+         //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
+         if( numThreadsPerBlock <= 16 )
+         numThreadsPerBlock = 16;
+         else if(numThreadsPerBlock <= 32 )
+         numThreadsPerBlock = 32;
+         else if(numThreadsPerBlock <= 64 )
+         numThreadsPerBlock = 64;
+         else if(numThreadsPerBlock <= 128 )
+         numThreadsPerBlock = 128;
+         else if(numThreadsPerBlock <= 256 )
+         numThreadsPerBlock = 256;
+         else if(numThreadsPerBlock <= 512 )
+         numThreadsPerBlock = 512;
+         else
+         numThreadsPerBlock = 1024;
+         //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
+         
+         if( numThreadsPerBlock == -1 ){
+            printf("Fail in setting numThreadsPerBlock.\n");
+         break;
+         }
          
          int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
          int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
@@ -140,8 +162,22 @@ solve( const MeshPointer& mesh,
          helpFunc1 = auxPtr;
          auxPtr = helpFunc;
          helpFunc = helpFunc1;
+         switch ( numThreadsPerBlock ){
+         case 16:
+         this->template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         case 32:
+         this->template updateBlocks< 34 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         case 64:
          this->template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
-         
+         case 128:
+         this->template updateBlocks< 130 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         case 256:
+         this->template updateBlocks< 258 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         case 512:
+         this->template updateBlocks< 514 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         default:
+         this->template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         }
          //Reduction      
          for( int i = 0; i < BlockIterHost.getSize(); i++ ){
          if( IsCalculationDone == 0 ){
@@ -176,43 +212,43 @@ solve( const MeshPointer& mesh,
     // TOP, NORTH and WEST        
         boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2];
         boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
-        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = vecLowerOverlaps[0];
+        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0];
         goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
         
     // TOP, SOUTH and EAST        
         boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2];
-        boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = vecLowerOverlaps[1];
+        boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
         boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
         goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
         
     // TOP, SOUTH and WEST        
         boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2];
-        boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = vecLowerOverlaps[1];
-        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = vecLowerOverlaps[0]; 
+        boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
+        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; 
         goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
             
     // BOTTOM, NOTH and EAST        
-        boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = vecLowerOverlaps[2];
+        boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2];
         boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
         boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
         goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); 
         
     // BOTTOM, NOTH and WEST        
-        boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = vecLowerOverlaps[2];
+        boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2];
         boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
-        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = vecLowerOverlaps[0]; 
+        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; 
         goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
         
     // BOTTOM, SOUTH and EAST        
-        boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = vecLowerOverlaps[2];
-        boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = vecLowerOverlaps[1];
+        boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2];
+        boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
         boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
         goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );    
         
     // BOTTOM, SOUTH and WEST        
-        boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = vecLowerOverlaps[2];
-        boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = vecLowerOverlaps[1];
-        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = vecLowerOverlaps[0];
+        boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2];
+        boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
+        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0];
         goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );    
         
         
-- 
GitLab


From ac30546025477c6f06c55c30670808b86df82b34 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Sat, 28 Sep 2019 21:49:31 +0200
Subject: [PATCH 13/14] Fixed passing of Arrays by ArrayView.

---
 .../tnlDirectEikonalMethodBase2D_impl.h       |  10 +-
 .../tnlDirectEikonalMethodBase3D_impl.h       |   8 +-
 .../tnlDirectEikonalMethodsBase.h             |  41 +--
 .../tnlFastSweepingMethod2D_impl.h            | 285 +++---------------
 .../tnlFastSweepingMethod3D_impl.h            | 121 +++++---
 5 files changed, 161 insertions(+), 304 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
index 50ea7bde8..c470a77ef 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
@@ -353,8 +353,8 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2,
 
 
 template < typename Index >
-__global__ void GetNeighbours( const TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator,
-        TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY )
+__global__ void GetNeighbours( const TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator,
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY )
 {
   int i = blockIdx.x * 1024 + threadIdx.x;
   
@@ -389,7 +389,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
         const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
         const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
         Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
-        TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator,
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator,
         const Containers::StaticVector< 2, Index > vecLowerOverlaps, 
         const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock )
 {
@@ -598,7 +598,7 @@ tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
 updateBlocks( InterfaceMapType interfaceMap,
         MeshFunctionType aux,
         MeshFunctionType helpFunc,
-        ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
+        ArrayContainerView BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
 {
 #pragma omp parallel for schedule( dynamic )
   for( IndexType i = 0; i < BlockIterHost.getSize(); i++ )
@@ -769,7 +769,7 @@ template< typename Real,
         typename Index >
 void 
 tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
-getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY )
+getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY )
 {
   int* BlockIterPom; 
   BlockIterPom = new int [numBlockX * numBlockY];
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h
index 5b2a4b685..32548abcf 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h
@@ -480,8 +480,8 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3
 
 
 template < typename Index >
-__global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
-        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,
+__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom,
         int numBlockX, int numBlockY, int numBlockZ )
 {
   int i = blockIdx.x * 1024 + threadIdx.x;
@@ -520,7 +520,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
         const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
         const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
         Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
-        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
         Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps )
 {
   int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
@@ -1056,7 +1056,7 @@ template< typename Real,
         typename Index >
 void 
 tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
-getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ )
+getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY, int numBlockZ )
 {
   int* BlockIterPom; 
   BlockIterPom = new int [ numBlockX * numBlockY * numBlockZ ];
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index e0ece04bf..7cba99f65 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -62,6 +62,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
     typedef Functions::MeshFunction< MeshType > MeshFunctionType;
     typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType;
     typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
+    using ArrayContainerView = typename ArrayContainer::ViewType;
     typedef Containers::StaticVector< 2, Index > StaticVector;
     
     using MeshPointer = Pointers::SharedPointer<  MeshType >;
@@ -87,15 +88,18 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
             const RealType velocity = 1.0 );
         
 // FOR OPENMP WILL BE REMOVED
-    void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY  );
+    void getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY  );
         
     template< int sizeSArray >
-    void updateBlocks( const InterfaceMapType& interfaceMap,
-            MeshFunctionType& aux,
-            MeshFunctionType& helpFunc,
-            ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ );
+    void updateBlocks( InterfaceMapType interfaceMap,
+            MeshFunctionType aux,
+            MeshFunctionType helpFunc,
+            ArrayContainerView BlockIterHost, int numThreadsPerBlock );
+    
+  protected:
     
-    void getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY  );
+   __cuda_callable__ RealType getNewValue( RealType valuesAndSteps[],
+           const RealType originalValue, const RealType v );
 };
 
 template< typename Real,
@@ -111,6 +115,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
     typedef Functions::MeshFunction< MeshType > MeshFunctionType;
     typedef Functions::MeshFunction< MeshType, 3, bool > InterfaceMapType;
     typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
+    using ArrayContainerView = typename ArrayContainer::ViewType;
     typedef Containers::StaticVector< 3, Index > StaticVector;
     using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
     using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;      
@@ -134,15 +139,15 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
             const RealType velocity = 1.0 );
     
     // OPENMP WILL BE REMOVED
-    void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ );
+    void getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY, int numBlockZ );
     
     template< int sizeSArray >
-    void updateBlocks( const InterfaceMapType& interfaceMap,
-            const MeshFunctionType& aux,
+    void updateBlocks( const InterfaceMapType interfaceMap,
+            const MeshFunctionType aux,
             MeshFunctionType& helpFunc,
-            ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ );
+            ArrayContainer BlockIterHost, int numThreadsPerBlock );
     
-    void getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY, int numBlockZ );
+  protected:
     
     __cuda_callable__ RealType getNewValue( RealType valuesAndSteps[],
            const RealType originalValue, const RealType v );
@@ -180,17 +185,14 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
         const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
         const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
         Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
-        TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator,
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator,
         const Containers::StaticVector< 2, Index > vecLowerOverlaps, 
         const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock =0);
 
 template < typename Index >
-__global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > dBlock, int nBlocks );
+__global__ void GetNeighbours( const TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator,
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY );
 
-template < typename Index >
-__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY );
 
 
 // 3D
@@ -205,10 +207,11 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
         const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
         const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
         Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice );
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
+        Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps );
 
 template < typename Index >
-__global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
+__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
         TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom,
         int numBlockX, int numBlockY, int numBlockZ );
 #endif
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index 31d3f8b32..e5638c11d 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -80,40 +80,9 @@ solve( const MeshPointer& mesh,
   IndexType iteration( 0 );
   InterfaceMapType interfaceMap = *interfaceMapPtr;
   MeshFunctionType aux = *auxPtr;
-  aux.template synchronize< Communicator >();
-  
-  
-#ifdef HAVE_MPI
-  int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup );
-  //printf( "Hello world from rank: %d ", i );
-  //Communicators::MpiCommunicator::Request r = Communicators::MpiCommunicator::ISend( auxPtr, 0, 0, Communicators::MpiCommunicator::AllGroup );
-  if( i == 1 ) {
-    /*for( int k = 0; k < 16*16; k++ )
-      aux[ k ] = 10;*/
-    printf( "1: mesh x: %d\n", mesh->getDimensions().x() );
-    printf( "1: mesh y: %d\n", mesh->getDimensions().y() );
-    //aux.save("aux_proc1.tnl");
-  }
-  if( i == 0 ) {
-    printf( "0: mesh x: %d\n", mesh->getDimensions().x() );
-    printf( "0: mesh y: %d\n", mesh->getDimensions().y() );
-    //aux.save("aux_proc0.tnl");
-    /*for( int k = 0; k < mesh->getDimensions().x()*mesh->getDimensions().y(); k++ )
-      aux[ k ] = 10;
-    for( int k = 0; k < mesh->getDimensions().x(); k++ ){
-      for( int l = 0; l < mesh->getDimensions().y(); l++ )
-        printf("%f.2\t",aux[ k * 16 + l ] );
-    printf("\n");
-    }*/
-  }
-    
-  /*bool a = Communicators::MpiCommunicator::IsInitialized();
-  if( a )
-    printf("Je Init\n");
-  else
-    printf("Neni Init\n");*/
-#endif
+  aux.template synchronize< Communicator >(); //synchronize initialized overlaps
   
+  std::cout << "Calculating the values ..." << std::endl; 
   while( iteration < this->maxIterations )
   {
     // calculatedBefore indicates weather we calculated in the last passage of the while cycle 
@@ -290,41 +259,8 @@ solve( const MeshPointer& mesh,
         // Need for calling functions from kernel
         BaseType ptr;
         
-        int BlockIterD = 1;
-        /*auxPtr = helpFunc;
-         
-         CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
-         interfaceMapPtr.template getData< Device >(),
-         auxPtr.template getData< Device>(),
-         helpFunc.template modifyData< Device>(),
-         BlockIterDevice,
-         oddEvenBlock.getView() );
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
-         auxPtr = helpFunc;
-         
-         oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
-         
-         CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
-         interfaceMapPtr.template getData< Device >(),
-         auxPtr.template getData< Device>(),
-         helpFunc.template modifyData< Device>(),
-         BlockIterDevice,
-         oddEvenBlock.getView() );
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
-         auxPtr = helpFunc;
-         
-         oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
-         
-         CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY ) );
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
-         CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks );
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
-         
-         BlockIterD = dBlock.getElement( 0 );*/
+        // True if we should calculate again.
+        int calculateCudaBlocksAgain = 1;
         
         // Array that identifies which blocks should be calculated.
         // All blocks should calculate in first passage ( setValue(1) )
@@ -343,16 +279,9 @@ solve( const MeshPointer& mesh,
         MeshFunctionPointer helpFunc( mesh );
         helpFunc.template modifyData() = auxPtr.template getData(); 
         
-        //int pocBloku = 0;
-        Devices::Cuda::synchronizeDevice();
-        CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
-                interfaceMapPtr.template getData< Device >(),
-                auxPtr.template modifyData< Device>(),
-                helpFunc.template modifyData< Device>(),
-                BlockIterDevice.getView() );
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
-        
+        // number of iterations of while calculateCudaBlocksAgain
+        int numIter = 0;
+               
         //int oddEvenBlock = 0;
         while( calculateCudaBlocksAgain )
         {
@@ -390,44 +319,16 @@ solve( const MeshPointer& mesh,
           Devices::Cuda::synchronizeDevice();
           CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(),
                   auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(),
-                  blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps );
+                  blockCalculationIndicator.getView(), vecLowerOverlaps, vecUpperOverlaps );
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
-        
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
-        
-        
-        aux = *auxPtr;
-        interfaceMap = *interfaceMapPtr;
-#endif
-      }
-
-      
-/**----------------------MPI-TO-DO---------------------------------------------**/
-        
-#ifdef HAVE_MPI
-        //int i = MPI::GetRank( MPI::AllGroup );
-        //TNL::Meshes::DistributedMeshes::DistributedMesh< MeshType > Mesh;
-        int neighCount = 0; // should this thread calculate again?
-        int calculpom[4] = {0,0,0,0};
-        
-          if( i == 0 ){
-            BlockIterPom1 = BlockIterDevice;
-            for( int i =0; i< numBlocksX; i++ ){
-              for( int j = 0; j < numBlocksY; j++ )
-              {
-                std::cout << BlockIterPom1[j*numBlocksX + i];
-              }
-              std::cout << std::endl;
-            }
-            std::cout << std::endl;
-          }
-#endif
+          
+          // Switching helpFunc and auxPtr.
+          auxPtr.swap( helpFunc );
           
           // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now.
           Devices::Cuda::synchronizeDevice(); 
-          GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator, blockCalculationIndicatorHelp, numBlocksX, numBlocksY );
+          GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator.getView(), blockCalculationIndicatorHelp.getView(), numBlocksX, numBlocksY );
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
           blockCalculationIndicator = blockCalculationIndicatorHelp;
@@ -445,46 +346,24 @@ solve( const MeshPointer& mesh,
 /**-----------------------------------------------------------------------------------------------------------*/
           numIter ++;
         }
-        if( numIter%2  == 1 ){
-          auxPtr = helpFunc;
-        }
-        /*cudaFree( BlockIterDevice );
-         cudaFree( dBlock );
-         delete BlockIter;*/
-        
-        if( neigh[1] != -1 )
-        {
-          req[neighCount] = MPI::ISend( &calculated, 1, neigh[1], 0, MPI::AllGroup ); 
-          neighCount++;
-          
-          
-          req[neighCount] = MPI::IRecv( &calculpom[1], 1, neigh[1], 0, MPI::AllGroup );
-          neighCount++;
-        }
-        
-        if( neigh[2] != -1 )
-        {
-          req[neighCount] = MPI::ISend( &calculated, 1, neigh[2], 0, MPI::AllGroup );
-          neighCount++;
-          
-          req[neighCount] = MPI::IRecv( &calculpom[2], 1, neigh[2], 0, MPI::AllGroup  );
-          neighCount++;
-        }
-        
-        if( neigh[5] != -1 )
+        if( numIter%2 == 1 ) // Need to check parity for MPI overlaps to synchronize ( otherwise doesnt work )
         {
-          req[neighCount] = MPI::ISend( &calculated, 1, neigh[5], 0, MPI::AllGroup );
-          neighCount++;
-          
-          req[neighCount] = MPI::IRecv( &calculpom[3], 1, neigh[5], 0, MPI::AllGroup );
-          neighCount++;
+          helpFunc.swap( auxPtr );
+          Devices::Cuda::synchronizeDevice();
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
         }
-        
-        MPI::WaitAll(req,neighCount);
-#if ForDebug
-        printf( "%d: Sending Calculated = %d.\n", i, calculated );
-#endif        
-        MPI::Allreduce( &calculated, &calculated, 1, MPI_LOR,  MPI::AllGroup );
+        aux = *auxPtr;
+        interfaceMap = *interfaceMapPtr;
+#endif
+      }
+
+      
+/**----------------------MPI-TO-DO---------------------------------------------**/        
+#ifdef HAVE_MPI
+      if( CommunicatorType::isDistributed() ){
+        getInfoFromNeighbours( calculatedBefore, calculateMPIAgain, mesh );
+       
         aux.template synchronize< Communicator >();
       }
 #endif
@@ -518,9 +397,16 @@ setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps,
 #endif
 }
 
-template < typename Index >
-__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY )
+
+
+
+template< typename Real, typename Device, typename Index, 
+          typename Communicator, typename Anisotropy >
+bool 
+FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
+goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, 
+        MeshFunctionType& aux, const InterfaceMapType& interfaceMap,
+        const AnisotropyPointer& anisotropy )
 {
   bool calculated = false;
   const MeshType& mesh = aux.getMesh();
@@ -548,97 +434,15 @@ __global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, I
   return calculated;
 }
 
-template < typename Index >
-__global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > dBlock, int nBlocks )
-{
-  int i = threadIdx.x;
-  int blId = blockIdx.x;
-  int blockSize = blockDim.x;
-  /*if ( i == 0 && blId == 0 ){
-    printf( "nBlocks = %d\n", nBlocks );
-    for( int j = nBlocks-1; j > -1 ; j--){
-      printf( "%d: cislo = %d \n", j, BlockIterDevice[ j ] );
-    }
-  }*/
-  __shared__ int sArray[ 1024 ];
-  sArray[ i ] = 0;
-  if( blId * 1024 + i < nBlocks )
-    sArray[ i ] = BlockIterDevice[ blId * 1024 + i ];
-  __syncthreads();
-  /*if ( i == 0 && blId == 0 ){
-   printf( "nBlocks = %d\n", nBlocks );
-   for( int j = 4; j > -1 ; j--){
-   printf( "%d: cislo = %d \n", j, sArray[ j ] );
-   }
-  }*/
-  /*extern __shared__ volatile int sArray[];
-   unsigned int i = threadIdx.x;
-   unsigned int gid = blockIdx.x * blockSize * 2 + threadIdx.x;
-   unsigned int gridSize = blockSize * 2 * gridDim.x;
-   sArray[ i ] = 0;
-   while( gid < nBlocks )
-   {
-   sArray[ i ] += BlockIterDevice[ gid ] + BlockIterDevice[ gid + blockSize ];
-   gid += gridSize;
-   }
-   __syncthreads();*/
-  
-  if ( blockSize == 1024) {
-    if (i < 512)
-      sArray[ i ] += sArray[ i + 512 ];
-  }
-  __syncthreads();
-  if (blockSize >= 512) {
-    if (i < 256) {
-      sArray[ i ] += sArray[ i + 256 ];
-    }
-  }
-  __syncthreads();
-  if (blockSize >= 256) {
-    if (i < 128) {
-      sArray[ i ] += sArray[ i + 128 ];
-    }
-  }
-  __syncthreads();
-  if (blockSize >= 128) {
-    if (i < 64) {
-      sArray[ i ] += sArray[ i + 64 ];
-    }
-  }
-  __syncthreads();
-  if (i < 32 )
-  {
-    if(  blockSize >= 64 ){ sArray[ i ] += sArray[ i + 32 ];}
-  __syncthreads();
-    if(  blockSize >= 32 ){  sArray[ i ] += sArray[ i + 16 ];}
-  __syncthreads();
-    if(  blockSize >= 16 ){  sArray[ i ] += sArray[ i + 8 ];}
-    if(  blockSize >= 8 ){  sArray[ i ] += sArray[ i + 4 ];}
-  __syncthreads();
-    if(  blockSize >= 4 ){  sArray[ i ] += sArray[ i + 2 ];}
-  __syncthreads();
-    if(  blockSize >= 2 ){  sArray[ i ] += sArray[ i + 1 ];}
-  __syncthreads();
-  }
-  __syncthreads();
-  
-  if( i == 0 )
-    dBlock[ blId ] = sArray[ 0 ];
-}
 
 
 
-template < int sizeSArray, typename Real, typename Device, typename Index >
-__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
-        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
-        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
-        CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY ) );
-        TNL_CHECK_CUDA_DEVICE;
-        
-        CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks );
-        TNL_CHECK_CUDA_DEVICE;
+#ifdef HAVE_MPI
+template< typename Real, typename Device, typename Index, 
+          typename Communicator, typename Anisotropy >
+void 
+FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
+getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh )
 {
   Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh();
   
@@ -687,4 +491,3 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
               calculateFromNeighbours[2] || calculateFromNeighbours[3];
 }
 #endif
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock )
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index 4895c7693..325b626f7 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -262,24 +262,86 @@ solve( const MeshPointer& mesh,
         // IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2)
         const int cudaBlockSize( 8 );
         
-        CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr,
-                interfaceMapPtr.template getData< Device >(),
-                auxPtr.template getData< Device>(),
-                helpFunc.template modifyData< Device>(),
-                BlockIterDevice.getView() );
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
+        // Getting the number of blocks in grid in each direction (without overlaps bcs we dont calculate on overlaps)
+        int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize );
+        int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize );
+        int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z() - vecLowerOverlaps[2] - vecUpperOverlaps[2], cudaBlockSize ); 
+        if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
+          std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
         
-        GetNeighbours3D<<< nBlocksNeigh, 1024 >>>( BlockIterDevice.getView(), BlockIterPom.getView(), numBlocksX, numBlocksY, numBlocksZ );
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
-        BlockIterDevice = BlockIterPom;
+        // Making the variables for global function CudaUpdateCellCaller.
+        dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
+        dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
         
-        CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY * numBlocksZ ) );
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
+        BaseType ptr; // tnlDirectEikonalMethodBase type for calling of function inside CudaUpdateCellCaller
+        
+        
+        int BlockIterD = 1; //variable that tells us weather we should calculate the main cuda body again
+        
+        // Array containing information about each block in grid, answering question (Have we calculated in this block?)
+        TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice( numBlocksX * numBlocksY * numBlocksZ );
+        BlockIterDevice.setValue( 1 ); // calculate all in the first passage
+        
+        // Helping Array for GetNeighbours3D
+        TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom( numBlocksX * numBlocksY * numBlocksZ );
+        BlockIterPom.setValue( 0 ); //doesnt matter what number
+        
+        
+        
+        // number of neighbours in one block (1024 threads) for GetNeighbours3D
+        int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0);
+        
+        
+        //MeshFunctionPointer helpFunc1( mesh );      
+        MeshFunctionPointer helpFunc( mesh );
+        helpFunc.template modifyData() = auxPtr.template getData();
+        Devices::Cuda::synchronizeDevice(); 
+                
+        int numIter = 0; // number of passages of following while cycle
         
-        CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks );
+        while( BlockIterD ) //main body of cuda code
+        {
+          
+          Devices::Cuda::synchronizeDevice();          
+          // main function that calculates all values in each blocks
+          // calculated values are in helpFunc
+          CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr,
+                  interfaceMapPtr.template getData< Device >(),
+                  auxPtr.template getData< Device>(),
+                  helpFunc.template modifyData< Device>(),
+                  BlockIterDevice.getView(), vecLowerOverlaps, vecUpperOverlaps );
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
+          // Switching pointers to helpFunc and auxPtr so real results are in memory of helpFunc but here under variable auxPtr
+          auxPtr.swap( helpFunc );
+          
+          Devices::Cuda::synchronizeDevice();
+          // Neighbours of blocks that calculatedBefore in this passage should calculate in the next!
+          // BlockIterDevice contains blocks that calculatedBefore in this passage and BlockIterPom those that should calculate in next (are neighbours)
+          GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice.getView(), BlockIterPom.getView(), numBlocksX, numBlocksY, numBlocksZ );
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
+          BlockIterDevice = BlockIterPom;
+          Devices::Cuda::synchronizeDevice();
+          
+          // .containsValue(1) is actually parallel reduction implemented in TNL
+          BlockIterD = BlockIterDevice.containsValue(1);
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
+          
+          numIter++;
+          if( BlockIterD ){ 
+            // if we calculated in this passage, we should send the info via MPI so neighbours should calculate after synchronization
+            calculatedBefore = 1;
+          }
+        }
+        if( numIter%2 == 1 ){
+          
+          // We need auxPtr to point on memory of original auxPtr (not to helpFunc)
+          // last passage of previous while cycle didnt calculate any number anyway so switching names doesnt effect values
+          auxPtr.swap( helpFunc ); 
+          Devices::Cuda::synchronizeDevice();
+        }
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
         aux = *auxPtr;
@@ -375,10 +437,15 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo,
   return calculated;
 }
 
-template < typename Index >
-__global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom,
-        int numBlockX, int numBlockY, int numBlockZ )
+
+
+
+#ifdef HAVE_MPI
+template< typename Real, typename Device, typename Index, 
+          typename Communicator, typename Anisotropy >
+void 
+FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
+getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh )
 {
   Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh();
   
@@ -397,22 +464,6 @@ __global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda,
     requestsInformation[neighCount++] = 
             MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup );
   }
-}
-
-template < int sizeSArray, typename Real, typename Device, typename Index >
-__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
-        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
-        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
-        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice )
-{
-  int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
-  int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z;
-  int i = threadIdx.x + blockDim.x*blockIdx.x + vLower[0]; // WITH OVERLAPS!!! i,j,k aren't coordinates of all values
-  int j = blockDim.y*blockIdx.y + threadIdx.y + vLower[1];
-  int k = blockDim.z*blockIdx.z + threadIdx.z + vLower[2];
-  int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri;
-  const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
   
   if( neighbours[1] != -1 ) // EAST
   {
-- 
GitLab


From e162a57a89c80ef7d685191b013cb6436d0b8576 Mon Sep 17 00:00:00 2001
From: fencl <fenclmat@fjfi.cvut.cz>
Date: Sat, 5 Oct 2019 10:33:58 +0200
Subject: [PATCH 14/14] 2D MPI GPU method adjusted.

---
 .../Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
index c470a77ef..cddf4f9cb 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
@@ -297,7 +297,7 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2,
     
     if( i < mesh.getDimensions().x() - vecUpperOverlaps[ 0 ] &&
             j < mesh.getDimensions().y() - vecUpperOverlaps[ 1 ] &&
-            i>vecLowerOverlaps[ 0 ] -1 && j> vecLowerOverlaps[ 0 ]-1 )
+            i>vecLowerOverlaps[ 0 ] -1 && j> vecLowerOverlaps[ 1 ]-1 )
     {
       const Real& hx = mesh.getSpaceSteps().x();
       const Real& hy = mesh.getSpaceSteps().y();
-- 
GitLab