diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h
index 1b46ecb3dbea0438be1065fc43808ecb157c93ab..b2cfc65dcee8b3dc7a4f980c20feba6c5eac8d5d 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-direct-eikonal-solver.h
@@ -36,6 +36,9 @@ class DirectEikonalSolverConfig
       {
          config.addDelimiter( "Direct eikonal equation solver settings:" );
          config.addRequiredEntry< String >( "input-file", "Input file." );
+         config.addEntry< String >( "distributed-grid-io-type", "Choose Distributed Grid IO Type", "MpiIO");
+            config.addEntryEnum< String >( "LocalCopy" );
+            config.addEntryEnum< String >( "MpiIO" );
       };
 };
 
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test
index 0dc50246f5ced2ec65616f7eecc65ebf860a2cdb..24b782a820641a8ce62a11b7221da22d2473e735 100755
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnl-run-fsm-eoc-test
@@ -1,19 +1,33 @@
 #!/bin/bash
 
 device="host"
-dimensions="2D 3D"
-#dimensions="3D"
+#dimensions="2D 3D"
+dimensions="2D"
 sizes1D="16 32 64 128 256 512 1024 2048 4096"
 #sizes1D="256"
-sizes2D="16 32 64 128 256 512 1024"
-#sizes2D="8"
-sizes3D="16 32 64 128 256"
+sizes2D="16 32 64 128 256 512 1024 2048 4096"
+#sizes2D="16"
+sizes3D="8 16 32 64 128 256"
+#sizes3D="128 256"
 testFunctions="paraboloid"
+#testFunctions="sin-wave-sdf"
+#testFunctions="sin-bumps-sdf"
 snapshotPeriod=0.1
 finalTime=1.5
-solverName="tnl-direct-eikonal-solver"
-#solverName="gdb --args tnl-direct-eikonal-solver-dbg --catch-exceptions no"
-#
+realType="double"
+#mpiRun="mpirun -np 4 -oversubscribe "
+mpiRun=""
+
+## CAREFULL: If you set LocalCopy of MPI, you have to start with mpiRun even tnl-init
+## 	     This isnt problem with MpiIO.
+## CAREFULL: For smoothly calculated error, you have to choose the right output function which
+##           is for both MpiIO, LocalCopy different.
+
+solverName=${mpiRun}"tnl-direct-eikonal-solver"
+#solverName="gdb --args "${mpiRun}"tnl-direct-eikonal-solver-dbg --catch-exceptions no --mpi-gdb-debug false"
+#scale=2.0
+#finalSdf="aux-0.tnl aux-1.tnl"
+finalSdf="aux-final.tnl"
 
 setupTestFunction()
 {
@@ -22,16 +36,17 @@ setupTestFunction()
 #   then
       origin=-1.0
       proportions=2.0
+#      origin=-1.0
+#      proportions=2.0
       amplitude=1.0
-      waveLength=0.2
-      waveLengthX=0.2
-      waveLengthY=0.2
-      waveLengthZ=0.2
-      wavesNumber=3.0
-      wavesNumberX=0.5
-      wavesNumberY=2.0
+      waveLength=0.4
+      waveLengthX=0.5
+      waveLengthY=0.5
+      waveLengthZ=0.5
+      wavesNumber=1.25
+      wavesNumberX=0.5      wavesNumberY=2.0
       wavesNumberZ=3.0
-      phase=0.1
+      phase=-1.5
       phaseX=0.0
       phaseY=0.0
       phaseZ=0.0
@@ -44,6 +59,7 @@ setupGrid()
 {
    dimensions=$1
    gridSize=$2
+   #scale=$3
    tnl-grid-setup --dimensions ${dimensions} \
                   --origin-x ${origin} \
                   --origin-y ${origin} \
@@ -53,47 +69,51 @@ setupGrid()
                   --proportions-z ${proportions} \
                   --size-x ${gridSize} \
                   --size-y ${gridSize} \
-                  --size-z ${gridSize} 
+	    	  --real-type ${realType} \
+                  --size-z ${gridSize}
+#$((2*${gridSize})) 
 }
 
 setInitialCondition()
 {
    testFunction=$1
    tnl-init --test-function ${testFunction} \
-            --output-file initial-u.tnl \
-            --amplitude ${amplitude} \
-            --wave-length ${waveLength} \
-            --wave-length-x ${waveLengthX} \
-            --wave-length-y ${waveLengthY} \
-            --wave-length-z ${waveLengthZ} \
-            --waves-number ${wavesNumber} \
-            --waves-number-x ${wavesNumberX} \
-            --waves-number-y ${wavesNumberY} \
-            --waves-number-z ${wavesNumberZ} \
-            --phase ${phase} \
-            --phase-x ${phaseX} \
-            --phase-y ${phaseY} \
-            --phase-z ${phaseZ} \
-            --sigma ${sigma} \
-            --radius ${radius}
+	    		 --real-type ${realType} \
+            		 --output-file initial-u.tnl \
+            		 --amplitude ${amplitude} \
+            		 --wave-length ${waveLength} \
+            		 --wave-length-x ${waveLengthX} \
+            		 --wave-length-y ${waveLengthY} \
+            		 --wave-length-z ${waveLengthZ} \
+            		 --waves-number ${wavesNumber} \
+            		 --waves-number-x ${wavesNumberX} \
+            		 --waves-number-y ${wavesNumberY} \
+            		 --waves-number-z ${wavesNumberZ} \
+            		 --phase ${phase} \
+            		 --phase-x ${phaseX} \
+            		 --phase-y ${phaseY} \
+            		 --phase-z ${phaseZ} \
+            		 --sigma ${sigma} \
+            		 --radius ${radius}
    
-    tnl-init --test-function ${testFunction}-sdf \
-            --output-file exact-u.tnl \
-            --amplitude ${amplitude} \
-            --wave-length ${waveLength} \
-            --wave-length-x ${waveLengthX} \
-            --wave-length-y ${waveLengthY} \
-            --wave-length-z ${waveLengthZ} \
-            --waves-number ${wavesNumber} \
-            --waves-number-x ${wavesNumberX} \
-            --waves-number-y ${wavesNumberY} \
-            --waves-number-z ${wavesNumberZ} \
-            --phase ${phase} \
-            --phase-x ${phaseX} \
-            --phase-y ${phaseY} \
-            --phase-z ${phaseZ} \
-            --sigma ${sigma} \
-            --radius ${radius}
+   tnl-init --test-function ${testFunction}-sdf \
+	    		 --real-type ${realType} \
+            		 --output-file exact-u.tnl \
+            		 --amplitude ${amplitude} \
+            		 --wave-length ${waveLength} \
+            		 --wave-length-x ${waveLengthX} \
+            		 --wave-length-y ${waveLengthY} \
+            		 --wave-length-z ${waveLengthZ} \
+            		 --waves-number ${wavesNumber} \
+            		 --waves-number-x ${wavesNumberX} \
+            		 --waves-number-y ${wavesNumberY} \
+            		 --waves-number-z ${wavesNumberZ} \
+            		 --phase ${phase} \
+            		 --phase-x ${phaseZ} \
+            		 --phase-y ${phaseZ} \
+            		 --phase-z ${phaseZ} \
+            		 --sigma ${sigma} \
+            		 --radius ${radius} \
 
 }
 
@@ -111,17 +131,22 @@ solve()
                  --min-iterations 20 \
                  --convergence-residue 1.0e-12 \
                  --snapshot-period ${snapshotPeriod} \
+		 --real-type ${realType} \
                  --final-time ${finalTime}
 }
                
 computeError()
 {
+for sweep in ${finalSdf}
+do
    tnl-diff --mesh mesh.tnl \
-            --input-files aux-final.tnl exact-u.tnl \
+	    --input-files exact-u.tnl u-00000.tnl \
             --mode sequence \
             --snapshot-period ${snapshotPeriod} \
             --output-file errors.txt \
             --write-difference yes
+#aux-final.tnl \
+done
 }
 
 runTest()
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..55129c4e1008e69ef3b3d238acb8fc587cce4076
--- /dev/null
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h
@@ -0,0 +1,204 @@
+/* 
+ * File:   tnlDirectEikonalMethodBase1D_impl.h
+ * Author: Fencl
+ *
+ * Created on March 15, 2019
+ */
+
+#pragma once
+
+template< typename Real,
+        typename Device,
+        typename Index >
+void
+tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >::
+initInterface( const MeshFunctionPointer& _input,
+        MeshFunctionPointer& _output,
+        InterfaceMapPointer& _interfaceMap  )
+{
+  if( std::is_same< Device, Devices::Cuda >::value )
+  {
+#ifdef HAVE_CUDA
+    const MeshType& mesh = _input->getMesh();
+    
+    const int cudaBlockSize( 16 );
+    int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
+    dim3 blockSize( cudaBlockSize );
+    dim3 gridSize( numBlocksX );
+    Devices::Cuda::synchronizeDevice();
+    CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
+            _output.template modifyData< Device >(),
+            _interfaceMap.template modifyData< Device >() );
+    cudaDeviceSynchronize();
+    TNL_CHECK_CUDA_DEVICE;
+#endif
+  }
+  if( std::is_same< Device, Devices::Host >::value )
+  {
+    const MeshType& mesh = _input->getMesh();
+    typedef typename MeshType::Cell Cell;
+    const MeshFunctionType& input = _input.getData();
+    MeshFunctionType& output = _output.modifyData();
+    InterfaceMapType& interfaceMap = _interfaceMap.modifyData();
+    Cell cell( mesh );
+    for( cell.getCoordinates().x() = 0;
+            cell.getCoordinates().x() < mesh.getDimensions().x();
+            cell.getCoordinates().x() ++ )
+    {
+      cell.refresh();
+      output[ cell.getIndex() ] =
+              input( cell ) >= 0 ? std::numeric_limits< RealType >::max() :
+                -std::numeric_limits< RealType >::max();
+      interfaceMap[ cell.getIndex() ] = false;
+    }
+    
+    
+    const RealType& h = mesh.getSpaceSteps().x();
+    for( cell.getCoordinates().x() = 0;
+            cell.getCoordinates().x() < mesh.getDimensions().x() - 1;
+            cell.getCoordinates().x() ++ )
+    {
+      cell.refresh();
+      const RealType& c = input( cell );      
+      if( ! cell.isBoundaryEntity()  )
+      {
+        const auto& neighbors = cell.getNeighborEntities();
+        Real pom = 0;
+        //const IndexType& c = cell.getIndex();
+        const IndexType e = neighbors.template getEntityIndex<  1 >();
+        if( c * input[ e ] <= 0 )
+        {
+          pom = TNL::sign( c )*( h * c )/( c - input[ e ]);
+          if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
+            output[ cell.getIndex() ] = pom;
+          
+          pom = pom - TNL::sign( c )*h; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
+          if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
+            output[ e ] = pom; 
+          
+          interfaceMap[ cell.getIndex() ] = true;
+          interfaceMap[ e ] = true;
+        }
+      }
+    }
+  }
+}
+
+template< typename Real,
+        typename Device,
+        typename Index >
+template< typename MeshEntity >
+void
+tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >::
+updateCell( MeshFunctionType& u,
+        const MeshEntity& cell, 
+        const RealType v )
+{
+  const auto& neighborEntities = cell.template getNeighborEntities< 1 >();
+  const MeshType& mesh = cell.getMesh();
+  const RealType& h = mesh.getSpaceSteps().x();
+  const RealType value = u( cell );
+  RealType a, tmp = std::numeric_limits< RealType >::max();
+  
+  if( cell.getCoordinates().x() == 0 )
+    a = u[ neighborEntities.template getEntityIndex< 1 >() ];
+  else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
+    a = u[ neighborEntities.template getEntityIndex< -1 >() ];
+  else
+  {
+    a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1 >() ],
+            u[ neighborEntities.template getEntityIndex<  1 >() ] );
+  }
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() )
+    return;
+  
+  tmp = a + TNL::sign( value ) * h/v;
+  
+  u[ cell.getIndex() ] = argAbsMin( value, tmp );
+}
+
+template< typename Real,
+        typename Device,
+        typename Index >
+__cuda_callable__
+bool
+tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >::
+updateCell( volatile Real sArray[18], int thri, const Real h, const Real v )
+{
+  const RealType value = sArray[ thri ];
+  RealType a, tmp = std::numeric_limits< RealType >::max();
+  
+  a = TNL::argAbsMin( sArray[ thri+1 ],
+          sArray[ thri-1 ] );
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() )
+    return false;
+  
+  tmp = a + TNL::sign( value ) * h/v;
+  
+  
+  sArray[ thri ] = argAbsMin( value, tmp );
+  
+  tmp = value - sArray[ thri ];
+  if ( fabs( tmp ) >  0.001*h )
+    return true;
+  else
+    return false;
+}
+
+#ifdef HAVE_CUDA
+template < typename Real, typename Device, typename Index >
+__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, 
+        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap )
+{
+  int i = threadIdx.x + blockDim.x*blockIdx.x;
+  const Meshes::Grid< 1, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
+  
+  if( i < mesh.getDimensions().x()  )
+  {
+    typedef typename Meshes::Grid< 1, Real, Device, Index >::Cell Cell;
+    Cell cell( mesh );
+    cell.getCoordinates().x() = i;
+    cell.refresh();
+    const Index cind = cell.getIndex();
+    
+    
+    output[ cind ] =
+            input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
+              - std::numeric_limits< Real >::max();
+    interfaceMap[ cind ] = false; 
+    
+    const Real& h = mesh.getSpaceSteps().x();
+    cell.refresh();
+    const Real& c = input( cell );
+    if( ! cell.isBoundaryEntity()  )
+    {
+      auto neighbors = cell.getNeighborEntities();
+      Real pom = 0;
+      const Index e = neighbors.template getEntityIndex< 1 >();
+      const Index w = neighbors.template getEntityIndex< -1 >();
+      if( c * input[ e ] <= 0 )
+      {
+        pom = TNL::sign( c )*( h * c )/( c - input[ e ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
+          output[ cind ] = pom;                       
+        
+        interfaceMap[ cind ] = true;
+      }
+      if( c * input[ w ] <= 0 )
+      {
+        pom = TNL::sign( c )*( h * c )/( c - input[ w ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+          output[ cind ] = pom;
+        
+        interfaceMap[ cind ] = true;
+      }
+    }
+  }
+  
+}
+#endif
+
+
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..cddf4f9cb7a97f8a74eb94f7377b2cf740db03a5
--- /dev/null
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
@@ -0,0 +1,803 @@
+/* 
+ * File:   tnlDirectEikonalMethodBase2D_impl.h
+ * Author: Fencl
+ *
+ * Created on March 15, 2019
+ */
+
+#pragma once
+
+template< typename Real,
+        typename Device,
+        typename Index >
+void
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+initInterface( const MeshFunctionPointer& _input,
+        MeshFunctionPointer& _output,
+        InterfaceMapPointer& _interfaceMap, 
+        const StaticVector vecLowerOverlaps, 
+        const StaticVector vecUpperOverlaps )
+{
+  
+  if( std::is_same< Device, Devices::Cuda >::value )
+  {
+#ifdef HAVE_CUDA
+    const MeshType& mesh = _input->getMesh();
+    
+    const int cudaBlockSize( 16 );
+    int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
+    int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
+    dim3 blockSize( cudaBlockSize, cudaBlockSize );
+    dim3 gridSize( numBlocksX, numBlocksY );
+    Devices::Cuda::synchronizeDevice();
+    CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
+            _output.template modifyData< Device >(),
+            _interfaceMap.template modifyData< Device >(),
+            vecLowerOverlaps, vecUpperOverlaps);
+    cudaDeviceSynchronize();
+    TNL_CHECK_CUDA_DEVICE;
+#endif
+  }
+  if( std::is_same< Device, Devices::Host >::value )
+  {
+    MeshFunctionType input = _input.getData();    
+    MeshFunctionType& output = _output.modifyData();
+    InterfaceMapType& interfaceMap = _interfaceMap.modifyData();
+    const MeshType& mesh = input.getMesh();
+    typedef typename MeshType::Cell Cell;
+    Cell cell( mesh );
+    for( cell.getCoordinates().y() = 0;
+            cell.getCoordinates().y() < mesh.getDimensions().y();
+            cell.getCoordinates().y() ++ )
+      for( cell.getCoordinates().x() = 0;
+              cell.getCoordinates().x() < mesh.getDimensions().x();
+              cell.getCoordinates().x() ++ )
+      {
+        cell.refresh();
+        output[ cell.getIndex() ] =
+                input( cell ) >= 0 ? std::numeric_limits< RealType >::max() :
+                  - std::numeric_limits< RealType >::max();
+        interfaceMap[ cell.getIndex() ] = false;
+      }
+    
+    const RealType& hx = mesh.getSpaceSteps().x();
+    const RealType& hy = mesh.getSpaceSteps().y();     
+    for( cell.getCoordinates().y() = 0 + vecLowerOverlaps[1];
+            cell.getCoordinates().y() < mesh.getDimensions().y() - vecUpperOverlaps[1];
+            cell.getCoordinates().y() ++ )
+      for( cell.getCoordinates().x() = 0 + vecLowerOverlaps[0];
+              cell.getCoordinates().x() < mesh.getDimensions().x() - vecUpperOverlaps[0];
+              cell.getCoordinates().x() ++ )
+      {
+        cell.refresh();
+        const RealType& c = input( cell );
+        if( ! cell.isBoundaryEntity()  )
+        {
+          auto neighbors = cell.getNeighborEntities();
+          Real pom = 0;
+          const IndexType e = neighbors.template getEntityIndex<  1,  0 >();
+          const IndexType n = neighbors.template getEntityIndex<  0,  1 >();
+          if( c * input[ n ] <= 0 )
+          {
+            pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
+            if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
+              output[ cell.getIndex() ] = pom;
+            pom = pom - TNL::sign( c )*hy;
+            if( TNL::abs( output[ n ] ) > TNL::abs( pom ) )
+              output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy;
+            
+            interfaceMap[ cell.getIndex() ] = true;
+            interfaceMap[ n ] = true;
+          }
+          if( c * input[ e ] <= 0 )
+          {
+            pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
+            if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
+              output[ cell.getIndex() ] = pom;
+            
+            pom = pom - TNL::sign( c )*hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
+            if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
+              output[ e ] = pom; 
+            
+            interfaceMap[ cell.getIndex() ] = true;
+            interfaceMap[ e ] = true;
+          }
+        }
+      }
+  }
+}
+
+template< typename Real,
+        typename Device,
+        typename Index >
+template< typename MeshEntity >
+__cuda_callable__
+bool
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+updateCell( MeshFunctionType& u,
+        const MeshEntity& cell,   
+        const RealType v)
+{
+  const auto& neighborEntities = cell.template getNeighborEntities< 2 >();
+  const MeshType& mesh = cell.getMesh();
+  const RealType& hx = mesh.getSpaceSteps().x();
+  const RealType& hy = mesh.getSpaceSteps().y();
+  const RealType value = u( cell );
+  RealType a, b, tmp = std::numeric_limits< RealType >::max();
+  
+  if( cell.getCoordinates().x() == 0 )
+    a = u[ neighborEntities.template getEntityIndex< 1,  0 >() ];
+  else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
+    a = u[ neighborEntities.template getEntityIndex< -1,  0 >() ];
+  else
+  {
+    a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1,  0 >() ],
+            u[ neighborEntities.template getEntityIndex<  1,  0 >() ] );
+  }
+  
+  if( cell.getCoordinates().y() == 0 )
+    b = u[ neighborEntities.template getEntityIndex< 0,  1 >()];
+  else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 )
+    b = u[ neighborEntities.template getEntityIndex< 0,  -1 >() ];
+  else
+  {
+    b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0,  -1 >() ],
+            u[ neighborEntities.template getEntityIndex< 0,   1 >() ] );
+  }
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+          fabs( b ) == std::numeric_limits< RealType >::max() )
+    return false;
+  
+  RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), hx, hy, 0.0 };
+  
+  tmp = getNewValue( pom , value, v );
+  
+  u[ cell.getIndex() ] = tmp;
+  
+  
+  tmp = value - u[ cell.getIndex() ];
+  
+  if ( fabs( tmp ) >  0.001*hx )
+    return true;
+  else
+    return false;
+  
+}
+
+template< typename Real,
+        typename Device,
+        typename Index >
+template< int sizeSArray >
+__cuda_callable__
+bool
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real hy,
+        const Real v )
+{
+  const RealType value = sArray[ thrj * sizeSArray + thri ];
+  RealType a, b, tmp = std::numeric_limits< RealType >::max();
+  
+  b = TNL::argAbsMin( sArray[ (thrj+1) * sizeSArray + thri ],
+          sArray[ (thrj-1) * sizeSArray + thri ] );
+  
+  a = TNL::argAbsMin( sArray[ thrj * sizeSArray + thri+1 ],
+          sArray[ thrj * sizeSArray + thri-1 ] );
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+          fabs( b ) == std::numeric_limits< RealType >::max() )
+    return false;
+  
+  RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
+  
+  tmp = getNewValue( pom , value, v );
+  
+  sArray[ thrj * sizeSArray + thri ] = tmp;
+  tmp = value - sArray[ thrj * sizeSArray + thri ];
+  if ( fabs( tmp ) >  0.001*hx )
+    return true;
+  else
+    return false;
+}
+
+template< typename Real,
+        typename Device,
+        typename Index >
+__cuda_callable__
+Real
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+getNewValue( RealType valuesAndSteps[], const RealType originalValue, const RealType v )
+{
+  RealType newValue = std::numeric_limits< RealType >::max();
+  sortMinims( valuesAndSteps );
+  
+  // calculation of real value taken from ZHAO
+  newValue = valuesAndSteps[ 0 ] + TNL::sign( originalValue ) * valuesAndSteps[ 3 ]/v;
+  if( fabs( newValue ) < fabs( valuesAndSteps[ 1 ] ) ) 
+  {
+    newValue = argAbsMin( originalValue, newValue );
+  }
+  else
+  {
+    newValue = ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] * valuesAndSteps[ 1 ] + 
+            valuesAndSteps[ 4 ] * valuesAndSteps[ 4 ] * valuesAndSteps[ 0 ] + 
+            TNL::sign( originalValue ) * valuesAndSteps[ 3 ] * valuesAndSteps[ 4 ] * 
+            TNL::sqrt( ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] +  valuesAndSteps[ 4 ] *  valuesAndSteps[ 4 ] )/( v * v ) - 
+            ( valuesAndSteps[ 1 ] - valuesAndSteps[ 0 ] ) * 
+            ( valuesAndSteps[ 1 ] - valuesAndSteps[ 0 ] ) ) )/
+            ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] + valuesAndSteps[ 4 ] * valuesAndSteps[ 4 ] );
+    newValue = argAbsMin( originalValue, newValue );
+  }
+  
+  return newValue;
+}
+
+
+template < typename T1 >
+__cuda_callable__ void sortMinims( T1 pom[] )
+{
+  T1 tmp[6] = {0.0,0.0,0.0,0.0,0.0,0.0}; 
+  if( fabs(pom[0]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[2])){
+    tmp[0] = pom[0]; tmp[1] = pom[1]; tmp[2] = pom[2];
+    tmp[3] = pom[3]; tmp[4] = pom[4]; tmp[5] = pom[5];
+    
+  }
+  else if( fabs(pom[0]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[1]) ){
+    tmp[0] = pom[0]; tmp[1] = pom[2]; tmp[2] = pom[1];
+    tmp[3] = pom[3]; tmp[4] = pom[5]; tmp[5] = pom[4];
+  }
+  else if( fabs(pom[1]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[2]) ){
+    tmp[0] = pom[1]; tmp[1] = pom[0]; tmp[2] = pom[2];
+    tmp[3] = pom[4]; tmp[4] = pom[3]; tmp[5] = pom[5];
+  }
+  else if( fabs(pom[1]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[0]) ){
+    tmp[0] = pom[1]; tmp[1] = pom[2]; tmp[2] = pom[0];
+    tmp[3] = pom[4]; tmp[4] = pom[5]; tmp[5] = pom[3];
+  }
+  else if( fabs(pom[2]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[1]) ){
+    tmp[0] = pom[2]; tmp[1] = pom[0]; tmp[2] = pom[1];
+    tmp[3] = pom[5]; tmp[4] = pom[3]; tmp[5] = pom[4];
+  }
+  else if( fabs(pom[2]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[0]) ){
+    tmp[0] = pom[2]; tmp[1] = pom[1]; tmp[2] = pom[0];
+    tmp[3] = pom[5]; tmp[4] = pom[4]; tmp[5] = pom[3];
+  }
+  
+  for( unsigned int i = 0; i < 6; i++ )
+  {
+    pom[ i ] = tmp[ i ];
+  }   
+}
+
+#ifdef HAVE_CUDA
+template < typename Real, typename Device, typename Index >
+__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
+        const Containers::StaticVector< 2, Index > vecLowerOverlaps, 
+        const Containers::StaticVector< 2, Index > vecUpperOverlaps ) 
+{
+  int i = threadIdx.x + blockDim.x*blockIdx.x;
+  int j = blockDim.y*blockIdx.y + threadIdx.y;
+  const Meshes::Grid< 2, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
+  
+  if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+  {
+    typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell;
+    Cell cell( mesh );
+    cell.getCoordinates().x() = i; cell.getCoordinates().y() = j;
+    cell.refresh();
+    const Index cind = cell.getIndex();
+    
+    
+    output[ cind ] =
+            input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
+              - std::numeric_limits< Real >::max();
+    interfaceMap[ cind ] = false; 
+    
+    if( i < mesh.getDimensions().x() - vecUpperOverlaps[ 0 ] &&
+            j < mesh.getDimensions().y() - vecUpperOverlaps[ 1 ] &&
+            i>vecLowerOverlaps[ 0 ] -1 && j> vecLowerOverlaps[ 1 ]-1 )
+    {
+      const Real& hx = mesh.getSpaceSteps().x();
+      const Real& hy = mesh.getSpaceSteps().y();
+      cell.refresh();
+      const Real& c = input( cell );
+      if( ! cell.isBoundaryEntity()  )
+      {
+        auto neighbors = cell.getNeighborEntities();
+        Real tmp = 0;
+        const Index e = neighbors.template getEntityIndex<  1,  0 >();
+        const Index w = neighbors.template getEntityIndex<  -1,  0 >();
+        const Index n = neighbors.template getEntityIndex<  0,  1 >();
+        const Index s = neighbors.template getEntityIndex<  0,  -1 >();
+        
+        if( c * input[ n ] <= 0 )
+        {
+          tmp = TNL::sign( c )*( hy * c )/( c - input[ n ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( tmp ) )
+              output[ cind ] = tmp;
+          
+          interfaceMap[ cell.getIndex() ] = true;
+        }
+        
+        
+        if( c * input[ e ] <= 0 )
+        {
+          tmp = TNL::sign( c )*( hx * c )/( c - input[ e ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( tmp ) )
+            output[ cind ] = tmp;                       
+          
+          interfaceMap[ cind ] = true;
+        }
+        if( c * input[ w ] <= 0 )
+        {
+          tmp = TNL::sign( c )*( hx * c )/( c - input[ w ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( tmp ) ) 
+            output[ cind ] = tmp;
+          
+          interfaceMap[ cind ] = true;
+        }
+        if( c * input[ s ] <= 0 )
+        {
+          tmp = TNL::sign( c )*( hy * c )/( c - input[ s ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( tmp ) ) 
+            output[ cind ] = tmp;
+          
+          interfaceMap[ cind ] = true;
+        }
+      }
+    }
+  }
+}
+
+
+template < typename Index >
+__global__ void GetNeighbours( const TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator,
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY )
+{
+  int i = blockIdx.x * 1024 + threadIdx.x;
+  
+  if( i < numBlockX * numBlockY )
+  {
+    int pom = 0;//BlockIterPom[ i ] = 0;
+    int m=0, k=0;
+    m = i%numBlockX;
+    k = i/numBlockX;
+    if( m > 0 && blockCalculationIndicator[ i - 1 ] ){
+      pom = 1;//blockCalculationIndicatorHelp[ i ] = 1;
+    }else if( m < numBlockX -1 && blockCalculationIndicator[ i + 1 ] ){
+      pom = 1;//blockCalculationIndicatorHelp[ i ] = 1;
+    }else if( k > 0 && blockCalculationIndicator[ i - numBlockX ] ){
+      pom = 1;// blockCalculationIndicatorHelp[ i ] = 1;
+    }else if( k < numBlockY -1 && blockCalculationIndicator[ i + numBlockX ] ){
+      pom = 1;//blockCalculationIndicatorHelp[ i ] = 1;
+    }
+    
+    if( blockCalculationIndicator[ i ] != 1 )
+      blockCalculationIndicatorHelp[ i ] = pom;//BlockIterPom[ i ];
+    else
+      blockCalculationIndicatorHelp[ i ] = 1;
+  }
+}
+
+
+
+
+template < int sizeSArray, typename Real, typename Device, typename Index >
+__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
+        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
+        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator,
+        const Containers::StaticVector< 2, Index > vecLowerOverlaps, 
+        const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock )
+{
+  // Setting up threads
+  int thri = threadIdx.x; int thrj = threadIdx.y;
+  int i = threadIdx.x + blockDim.x*blockIdx.x + vecLowerOverlaps[0];
+  int j = blockDim.y*blockIdx.y + threadIdx.y + vecLowerOverlaps[1];
+  const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >();
+  
+/** FOR CHESS METHOD */
+  //if( (blockIdx.y%2  + blockIdx.x) % 2 == oddEvenBlock )
+  //{
+/**------------------------------------------*/
+  
+/** FOR FIM METHOD */
+  if( blockCalculationIndicator[ blockIdx.y * gridDim.x + blockIdx.x ] )
+  { 
+    __syncthreads();
+/**-----------------------------------------*/
+    
+    const int dimX = mesh.getDimensions().x(); const int dimY = mesh.getDimensions().y();
+    const Real hx = mesh.getSpaceSteps().x(); const Real hy = mesh.getSpaceSteps().y();
+    if( thri==0 && thrj == 0)
+    {
+      blockCalculationIndicator[ blockIdx.y * gridDim.x + blockIdx.x ] = 0;
+    }
+    __syncthreads();
+    int maxThreadsInXDirection;
+    int maxThreadsInYDirection;
+    
+    // Maximum threads in each direction can differ
+    // e.g. cudaBlockSize = 16, dimX = 50, then:
+    // blockIdx   maxThreadsInXDirection   calculation [from, to]  sArray [from, to] 
+    //    0                 16                    [ 0,15]             [ 0,16]   //"-1" set to inf
+    //    1                 16                    [16,31]             [15,32]
+    //    2                 16                    [32,47]             [31,48]
+    //    3                  2                    [48,50]             [47,50]   // rest set to inf
+    // same for YDirection because blocks are squared 
+    maxThreadsInXDirection = blockDim.x + 1;
+    maxThreadsInYDirection = blockDim.y + 1;
+    
+    if( gridDim.x - 1 == blockIdx.x ) // care about number of values if we are in last block
+      maxThreadsInXDirection = (dimX-vecUpperOverlaps[0]-vecLowerOverlaps[0]) - (blockIdx.x)*blockDim.x+1;
+    
+    if( gridDim.y - 1 == blockIdx.y ) // care about number of values if we are in last block
+      maxThreadsInYDirection = (dimY-vecUpperOverlaps[1]-vecLowerOverlaps[1]) - (blockIdx.y)*blockDim.y+1;
+    __syncthreads();
+    
+    // Setting changed array that contains info: "Did the value of this thread changed in last passage?"
+    // Will be used in parallel reduction ( inside block level )
+    int currentIndex = thrj * blockDim.x + thri;
+    __shared__ volatile bool changed[ ( sizeSArray - 2 ) * ( sizeSArray - 2 ) ];
+    changed[ currentIndex ] = false;
+    if( thrj == 0 && thri == 0 )
+      changed[ 0 ] = true; // fist must be true to start while cycle
+    
+    
+    //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ];
+    __shared__ volatile Real sArray[ sizeSArray * sizeSArray ];
+    sArray[ (thrj+1) * sizeSArray + thri +1 ] = std::numeric_limits< Real >::max();
+    
+       
+    //filling sArray edges
+    if( thri == 0 ) // 
+    {      
+      if( dimX - vecLowerOverlaps[ 0 ] > (blockIdx.x+1) * blockDim.x  && thrj+1 < maxThreadsInYDirection )
+        sArray[ (thrj+1)*sizeSArray + maxThreadsInXDirection ] = 
+                aux[ (blockIdx.y*blockDim.y+vecLowerOverlaps[1])*dimX - dimX + blockIdx.x*blockDim.x - 1 // this to get to right possition
+                + (thrj+1)*dimX + maxThreadsInXDirection + vecLowerOverlaps[0] ];                        // rest to get the right sArray overlap
+      else
+        sArray[ (thrj+1)*sizeSArray + maxThreadsInXDirection ] = std::numeric_limits< Real >::max();
+    }
+        
+    if( thri == 1 )
+    { 
+      if( ( blockIdx.x != 0 || vecLowerOverlaps[0] != 0 ) && thrj+1 < maxThreadsInYDirection )
+        sArray[(thrj+1)*sizeSArray + 0] = 
+                aux[ (blockIdx.y*blockDim.y+vecLowerOverlaps[1])*dimX - dimX + blockIdx.x*blockDim.x - 1
+                + (thrj+1)*dimX  + vecLowerOverlaps[0] ];
+      else
+        sArray[(thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max();
+    }
+    
+    if( thri == 2 )
+    {
+      if( dimY - vecLowerOverlaps[ 1 ] > (blockIdx.y+1) * blockDim.y  && thrj+1 < maxThreadsInXDirection )
+        sArray[ maxThreadsInYDirection * sizeSArray + thrj+1 ] = 
+                aux[ ( blockIdx.y * blockDim.y + vecLowerOverlaps[ 1 ] ) * dimX - dimX + blockIdx.x * blockDim.x - 1
+                + maxThreadsInYDirection * dimX + thrj + 1 + vecLowerOverlaps[0] ];
+      else
+        sArray[ maxThreadsInYDirection*sizeSArray + thrj+1 ] = std::numeric_limits< Real >::max();
+      
+    }
+        
+    if( thri == 3 )
+    {
+      if( ( blockIdx.y != 0 || vecLowerOverlaps[1] != 0 ) && thrj+1 < maxThreadsInXDirection )
+        sArray[0*sizeSArray + thrj+1] = 
+                aux[ ( blockIdx.y * blockDim.y + vecLowerOverlaps[ 1 ] ) * dimX - dimX + blockIdx.x * blockDim.x - 1
+                + thrj + 1 + vecLowerOverlaps[ 0 ] ];
+      else
+        sArray[0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
+    }
+    
+    // Filling sArray inside
+    if( i - vecLowerOverlaps[ 0 ] < dimX && j - vecLowerOverlaps[ 1 ] < dimY &&
+            thri + 1 < maxThreadsInXDirection + vecUpperOverlaps[ 0 ] && 
+            thrj + 1 < maxThreadsInYDirection + vecUpperOverlaps[ 1 ] )
+    {
+      sArray[ ( thrj + 1 ) * sizeSArray + thri + 1 ] = aux[ j * dimX + i ];
+    }
+    __syncthreads();  
+
+    //main while cycle ( CALCULATES TILL VALUES ARE CHANGING )
+    while( changed[ 0 ] )
+    {
+      __syncthreads();
+      
+      changed[ currentIndex] = false;
+      
+      //calculation of update cell
+      if( i < dimX - vecUpperOverlaps[ 0 ] && j < dimY - vecUpperOverlaps[ 1 ] )
+      {
+        if( ! interfaceMap[ j * dimX + i ] )
+        {
+          changed[ currentIndex ] = ptr.updateCell<sizeSArray>( sArray, thri + 1, thrj + 1, hx, hy );
+        }
+      }
+      __syncthreads();
+      
+      //pyramid reduction
+      if( blockDim.x * blockDim.y == 1024 )
+      {
+        if( currentIndex < 512 )
+        {
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
+        }
+      }
+      __syncthreads();
+      if( blockDim.x * blockDim.y >= 512 )
+      {
+        if( currentIndex < 256 )
+        {
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
+        }
+      }
+      __syncthreads();
+      if( blockDim.x * blockDim.y >= 256 )
+      {
+        if( currentIndex < 128 )
+        {
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
+        }
+      }
+      __syncthreads();
+      if( blockDim.x * blockDim.y >= 128 )
+      {
+        if( currentIndex < 64 )
+        {
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
+        }
+      }
+      __syncthreads();
+      if( currentIndex < 32 ) 
+      {
+        if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
+        if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
+        if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
+        if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
+        if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
+        if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
+      }
+      // result of reduction is in changed[ 0 ]
+      
+      // If we calculated in passage, then the blockCalculationIndicator for this block has to be 1
+      // means that we calculated in this block
+      if( thri == 0 && thrj == 0 && changed[ 0 ] ){
+        blockCalculationIndicator[ blockIdx.y * gridDim.x + blockIdx.x ] = 1;
+      }
+      __syncthreads();
+    }
+    
+    
+      
+    if( i < dimX && j < dimY && thri+1 < maxThreadsInXDirection && thrj+1 < maxThreadsInYDirection )
+      helpFunc[ j * dimX + i ] = sArray[ ( thrj + 1 ) * sizeSArray + thri + 1 ];
+    __syncthreads();
+  }
+  else
+  {
+    if( i < mesh.getDimensions().x() - vecUpperOverlaps[0] && j < mesh.getDimensions().y() - vecUpperOverlaps[1] )
+      helpFunc[ j * mesh.getDimensions().x() + i ] = aux[ j * mesh.getDimensions().x() + i ];
+  }
+}
+#endif
+
+
+
+/// ====================OPEN=MP============================================
+template< typename Real,
+        typename Device,
+        typename Index >
+template< int sizeSArray >
+void
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+updateBlocks( InterfaceMapType interfaceMap,
+        MeshFunctionType aux,
+        MeshFunctionType helpFunc,
+        ArrayContainerView BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
+{
+#pragma omp parallel for schedule( dynamic )
+  for( IndexType i = 0; i < BlockIterHost.getSize(); i++ )
+  {
+    if( BlockIterHost[ i ] )
+    {
+      MeshType mesh = interfaceMap.template getMesh< Devices::Host >();
+      
+      int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
+      //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl;
+      int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
+      int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
+      //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl;
+      int xkolik = numThreadsPerBlock + 1;
+      int ykolik = numThreadsPerBlock + 1;
+      
+      int blIdx = i%numOfBlockx;
+      int blIdy = i/numOfBlockx;
+      //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl;
+      
+      if( numOfBlockx - 1 == blIdx )
+        xkolik = dimX - (blIdx)*numThreadsPerBlock+1;
+      
+      if( numOfBlocky -1 == blIdy )
+        ykolik = dimY - (blIdy)*numThreadsPerBlock+1;
+      //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl;
+      
+      
+      /*bool changed[numThreadsPerBlock*numThreadsPerBlock];
+       changed[ 0 ] = 1;*/
+      Real hx = mesh.getSpaceSteps().x();
+      Real hy = mesh.getSpaceSteps().y();
+      
+      bool changed = false;
+      BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0;
+      
+      
+      Real *sArray;
+      sArray = new Real[ sizeSArray * sizeSArray ];
+      if( sArray == nullptr )
+        std::cout << "Error while allocating memory for sArray." << std::endl;
+      
+      for( IndexType thri = 0; thri < sizeSArray; thri++ ){
+        for( IndexType thrj = 0; thrj < sizeSArray; thrj++ )
+          sArray[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max();
+      }
+      
+      
+      //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
+      for( IndexType thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ )
+      {        
+        if( dimX > (blIdx+1) * numThreadsPerBlock  && thrj+1 < ykolik )
+          sArray[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ];
+        
+        
+        if( blIdx != 0 && thrj+1 < ykolik )
+          sArray[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ];
+        
+        if( dimY > (blIdy+1) * numThreadsPerBlock  && thrj+1 < xkolik )
+          sArray[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ];
+        
+        if( blIdy != 0 && thrj+1 < xkolik )
+          sArray[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ];
+      }
+      
+      for( IndexType k = 0; k < numThreadsPerBlock; k++ ){
+        for( IndexType l = 0; l < numThreadsPerBlock; l++ )
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
+            sArray[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ];
+      }
+      
+      for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ 
+        for( IndexType l = 0; l < numThreadsPerBlock; l++ ){
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ){
+            //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl;
+            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
+            {
+              changed = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy) || changed;
+              
+            }
+          }
+        }
+      }
+      /*aux.save( "aux-1pruch.tnl" );
+       for( int k = 0; k < sizeSArray; k++ ){ 
+       for( int l = 0; l < sizeSArray; l++ ) {
+       std::cout << sArray[ k * sizeSArray + l] << " ";
+       }
+       std::cout << std::endl;
+       }*/
+      
+      for( IndexType k = 0; k < numThreadsPerBlock; k++ ) 
+        for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ) { 
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
+          {
+            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
+            {
+              this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy);
+            }
+          }
+        }
+      /*aux.save( "aux-2pruch.tnl" );
+       for( int k = 0; k < sizeSArray; k++ ){ 
+       for( int l = 0; l < sizeSArray; l++ ) {
+       std::cout << sArray[ k * sizeSArray + l] << " ";
+       }
+       std::cout << std::endl;
+       }*/
+      
+      for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ) 
+        for( IndexType l = 0; l < numThreadsPerBlock; l++ ) {
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
+          {
+            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
+            {
+              this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy);
+            }
+          }
+        }
+      /*aux.save( "aux-3pruch.tnl" );
+       for( int k = 0; k < sizeSArray; k++ ){ 
+       for( int l = 0; l < sizeSArray; l++ ) {
+       std::cout << sArray[ k * sizeSArray + l] << " ";
+       }
+       std::cout << std::endl;
+       }*/
+      
+      for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){
+        for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ) { 
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
+          {
+            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
+            {
+              this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx, hy, 1.0);
+            }
+          }
+        }
+      }
+      /*aux.save( "aux-4pruch.tnl" );
+       for( int k = 0; k < sizeSArray; k++ ){ 
+       for( int l = 0; l < sizeSArray; l++ ) {
+       std::cout << sArray[ k * sizeSArray + l] << " ";
+       }
+       std::cout << std::endl;
+       }*/
+      
+      
+      if( changed ){
+        BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 1;
+      }
+      
+      
+      for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ 
+        for( IndexType l = 0; l < numThreadsPerBlock; l++ ) {
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )      
+            helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] = sArray[ (k + 1)* sizeSArray + l + 1 ];
+          //std::cout<< sArray[k+1][l+1];
+        }
+        //std::cout<<std::endl;
+      }
+      delete []sArray;
+    }
+  }
+}
+
+template< typename Real,
+        typename Device,
+        typename Index >
+void 
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY )
+{
+  int* BlockIterPom; 
+  BlockIterPom = new int [numBlockX * numBlockY];
+  
+  for(int i = 0; i < numBlockX * numBlockY; i++)
+  {
+    BlockIterPom[ i ] = 0;//BlockIterPom[ i ] = 0;
+    int m=0, k=0;
+    m = i%numBlockX;
+    k = i/numBlockX;
+    if( m > 0 && BlockIterHost[ i - 1 ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){
+      BlockIterPom[ i ] = 1;
+    }
+  }
+  
+  for(int i = 0; i < numBlockX * numBlockY; i++)
+  {
+    if( !BlockIterHost[ i ] )
+      BlockIterHost[ i ] = BlockIterPom[ i ];
+  }
+  delete[] BlockIterPom;
+}
+
+
+
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..32548abcfe66affd71a79d3ed5f2f21d67df644e
--- /dev/null
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h
@@ -0,0 +1,1091 @@
+/* 
+ * File:   tnlDirectEikonalMethodBase3D_impl.h
+ * Author: Fencl
+ *
+ * Created on March 15, 2019
+ */
+
+#pragma once
+
+template< typename Real,
+        typename Device,
+        typename Index >
+void
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+initInterface( const MeshFunctionPointer& _input,
+        MeshFunctionPointer& _output,
+        InterfaceMapPointer& _interfaceMap, 
+        StaticVector vLower, StaticVector vUpper )
+{
+  if( std::is_same< Device, Devices::Cuda >::value )
+  {
+#ifdef HAVE_CUDA
+    const MeshType& mesh = _input->getMesh();
+    
+    const int cudaBlockSize( 8 );
+    int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
+    int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
+    int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize );
+    if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
+      std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
+    dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
+    dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
+    Devices::Cuda::synchronizeDevice();
+    CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(),
+            _output.template modifyData< Device >(),
+            _interfaceMap.template modifyData< Device >(), vLower, vUpper );
+    cudaDeviceSynchronize();
+    TNL_CHECK_CUDA_DEVICE;
+#endif
+  }
+  if( std::is_same< Device, Devices::Host >::value )
+  {
+    const MeshFunctionType& input =  _input.getData();
+    MeshFunctionType& output =  _output.modifyData();
+    InterfaceMapType& interfaceMap =  _interfaceMap.modifyData();
+    
+    const MeshType& mesh = input.getMesh();
+    typedef typename MeshType::Cell Cell;
+    
+    Cell cell( mesh );
+    for( cell.getCoordinates().z() = 0;
+            cell.getCoordinates().z() < mesh.getDimensions().z();
+            cell.getCoordinates().z() ++ )
+      for( cell.getCoordinates().y() = 0;
+              cell.getCoordinates().y() < mesh.getDimensions().y();
+              cell.getCoordinates().y() ++ )
+        for( cell.getCoordinates().x() = 0;
+                cell.getCoordinates().x() < mesh.getDimensions().x();
+                cell.getCoordinates().x() ++ )
+        {
+          cell.refresh();
+          output[ cell.getIndex() ] =
+                  input( cell ) > 0 ? std::numeric_limits< RealType >::max() :
+                    - std::numeric_limits< RealType >::max();
+          interfaceMap[ cell.getIndex() ] = false;
+        }
+    
+    const RealType& hx = mesh.getSpaceSteps().x();
+    const RealType& hy = mesh.getSpaceSteps().y();
+    const RealType& hz = mesh.getSpaceSteps().z();
+    for( cell.getCoordinates().z() = 0 + vLower[2];
+            cell.getCoordinates().z() < mesh.getDimensions().z() - vUpper[2];
+            cell.getCoordinates().z() ++ )   
+      for( cell.getCoordinates().y() = 0 + vLower[1];
+              cell.getCoordinates().y() < mesh.getDimensions().y() - vUpper[1];
+              cell.getCoordinates().y() ++ )
+        for( cell.getCoordinates().x() = 0 + vLower[0];
+                cell.getCoordinates().x() < mesh.getDimensions().x() - vUpper[0];
+                cell.getCoordinates().x() ++ )
+        {
+          cell.refresh();
+          const RealType& c = input( cell );
+          if( ! cell.isBoundaryEntity() )
+          {
+            auto neighbors = cell.getNeighborEntities();
+            Real pom = 0;
+            const IndexType e = neighbors.template getEntityIndex<  1,  0,  0 >();
+            const IndexType n = neighbors.template getEntityIndex<  0,  1,  0 >();
+            const IndexType t = neighbors.template getEntityIndex<  0,  0,  1 >();
+            
+            
+            if( c * input[ n ] <= 0 )
+            {
+              pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
+              if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
+                output[ cell.getIndex() ] = pom;
+              pom = pom - TNL::sign( c )*hy;
+              if( TNL::abs( output[ n ] ) > TNL::abs( pom ) )
+                output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy;
+              
+              interfaceMap[ cell.getIndex() ] = true;
+              interfaceMap[ n ] = true;
+            }
+            
+            if( c * input[ e ] <= 0 )
+            {
+              pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
+              if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
+                output[ cell.getIndex() ] = pom;
+              pom = pom - TNL::sign( c )*hx;
+              if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
+                output[ e ] = pom; 
+              
+              interfaceMap[ cell.getIndex() ] = true;
+              interfaceMap[ e ] = true;
+            }
+            
+            if( c * input[ t ] <= 0 )
+            {
+              pom = TNL::sign( c )*( hz * c )/( c - input[ t ]);
+              if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
+                output[ cell.getIndex() ] = pom;
+              pom = pom - TNL::sign( c )*hz;
+              if( TNL::abs( output[ t ] ) > TNL::abs( pom ) )
+                output[ t ] = pom; 
+              
+              interfaceMap[ cell.getIndex() ] = true;
+              interfaceMap[ t ] = true;
+            }  
+          }
+        }
+  }
+}
+
+template< typename Real,
+        typename Device,
+        typename Index >
+template< typename MeshEntity >
+__cuda_callable__
+bool
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+updateCell( MeshFunctionType& u,
+        const MeshEntity& cell, 
+        const RealType v )
+{
+  const auto& neighborEntities = cell.template getNeighborEntities< 3 >();
+  const MeshType& mesh = cell.getMesh();
+  
+  const RealType& hx = mesh.getSpaceSteps().x();
+  const RealType& hy = mesh.getSpaceSteps().y();
+  const RealType& hz = mesh.getSpaceSteps().z();
+  const RealType value = u( cell );
+  //std::cout << value << std::endl;
+  RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
+  
+  
+  if( cell.getCoordinates().x() == 0 )
+    a = u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ];
+  else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
+    a = u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ];
+  else
+  {
+    a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ],
+            u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ] );
+  }
+  
+  if( cell.getCoordinates().y() == 0 )
+    b = u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ];
+  else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 )
+    b = u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ];
+  else
+  {
+    b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ],
+            u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ] );
+  }
+  
+  if( cell.getCoordinates().z() == 0 )
+    c = u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ];
+  else if( cell.getCoordinates().z() == mesh.getDimensions().z() - 1 )
+    c = u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ];
+  else
+  {
+    c = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ],
+            u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ] );
+  }
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+          fabs( b ) == std::numeric_limits< RealType >::max() &&
+          fabs( c ) == std::numeric_limits< RealType >::max() )
+    return false;
+  
+  RealType pom[6] = { a, b, c, hx, hy, hz};
+  tmp = getNewValue( pom , value, v );
+  
+  u[ cell.getIndex() ] = tmp;
+  tmp = value - u[ cell.getIndex() ];
+  if ( fabs( tmp ) >  0.001*hx )
+    return true;
+  else
+    return false;
+  /*sortMinims( pom );   
+  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
+  
+  if( fabs( tmp ) < fabs( pom[ 1 ] ) )
+  {
+    u[ cell.getIndex() ] = argAbsMin( value, tmp );
+    tmp = value - u[ cell.getIndex() ];
+    if ( fabs( tmp ) > 0.001*hx ){
+      return true;
+    }else{
+      return false;
+    }
+  }
+  else
+  {
+    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
+            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
+    if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
+    {
+      u[ cell.getIndex() ] = argAbsMin( value, tmp );
+      tmp = value - u[ cell.getIndex() ];
+      if ( fabs( tmp ) > 0.001*hx ){
+        return true;
+      }else{
+        return false;
+      }
+    }
+    else
+    {
+      tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
+              TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
+              hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
+              hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
+      u[ cell.getIndex() ] = argAbsMin( value, tmp );
+      tmp = value - u[ cell.getIndex() ];
+      if ( fabs( tmp ) > 0.001*hx ){
+        return true;
+      }else{
+        return false;
+      }
+    }
+  }*/
+}
+
+template< typename Real,
+        typename Device,
+        typename Index >
+template< int sizeSArray >
+__cuda_callable__ 
+bool 
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+updateCell( volatile Real *sArray, int thri, int thrj, int thrk,
+        const Real hx, const Real hy, const Real hz, const Real v )
+{
+  const RealType value = sArray[thrk *sizeSArray * sizeSArray + thrj * sizeSArray + thri];
+  
+  RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
+  
+  c = TNL::argAbsMin( sArray[ (thrk+1)* sizeSArray*sizeSArray + thrj * sizeSArray + thri ],
+          sArray[ (thrk-1) * sizeSArray *sizeSArray + thrj* sizeSArray + thri ] );
+  
+  b = TNL::argAbsMin( sArray[ thrk* sizeSArray*sizeSArray + (thrj+1) * sizeSArray + thri ],
+          sArray[ thrk* sizeSArray * sizeSArray + (thrj-1)* sizeSArray +thri ] );
+  
+  a = TNL::argAbsMin( sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri+1 ],
+          sArray[ thrk* sizeSArray * sizeSArray + thrj* sizeSArray +thri-1 ] );
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+          fabs( b ) == std::numeric_limits< RealType >::max() &&
+          fabs( c ) == std::numeric_limits< RealType >::max() )
+    return false;
+  
+  RealType pom[6] = { a, b, c, hx, hy, hz};
+  
+  tmp = getNewValue( pom , value, v );
+  
+  sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ] = tmp;
+  tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
+  if ( fabs( tmp ) >  0.001*hx )
+    return true;
+  else
+    return false;
+  /*sortMinims( pom );
+  
+  // calculation of real value taken from ZHAO
+  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
+  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
+  {
+    sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ] = argAbsMin( value, tmp );
+    tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
+    if ( fabs( tmp ) >  0.001*hx )
+      return true;
+    else
+      return false;
+  }
+  else
+  {
+    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
+            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
+    if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
+    {
+      sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ] = argAbsMin( value, tmp );
+      tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
+      if ( fabs( tmp ) > 0.001*hx )
+        return true;
+      else
+        return false;
+    }
+    else
+    {
+      tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
+              TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
+              hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
+              hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
+      sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ] = argAbsMin( value, tmp );
+      tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
+      if ( fabs( tmp ) > 0.001*hx )
+        return true;
+      else
+        return false;
+    }
+  }*/
+  
+}
+
+
+template< typename Real,
+        typename Device,
+        typename Index >
+__cuda_callable__ 
+Real
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+getNewValue( RealType valuesAndSteps[], const RealType originalValue, const RealType v )
+{
+  RealType newValue = std::numeric_limits< RealType >::max();
+  sortMinims( valuesAndSteps );
+  
+  // calculation of real value taken from ZHAO
+  newValue = valuesAndSteps[ 0 ] + TNL::sign( originalValue ) * valuesAndSteps[ 3 ]/v;
+  if( fabs( newValue ) < fabs( valuesAndSteps[ 1 ] ) ) 
+  {
+    newValue = argAbsMin( originalValue, newValue );
+  }
+  else
+  {
+    newValue = ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] * valuesAndSteps[ 1 ] + 
+            valuesAndSteps[ 4 ] * valuesAndSteps[ 4 ] * valuesAndSteps[ 0 ] + 
+            TNL::sign( originalValue ) * valuesAndSteps[ 3 ] * valuesAndSteps[ 4 ] * 
+            TNL::sqrt( ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] +  valuesAndSteps[ 4 ] *  valuesAndSteps[ 4 ] )/( v * v ) - 
+            ( valuesAndSteps[ 1 ] - valuesAndSteps[ 0 ] ) * 
+            ( valuesAndSteps[ 1 ] - valuesAndSteps[ 0 ] ) ) )/
+            ( valuesAndSteps[ 3 ] * valuesAndSteps[ 3 ] + valuesAndSteps[ 4 ] * valuesAndSteps[ 4 ] );
+    if( fabs( newValue ) < fabs( valuesAndSteps[ 2 ]) ) 
+    {
+      newValue = argAbsMin( originalValue, newValue );
+    }
+    else
+    {
+      // Value from algorithm by Zhao
+      newValue = ( valuesAndSteps[4] * valuesAndSteps[4] * valuesAndSteps[5] * valuesAndSteps[5] * valuesAndSteps[0] +
+              valuesAndSteps[3] * valuesAndSteps[3] * valuesAndSteps[5] * valuesAndSteps[5] * valuesAndSteps[1] +
+              valuesAndSteps[3] * valuesAndSteps[3] * valuesAndSteps[4] * valuesAndSteps[4] * valuesAndSteps[2] +
+              TNL::sign(originalValue) * valuesAndSteps[3] * valuesAndSteps[4] * valuesAndSteps[5] * TNL::sqrt(
+              (valuesAndSteps[3] * valuesAndSteps[3] * valuesAndSteps[5] * valuesAndSteps[5] +
+              valuesAndSteps[4] * valuesAndSteps[4] * valuesAndSteps[5] * valuesAndSteps[5] +
+              valuesAndSteps[3] * valuesAndSteps[3] * valuesAndSteps[4] * valuesAndSteps[4]) / (v * v) -
+              valuesAndSteps[5] * valuesAndSteps[5] * (valuesAndSteps[0] - valuesAndSteps[1]) * (valuesAndSteps[0] - valuesAndSteps[1]) -
+              valuesAndSteps[4] * valuesAndSteps[4] * (valuesAndSteps[0] - valuesAndSteps[2]) * (valuesAndSteps[0] - valuesAndSteps[2]) -
+              valuesAndSteps[3] * valuesAndSteps[3] * (valuesAndSteps[1] - valuesAndSteps[2]) * (valuesAndSteps[1] - valuesAndSteps[2]))) / (
+              valuesAndSteps[3] * valuesAndSteps[3] * valuesAndSteps[4] * valuesAndSteps[4] +
+              valuesAndSteps[4] * valuesAndSteps[4] * valuesAndSteps[5] * valuesAndSteps[5] +
+              valuesAndSteps[5] * valuesAndSteps[5] * valuesAndSteps[3] * valuesAndSteps[3]);
+      newValue = argAbsMin( originalValue, newValue );
+    }
+  }
+  
+  return newValue;
+}
+
+
+#ifdef HAVE_CUDA
+template < typename Real, typename Device, typename Index >
+__global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, 
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
+        Containers::StaticVector< 3, Index > vLower, Containers::StaticVector< 3, Index > vUpper )
+{
+  int i = threadIdx.x + blockDim.x*blockIdx.x;
+  int j = blockDim.y*blockIdx.y + threadIdx.y;
+  int k = blockDim.z*blockIdx.z + threadIdx.z;
+  const Meshes::Grid< 3, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
+  
+  if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
+  {
+    typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell;
+    Cell cell( mesh );
+    cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k;
+    cell.refresh();
+    const Index cind = cell.getIndex();
+    
+    
+    output[ cind ] =
+            input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
+              - std::numeric_limits< Real >::max();
+    interfaceMap[ cind ] = false; 
+    cell.refresh();
+    
+    if( i < mesh.getDimensions().x() - vUpper[0] && j < mesh.getDimensions().y() - vUpper[1] &&
+            k < mesh.getDimensions().y() - vUpper[2] && i>vLower[0]-1 && j> vLower[1]-1 && k>vLower[2]-1 )
+    {
+      const Real& hx = mesh.getSpaceSteps().x();
+      const Real& hy = mesh.getSpaceSteps().y();
+      const Real& hz = mesh.getSpaceSteps().z();
+      const Real& c = input( cell );
+      if( ! cell.isBoundaryEntity()  )
+      {
+        auto neighbors = cell.getNeighborEntities();
+        Real pom = 0;
+        const Index e = neighbors.template getEntityIndex<  1, 0, 0 >();
+        const Index w = neighbors.template getEntityIndex<  -1, 0, 0 >();
+        const Index n = neighbors.template getEntityIndex<  0, 1, 0 >();
+        const Index s = neighbors.template getEntityIndex<  0, -1, 0 >();
+        const Index t = neighbors.template getEntityIndex<  0, 0, 1 >();
+        const Index b = neighbors.template getEntityIndex<  0, 0, -1 >();
+        
+        if( c * input[ n ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+            output[ cind ] = pom;
+          
+          interfaceMap[ cind ] = true;
+        }
+        if( c * input[ e ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
+            output[ cind ] = pom;                       
+          
+          interfaceMap[ cind ] = true;
+        }
+        if( c * input[ w ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+            output[ cind ] = pom;
+          
+          interfaceMap[ cind ] = true;
+        }
+        if( c * input[ s ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+            output[ cind ] = pom;
+          
+          interfaceMap[ cind ] = true;
+        }
+        if( c * input[ b ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hz * c )/( c - input[ b ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+            output[ cind ] = pom;
+          
+          interfaceMap[ cind ] = true;
+        }
+        if( c * input[ t ] <= 0 )
+        {
+          pom = TNL::sign( c )*( hz * c )/( c - input[ t ]);
+          if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+            output[ cind ] = pom;
+          
+          interfaceMap[ cind ] = true;
+        }
+      }
+    }
+  }
+}
+
+
+template < typename Index >
+__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom,
+        int numBlockX, int numBlockY, int numBlockZ )
+{
+  int i = blockIdx.x * 1024 + threadIdx.x;
+  
+  if( i < numBlockX * numBlockY * numBlockZ )
+  {
+    int pom = 0;//BlockIterPom[ i ] = 0;
+    int m=0, l=0, k=0;
+    l = i/( numBlockX * numBlockY );
+    k = (i-l*numBlockX * numBlockY )/(numBlockX );
+    m = (i-l*numBlockX * numBlockY )%( numBlockX );
+    if( m > 0 && BlockIterDevice[ i - 1 ] ){ // left neighbour
+      pom = 1;//BlockIterPom[ i ] = 1;
+    }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){ // right neighbour
+      pom = 1;//BlockIterPom[ i ] = 1;
+    }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){ // bottom neighbour
+      pom = 1;// BlockIterPom[ i ] = 1;
+    }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){ // top neighbour
+      pom = 1;//BlockIterPom[ i ] = 1;
+    }else if( l > 0 && BlockIterDevice[ i - numBlockX*numBlockY ] ){ // neighbour behind 
+      pom = 1;
+    }else if( l < numBlockZ-1 && BlockIterDevice[ i + numBlockX*numBlockY ] ){ // neighbour in front
+      pom = 1;
+    }
+    
+    if( !BlockIterDevice[ i ] ) // only in CudaUpdateCellCaller can BlockIterDevice gain 0
+      BlockIterPom[ i ] = pom;
+    else
+      BlockIterPom[ i ] = 1;
+  }
+}
+
+
+template < int sizeSArray, typename Real, typename Device, typename Index >
+__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
+        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
+        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
+        Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps )
+{
+  int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
+  int i = threadIdx.x + blockDim.x*blockIdx.x + vecLowerOverlaps[0]; // WITH OVERLAPS!!! i,j,k aren't coordinates of all values
+  int j = blockDim.y*blockIdx.y + threadIdx.y + vecLowerOverlaps[1];
+  int k = blockDim.z*blockIdx.z + threadIdx.z + vecLowerOverlaps[2];
+  int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri;
+  const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
+  
+  // should this block calculate?
+  if( BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] ) 
+  {
+    __syncthreads();
+    
+    // Array indicates weather some threads calculated (for parallel reduction)
+    __shared__ volatile bool changed[ (sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2) ];
+    changed[ currentIndex ] = false;
+    
+    if( thrj == 0 && thri == 0 && thrk == 0 )
+      changed[ 0 ] = true; // first indicates weather we should calculate again (princip of parallel reduction)
+    
+    //getting stepps and size of mesh
+    const Real hx = mesh.getSpaceSteps().x(); const int dimX = mesh.getDimensions().x(); 
+    const Real hy = mesh.getSpaceSteps().y(); const int dimY = mesh.getDimensions().y();
+    const Real hz = mesh.getSpaceSteps().z(); const int dimZ  = mesh.getDimensions().z();
+    
+    if( thrj == 1 && thri == 1 && thrk == 1 )
+    {
+      // we dont know if we will calculate in here, more info down in code
+      BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 0;
+    }
+    
+    // sArray contains values of one block (coppied from aux) and edges (not MPI) of those blocks
+    __shared__ volatile Real sArray[ sizeSArray * sizeSArray * sizeSArray ];
+    sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = std::numeric_limits< Real >::max();
+    
+    
+    int xkolik = blockDim.x + 1;// maximum of threads in x direction (for all blocks different)
+    int ykolik = blockDim.y + 1;
+    int zkolik = blockDim.z + 1;
+    __syncthreads();
+    
+    if( gridDim.x - 1 == blockIdx.x )
+      xkolik = (dimX-vecUpperOverlaps[0]-vecLowerOverlaps[0]) -  blockIdx.x*blockDim.x+1;
+    if( gridDim.y -1 == blockIdx.y )
+      ykolik = (dimY-vecUpperOverlaps[1]-vecLowerOverlaps[1]) - (blockIdx.y)*blockDim.y+1;
+    if( gridDim.z-1 == blockIdx.z )
+      zkolik = (dimZ-vecUpperOverlaps[2]-vecLowerOverlaps[2]) - (blockIdx.z)*blockDim.z+1;
+    __syncthreads();
+    
+     //filling sArray edges
+    if( thri == 0 ) //x bottom
+    {        
+      if(  (blockIdx.x != 0 || vecLowerOverlaps[0] !=0) && thrj+1 < ykolik && thrk+1 < zkolik )
+        sArray[ (thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0 ] = 
+                aux[ (blockIdx.z*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + (blockIdx.y * blockDim.y+vecLowerOverlaps[1])*dimX 
+                + blockIdx.x*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY + vecLowerOverlaps[0] ];
+    else
+        sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max();
+    }
+    
+    if( thri == 1 ) //xtop
+    {
+      if( dimX - vecLowerOverlaps[ 0 ] >  (blockIdx.x+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik )
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] =
+                aux[ (blockIdx.z*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + (blockIdx.y * blockDim.y+vecLowerOverlaps[1])*dimX
+                + blockIdx.x*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY + vecLowerOverlaps[0] ];
+     else
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max();
+    }
+    if( thri == 2 ) //y bottom
+    {        
+      if( (blockIdx.y != 0 || vecLowerOverlaps[1] !=0) && thrj+1 < xkolik && thrk+1 < zkolik )
+        sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] =
+                aux[ (blockIdx.z*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + (blockIdx.y * blockDim.y+vecLowerOverlaps[1])*dimX
+                + blockIdx.x*blockDim.x - dimX + thrj + thrk*dimX*dimY + vecLowerOverlaps[0] ];
+      else
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + 0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
+    }
+    
+    if( thri == 3 ) //y top
+    {
+      if( dimY - vecLowerOverlaps[ 1 ] > (blockIdx.y+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik )
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] =
+                aux[ (blockIdx.z*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + ((blockIdx.y+1) * blockDim.y+vecLowerOverlaps[1])*dimX
+                + blockIdx.x*blockDim.x + thrj + thrk*dimX*dimY + vecLowerOverlaps[0] ];
+     else
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
+    }
+    if( thri == 4 ) //z bottom
+    {        
+      if( (blockIdx.z != 0 || vecLowerOverlaps[2] !=0) && thrj+1 < ykolik && thrk+1 < xkolik )
+        sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] =
+                aux[ (blockIdx.z*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + (blockIdx.y * blockDim.y+vecLowerOverlaps[1])*dimX
+                + blockIdx.x*blockDim.x - dimX * dimY + thrj * dimX + thrk + vecLowerOverlaps[0] ];
+     else
+        sArray[0 * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thrk+1] = std::numeric_limits< Real >::max();
+    }
+    
+    if( thri == 5 ) //z top
+    {
+      if( dimZ - vecLowerOverlaps[ 2 ] > (blockIdx.z+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik )
+        sArray[ zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] =
+                aux[ ((blockIdx.z+1)*blockDim.z + vecLowerOverlaps[2]) * dimX * dimY + (blockIdx.y * blockDim.y+vecLowerOverlaps[1])*dimX
+                + blockIdx.x*blockDim.x + thrj * dimX + thrk + vecLowerOverlaps[0] ];
+     else
+        sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = std::numeric_limits< Real >::max();
+    }
+    
+    // Copy all other values that aren't edges
+    if( i - vecLowerOverlaps[0] < dimX && j - vecLowerOverlaps[1] < dimY && k - vecLowerOverlaps[2] < dimZ &&
+        thri+1 < xkolik + vecUpperOverlaps[0] && thrj+1 < ykolik + vecUpperOverlaps[1] && thrk+1 < zkolik + vecUpperOverlaps[2] )
+    {
+      sArray[(thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = aux[ k*dimX*dimY + j*dimX + i ];
+    }
+    __syncthreads(); 
+    
+    //main while cycle. each value can get information only from neighbour but that information has to spread there
+    while( changed[ 0 ] )
+    {
+      __syncthreads();
+      
+      changed[ currentIndex ] = false;
+      
+      //calculation of update cell
+      if( i < dimX - vecUpperOverlaps[0] && j < dimY - vecUpperOverlaps[1] && k < dimZ - vecUpperOverlaps[2] )
+      {
+        if( ! interfaceMap[ k*dimX*dimY + j * dimX + i ] )
+        {
+          // calculate new value depending on neighbours in sArray on (thri+1, thrj+1) coordinates
+          changed[ currentIndex ] = ptr.updateCell< sizeSArray >( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz); 
+        }
+      }
+      __syncthreads();
+      
+      //pyramid reduction (parallel reduction)
+      if( blockDim.x*blockDim.y*blockDim.z == 1024 )
+      {
+        if( currentIndex < 512 )
+        {
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
+        }
+      }
+      __syncthreads();
+      if( blockDim.x*blockDim.y*blockDim.z >= 512 )
+      {
+        if( currentIndex < 256 )
+        {
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
+        }
+      }
+      __syncthreads();
+      if( blockDim.x*blockDim.y*blockDim.z >= 256 )
+      {
+        if( currentIndex < 128 )
+        {
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
+        }
+      }
+      __syncthreads();
+      if( blockDim.x*blockDim.y*blockDim.z >= 128 )
+      {
+        if( currentIndex < 64 )
+        {
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
+        }
+      }
+      __syncthreads();
+      if( currentIndex < 32 )
+      {
+        if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
+        if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
+        if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
+        if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
+        if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
+        if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
+      }
+      __syncthreads();
+      
+      // if we calculated, then the BlockIterDevice should contain the info about this whole block! (only one number for one block)
+      if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 )
+      {
+        BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 1;
+      }
+      __syncthreads();
+    }
+    
+    // copy results into helpFunc (not into aux bcs of conflicts)
+    if( i < dimX && j < dimY && k < dimZ && thri+1 < xkolik && thrj+1 < ykolik && thrk+1 < zkolik )
+      helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thri+1 ];
+    
+  }
+  else // if not, then it should at least copy the values from aux to helpFunc.
+  {
+    if( i < mesh.getDimensions().x() - vecUpperOverlaps[0] && j < mesh.getDimensions().y() - vecUpperOverlaps[1]
+            && k < mesh.getDimensions().z() - vecUpperOverlaps[2])
+      helpFunc[ k * mesh.getDimensions().x() * mesh.getDimensions().y() + j * mesh.getDimensions().x() + i ] =
+              aux[ k * mesh.getDimensions().x() * mesh.getDimensions().y() + j * mesh.getDimensions().x() + i ];
+  }
+}  
+#endif
+
+
+/// ==========================OPEN=MP=================================
+template< typename Real,
+        typename Device,
+        typename Index >
+template< int sizeSArray >
+void
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+updateBlocks( const InterfaceMapType interfaceMap,
+        const MeshFunctionType aux,
+        MeshFunctionType& helpFunc,
+        ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
+{  
+  #pragma omp parallel for schedule( dynamic )
+  for( IndexType i = 0; i < BlockIterHost.getSize(); i++ )
+  {
+    if( BlockIterHost[ i ] )
+    {
+      MeshType mesh = interfaceMap.template getMesh< Devices::Host >();
+      
+      int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
+      int dimZ = mesh.getDimensions().z();
+      //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl;
+      int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
+      int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
+      int numOfBlockz = dimZ/numThreadsPerBlock + ((dimZ%numThreadsPerBlock != 0) ? 1:0);
+      //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl;
+      int xkolik = numThreadsPerBlock + 1;
+      int ykolik = numThreadsPerBlock + 1;
+      int zkolik = numThreadsPerBlock + 1;
+      
+      
+      int blIdz = i/( numOfBlockx * numOfBlocky );
+      int blIdy = (i-blIdz*numOfBlockx * numOfBlocky )/(numOfBlockx );
+      int blIdx = (i-blIdz*numOfBlockx * numOfBlocky )%( numOfBlockx );
+      //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl;
+      
+      if( numOfBlockx - 1 == blIdx )
+        xkolik = dimX - (blIdx)*numThreadsPerBlock+1;
+      if( numOfBlocky -1 == blIdy )
+        ykolik = dimY - (blIdy)*numThreadsPerBlock+1;
+      if( numOfBlockz-1 == blIdz )
+        zkolik = dimZ - (blIdz)*numThreadsPerBlock+1;
+      //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl;
+      
+      
+      /*bool changed[numThreadsPerBlock*numThreadsPerBlock];
+       changed[ 0 ] = 1;*/
+      Real hx = mesh.getSpaceSteps().x();
+      Real hy = mesh.getSpaceSteps().y();
+      Real hz = mesh.getSpaceSteps().z();
+      
+      bool changed = false;
+      BlockIterHost[ i ] = 0;
+      
+      
+      Real *sArray;
+      sArray = new Real[ sizeSArray * sizeSArray * sizeSArray ];
+      if( sArray == nullptr )
+        std::cout << "Error while allocating memory for sArray." << std::endl;
+      
+      for( IndexType k = 0; k < sizeSArray; k++ )
+        for( IndexType l = 0; l < sizeSArray; l++ )
+          for( IndexType m = 0; m < sizeSArray; m++ ){
+            sArray[ m * sizeSArray * sizeSArray + k * sizeSArray + l ] = std::numeric_limits< Real >::max();
+          }
+      
+      
+      for( IndexType thrk = 0; thrk < numThreadsPerBlock; thrk++ )
+        for( IndexType thrj = 0; thrj < numThreadsPerBlock; thrj++ )
+        {
+          if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik )
+            sArray[(thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX -1 + thrk*dimX*dimY ];
+          
+          if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < zkolik )
+            sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy *numThreadsPerBlock*dimX+ blIdx*numThreadsPerBlock + numThreadsPerBlock + thrj * dimX + thrk*dimX*dimY ];
+          
+          if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik )
+            sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX + thrj + thrk*dimX*dimY ];
+          
+          if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik && thrk+1 < zkolik )
+            sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + (blIdy+1) * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj + thrk*dimX*dimY ];
+          
+          if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik )
+            sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX * dimY + thrj * dimX + thrk ];
+          
+          if( dimZ > (blIdz+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < xkolik )
+            sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = 
+                    aux[ (blIdz+1)*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX + thrk ];
+        }
+      
+      for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
+        for( IndexType k = 0; k < numThreadsPerBlock; k++ ){
+          for( IndexType l = 0; l < numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+              sArray[(m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1] = 
+                      aux[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ];
+          }
+        }
+      }
+      /*string s;
+       int numWhile = 0;
+       for( int k = 0; k < numThreadsPerBlock; k++ ){
+       for( int l = 0; l < numThreadsPerBlock; l++ ) 
+       for( int m = 0; m < numThreadsPerBlock; m++ )
+       if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+       helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+       } 
+       numWhile++;
+       s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+       helpFunc.save( s );*/
+      
+      for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
+        for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ 
+          for( IndexType l = 0; l < numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){
+              //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl;
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                //printf("In with point m  = %d, k = %d, l = %d\n", m, k, l);
+                changed = this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz) || changed;
+                
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+       for( int l = 0; l < numThreadsPerBlock; l++ ) 
+       for( int m = 0; m < numThreadsPerBlock; m++ )
+       if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+       helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+       } 
+       numWhile++;
+       s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+       helpFunc.save( s );*/
+      
+      for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){
+        for( IndexType k = 0; k < numThreadsPerBlock; k++ ){
+          for( IndexType l = 0; l <numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+       for( int l = 0; l < numThreadsPerBlock; l++ ) 
+       for( int m = 0; m < numThreadsPerBlock; m++ )
+       if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+       helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+       } 
+       numWhile++;
+       s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+       helpFunc.save( s );*/
+      
+      for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
+        for( IndexType k = 0; k < numThreadsPerBlock; k++ ){
+          for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+       for( int l = 0; l < numThreadsPerBlock; l++ ) 
+       for( int m = 0; m < numThreadsPerBlock; m++ )
+       if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+       helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+       } 
+       numWhile++;
+       s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+       helpFunc.save( s );
+       */
+      for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){
+        for( IndexType k = 0; k < numThreadsPerBlock; k++ ){
+          for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell< sizeSArray >(  sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+       for( int l = 0; l < numThreadsPerBlock; l++ ) 
+       for( int m = 0; m < numThreadsPerBlock; m++ )
+       if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+       helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+       } 
+       numWhile++;
+       s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+       helpFunc.save( s );*/
+      
+      for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
+        for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){
+          for( IndexType l = 0; l <numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+       for( int l = 0; l < numThreadsPerBlock; l++ ) 
+       for( int m = 0; m < numThreadsPerBlock; m++ )
+       if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+       helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+       } 
+       numWhile++;
+       s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+       helpFunc.save( s );*/
+      
+      for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){
+        for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){
+          for( IndexType l = 0; l <numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+       for( int l = 0; l < numThreadsPerBlock; l++ ) 
+       for( int m = 0; m < numThreadsPerBlock; m++ )
+       if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+       helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+       } 
+       numWhile++;
+       s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+       helpFunc.save( s );*/
+      
+      for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
+        for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){
+          for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+       for( int l = 0; l < numThreadsPerBlock; l++ ) 
+       for( int m = 0; m < numThreadsPerBlock; m++ )
+       if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+       helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+       } 
+       numWhile++;
+       s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+       helpFunc.save( s );*/
+      
+      
+      for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){
+        for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){
+          for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+       for( int l = 0; l < numThreadsPerBlock; l++ ) 
+       for( int m = 0; m < numThreadsPerBlock; m++ )
+       if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+       helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+       } 
+       numWhile++;
+       s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+       helpFunc.save( s );*/
+      
+      if( changed ){
+        BlockIterHost[ i ] = 1;
+      }
+      
+      
+      for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ 
+        for( IndexType l = 0; l < numThreadsPerBlock; l++ ) {
+          for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){      
+              helpFunc[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] = 
+                      sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+              //std::cout << helpFunc[ m*dimX*dimY + k*dimX + l ] << " ";
+            }
+          }
+          //std::cout << std::endl;
+        }
+        //std::cout << std::endl;
+      }
+      //helpFunc.save( "helpF.tnl");
+      delete []sArray;
+    }
+  }
+}
+
+template< typename Real,
+        typename Device,
+        typename Index >
+void 
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY, int numBlockZ )
+{
+  int* BlockIterPom; 
+  BlockIterPom = new int [ numBlockX * numBlockY * numBlockZ ];
+  
+  for( int i = 0; i< BlockIterHost.getSize(); i++)
+  {
+    BlockIterPom[ i ] = 0;
+    
+    int m=0, l=0, k=0;
+    l = i/( numBlockX * numBlockY );
+    k = (i-l*numBlockX * numBlockY )/(numBlockX );
+    m = (i-l*numBlockX * numBlockY )%( numBlockX );
+    
+    if( m > 0 && BlockIterHost[ i - 1 ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( l > 0 && BlockIterHost[ i - numBlockX*numBlockY ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( l < numBlockZ-1 && BlockIterHost[ i + numBlockX*numBlockY ] ){
+      BlockIterPom[ i ] = 1;
+    }
+  }
+  for( int i = 0; i< BlockIterHost.getSize(); i++)
+  { 
+    BlockIterHost[ i ] = BlockIterPom[ i ];
+  }
+}
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index 24a3885540e091047cd2580468ff24d3f1da38d6..7cba99f6586b84e31fbe1a49da4223c9e621dd5b 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -7,9 +7,9 @@
 
 #pragma once 
 
-#include <TNL/Meshes/Grid.h>
-#include <TNL/Functions/MeshFunction.h>
-#include <TNL/Devices/Cuda.h>
+//#include <TNL/Meshes/Grid.h>
+//#include <TNL/Functions/MeshFunction.h>
+//#include <TNL/Devices/Cuda.h>
 
 using namespace TNL;
 
@@ -62,30 +62,44 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
     typedef Functions::MeshFunction< MeshType > MeshFunctionType;
     typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType;
     typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
+    using ArrayContainerView = typename ArrayContainer::ViewType;
+    typedef Containers::StaticVector< 2, Index > StaticVector;
+    
+    using MeshPointer = Pointers::SharedPointer<  MeshType >;
     using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
     using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;
     
+    // CALLER FOR HOST AND CUDA
     void initInterface( const MeshFunctionPointer& input,
             MeshFunctionPointer& output,
-            InterfaceMapPointer& interfaceMap );
-    
+            InterfaceMapPointer& interfaceMap,
+            StaticVector vLower, StaticVector vUpper );
+        
+    // FOR HOST
     template< typename MeshEntity >
-    __cuda_callable__ void updateCell( MeshFunctionType& u,
+    __cuda_callable__ bool updateCell( MeshFunctionType& u,
             const MeshEntity& cell,
             const RealType velocity = 1.0 );
     
+    // FOR CUDA
     template< int sizeSArray >
-    __cuda_callable__ bool updateCell( volatile Real *sArray,
-            int thri, int thrj, const Real hx, const Real hy,
-            const Real velocity = 1.0 );
-    
+    __cuda_callable__ bool updateCell( volatile RealType *sArray,
+            int thri, int thrj, const RealType hx, const RealType hy,
+            const RealType velocity = 1.0 );
+        
+// FOR OPENMP WILL BE REMOVED
+    void getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY  );
+        
     template< int sizeSArray >
-    void updateBlocks( const InterfaceMapType& interfaceMap,
-            MeshFunctionType& aux,
-            MeshFunctionType& helpFunc,
-            ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ );
+    void updateBlocks( InterfaceMapType interfaceMap,
+            MeshFunctionType aux,
+            MeshFunctionType helpFunc,
+            ArrayContainerView BlockIterHost, int numThreadsPerBlock );
+    
+  protected:
     
-    void getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY  );
+   __cuda_callable__ RealType getNewValue( RealType valuesAndSteps[],
+           const RealType originalValue, const RealType v );
 };
 
 template< typename Real,
@@ -101,36 +115,49 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
     typedef Functions::MeshFunction< MeshType > MeshFunctionType;
     typedef Functions::MeshFunction< MeshType, 3, bool > InterfaceMapType;
     typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
+    using ArrayContainerView = typename ArrayContainer::ViewType;
+    typedef Containers::StaticVector< 3, Index > StaticVector;
     using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
     using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;      
     
+     // CALLER FOR HOST AND CUDA
     void initInterface( const MeshFunctionPointer& input,
             MeshFunctionPointer& output,
-            InterfaceMapPointer& interfaceMap );
+            InterfaceMapPointer& interfaceMap,
+            StaticVector vLower, StaticVector vUpper );
     
+    // FOR HOST
     template< typename MeshEntity >
-    __cuda_callable__ void updateCell( MeshFunctionType& u,
+    __cuda_callable__ bool updateCell( MeshFunctionType& u,
             const MeshEntity& cell,
             const RealType velocity = 1.0);
     
+    // FOR CUDA
     template< int sizeSArray >
-    void updateBlocks( const InterfaceMapType& interfaceMap,
-            const MeshFunctionType& aux,
-            MeshFunctionType& helpFunc,
-            ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ );
+    __cuda_callable__ bool updateCell( volatile Real *sArray,
+            int thri, int thrj, int thrk, const RealType hx, const RealType hy, const RealType hz,
+            const RealType velocity = 1.0 );
     
-    void getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY, int numBlockZ );
+    // OPENMP WILL BE REMOVED
+    void getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY, int numBlockZ );
     
     template< int sizeSArray >
-    __cuda_callable__ bool updateCell3D( volatile Real *sArray,
-            int thri, int thrj, int thrk, const Real hx, const Real hy, const Real hz,
-            const Real velocity = 1.0 );
+    void updateBlocks( const InterfaceMapType interfaceMap,
+            const MeshFunctionType aux,
+            MeshFunctionType& helpFunc,
+            ArrayContainer BlockIterHost, int numThreadsPerBlock );
+    
+  protected:
+    
+    __cuda_callable__ RealType getNewValue( RealType valuesAndSteps[],
+           const RealType originalValue, const RealType v );
 };
 
 template < typename T1 >
 __cuda_callable__ void sortMinims( T1 pom[] );
 
 #ifdef HAVE_CUDA
+// 1D
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, 
         Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output,
@@ -142,42 +169,53 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
         Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& aux,
         bool *BlockIterDevice );
 
+
+
+
+// 2D
+template < typename Real, typename Device, typename Index >
+__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
+        const Containers::StaticVector< 2, Index > vecLowerOverlas,
+        const Containers::StaticVector< 2, Index > vecUpperOerlaps );
+
 template < int sizeSArray, typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
         const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
         const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
         Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock =0);
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator,
+        const Containers::StaticVector< 2, Index > vecLowerOverlaps, 
+        const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock =0);
 
 template < typename Index >
-__global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > dBlock, int nBlocks );
+__global__ void GetNeighbours( const TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator,
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY );
 
-template < typename Index >
-__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY );
 
-template < typename Real, typename Device, typename Index >
-__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
-        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
-        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap );
 
+// 3D
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, 
         Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
-        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap );
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
+        Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps );
 
 template < int sizeSArray, typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
         const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
         const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
         Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice );
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
+        Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps );
 
 template < typename Index >
-__global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
+__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
         TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom,
         int numBlockX, int numBlockY, int numBlockZ );
 #endif
 
-#include "tnlDirectEikonalMethodsBase_impl.h"
+#include "tnlDirectEikonalMethodBase1D_impl.h"
+#include "tnlDirectEikonalMethodBase2D_impl.h"
+#include "tnlDirectEikonalMethodBase3D_impl.h"
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
deleted file mode 100644
index 26444bcfa591d702fbf7fdb325db0fd846c31fa1..0000000000000000000000000000000000000000
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ /dev/null
@@ -1,1639 +0,0 @@
-/* 
- * File:   tnlDirectEikonalMethodsBase_impl.h
- * Author: oberhuber
- *
- * Created on July 14, 2016, 3:22 PM
- */
-
-#pragma once
-
-#include <limits>
-
-#include <iostream>
-#include "tnlFastSweepingMethod.h"
-#include "tnlDirectEikonalMethodsBase.h"
-
-template< typename Real,
-        typename Device,
-        typename Index >
-void
-tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >::
-initInterface( const MeshFunctionPointer& _input,
-        MeshFunctionPointer& _output,
-        InterfaceMapPointer& _interfaceMap  )
-{
-  if( std::is_same< Device, Devices::Cuda >::value )
-  {
-#ifdef HAVE_CUDA
-    const MeshType& mesh = _input->getMesh();
-    
-    const int cudaBlockSize( 16 );
-    int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
-    dim3 blockSize( cudaBlockSize );
-    dim3 gridSize( numBlocksX );
-    Devices::Cuda::synchronizeDevice();
-    CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
-            _output.template modifyData< Device >(),
-            _interfaceMap.template modifyData< Device >() );
-    cudaDeviceSynchronize();
-    TNL_CHECK_CUDA_DEVICE;
-#endif
-  }
-  if( std::is_same< Device, Devices::Host >::value )
-  {
-    const MeshType& mesh = _input->getMesh();
-    typedef typename MeshType::Cell Cell;
-    const MeshFunctionType& input = _input.getData();
-    MeshFunctionType& output = _output.modifyData();
-    InterfaceMapType& interfaceMap = _interfaceMap.modifyData();
-    Cell cell( mesh );
-    for( cell.getCoordinates().x() = 0;
-            cell.getCoordinates().x() < mesh.getDimensions().x();
-            cell.getCoordinates().x() ++ )
-    {
-      cell.refresh();
-      output[ cell.getIndex() ] =
-              input( cell ) >= 0 ? std::numeric_limits< RealType >::max() :
-                -std::numeric_limits< RealType >::max();
-      interfaceMap[ cell.getIndex() ] = false;
-    }
-    
-    
-    const RealType& h = mesh.getSpaceSteps().x();
-    for( cell.getCoordinates().x() = 0;
-            cell.getCoordinates().x() < mesh.getDimensions().x() - 1;
-            cell.getCoordinates().x() ++ )
-    {
-      cell.refresh();
-      const RealType& c = input( cell );      
-      if( ! cell.isBoundaryEntity()  )
-      {
-        const auto& neighbors = cell.getNeighborEntities();
-        Real pom = 0;
-        //const IndexType& c = cell.getIndex();
-        const IndexType e = neighbors.template getEntityIndex<  1 >();
-        if( c * input[ e ] <= 0 )
-        {
-          pom = TNL::sign( c )*( h * c )/( c - input[ e ]);
-          if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
-            output[ cell.getIndex() ] = pom;
-          
-          pom = pom - TNL::sign( c )*h; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
-          if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
-            output[ e ] = pom; 
-          
-          interfaceMap[ cell.getIndex() ] = true;
-          interfaceMap[ e ] = true;
-        }
-      }
-    }
-  }
-}
-
-template< typename Real,
-        typename Device,
-        typename Index >
-template< int sizeSArray >
-void
-tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
-updateBlocks( const InterfaceMapType& interfaceMap,
-        MeshFunctionType& aux,
-        MeshFunctionType& helpFunc,
-        ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
-{
-#pragma omp parallel for schedule( dynamic )
-  for( IndexType i = 0; i < BlockIterHost.getSize(); i++ )
-  {
-    if( BlockIterHost[ i ] )
-    {
-      MeshType mesh = interfaceMap.template getMesh< Devices::Host >();
-      
-      int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
-      //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl;
-      int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
-      int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
-      //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl;
-      int xkolik = numThreadsPerBlock + 1;
-      int ykolik = numThreadsPerBlock + 1;
-      
-      int blIdx = i%numOfBlockx;
-      int blIdy = i/numOfBlockx;
-      //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl;
-      
-      if( numOfBlockx - 1 == blIdx )
-        xkolik = dimX - (blIdx)*numThreadsPerBlock+1;
-      
-      if( numOfBlocky -1 == blIdy )
-        ykolik = dimY - (blIdy)*numThreadsPerBlock+1;
-      //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl;
-      
-      
-      /*bool changed[numThreadsPerBlock*numThreadsPerBlock];
-       changed[ 0 ] = 1;*/
-      Real hx = mesh.getSpaceSteps().x();
-      Real hy = mesh.getSpaceSteps().y();
-      
-      bool changed = false;
-      BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0;
-      
-      
-      Real *sArray;
-      sArray = new Real[ sizeSArray * sizeSArray ];
-      if( sArray == nullptr )
-        std::cout << "Error while allocating memory for sArray." << std::endl;
-      
-      for( IndexType thri = 0; thri < sizeSArray; thri++ ){
-        for( IndexType thrj = 0; thrj < sizeSArray; thrj++ )
-          sArray[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max();
-      }
-      
-      
-      //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
-      for( IndexType thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ )
-      {        
-        if( dimX > (blIdx+1) * numThreadsPerBlock  && thrj+1 < ykolik )
-          sArray[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ];
-        
-        
-        if( blIdx != 0 && thrj+1 < ykolik )
-          sArray[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ];
-        
-        if( dimY > (blIdy+1) * numThreadsPerBlock  && thrj+1 < xkolik )
-          sArray[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ];
-        
-        if( blIdy != 0 && thrj+1 < xkolik )
-          sArray[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ];
-      }
-      
-      for( IndexType k = 0; k < numThreadsPerBlock; k++ ){
-        for( IndexType l = 0; l < numThreadsPerBlock; l++ )
-          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
-            sArray[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ];
-      }
-      
-      for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ 
-        for( IndexType l = 0; l < numThreadsPerBlock; l++ ){
-          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ){
-            //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl;
-            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
-            {
-              changed = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy) || changed;
-              
-            }
-          }
-        }
-      }
-      /*aux.save( "aux-1pruch.tnl" );
-       for( int k = 0; k < sizeSArray; k++ ){ 
-       for( int l = 0; l < sizeSArray; l++ ) {
-       std::cout << sArray[ k * sizeSArray + l] << " ";
-       }
-       std::cout << std::endl;
-       }*/
-      
-      for( IndexType k = 0; k < numThreadsPerBlock; k++ ) 
-        for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ) { 
-          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
-          {
-            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
-            {
-              this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy);
-            }
-          }
-        }
-      /*aux.save( "aux-2pruch.tnl" );
-       for( int k = 0; k < sizeSArray; k++ ){ 
-       for( int l = 0; l < sizeSArray; l++ ) {
-       std::cout << sArray[ k * sizeSArray + l] << " ";
-       }
-       std::cout << std::endl;
-       }*/
-      
-      for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ) 
-        for( IndexType l = 0; l < numThreadsPerBlock; l++ ) {
-          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
-          {
-            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
-            {
-              this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy);
-            }
-          }
-        }
-      /*aux.save( "aux-3pruch.tnl" );
-       for( int k = 0; k < sizeSArray; k++ ){ 
-       for( int l = 0; l < sizeSArray; l++ ) {
-       std::cout << sArray[ k * sizeSArray + l] << " ";
-       }
-       std::cout << std::endl;
-       }*/
-      
-      for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){
-        for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ) { 
-          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
-          {
-            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
-            {
-              this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx, hy, 1.0);
-            }
-          }
-        }
-      }
-      /*aux.save( "aux-4pruch.tnl" );
-       for( int k = 0; k < sizeSArray; k++ ){ 
-       for( int l = 0; l < sizeSArray; l++ ) {
-       std::cout << sArray[ k * sizeSArray + l] << " ";
-       }
-       std::cout << std::endl;
-       }*/
-      
-      
-      if( changed ){
-        BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 1;
-      }
-      
-      
-      for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ 
-        for( IndexType l = 0; l < numThreadsPerBlock; l++ ) {
-          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )      
-            helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] = sArray[ (k + 1)* sizeSArray + l + 1 ];
-          //std::cout<< sArray[k+1][l+1];
-        }
-        //std::cout<<std::endl;
-      }
-      delete []sArray;
-    }
-  }
-}
-template< typename Real,
-        typename Device,
-        typename Index >
-template< int sizeSArray >
-void
-tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
-updateBlocks( const InterfaceMapType& interfaceMap,
-        const MeshFunctionType& aux,
-        MeshFunctionType& helpFunc,
-        ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
-{  
-//#pragma omp parallel for schedule( dynamic )
-  for( IndexType i = 0; i < BlockIterHost.getSize(); i++ )
-  {
-    if( BlockIterHost[ i ] )
-    {
-      MeshType mesh = interfaceMap.template getMesh< Devices::Host >();
-      
-      int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
-      int dimZ = mesh.getDimensions().z();
-      //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl;
-      int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
-      int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
-      int numOfBlockz = dimZ/numThreadsPerBlock + ((dimZ%numThreadsPerBlock != 0) ? 1:0);
-      //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl;
-      int xkolik = numThreadsPerBlock + 1;
-      int ykolik = numThreadsPerBlock + 1;
-      int zkolik = numThreadsPerBlock + 1;
-      
-      
-      int blIdz = i/( numOfBlockx * numOfBlocky );
-      int blIdy = (i-blIdz*numOfBlockx * numOfBlocky )/(numOfBlockx );
-      int blIdx = (i-blIdz*numOfBlockx * numOfBlocky )%( numOfBlockx );
-      //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl;
-      
-      if( numOfBlockx - 1 == blIdx )
-        xkolik = dimX - (blIdx)*numThreadsPerBlock+1;
-      if( numOfBlocky -1 == blIdy )
-        ykolik = dimY - (blIdy)*numThreadsPerBlock+1;
-      if( numOfBlockz-1 == blIdz )
-        zkolik = dimZ - (blIdz)*numThreadsPerBlock+1;
-      //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl;
-      
-      
-      /*bool changed[numThreadsPerBlock*numThreadsPerBlock];
-       changed[ 0 ] = 1;*/
-      Real hx = mesh.getSpaceSteps().x();
-      Real hy = mesh.getSpaceSteps().y();
-      Real hz = mesh.getSpaceSteps().z();
-      
-      bool changed = false;
-      BlockIterHost[ i ] = 0;
-      
-      
-      Real *sArray;
-      sArray = new Real[ sizeSArray * sizeSArray * sizeSArray ];
-      if( sArray == nullptr )
-        std::cout << "Error while allocating memory for sArray." << std::endl;
-      
-      for( IndexType k = 0; k < sizeSArray; k++ )
-        for( IndexType l = 0; l < sizeSArray; l++ )
-          for( IndexType m = 0; m < sizeSArray; m++ ){
-            sArray[ m * sizeSArray * sizeSArray + k * sizeSArray + l ] = std::numeric_limits< Real >::max();
-          }
-      
-      
-      for( IndexType thrk = 0; thrk < numThreadsPerBlock; thrk++ )
-        for( IndexType thrj = 0; thrj < numThreadsPerBlock; thrj++ )
-        {
-          if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik )
-            sArray[(thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = 
-                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX -1 + thrk*dimX*dimY ];
-          
-          if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < zkolik )
-            sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = 
-                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy *numThreadsPerBlock*dimX+ blIdx*numThreadsPerBlock + numThreadsPerBlock + thrj * dimX + thrk*dimX*dimY ];
-          
-          if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik )
-            sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = 
-                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX + thrj + thrk*dimX*dimY ];
-          
-          if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik && thrk+1 < zkolik )
-            sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = 
-                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + (blIdy+1) * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj + thrk*dimX*dimY ];
-          
-          if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik )
-            sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = 
-                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX * dimY + thrj * dimX + thrk ];
-          
-          if( dimZ > (blIdz+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < xkolik )
-            sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = 
-                    aux[ (blIdz+1)*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX + thrk ];
-        }
-      
-      for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
-        for( IndexType k = 0; k < numThreadsPerBlock; k++ ){
-          for( IndexType l = 0; l < numThreadsPerBlock; l++ ){
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
-              sArray[(m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1] = 
-                      aux[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ];
-          }
-        }
-      }
-      /*string s;
-      int numWhile = 0;
-      for( int k = 0; k < numThreadsPerBlock; k++ ){
-        for( int l = 0; l < numThreadsPerBlock; l++ ) 
-          for( int m = 0; m < numThreadsPerBlock; m++ )
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
-              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
-      } 
-      numWhile++;
-      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
-      helpFunc.save( s );*/
-      
-      for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
-        for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ 
-          for( IndexType l = 0; l < numThreadsPerBlock; l++ ){
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){
-              //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl;
-              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
-              {
-                //printf("In with point m  = %d, k = %d, l = %d\n", m, k, l);
-                changed = this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz) || changed;
-                
-              }
-            }
-          }
-        }
-      }
-      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
-        for( int l = 0; l < numThreadsPerBlock; l++ ) 
-          for( int m = 0; m < numThreadsPerBlock; m++ )
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
-              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
-      } 
-      numWhile++;
-      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
-      helpFunc.save( s );*/
-      
-      for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){
-        for( IndexType k = 0; k < numThreadsPerBlock; k++ ){
-          for( IndexType l = 0; l <numThreadsPerBlock; l++ ){
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
-            {
-              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
-              {
-                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
-              }
-            }
-          }
-        }
-      }
-      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
-        for( int l = 0; l < numThreadsPerBlock; l++ ) 
-          for( int m = 0; m < numThreadsPerBlock; m++ )
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
-              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
-      } 
-      numWhile++;
-      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
-      helpFunc.save( s );*/
-      
-      for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
-        for( IndexType k = 0; k < numThreadsPerBlock; k++ ){
-          for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
-            {
-              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
-              {
-                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
-              }
-            }
-          }
-        }
-      }
-      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
-        for( int l = 0; l < numThreadsPerBlock; l++ ) 
-          for( int m = 0; m < numThreadsPerBlock; m++ )
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
-              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
-      } 
-      numWhile++;
-      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
-      helpFunc.save( s );
-      */
-      for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){
-        for( IndexType k = 0; k < numThreadsPerBlock; k++ ){
-          for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
-            {
-              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
-              {
-                this->template updateCell3D< sizeSArray >(  sArray, l+1, k+1, m+1, hx,hy,hz);
-              }
-            }
-          }
-        }
-      }
-      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
-        for( int l = 0; l < numThreadsPerBlock; l++ ) 
-          for( int m = 0; m < numThreadsPerBlock; m++ )
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
-              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
-      } 
-      numWhile++;
-      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
-      helpFunc.save( s );*/
-      
-      for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
-        for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){
-          for( IndexType l = 0; l <numThreadsPerBlock; l++ ){
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
-            {
-              if( ! interfaceMap[blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
-              {
-                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
-              }
-            }
-          }
-        }
-      }
-      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
-        for( int l = 0; l < numThreadsPerBlock; l++ ) 
-          for( int m = 0; m < numThreadsPerBlock; m++ )
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
-              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
-      } 
-      numWhile++;
-      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
-      helpFunc.save( s );*/
-      
-      for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){
-        for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){
-          for( IndexType l = 0; l <numThreadsPerBlock; l++ ){
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
-            {
-              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
-              {
-                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
-              }
-            }
-          }
-        }
-      }
-      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
-        for( int l = 0; l < numThreadsPerBlock; l++ ) 
-          for( int m = 0; m < numThreadsPerBlock; m++ )
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
-              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
-      } 
-      numWhile++;
-      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
-      helpFunc.save( s );*/
-      
-      for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
-        for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){
-          for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
-            {
-              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
-              {
-                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
-              }
-            }
-          }
-        }
-      }
-      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
-        for( int l = 0; l < numThreadsPerBlock; l++ ) 
-          for( int m = 0; m < numThreadsPerBlock; m++ )
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
-              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
-      } 
-      numWhile++;
-      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
-      helpFunc.save( s );*/
-      
-      
-      for( IndexType m = numThreadsPerBlock-1; m >-1; m-- ){
-        for( IndexType k = numThreadsPerBlock-1; k > -1; k-- ){
-          for( IndexType l = numThreadsPerBlock-1; l >-1; l-- ){
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
-            {
-              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
-              {
-                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
-              }
-            }
-          }
-        }
-      }
-      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
-        for( int l = 0; l < numThreadsPerBlock; l++ ) 
-          for( int m = 0; m < numThreadsPerBlock; m++ )
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
-              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
-      } 
-      numWhile++;
-      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
-      helpFunc.save( s );*/
-      
-      if( changed ){
-        BlockIterHost[ i ] = 1;
-      }
-      
-      
-      for( IndexType k = 0; k < numThreadsPerBlock; k++ ){ 
-        for( IndexType l = 0; l < numThreadsPerBlock; l++ ) {
-          for( IndexType m = 0; m < numThreadsPerBlock; m++ ){
-            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){      
-              helpFunc[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] = 
-                      sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
-              //std::cout << helpFunc[ m*dimX*dimY + k*dimX + l ] << " ";
-            }
-          }
-          //std::cout << std::endl;
-        }
-        //std::cout << std::endl;
-      }
-      //helpFunc.save( "helpF.tnl");
-      delete []sArray;
-    }
-  }
-}
-template< typename Real,
-        typename Device,
-        typename Index >
-void 
-tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
-getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY, int numBlockZ )
-{
-  int* BlockIterPom; 
-  BlockIterPom = new int [ numBlockX * numBlockY * numBlockZ ];
-  
-  for( int i = 0; i< BlockIterHost.getSize(); i++)
-  {
-    BlockIterPom[ i ] = 0;
-    
-    int m=0, l=0, k=0;
-    l = i/( numBlockX * numBlockY );
-    k = (i-l*numBlockX * numBlockY )/(numBlockX );
-    m = (i-l*numBlockX * numBlockY )%( numBlockX );
-    
-    if( m > 0 && BlockIterHost[ i - 1 ] ){
-      BlockIterPom[ i ] = 1;
-    }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){
-      BlockIterPom[ i ] = 1;
-    }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){
-      BlockIterPom[ i ] = 1;
-    }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){
-      BlockIterPom[ i ] = 1;
-    }else if( l > 0 && BlockIterHost[ i - numBlockX*numBlockY ] ){
-      BlockIterPom[ i ] = 1;
-    }else if( l < numBlockZ-1 && BlockIterHost[ i + numBlockX*numBlockY ] ){
-      BlockIterPom[ i ] = 1;
-    }
-  }
-  for( int i = 0; i< BlockIterHost.getSize(); i++)
-  { 
-    BlockIterHost[ i ] = BlockIterPom[ i ];
-  }
-}
-
-
-template< typename Real,
-        typename Device,
-        typename Index >
-void 
-tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
-getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY )
-{
-  int* BlockIterPom; 
-  BlockIterPom = new int [numBlockX * numBlockY];
-  
-  for(int i = 0; i < numBlockX * numBlockY; i++)
-  {
-    BlockIterPom[ i ] = 0;//BlockIterPom[ i ] = 0;
-    int m=0, k=0;
-    m = i%numBlockX;
-    k = i/numBlockX;
-    if( m > 0 && BlockIterHost[ i - 1 ] ){
-      BlockIterPom[ i ] = 1;
-    }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){
-      BlockIterPom[ i ] = 1;
-    }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){
-      BlockIterPom[ i ] = 1;
-    }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){
-      BlockIterPom[ i ] = 1;
-    }
-    //BlockIterPom[ i ];
-  }
-  
-  for(int i = 0; i < numBlockX * numBlockY; i++)
-  {
-    if( !BlockIterHost[ i ] )
-      BlockIterHost[ i ] = BlockIterPom[ i ];
-  }
-  /*else
-   BlockIter[ i ] = 0;*/
-  /*for( int i = numBlockX-1; i > -1; i-- )
-   {
-   for( int j = 0; j< numBlockY; j++ )
-   std::cout << BlockIterHost[ i*numBlockY + j ];
-   std::cout << std::endl;
-   }
-   std::cout << std::endl;*/
-  delete[] BlockIterPom;
-}
-
-template< typename Real,
-        typename Device,
-        typename Index >
-template< typename MeshEntity >
-void
-tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >::
-updateCell( MeshFunctionType& u,
-        const MeshEntity& cell, 
-        const RealType v )
-{
-  const auto& neighborEntities = cell.template getNeighborEntities< 1 >();
-  const MeshType& mesh = cell.getMesh();
-  const RealType& h = mesh.getSpaceSteps().x();
-  const RealType value = u( cell );
-  RealType a, tmp = std::numeric_limits< RealType >::max();
-  
-  if( cell.getCoordinates().x() == 0 )
-    a = u[ neighborEntities.template getEntityIndex< 1 >() ];
-  else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
-    a = u[ neighborEntities.template getEntityIndex< -1 >() ];
-  else
-  {
-    a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1 >() ],
-            u[ neighborEntities.template getEntityIndex<  1 >() ] );
-  }
-  
-  if( fabs( a ) == std::numeric_limits< RealType >::max() )
-    return;
-  
-  tmp = a + TNL::sign( value ) * h/v;
-  
-  u[ cell.getIndex() ] = argAbsMin( value, tmp );
-}
-
-template< typename Real,
-        typename Device,
-        typename Index >
-void
-tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
-initInterface( const MeshFunctionPointer& _input,
-        MeshFunctionPointer& _output,
-        InterfaceMapPointer& _interfaceMap )
-{
-  
-  if( std::is_same< Device, Devices::Cuda >::value )
-  {
-#ifdef HAVE_CUDA
-    const MeshType& mesh = _input->getMesh();
-    
-    const int cudaBlockSize( 16 );
-    int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
-    int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
-    dim3 blockSize( cudaBlockSize, cudaBlockSize );
-    dim3 gridSize( numBlocksX, numBlocksY );
-    Devices::Cuda::synchronizeDevice();
-    CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
-            _output.template modifyData< Device >(),
-            _interfaceMap.template modifyData< Device >() );
-    cudaDeviceSynchronize();
-    TNL_CHECK_CUDA_DEVICE;
-#endif
-  }
-  if( std::is_same< Device, Devices::Host >::value )
-  {
-    MeshFunctionType input = _input.getData();
-    
-    /*double A[320][320];
-     std::ifstream fileInit("/home/maty/Downloads/initData.txt");
-     
-     for (int i = 0; i < 320; i++)
-     for (int j = 0; j < 320; j++)
-     fileInit >> A[j];
-     fileInit.close();
-     for (int i = 0; i < 320; i++)
-     for (int j = 0; j < 320; j++)
-     input[i*320 + j] = A[j];*/
-    
-    
-    MeshFunctionType& output = _output.modifyData();
-    InterfaceMapType& interfaceMap = _interfaceMap.modifyData();
-    const MeshType& mesh = input.getMesh();
-    typedef typename MeshType::Cell Cell;
-    Cell cell( mesh );
-    for( cell.getCoordinates().y() = 0;
-            cell.getCoordinates().y() < mesh.getDimensions().y();
-            cell.getCoordinates().y() ++ )
-      for( cell.getCoordinates().x() = 0;
-              cell.getCoordinates().x() < mesh.getDimensions().x();
-              cell.getCoordinates().x() ++ )
-      {
-        cell.refresh();
-        output[ cell.getIndex() ] =
-                input( cell ) >= 0 ? std::numeric_limits< RealType >::max() :
-                  - std::numeric_limits< RealType >::max();
-        interfaceMap[ cell.getIndex() ] = false;
-      }
-    
-    const RealType& hx = mesh.getSpaceSteps().x();
-    const RealType& hy = mesh.getSpaceSteps().y();     
-    for( cell.getCoordinates().y() = 0;
-            cell.getCoordinates().y() < mesh.getDimensions().y();
-            cell.getCoordinates().y() ++ )
-      for( cell.getCoordinates().x() = 0;
-              cell.getCoordinates().x() < mesh.getDimensions().x();
-              cell.getCoordinates().x() ++ )
-      {
-        cell.refresh();
-        const RealType& c = input( cell );
-        if( ! cell.isBoundaryEntity()  )
-        {
-          auto neighbors = cell.getNeighborEntities();
-          Real pom = 0;
-          const IndexType e = neighbors.template getEntityIndex<  1,  0 >();
-          const IndexType n = neighbors.template getEntityIndex<  0,  1 >();
-          //Try init with exact data:
-          /*if( c * input[ n ] <= 0 )
-           {
-           output[ cell.getIndex() ] = c;
-           output[ n ] = input[ n ];
-           interfaceMap[ cell.getIndex() ] = true;
-           interfaceMap[ n ] = true;
-           }   
-           if( c * input[ e ] <= 0 )
-           {   
-           output[ cell.getIndex() ] = c;
-           output[ e ] = input[ e ];
-           interfaceMap[ cell.getIndex() ] = true;
-           interfaceMap[ e ] = true;
-           }*/
-          if( c * input[ n ] <= 0 )
-          {
-            /*if( c >= 0 )
-             {*/
-            pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
-            if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
-              output[ cell.getIndex() ] = pom;
-            pom = pom - TNL::sign( c )*hy;
-            if( TNL::abs( output[ n ] ) > TNL::abs( pom ) )
-              output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy;
-            /*}else
-             {
-             pom = - ( hy * c )/( c - input[ n ]);
-             if( output[ cell.getIndex() ] < pom )
-             output[ cell.getIndex() ] = pom;
-             if( output[ n ] > hy + pom )
-             output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]);
-             }*/
-            interfaceMap[ cell.getIndex() ] = true;
-            interfaceMap[ n ] = true;
-          }
-          if( c * input[ e ] <= 0 )
-          {
-            /*if( c >= 0 )
-             {*/
-            pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
-            if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
-              output[ cell.getIndex() ] = pom;
-            
-            pom = pom - TNL::sign( c )*hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
-            if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
-              output[ e ] = pom; 
-            /*}else
-             {
-             pom = - (hx * c)/( c - input[ e ]);
-             if( output[ cell.getIndex() ] < pom )
-             output[ cell.getIndex() ] = pom;
-             
-             pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
-             if( output[ e ] > pom )
-             output[ e ] = pom;
-             }*/
-            interfaceMap[ cell.getIndex() ] = true;
-            interfaceMap[ e ] = true;
-          }
-        }
-      }
-  }
-}
-
-template< typename Real,
-        typename Device,
-        typename Index >
-template< typename MeshEntity >
-__cuda_callable__
-void
-tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
-updateCell( MeshFunctionType& u,
-        const MeshEntity& cell,   
-        const RealType v)
-{
-  const auto& neighborEntities = cell.template getNeighborEntities< 2 >();
-  const MeshType& mesh = cell.getMesh();
-  const RealType& hx = mesh.getSpaceSteps().x();
-  const RealType& hy = mesh.getSpaceSteps().y();
-  const RealType value = u( cell );
-  RealType a, b, tmp = std::numeric_limits< RealType >::max();
-  
-  if( cell.getCoordinates().x() == 0 )
-    a = u[ neighborEntities.template getEntityIndex< 1,  0 >() ];
-  else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
-    a = u[ neighborEntities.template getEntityIndex< -1,  0 >() ];
-  else
-  {
-    a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1,  0 >() ],
-            u[ neighborEntities.template getEntityIndex<  1,  0 >() ] );
-  }
-  
-  if( cell.getCoordinates().y() == 0 )
-    b = u[ neighborEntities.template getEntityIndex< 0,  1 >()];
-  else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 )
-    b = u[ neighborEntities.template getEntityIndex< 0,  -1 >() ];
-  else
-  {
-    b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0,  -1 >() ],
-            u[ neighborEntities.template getEntityIndex< 0,   1 >() ] );
-  }
-  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-          fabs( b ) == std::numeric_limits< RealType >::max() )
-    return;
-  /*if( fabs( a ) == TypeInfo< Real >::getMaxValue() ||
-   fabs( b ) == TypeInfo< Real >::getMaxValue() ||
-   fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) )
-   {
-   tmp = 
-   fabs( a ) >= fabs( b ) ? b + TNL::sign( value ) * hy :
-   a + TNL::sign( value ) * hx;
-   }*/
-  /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() &&
-   fabs( b ) != TypeInfo< Real >::getMaxValue() &&
-   fabs( a - b ) < TNL::sqrt( (hx * hx + hy * hy)/v ) )
-   {
-   tmp = ( hx * hx * b + hy * hy * a + 
-   sign( value ) * hx * hy * TNL::sqrt( ( hx * hx + hy * hy )/v - 
-   ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy );
-   u[ cell.getIndex() ] =  tmp;
-   }
-   else
-   {
-   tmp = 
-   fabs( a ) > fabs( b ) ? b + TNL::sign( value ) * hy/v :
-   a + TNL::sign( value ) * hx/v;
-   u[ cell.getIndex() ] = argAbsMin( value, tmp );
-   //tmp = TypeInfo< RealType >::getMaxValue();
-   }*/
-  RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
-  sortMinims( pom );
-  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
-  
-  
-  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
-    u[ cell.getIndex() ] = argAbsMin( value, tmp );
-  else
-  {
-    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
-            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
-            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-    u[ cell.getIndex() ] = argAbsMin( value, tmp );
-  }
-}
-
-template< typename Real,
-        typename Device,
-        typename Index >
-void
-tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
-initInterface( const MeshFunctionPointer& _input,
-        MeshFunctionPointer& _output,
-        InterfaceMapPointer& _interfaceMap  )
-{
-  if( std::is_same< Device, Devices::Cuda >::value )
-  {
-#ifdef HAVE_CUDA
-    const MeshType& mesh = _input->getMesh();
-    
-    const int cudaBlockSize( 8 );
-    int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
-    int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
-    int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize );
-    if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
-      std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
-    dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
-    dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
-    Devices::Cuda::synchronizeDevice();
-    CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(),
-            _output.template modifyData< Device >(),
-            _interfaceMap.template modifyData< Device >() );
-    cudaDeviceSynchronize();
-    TNL_CHECK_CUDA_DEVICE;
-#endif
-  }
-  if( std::is_same< Device, Devices::Host >::value )
-  {
-    const MeshFunctionType& input =  _input.getData();
-    MeshFunctionType& output =  _output.modifyData();
-    InterfaceMapType& interfaceMap =  _interfaceMap.modifyData();
-    const MeshType& mesh = input.getMesh();
-    typedef typename MeshType::Cell Cell;
-    Cell cell( mesh );
-    for( cell.getCoordinates().z() = 0;
-            cell.getCoordinates().z() < mesh.getDimensions().z();
-            cell.getCoordinates().z() ++ )
-      for( cell.getCoordinates().y() = 0;
-              cell.getCoordinates().y() < mesh.getDimensions().y();
-              cell.getCoordinates().y() ++ )
-        for( cell.getCoordinates().x() = 0;
-                cell.getCoordinates().x() < mesh.getDimensions().x();
-                cell.getCoordinates().x() ++ )
-        {
-          cell.refresh();
-          output[ cell.getIndex() ] =
-                  input( cell ) > 0 ? 10://std::numeric_limits< RealType >::max() :
-                    -10;//- std::numeric_limits< RealType >::max();
-          interfaceMap[ cell.getIndex() ] = false;
-        }
-    
-    const RealType& hx = mesh.getSpaceSteps().x();
-    const RealType& hy = mesh.getSpaceSteps().y();
-    const RealType& hz = mesh.getSpaceSteps().z();
-    for( cell.getCoordinates().z() = 0;
-            cell.getCoordinates().z() < mesh.getDimensions().z();
-            cell.getCoordinates().z() ++ )   
-      for( cell.getCoordinates().y() = 0;
-              cell.getCoordinates().y() < mesh.getDimensions().y();
-              cell.getCoordinates().y() ++ )
-        for( cell.getCoordinates().x() = 0;
-                cell.getCoordinates().x() < mesh.getDimensions().x();
-                cell.getCoordinates().x() ++ )
-        {
-          cell.refresh();
-          const RealType& c = input( cell );
-          if( ! cell.isBoundaryEntity() )
-          {
-            auto neighbors = cell.getNeighborEntities();
-            Real pom = 0;
-            const IndexType e = neighbors.template getEntityIndex<  1,  0,  0 >();
-            const IndexType n = neighbors.template getEntityIndex<  0,  1,  0 >();
-            const IndexType t = neighbors.template getEntityIndex<  0,  0,  1 >();
-            //Try exact initiation
-            /*const IndexType w = neighbors.template getEntityIndex< -1,  0,  0 >();
-             const IndexType s = neighbors.template getEntityIndex<  0, -1,  0 >();
-             const IndexType b = neighbors.template getEntityIndex<  0,  0, -1 >();
-             if( c * input[ e ] <= 0 )
-             {
-             output[ cell.getIndex() ] = c;
-             output[ e ] = input[ e ];
-             interfaceMap[ e ] = true;   
-             interfaceMap[ cell.getIndex() ] = true;
-             }
-             else if( c * input[ n ] <= 0 )
-             {
-             output[ cell.getIndex() ] = c;
-             output[ n ] = input[ n ];
-             interfaceMap[ n ] = true;   
-             interfaceMap[ cell.getIndex() ] = true;
-             }
-             else if( c * input[ t ] <= 0 )
-             {
-             output[ cell.getIndex() ] = c;
-             output[ t ] = input[ t ];
-             interfaceMap[ t ] = true;   
-             interfaceMap[ cell.getIndex() ] = true;
-             }*/
-            if( c * input[ n ] <= 0 )
-            {
-              if( c >= 0 )
-              {
-                pom = ( hy * c )/( c - input[ n ]);
-                if( output[ cell.getIndex() ] > pom ) 
-                  output[ cell.getIndex() ] = pom;
-                
-                if ( output[ n ] < pom - hy)
-                  output[ n ] = pom - hy; // ( hy * c )/( c - input[ n ]) - hy;
-                
-              }else
-              {
-                pom = - ( hy * c )/( c - input[ n ]);
-                if( output[ cell.getIndex() ] < pom )
-                  output[ cell.getIndex() ] = pom;
-                if( output[ n ] > hy + pom )
-                  output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]);
-                
-              }
-              interfaceMap[ cell.getIndex() ] = true;
-              interfaceMap[ n ] = true;
-            }
-            if( c * input[ e ] <= 0 )
-            {
-              if( c >= 0 )
-              {
-                pom = ( hx * c )/( c - input[ e ]);
-                if( output[ cell.getIndex() ] > pom )
-                  output[ cell.getIndex() ] = pom;
-                
-                pom = pom - hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
-                if( output[ e ] < pom )
-                  output[ e ] = pom;      
-                
-              }else
-              {
-                pom = - (hx * c)/( c - input[ e ]);
-                if( output[ cell.getIndex() ] < pom )
-                  output[ cell.getIndex() ] = pom;
-                
-                pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
-                if( output[ e ] > pom )
-                  output[ e ] = pom;
-              }
-              interfaceMap[ cell.getIndex() ] = true;
-              interfaceMap[ e ] = true;
-            }
-            if( c * input[ t ] <= 0 )
-            {
-              if( c >= 0 )
-              {
-                pom = ( hz * c )/( c - input[ t ]);
-                if( output[ cell.getIndex() ] > pom )
-                  output[ cell.getIndex() ] = pom;
-                
-                pom = pom - hz; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
-                if( output[ t ] < pom )
-                  output[ t ] = pom; 
-                
-              }else
-              {
-                pom = - (hz * c)/( c - input[ t ]);
-                if( output[ cell.getIndex() ] < pom )
-                  output[ cell.getIndex() ] = pom;
-                
-                pom = pom + hz; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
-                if( output[ t ] > pom )
-                  output[ t ] = pom;
-                
-              }
-              interfaceMap[ cell.getIndex() ] = true;
-              interfaceMap[ t ] = true;
-            }    
-          }
-          /*output[ cell.getIndex() ] =
-           c > 0 ? TypeInfo< RealType >::getMaxValue() :
-           -TypeInfo< RealType >::getMaxValue();
-           interfaceMap[ cell.getIndex() ] = false;*/ //is on line 245
-        }
-  }
-}
-
-template< typename Real,
-        typename Device,
-        typename Index >
-template< typename MeshEntity >
-__cuda_callable__
-void
-tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
-updateCell( MeshFunctionType& u,
-        const MeshEntity& cell, 
-        const RealType v )
-{
-  const auto& neighborEntities = cell.template getNeighborEntities< 3 >();
-  const MeshType& mesh = cell.getMesh();
-  
-  const RealType& hx = mesh.getSpaceSteps().x();
-  const RealType& hy = mesh.getSpaceSteps().y();
-  const RealType& hz = mesh.getSpaceSteps().z();
-  const RealType value = u( cell );
-  //std::cout << value << std::endl;
-  RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
-  
-  
-  if( cell.getCoordinates().x() == 0 )
-    a = u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ];
-  else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
-    a = u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ];
-  else
-  {
-    a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ],
-            u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ] );
-  }
-  if( cell.getCoordinates().y() == 0 )
-    b = u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ];
-  else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 )
-    b = u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ];
-  else
-  {
-    b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ],
-            u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ] );
-  }if( cell.getCoordinates().z() == 0 )
-    c = u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ];
-  else if( cell.getCoordinates().z() == mesh.getDimensions().z() - 1 )
-    c = u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ];
-  else
-  {
-    c = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ],
-            u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ] );
-  }
-  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-          fabs( b ) == std::numeric_limits< RealType >::max() &&
-          fabs( c ) == std::numeric_limits< RealType >::max() )
-    return;
-  
-  
-  /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() &&
-   fabs( b ) != TypeInfo< Real >::getMaxValue() &&
-   fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) )
-   {
-   tmp = ( hx * hx * a + hy * hy * b + 
-   sign( value ) * hx * hy * sqrt( ( hx * hx + hy * hy )/v - 
-   ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy );
-   }
-   if( fabs( a ) != TypeInfo< Real >::getMaxValue() &&
-   fabs( c ) != TypeInfo< Real >::getMaxValue() &&
-   fabs( a - c ) >= TNL::sqrt( (hx * hx + hz * hz)/v ) )
-   {
-   tmp = ( hx * hx * a + hz * hz * c + 
-   sign( value ) * hx * hz * sqrt( ( hx * hx + hz * hz )/v - 
-   ( a - c ) * ( a - c ) ) )/( hx * hx + hz * hz );
-   }
-   if( fabs( b ) != TypeInfo< Real >::getMaxValue() &&
-   fabs( c ) != TypeInfo< Real >::getMaxValue() &&
-   fabs( b - c ) >= TNL::sqrt( (hy * hy + hz * hz)/v ) )
-   {
-   tmp = ( hy * hy * b + hz * hz * c + 
-   sign( value ) * hy * hz * sqrt( ( hy * hy + hz * hz )/v - 
-   ( b - c ) * ( b - c ) ) )/( hy * hy + hz * hz );
-   }*/
-  RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
-  sortMinims( pom );   
-  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
-  if( fabs( tmp ) < fabs( pom[ 1 ] ) )
-  {
-    u[ cell.getIndex() ] = argAbsMin( value, tmp ); 
-  }
-  else
-  {
-    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
-            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
-            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-    if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
-    {
-      u[ cell.getIndex() ] = argAbsMin( value, tmp );
-    }
-    else
-    {
-      tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
-              TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
-              hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
-              hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
-      u[ cell.getIndex() ] = argAbsMin( value, tmp );
-    }
-  }
-}
-
-template < typename T1 >
-__cuda_callable__ void sortMinims( T1 pom[] )
-{
-  T1 tmp[6] = {0.0,0.0,0.0,0.0,0.0,0.0}; 
-  if( fabs(pom[0]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[2])){
-    tmp[0] = pom[0]; tmp[1] = pom[1]; tmp[2] = pom[2];
-    tmp[3] = pom[3]; tmp[4] = pom[4]; tmp[5] = pom[5];
-    
-  }
-  else if( fabs(pom[0]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[1]) ){
-    tmp[0] = pom[0]; tmp[1] = pom[2]; tmp[2] = pom[1];
-    tmp[3] = pom[3]; tmp[4] = pom[5]; tmp[5] = pom[4];
-  }
-  else if( fabs(pom[1]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[2]) ){
-    tmp[0] = pom[1]; tmp[1] = pom[0]; tmp[2] = pom[2];
-    tmp[3] = pom[4]; tmp[4] = pom[3]; tmp[5] = pom[5];
-  }
-  else if( fabs(pom[1]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[0]) ){
-    tmp[0] = pom[1]; tmp[1] = pom[2]; tmp[2] = pom[0];
-    tmp[3] = pom[4]; tmp[4] = pom[5]; tmp[5] = pom[3];
-  }
-  else if( fabs(pom[2]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[1]) ){
-    tmp[0] = pom[2]; tmp[1] = pom[0]; tmp[2] = pom[1];
-    tmp[3] = pom[5]; tmp[4] = pom[3]; tmp[5] = pom[4];
-  }
-  else if( fabs(pom[2]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[0]) ){
-    tmp[0] = pom[2]; tmp[1] = pom[1]; tmp[2] = pom[0];
-    tmp[3] = pom[5]; tmp[4] = pom[4]; tmp[5] = pom[3];
-  }
-  
-  for( unsigned int i = 0; i < 6; i++ )
-  {
-    pom[ i ] = tmp[ i ];
-  }   
-}
-
-template< typename Real,
-        typename Device,
-        typename Index >
-template< int sizeSArray >
-__cuda_callable__
-bool
-tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
-updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real hy,
-        const Real v )
-{
-  const RealType value = sArray[ thrj * sizeSArray + thri ];
-  RealType a, b, tmp = std::numeric_limits< RealType >::max();
-  
-  b = TNL::argAbsMin( sArray[ (thrj+1) * sizeSArray + thri ],
-          sArray[ (thrj-1) * sizeSArray + thri ] );
-  
-  a = TNL::argAbsMin( sArray[ thrj * sizeSArray + thri+1 ],
-          sArray[ thrj * sizeSArray + thri-1 ] );
-  
-  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-          fabs( b ) == std::numeric_limits< RealType >::max() )
-    return false;
-  
-  RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
-  sortMinims( pom );
-  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
-  
-  
-  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
-  {
-    sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp );
-    tmp = value - sArray[ thrj * sizeSArray + thri ];
-    if ( fabs( tmp ) >  0.001*hx )
-      return true;
-    else
-      return false;
-  }
-  else
-  {
-    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
-            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
-            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-    sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp );
-    tmp = value - sArray[ thrj * sizeSArray + thri ];
-    if ( fabs( tmp ) > 0.001*hx )
-      return true;
-    else
-      return false;
-  }
-  
-  return false;
-}
-template< typename Real,
-        typename Device,
-        typename Index >
-template< int sizeSArray >
-__cuda_callable__ 
-bool 
-tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
-updateCell3D( volatile Real *sArray, int thri, int thrj, int thrk,
-        const Real hx, const Real hy, const Real hz, const Real v )
-{
-  const RealType value = sArray[thrk *sizeSArray * sizeSArray + thrj * sizeSArray + thri];
-  
-  RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
-  
-  c = TNL::argAbsMin( sArray[ (thrk+1)* sizeSArray*sizeSArray + thrj * sizeSArray + thri ],
-          sArray[ (thrk-1) * sizeSArray *sizeSArray + thrj* sizeSArray + thri ] );
-  
-  b = TNL::argAbsMin( sArray[ thrk* sizeSArray*sizeSArray + (thrj+1) * sizeSArray + thri ],
-          sArray[ thrk* sizeSArray * sizeSArray + (thrj-1)* sizeSArray +thri ] );
-  
-  a = TNL::argAbsMin( sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri+1 ],
-          sArray[ thrk* sizeSArray * sizeSArray + thrj* sizeSArray +thri-1 ] );
-  
-  /*if( thrk == 8 )
-    printf("Calculating a = %f, b = %f, c = %f\n" , a, b, c );*/
-  
-  if( fabs( a ) == 10&& //std::numeric_limits< RealType >::max() && 
-          fabs( b ) == 10&&//std::numeric_limits< RealType >::max() &&
-          fabs( c ) == 10)//std::numeric_limits< RealType >::max() )
-    return false;
-  
-  RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
-  
-  sortMinims( pom );
-  
-  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
-  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
-  {
-    sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ] = argAbsMin( value, tmp );
-    tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
-    if ( fabs( tmp ) >  0.001*hx )
-      return true;
-    else
-      return false;
-  }
-  else
-  {
-    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
-            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
-            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-    if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
-    {
-      sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ] = argAbsMin( value, tmp );
-      tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
-      if ( fabs( tmp ) > 0.001*hx )
-        return true;
-      else
-        return false;
-    }
-    else
-    {
-      tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
-              TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
-              hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
-              hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
-      sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ] = argAbsMin( value, tmp );
-      tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
-      if ( fabs( tmp ) > 0.001*hx )
-        return true;
-      else
-        return false;
-    }
-  }
-  
-  return false;
-}
-
-#ifdef HAVE_CUDA
-template < typename Real, typename Device, typename Index >
-__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, 
-        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output,
-        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap  )
-{
-  int i = threadIdx.x + blockDim.x*blockIdx.x;
-  const Meshes::Grid< 1, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
-  
-  if( i < mesh.getDimensions().x()  )
-  {
-    typedef typename Meshes::Grid< 1, Real, Device, Index >::Cell Cell;
-    Cell cell( mesh );
-    cell.getCoordinates().x() = i;
-    cell.refresh();
-    const Index cind = cell.getIndex();
-    
-    
-    output[ cind ] =
-            input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
-              - std::numeric_limits< Real >::max();
-    interfaceMap[ cind ] = false; 
-    
-    const Real& h = mesh.getSpaceSteps().x();
-    cell.refresh();
-    const Real& c = input( cell );
-    if( ! cell.isBoundaryEntity()  )
-    {
-      auto neighbors = cell.getNeighborEntities();
-      Real pom = 0;
-      const Index e = neighbors.template getEntityIndex< 1 >();
-      const Index w = neighbors.template getEntityIndex< -1 >();
-      if( c * input[ e ] <= 0 )
-      {
-        pom = TNL::sign( c )*( h * c )/( c - input[ e ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
-          output[ cind ] = pom;                       
-        
-        interfaceMap[ cind ] = true;
-      }
-      if( c * input[ w ] <= 0 )
-      {
-        pom = TNL::sign( c )*( h * c )/( c - input[ w ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-          output[ cind ] = pom;
-        
-        interfaceMap[ cind ] = true;
-      }
-    }
-  }
-  
-}
-template < typename Real, typename Device, typename Index >
-__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
-        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
-        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap ) 
-{
-  int i = threadIdx.x + blockDim.x*blockIdx.x;
-  int j = blockDim.y*blockIdx.y + threadIdx.y;
-  const Meshes::Grid< 2, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
-  
-  if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
-  {
-    typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell;
-    Cell cell( mesh );
-    cell.getCoordinates().x() = i; cell.getCoordinates().y() = j;
-    cell.refresh();
-    const Index cind = cell.getIndex();
-    
-    
-    output[ cind ] =
-            input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
-              - std::numeric_limits< Real >::max();
-    interfaceMap[ cind ] = false; 
-    
-    const Real& hx = mesh.getSpaceSteps().x();
-    const Real& hy = mesh.getSpaceSteps().y();
-    cell.refresh();
-    const Real& c = input( cell );
-    if( ! cell.isBoundaryEntity()  )
-    {
-      auto neighbors = cell.getNeighborEntities();
-      Real pom = 0;
-      const Index e = neighbors.template getEntityIndex<  1,  0 >();
-      const Index w = neighbors.template getEntityIndex<  -1,  0 >();
-      const Index n = neighbors.template getEntityIndex<  0,  1 >();
-      const Index s = neighbors.template getEntityIndex<  0,  -1 >();
-      
-      if( c * input[ n ] <= 0 )
-      {
-        pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-          output[ cind ] = pom;
-        
-        interfaceMap[ cell.getIndex() ] = true;
-      }
-      if( c * input[ e ] <= 0 )
-      {
-        pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
-          output[ cind ] = pom;                       
-        
-        interfaceMap[ cind ] = true;
-      }
-      if( c * input[ w ] <= 0 )
-      {
-        pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-          output[ cind ] = pom;
-        
-        interfaceMap[ cind ] = true;
-      }
-      if( c * input[ s ] <= 0 )
-      {
-        pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-          output[ cind ] = pom;
-        
-        interfaceMap[ cind ] = true;
-      }
-    }
-  }
-}
-
-template < typename Real, typename Device, typename Index >
-__global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, 
-        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
-        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap )
-{
-  int i = threadIdx.x + blockDim.x*blockIdx.x;
-  int j = blockDim.y*blockIdx.y + threadIdx.y;
-  int k = blockDim.z*blockIdx.z + threadIdx.z;
-  const Meshes::Grid< 3, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
-  
-  if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
-  {
-    typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell;
-    Cell cell( mesh );
-    cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k;
-    cell.refresh();
-    const Index cind = cell.getIndex();
-    
-    
-    output[ cind ] =
-            input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
-              - std::numeric_limits< Real >::max();
-    interfaceMap[ cind ] = false; 
-    cell.refresh();
-    
-    const Real& hx = mesh.getSpaceSteps().x();
-    const Real& hy = mesh.getSpaceSteps().y();
-    const Real& hz = mesh.getSpaceSteps().z();
-    const Real& c = input( cell );
-    if( ! cell.isBoundaryEntity()  )
-    {
-      auto neighbors = cell.getNeighborEntities();
-      Real pom = 0;
-      const Index e = neighbors.template getEntityIndex<  1, 0, 0 >();
-      const Index w = neighbors.template getEntityIndex<  -1, 0, 0 >();
-      const Index n = neighbors.template getEntityIndex<  0, 1, 0 >();
-      const Index s = neighbors.template getEntityIndex<  0, -1, 0 >();
-      const Index t = neighbors.template getEntityIndex<  0, 0, 1 >();
-      const Index b = neighbors.template getEntityIndex<  0, 0, -1 >();
-      
-      if( c * input[ n ] <= 0 )
-      {
-        pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-          output[ cind ] = pom;
-        
-        interfaceMap[ cind ] = true;
-      }
-      if( c * input[ e ] <= 0 )
-      {
-        pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
-          output[ cind ] = pom;                       
-        
-        interfaceMap[ cind ] = true;
-      }
-      if( c * input[ w ] <= 0 )
-      {
-        pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-          output[ cind ] = pom;
-        
-        interfaceMap[ cind ] = true;
-      }
-      if( c * input[ s ] <= 0 )
-      {
-        pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-          output[ cind ] = pom;
-        
-        interfaceMap[ cind ] = true;
-      }
-      if( c * input[ b ] <= 0 )
-      {
-        pom = TNL::sign( c )*( hz * c )/( c - input[ b ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-          output[ cind ] = pom;
-        
-        interfaceMap[ cind ] = true;
-      }
-      if( c * input[ t ] <= 0 )
-      {
-        pom = TNL::sign( c )*( hz * c )/( c - input[ t ]);
-        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-          output[ cind ] = pom;
-        
-        interfaceMap[ cind ] = true;
-      }
-    }
-  }
-}
-
-
-
-
-template< typename Real,
-        typename Device,
-        typename Index >
-__cuda_callable__
-bool
-tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >::
-updateCell( volatile Real sArray[18], int thri, const Real h, const Real v )
-{
-  const RealType value = sArray[ thri ];
-  RealType a, tmp = std::numeric_limits< RealType >::max();
-  
-  a = TNL::argAbsMin( sArray[ thri+1 ],
-          sArray[ thri-1 ] );
-  
-  if( fabs( a ) == std::numeric_limits< RealType >::max() )
-    return false;
-  
-  tmp = a + TNL::sign( value ) * h/v;
-  
-  
-  sArray[ thri ] = argAbsMin( value, tmp );
-  
-  tmp = value - sArray[ thri ];
-  if ( fabs( tmp ) >  0.001*h )
-    return true;
-  else
-    return false;
-}
-#endif
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h
index 61465bee9f9784130a547dfe7aaf8bdac6ebdc2d..41aea10a0ef559042e3a9d94c80ab983e47de737 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem.h
@@ -71,6 +71,8 @@ class tnlDirectEikonalProblem
       
       bool setInitialCondition( const Config::ParameterContainer& parameters,
                                 DofVectorPointer& dofs );
+      
+      bool makeSnapshot( );
 
       bool solve( DofVectorPointer& dosf );
 
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
index 7437803e262b517632d43c590f485b5e923b2698..c36c4dca98d4c6bbdf27265a57eeef073512dc59 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h
@@ -12,6 +12,9 @@
  */
 
 #pragma once
+#include <TNL/FileName.h>
+
+#include "tnlDirectEikonalProblem.h"
 
 template< typename Mesh,
           typename Communicator,
@@ -76,6 +79,11 @@ tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >::
 setup( const Config::ParameterContainer& parameters,
        const String& prefix )
 {
+  String param=parameters.getParameter< String >( "distributed-grid-io-type" );
+   if(param=="MpiIO")
+        distributedIOType=Meshes::DistributedMeshes::MpiIO;
+   if(param=="LocalCopy")
+        distributedIOType=Meshes::DistributedMeshes::LocalCopy;
    return true;
 }
 
@@ -115,16 +123,15 @@ setInitialCondition( const Config::ParameterContainer& parameters,
 {
   this->bindDofs( dofs );
   String inputFile = parameters.getParameter< String >( "input-file" );
-  this->initialData->setMesh( this->getMesh() );
-  std::cout<<"setInitialCondition" <<std::endl; 
-  if(CommunicatorType::isDistributed())
+  this->initialData->setMesh( this->getMesh() ); 
+  if( CommunicatorType::isDistributed() )
   {
-    std::cout<<"Nodes Distribution: " << u->getMesh().getDistributedMesh()->printProcessDistr() << std::endl;
+    std::cout<<"Nodes Distribution: " << initialData->getMesh().getDistributedMesh()->printProcessDistr() << std::endl;
     if(distributedIOType==Meshes::DistributedMeshes::MpiIO)
-      Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType,Meshes::DistributedMeshes::MpiIO> ::load(inputFile, *u );
+      Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType,Meshes::DistributedMeshes::MpiIO> ::load(inputFile, *initialData );
     if(distributedIOType==Meshes::DistributedMeshes::LocalCopy)
-      Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType,Meshes::DistributedMeshes::LocalCopy> ::load(inputFile, *u );
-    u->template synchronize<CommunicatorType>();
+      Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType,Meshes::DistributedMeshes::LocalCopy> ::load(inputFile, *initialData );
+    initialData->template synchronize<CommunicatorType>();
   }
   else
   {
@@ -141,6 +148,35 @@ setInitialCondition( const Config::ParameterContainer& parameters,
   return true;
 }
 
+template< typename Mesh,
+          typename Communicator,
+          typename Anisotropy,
+          typename Real,
+          typename Index >
+bool
+tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >::
+makeSnapshot(  )
+{
+   std::cout << std::endl << "Writing output." << std::endl;
+
+   //this->bindDofs( dofs );
+
+   FileName fileName;
+   fileName.setFileNameBase( "u-" );
+   fileName.setExtension( "tnl" );
+
+   if(CommunicatorType::isDistributed())
+   {
+      if(distributedIOType==Meshes::DistributedMeshes::MpiIO)
+        Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType,Meshes::DistributedMeshes::MpiIO> ::save(fileName.getFileName(), *u );
+      if(distributedIOType==Meshes::DistributedMeshes::LocalCopy)
+        Meshes::DistributedMeshes::DistributedGridIO<MeshFunctionType,Meshes::DistributedMeshes::LocalCopy> ::save(fileName.getFileName(), *u );
+   }
+   else
+      this->u->save( fileName.getFileName() );
+   return true;
+}
+
 
 template< typename Mesh,
           typename Communicator,
@@ -151,7 +187,9 @@ bool
 tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >::
 solve( DofVectorPointer& dofs )
 {
-   FastSweepingMethod< MeshType, AnisotropyType > fsm;
-   fsm.solve( this->getMesh(), anisotropy, initialData );
+   FastSweepingMethod< MeshType, Communicator,AnisotropyType > fsm;
+   fsm.solve( this->getMesh(), u, anisotropy, initialData );
+   
+   makeSnapshot();
    return true;
 }
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
index 57b1886e8b67c943051623c0a07f301faa76cad8..8c58ee6102ebe52cf71cdc6a8a6bff94d5792536 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
@@ -10,13 +10,14 @@
 
 #pragma once
 
-#include <TNL/Meshes/Grid.h>
-#include <TNL/Functions/Analytic/Constant.h>
-#include <TNL/Pointers/SharedPointer.h>
+//#include <TNL/Meshes/Grid.h>
+//#include <TNL/Functions/Analytic/Constant.h>
+//#include <TNL/Pointers/SharedPointer.h>
 #include "tnlDirectEikonalMethodsBase.h"
 
 
 template< typename Mesh,
+        typename Communicator,
         typename Anisotropy = Functions::Analytic::Constant< Mesh::getMeshDimension(), typename Mesh::RealType > >
 class FastSweepingMethod
 {   
@@ -25,8 +26,9 @@ class FastSweepingMethod
 template< typename Real,
         typename Device,
         typename Index,
+        typename Communicator,
         typename Anisotropy >
-class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >
+class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Communicator, Anisotropy >
 : public tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >
 {
   //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
@@ -47,7 +49,7 @@ class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >
     using typename BaseType::MeshFunctionType;
     using typename BaseType::InterfaceMapPointer;
     using typename BaseType::MeshFunctionPointer;
-    
+   
     
     FastSweepingMethod();
     
@@ -56,6 +58,7 @@ class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >
     void setMaxIterations( const IndexType& maxIterations );
     
     void solve( const MeshPointer& mesh,
+            MeshFunctionPointer& Aux,
             const AnisotropyPointer& anisotropy,
             MeshFunctionPointer& u );
     
@@ -68,8 +71,9 @@ class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >
 template< typename Real,
         typename Device,
         typename Index,
+        typename Communicator,
         typename Anisotropy >
-class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >
+class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >
 : public tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
 {
   //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
@@ -82,15 +86,19 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >
     typedef Index IndexType;
     typedef Anisotropy AnisotropyType;
     typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > BaseType;
+    typedef Communicator CommunicatorType;
+    typedef Containers::StaticVector< 2, Index > StaticVector;
+    
     using MeshPointer = Pointers::SharedPointer<  MeshType >;
     using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
+    using MPI = Communicators::MpiCommunicator;
     
     using typename BaseType::InterfaceMapType;
     using typename BaseType::MeshFunctionType;
     using typename BaseType::InterfaceMapPointer;
     using typename BaseType::MeshFunctionPointer;
     using typename BaseType::ArrayContainer;
-    
+        
     FastSweepingMethod();
     
     const IndexType& getMaxIterations() const;
@@ -98,19 +106,30 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >
     void setMaxIterations( const IndexType& maxIterations );
     
     void solve( const MeshPointer& mesh,
+            MeshFunctionPointer& Aux,
             const AnisotropyPointer& anisotropy,
-            MeshFunctionPointer& u );
+            const MeshFunctionPointer& u );
     
     protected:
       
       const IndexType maxIterations;
+    
+      void setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps,
+              const MeshPointer& mesh);
+      
+      bool goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, 
+              MeshFunctionType& aux, const InterfaceMapType& interfaceMap,
+              const AnisotropyPointer& anisotropy );
+      
+      void getInfoFromNeighbours( int& calculated, int& calculateAgain, const MeshPointer& mesh );
 };
 
 template< typename Real,
         typename Device,
         typename Index,
+        typename Communicator,
         typename Anisotropy >
-class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >
+class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >
 : public tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
 {
   //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
@@ -123,8 +142,12 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >
     typedef Index IndexType;
     typedef Anisotropy AnisotropyType;
     typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > BaseType;
+    typedef Communicator CommunicatorType;
+    typedef Containers::StaticVector< 3, Index > StaticVector;
+    
     using MeshPointer = Pointers::SharedPointer<  MeshType >;
     using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
+    using MPI = Communicators::MpiCommunicator;
     
     using typename BaseType::InterfaceMapType;
     using typename BaseType::MeshFunctionType;
@@ -140,6 +163,7 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >
     void setMaxIterations( const IndexType& maxIterations );
     
     void solve( const MeshPointer& mesh,
+            MeshFunctionPointer& Aux,
             const AnisotropyPointer& anisotropy,
             MeshFunctionPointer& u );
     
@@ -147,6 +171,15 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >
     protected:
       
       const IndexType maxIterations;
+      
+      void setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps,
+              const MeshPointer& mesh);
+      
+      bool goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, 
+              MeshFunctionType& aux, const InterfaceMapType& interfaceMap,
+              const AnisotropyPointer& anisotropy );
+      
+      void getInfoFromNeighbours( int& calculated, int& calculateAgain, const MeshPointer& mesh );
 };
 
 
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h
index 890c6cb4c9373917abf1d668ebf60a30cfbc17b3..f2f033ccbee3ffa5b71567ea1b54e2307ebe1713 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h
@@ -18,8 +18,9 @@
 template< typename Real,
           typename Device,
           typename Index,
+        typename Communicator,
           typename Anisotropy >
-FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Communicator, Anisotropy >::
 FastSweepingMethod()
 : maxIterations( 1 )
 {
@@ -29,9 +30,10 @@ FastSweepingMethod()
 template< typename Real,
           typename Device,
           typename Index,
+        typename Communicator,
           typename Anisotropy >
 const Index&
-FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Communicator, Anisotropy >::
 getMaxIterations() const
 {
    
@@ -40,9 +42,10 @@ getMaxIterations() const
 template< typename Real,
           typename Device,
           typename Index,
+        typename Communicator,
           typename Anisotropy >
 void
-FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Communicator, Anisotropy >::
 setMaxIterations( const IndexType& maxIterations )
 {
    
@@ -51,10 +54,12 @@ setMaxIterations( const IndexType& maxIterations )
 template< typename Real,
           typename Device,
           typename Index,
+        typename Communicator,
           typename Anisotropy >
 void
-FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Communicator, Anisotropy >::
 solve( const MeshPointer& mesh,
+        MeshFunctionPointer& Aux,
        const AnisotropyPointer& anisotropy,
        MeshFunctionPointer& u )
 {   
@@ -104,7 +109,7 @@ solve( const MeshPointer& mesh,
           dim3 blockSize( cudaBlockSize );
           dim3 gridSize( numBlocksX );
           
-          tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > ptr;
+          BaseType ptr;
           
           
           
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index 89cb608810fd37823475c2fb08c1d0ad03397db2..e5638c11dd71d88d72d8d0590c8e51c0df6baaab 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -13,22 +13,12 @@
 
 #pragma once
 
-#include "tnlFastSweepingMethod.h"
-#include <TNL/Devices/Cuda.h>
-#include <TNL/Communicators/MpiDefs.h>
-
-
-
-
-#include <string.h>
-#include <iostream>
-#include <fstream>
-
 template< typename Real,
         typename Device,
         typename Index,
+        typename Communicator,
         typename Anisotropy >
-FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
 FastSweepingMethod()
 : maxIterations( 1 )
 {
@@ -38,9 +28,10 @@ FastSweepingMethod()
 template< typename Real,
         typename Device,
         typename Index,
+        typename Communicator,
         typename Anisotropy >
 const Index&
-FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
 getMaxIterations() const
 {
   
@@ -49,9 +40,10 @@ getMaxIterations() const
 template< typename Real,
         typename Device,
         typename Index,
+        typename Communicator,
         typename Anisotropy >
 void
-FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
 setMaxIterations( const IndexType& maxIterations )
 {
   
@@ -60,711 +52,442 @@ setMaxIterations( const IndexType& maxIterations )
 template< typename Real,
         typename Device,
         typename Index,
-        typename Anisotropy >
+        typename Communicator,
+        typename Anisotropy > 
 void
-FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
 solve( const MeshPointer& mesh,
+        MeshFunctionPointer& Aux,
         const AnisotropyPointer& anisotropy,
-        MeshFunctionPointer& u )
+        const MeshFunctionPointer& u )
 {  
   MeshFunctionPointer auxPtr;
   InterfaceMapPointer interfaceMapPtr;
   auxPtr->setMesh( mesh );
   interfaceMapPtr->setMesh( mesh );
+  
+  // Setting overlaps ( WITHOUT MPI SHOULD BE 0 )
+  StaticVector vecLowerOverlaps, vecUpperOverlaps;
+  setOverlaps( vecLowerOverlaps, vecUpperOverlaps, mesh );
+  
   std::cout << "Initiating the interface cells ..." << std::endl;
-  BaseType::initInterface( u, auxPtr, interfaceMapPtr );
+  BaseType::initInterface( u, auxPtr, interfaceMapPtr, vecLowerOverlaps, vecUpperOverlaps );
   
-  auxPtr->save( "aux-ini.tnl" );
+  //auxPtr->save( "aux-ini.tnl" );
   
   typename MeshType::Cell cell( *mesh );
   
   IndexType iteration( 0 );
   InterfaceMapType interfaceMap = *interfaceMapPtr;
   MeshFunctionType aux = *auxPtr;
+  aux.template synchronize< Communicator >(); //synchronize initialized overlaps
   
-  
-#ifdef HAVE_MPI
-  int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup );
-  //printf( "Hello world from rank: %d ", i );
-  //Communicators::MpiCommunicator::Request r = Communicators::MpiCommunicator::ISend( auxPtr, 0, 0, Communicators::MpiCommunicator::AllGroup );
-  if( i == 1 ) {
-    /*for( int k = 0; k < 16*16; k++ )
-      aux[ k ] = 10;*/
-    printf( "1: mesh x: %d\n", mesh->getDimensions().x() );
-    printf( "1: mesh y: %d\n", mesh->getDimensions().y() );
-    //aux.save("aux_proc1.tnl");
-  }
-  if( i == 0 ) {
-    printf( "0: mesh x: %d\n", mesh->getDimensions().x() );
-    printf( "0: mesh y: %d\n", mesh->getDimensions().y() );
-    //aux.save("aux_proc0.tnl");
-    /*for( int k = 0; k < mesh->getDimensions().x()*mesh->getDimensions().y(); k++ )
-      aux[ k ] = 10;
-    for( int k = 0; k < mesh->getDimensions().x(); k++ ){
-      for( int l = 0; l < mesh->getDimensions().y(); l++ )
-        printf("%f.2\t",aux[ k * 16 + l ] );
-    printf("\n");
-    }*/
-  }
-    
-  /*bool a = Communicators::MpiCommunicator::IsInitialized();
-  if( a )
-    printf("Je Init\n");
-  else
-    printf("Neni Init\n");*/
-#endif
-  
+  std::cout << "Calculating the values ..." << std::endl; 
   while( iteration < this->maxIterations )
   {
-    if( std::is_same< DeviceType, Devices::Host >::value )
-    {
-      int numThreadsPerBlock = -1;
-      
-      numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0));
-      //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
-      if( numThreadsPerBlock <= 16 )
-        numThreadsPerBlock = 16;
-      else if(numThreadsPerBlock <= 32 )
-        numThreadsPerBlock = 32;
-      else if(numThreadsPerBlock <= 64 )
-        numThreadsPerBlock = 64;
-      else if(numThreadsPerBlock <= 128 )
-        numThreadsPerBlock = 128;
-      else if(numThreadsPerBlock <= 256 )
-        numThreadsPerBlock = 256;
-      else if(numThreadsPerBlock <= 512 )
-        numThreadsPerBlock = 512;
-      else
-        numThreadsPerBlock = 1024;
-      //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
-      
-      if( numThreadsPerBlock == -1 ){
-        printf("Fail in setting numThreadsPerBlock.\n");
-        break;
-      }
-      
-      
-      
-      int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
-      int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
-      
-      //std::cout << "numBlocksX = " << numBlocksX << std::endl;
-      
-      /*Real **sArray = new Real*[numBlocksX*numBlocksY];
-       for( int i = 0; i < numBlocksX * numBlocksY; i++ )
-       sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];*/
-      
-      ArrayContainer BlockIterHost;
-      BlockIterHost.setSize( numBlocksX * numBlocksY );
-      BlockIterHost.setValue( 1 );
-      int IsCalculationDone = 1;
-      
-      MeshFunctionPointer helpFunc( mesh );
-      MeshFunctionPointer helpFunc1( mesh );
-      helpFunc1 = auxPtr;
-      auxPtr = helpFunc;
-      helpFunc = helpFunc1;
-      //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl;
-      /*for( int k = numBlocksX-1; k >-1; k-- ){
-       for( int l = 0; l < numBlocksY; l++ ){
-       std::cout<< BlockIterHost[ l*numBlocksX  + k ];
-       }
-       std::cout<<std::endl;
-       }
-       std::cout<<std::endl;*/
-      unsigned int numWhile = 0;
-      while( IsCalculationDone )
-      {      
-        IsCalculationDone = 0;
-        helpFunc1 = auxPtr;
-        auxPtr = helpFunc;
-        helpFunc = helpFunc1;
-        switch ( numThreadsPerBlock ){
-          case 16:
-            this->template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
-          case 32:
-            this->template updateBlocks< 34 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
-          case 64:
-            this->template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
-          case 128:
-            this->template updateBlocks< 130 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
-          case 256:
-            this->template updateBlocks< 258 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
-          case 512:
-            this->template updateBlocks< 514 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
-          default:
-            this->template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
-        }
-        
-        
-        //Reduction      
-        for( int i = 0; i < BlockIterHost.getSize(); i++ ){
-          if( IsCalculationDone == 0 ){
-            IsCalculationDone = IsCalculationDone || BlockIterHost[ i ];
-            //break;
-          }
-        }
-        numWhile++;
-        /*std::cout <<"numWhile = "<< numWhile <<std::endl;
-        
-        for( int j = numBlocksY-1; j>-1; j-- ){
-          for( int i = 0; i < numBlocksX; i++ )
-            std::cout << BlockIterHost[ j * numBlocksX + i ];
-          std::cout << std::endl;
-        }
-        std::cout << std::endl;*/
-        
-        this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY );
-        
-        /*for( int j = numBlocksY-1; j>-1; j-- ){
-         for( int i = 0; i < numBlocksX; i++ )
-         std::cout << "BlockIterHost = "<< j*numBlocksX + i<< " ," << BlockIterHost[ j * numBlocksX + i ];
-         std::cout << std::endl;
-         }
-         std::cout << std::endl;*/
-        
-        //std::cout<<std::endl;
-        //string s( "aux-"+ std::to_string(numWhile) + ".tnl");
-        //aux.save( s );
-      }
-      if( numWhile == 1 ){
-        auxPtr = helpFunc;
-      }
-      /*for( int i = 0; i < numBlocksX * numBlocksY; i++ )
-       delete []sArray[i];*/
-      
-      
-      /*for( cell.getCoordinates().y() = 0;
-       cell.getCoordinates().y() < mesh->getDimensions().y();
-       cell.getCoordinates().y()++ )
-       {
-       for( cell.getCoordinates().x() = 0;
-       cell.getCoordinates().x() < mesh->getDimensions().x();
-       cell.getCoordinates().x()++ )
-       {
-       cell.refresh();
-       if( ! interfaceMap( cell ) )
-       this->updateCell( aux, cell );
-       }
-       } 
-       
-       //aux.save( "aux-1.tnl" );
-       
-       for( cell.getCoordinates().y() = 0;
-       cell.getCoordinates().y() < mesh->getDimensions().y();
-       cell.getCoordinates().y()++ )
-       {
-       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-       cell.getCoordinates().x() >= 0 ;
-       cell.getCoordinates().x()-- )		
-       {
-       //std::cerr << "2 -> ";
-       cell.refresh();
-       if( ! interfaceMap( cell ) )            
-       this->updateCell( aux, cell );
-       }
-       }
-       
-       //aux.save( "aux-2.tnl" );
-       
-       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-       cell.getCoordinates().y() >= 0 ;
-       cell.getCoordinates().y()-- )
-       {
-       for( cell.getCoordinates().x() = 0;
-       cell.getCoordinates().x() < mesh->getDimensions().x();
-       cell.getCoordinates().x()++ )
-       {
-       //std::cerr << "3 -> ";
-       cell.refresh();
-       if( ! interfaceMap( cell ) )            
-       this->updateCell( aux, cell );
-       }
-       }
-       
-       //aux.save( "aux-3.tnl" );
-       
-       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-       cell.getCoordinates().y() >= 0;
-       cell.getCoordinates().y()-- )
-       {
-       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-       cell.getCoordinates().x() >= 0 ;
-       cell.getCoordinates().x()-- )		
-       {
-       //std::cerr << "4 -> ";
-       cell.refresh();
-       if( ! interfaceMap( cell ) )            
-       this->updateCell( aux, cell );
-       }
-       }
-       
-       for( int j = 0;
-       j < mesh->getDimensions().y();
-       j++ )
-       {
-       for( int i = 0;
-       i < mesh->getDimensions().x();
-       i++ )
-       {
-       std::cout << aux[ i * mesh->getDimensions().y() + j ] << " ";
-       }
-       std::cout << std::endl;
-       }*/
-      
-    }
-    if( std::is_same< DeviceType, Devices::Cuda >::value )
+    // calculatedBefore indicates weather we calculated in the last passage of the while cycle 
+    // calculatedBefore is same for all ranks 
+    // without MPI should be FALSE at the end of while cycle body
+    int calculatedBefore = 1;
+    
+    // calculateMPIAgain indicates if the thread should calculate again in upcoming passage of while cycle
+    // calculateMPIAgain is a value that can differ in every rank
+    // without MPI should be FALSE at the end of while cycle body
+    int calculateMPIAgain = 1;  
+    
+    while( calculatedBefore )
     {
-      // TODO: CUDA code
-#ifdef HAVE_CUDA
-      TNL_CHECK_CUDA_DEVICE;
-      // Maximum cudaBlockSite is 32. Because of maximum num. of threads in kernel.
-      const int cudaBlockSize( 16 );
-      
-      int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
-      int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
-      dim3 blockSize( cudaBlockSize, cudaBlockSize );
-      dim3 gridSize( numBlocksX, numBlocksY );
-      
-      tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
-      
-      int BlockIterD = 1;
-      
-      TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
-      BlockIterDevice.setSize( numBlocksX * numBlocksY );
-      BlockIterDevice.setValue( 1 );
-      TNL_CHECK_CUDA_DEVICE;
-      
-      
-      TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom;
-      BlockIterPom.setSize( numBlocksX * numBlocksY  );
-      BlockIterPom.setValue( 0 );
-      /*TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1;
-       BlockIterPom1.setSize( numBlocksX * numBlocksY  );
-       BlockIterPom1.setValue( 0 );*/
-      /*int *BlockIterDevice;
-       cudaMalloc((void**) &BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/
-      int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
-      //std::cout << "nBlocksNeigh = " << nBlocksNeigh << std::endl;
-      //free( BlockIter );
-      /*int *BlockIterPom;
-       cudaMalloc((void**) &BlockIterPom, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/
-      
-      int nBlocks = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
+      calculatedBefore = 0;
       
-      TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
-      dBlock.setSize( nBlocks );
-      TNL_CHECK_CUDA_DEVICE;
-      /*int *dBlock;
-       cudaMalloc((void**) &dBlock, nBlocks * sizeof( int ) );*/
-      
-      
-      MeshFunctionPointer helpFunc1( mesh );      
-      MeshFunctionPointer helpFunc( mesh );
-      
-      helpFunc1 = auxPtr;
-      auxPtr = helpFunc;
-      helpFunc = helpFunc1;
-      
-      int numIter = 0;
-      
-      //int oddEvenBlock = 0;
-      while( BlockIterD )
+      if( std::is_same< DeviceType, Devices::Host >::value && calculateMPIAgain ) // should we calculate in Host?
       {
-        /** HERE IS CHESS METHOD **/
+        calculateMPIAgain = 0;
         
-        /*auxPtr = helpFunc;
+  /**--HERE-IS-PARALLEL-OMP-CODE--!!!WITHOUT MPI!!!--------------------**/
+        /*
+         int numThreadsPerBlock = -1;
          
-         CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
-         interfaceMapPtr.template getData< Device >(),
-         auxPtr.template getData< Device>(),
-         helpFunc.template modifyData< Device>(),
-         BlockIterDevice,
-         oddEvenBlock.getView() );
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
-         auxPtr = helpFunc;
+         numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0));
+         //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
+         if( numThreadsPerBlock <= 16 )
+         numThreadsPerBlock = 16;
+         else if(numThreadsPerBlock <= 32 )
+         numThreadsPerBlock = 32;
+         else if(numThreadsPerBlock <= 64 )
+         numThreadsPerBlock = 64;
+         else if(numThreadsPerBlock <= 128 )
+         numThreadsPerBlock = 128;
+         else if(numThreadsPerBlock <= 256 )
+         numThreadsPerBlock = 256;
+         else if(numThreadsPerBlock <= 512 )
+         numThreadsPerBlock = 512;
+         else
+         numThreadsPerBlock = 1024;
+         //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
+         
+         if( numThreadsPerBlock == -1 ){
+         printf("Fail in setting numThreadsPerBlock.\n");
+         break;
+         }
+         
+         
+         
+         int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
+         int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
+         
+         //std::cout << "numBlocksX = " << numBlocksX << std::endl;
          
-         oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
+         //Real **sArray = new Real*[numBlocksX*numBlocksY];
+         //for( int i = 0; i < numBlocksX * numBlocksY; i++ )
+         // sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];
          
-         CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
-         interfaceMapPtr.template getData< Device >(),
-         auxPtr.template getData< Device>(),
-         helpFunc.template modifyData< Device>(),
-         BlockIterDevice,
-         oddEvenBlock.getView() );
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
+         ArrayContainer BlockIterHost;
+         BlockIterHost.setSize( numBlocksX * numBlocksY );
+         BlockIterHost.setValue( 1 );
+         int IsCalculationDone = 1;
+         
+         MeshFunctionPointer helpFunc( mesh );
+         MeshFunctionPointer helpFunc1( mesh );
+         helpFunc1 = auxPtr;
+         auxPtr = helpFunc;
+         helpFunc = helpFunc1;
+         //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl;
+         //for( int k = numBlocksX-1; k >-1; k-- ){
+         // for( int l = 0; l < numBlocksY; l++ ){
+         // std::cout<< BlockIterHost[ l*numBlocksX  + k ];
+         // }
+         // std::cout<<std::endl;
+         // }
+         // std::cout<<std::endl;
+         unsigned int numWhile = 0;
+         while( IsCalculationDone )
+         {      
+         IsCalculationDone = 0;
+         helpFunc1 = auxPtr;
          auxPtr = helpFunc;
+         helpFunc = helpFunc1;
+         switch ( numThreadsPerBlock ){
+         case 16:
+         this->template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         case 32:
+         this->template updateBlocks< 34 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         case 64:
+         this->template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         case 128:
+         this->template updateBlocks< 130 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         case 256:
+         this->template updateBlocks< 258 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         case 512:
+         this->template updateBlocks< 514 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         default:
+         this->template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         }
          
-         oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
          
-         CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY ) );
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
-         CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks );
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
+         //Reduction      
+         for( int i = 0; i < BlockIterHost.getSize(); i++ ){
+         if( IsCalculationDone == 0 ){
+         IsCalculationDone = IsCalculationDone || BlockIterHost[ i ];
+         //break;
+         }
+         }
+         numWhile++;
+         //std::cout <<"numWhile = "<< numWhile <<std::endl;
          
-         BlockIterD = dBlock.getElement( 0 );*/
-        
-        /**------------------------------------------------------------------------------------------------*/
+         // for( int j = numBlocksY-1; j>-1; j-- ){
+         // for( int i = 0; i < numBlocksX; i++ )
+         // std::cout << BlockIterHost[ j * numBlocksX + i ];
+         // std::cout << std::endl;
+         // }
+         // std::cout << std::endl;
+         
+         this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY );
+         
+         //std::cout<<std::endl;
+         //String s( "aux-"+ std::to_string(numWhile) + ".tnl");
+         //aux.save( s );
+         }
+         if( numWhile == 1 ){
+         auxPtr = helpFunc;
+         }
+         */
+  /**-END-OF-OMP-PARALLEL------------------------------------------------**/
         
         
-        /** HERE IS FIM **/
+  // FSM FOR MPI and WITHOUT MPI
+        StaticVector boundsFrom; StaticVector boundsTo;
+    // UP and RIGHT
+        boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
+        boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
+        calculatedBefore = goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
+        //aux.save("aux-1.tnl");
         
-        helpFunc1 = auxPtr;
-        auxPtr = helpFunc;
-        helpFunc = helpFunc1;
-        TNL_CHECK_CUDA_DEVICE;
+    // UP and LEFL
+        boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
+        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = -1 + vecLowerOverlaps[0];
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
+        //aux.save( "aux-2.tnl" );
         
-        //int pocBloku = 0;
-        Devices::Cuda::synchronizeDevice();
-        CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
-                interfaceMapPtr.template getData< Device >(),
-                auxPtr.template modifyData< Device>(),
-                helpFunc.template modifyData< Device>(),
-                BlockIterDevice.getView() );
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
+    // DOWN and RIGHT
+        boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
+        boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
+        //aux.save( "aux-3.tnl" );
         
-        //std::cout << "Pocet aktivnich bloku = " << pocBloku << std::endl;
-        //BlockIterPom1 = BlockIterDevice;
-        ///for( int i =0; i< numBlocksX; i++ ){
-        //  for( int j = 0; j < numBlocksY; j++ )
-        //  {
-        //    std::cout << BlockIterPom1[j*numBlocksX + i];
-        //  }
-        //  std::cout << std::endl;
-        //}
-        //std::cout << std::endl;
+    // DOWN and LEFT
+        boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
+        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0];
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
         
-        GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice.getView(), BlockIterPom.getView(), numBlocksX, numBlocksY );
-        cudaDeviceSynchronize();
+      }
+      if( std::is_same< DeviceType, Devices::Cuda >::value && calculateMPIAgain ) // should we calculate on CUDA?
+      {
+        calculateMPIAgain = 0;
+          
+#ifdef HAVE_CUDA
         TNL_CHECK_CUDA_DEVICE;
-        BlockIterDevice = BlockIterPom;
-        
-        //std::cout<< "Probehlo" << std::endl;
+        // Maximum cudaBlockSite is 32. Because of maximum num. of threads in kernel.
+        // IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2)
+        const int cudaBlockSize( 16 );
         
-        //TNL::swap( auxPtr, helpFunc );
+        // Setting number of threads and blocks for kernel
+        int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize );
+        int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize );
+        dim3 blockSize( cudaBlockSize, cudaBlockSize );
+        dim3 gridSize( numBlocksX, numBlocksY );
         
+        // Need for calling functions from kernel
+        BaseType ptr;
         
-        CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY ) );
-        TNL_CHECK_CUDA_DEVICE;
+        // True if we should calculate again.
+        int calculateCudaBlocksAgain = 1;
         
-        CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks );
+        // Array that identifies which blocks should be calculated.
+        // All blocks should calculate in first passage ( setValue(1) )
+        TNL::Containers::Array< int, Devices::Cuda, IndexType > blockCalculationIndicator( numBlocksX * numBlocksY );
+        blockCalculationIndicator.setValue( 1 );
         TNL_CHECK_CUDA_DEVICE;
         
+        // Array into which we identify the neighbours and then copy it into blockCalculationIndicator
+        TNL::Containers::Array< int, Devices::Cuda, IndexType > blockCalculationIndicatorHelp(numBlocksX * numBlocksY );
+        blockCalculationIndicatorHelp.setValue( 0 );
         
-        BlockIterD = dBlock.getElement( 0 );
-        //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
+        // number of Blocks for kernel that calculates neighbours.
+        int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
         
+        // Helping meshFunction that switches with AuxPtr in every calculation of CudaUpdateCellCaller<<<>>>()
+        MeshFunctionPointer helpFunc( mesh );
+        helpFunc.template modifyData() = auxPtr.template getData(); 
         
-        /**-----------------------------------------------------------------------------------------------------------*/
-        /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
-         BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
-        numIter ++;
-      }
-      if( numIter == 1 ){
-        auxPtr = helpFunc;
+        // number of iterations of while calculateCudaBlocksAgain
+        int numIter = 0;
+               
+        //int oddEvenBlock = 0;
+        while( calculateCudaBlocksAgain )
+        {
+  /** HERE IS CHESS METHOD (NO MPI) **/
+          
+          /*
+           CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
+           interfaceMapPtr.template getData< Device >(),
+           auxPtr.template getData< Device>(),
+           helpFunc.template modifyData< Device>(),
+           blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps,
+           oddEvenBlock );
+           cudaDeviceSynchronize();
+           TNL_CHECK_CUDA_DEVICE;
+           
+           oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
+           
+           CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
+           interfaceMapPtr.template getData< Device >(),
+           helpFunc.template getData< Device>(),
+           auxPtr.template modifyData< Device>(),
+           blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps, 
+           oddEvenBlock );
+           cudaDeviceSynchronize();
+           TNL_CHECK_CUDA_DEVICE;
+           
+           oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
+           
+           calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1);
+          */
+  /**------------------------------------------------------------------------------------------------*/
+          
+          
+  /** HERE IS FIM FOR MPI AND WITHOUT MPI **/
+          Devices::Cuda::synchronizeDevice();
+          CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(),
+                  auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(),
+                  blockCalculationIndicator.getView(), vecLowerOverlaps, vecUpperOverlaps );
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
+          
+          // Switching helpFunc and auxPtr.
+          auxPtr.swap( helpFunc );
+          
+          // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now.
+          Devices::Cuda::synchronizeDevice(); 
+          GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator.getView(), blockCalculationIndicatorHelp.getView(), numBlocksX, numBlocksY );
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
+          blockCalculationIndicator = blockCalculationIndicatorHelp;
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
+          
+          // "Parallel reduction" to see if we should calculate again calculateCudaBlocksAgain
+          calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1);
+          
+          // When we change something then we should caclucate again in the next passage of MPI ( calculated = true )
+         if( calculateCudaBlocksAgain ){
+            calculatedBefore = 1;
+          }
+          
+/**-----------------------------------------------------------------------------------------------------------*/
+          numIter ++;
+        }
+        if( numIter%2 == 1 ) // Need to check parity for MPI overlaps to synchronize ( otherwise doesnt work )
+        {
+          helpFunc.swap( auxPtr );
+          Devices::Cuda::synchronizeDevice();
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
+        }
+        aux = *auxPtr;
+        interfaceMap = *interfaceMapPtr;
+#endif
       }
-      /*cudaFree( BlockIterDevice );
-       cudaFree( dBlock );
-       delete BlockIter;*/
-      cudaDeviceSynchronize();
-      TNL_CHECK_CUDA_DEVICE;
+
       
-      aux = *auxPtr;
-      interfaceMap = *interfaceMapPtr;
+/**----------------------MPI-TO-DO---------------------------------------------**/        
+#ifdef HAVE_MPI
+      if( CommunicatorType::isDistributed() ){
+        getInfoFromNeighbours( calculatedBefore, calculateMPIAgain, mesh );
+       
+        aux.template synchronize< Communicator >();
+      }
 #endif
+      if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculated 0!
+        calculatedBefore = 0;
     }
     iteration++;
   }
-  //#endif
-  aux.save("aux-final.tnl");
+  Aux=auxPtr; // copy it for MakeSnapshot
 }
 
 
-#ifdef HAVE_CUDA
-
+// PROTECTED FUNCTIONS:
 
-template < typename Index >
-__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY )
+template< typename Real, typename Device, typename Index, 
+          typename Communicator, typename Anisotropy >
+void 
+FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
+setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps,
+              const MeshPointer& mesh)
 {
-  int i = blockIdx.x * 1024 + threadIdx.x;
-  
-  if( i < numBlockX * numBlockY )
+  vecLowerOverlaps[0] = 0; vecLowerOverlaps[1] = 0; vecUpperOverlaps[0] = 0; vecUpperOverlaps[1] = 0;
+#ifdef HAVE_MPI
+  if( CommunicatorType::isDistributed() ) //If we started solver with MPI
   {
-    int pom = 0;//BlockIterPom[ i ] = 0;
-    int m=0, k=0;
-    m = i%numBlockX;
-    k = i/numBlockX;
-    if( m > 0 && BlockIterDevice[ i - 1 ] ){
-      pom = 1;//BlockIterPom[ i ] = 1;
-    }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){
-      pom = 1;//BlockIterPom[ i ] = 1;
-    }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){
-      pom = 1;// BlockIterPom[ i ] = 1;
-    }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){
-      pom = 1;//BlockIterPom[ i ] = 1;
-    }
-    
-    BlockIterPom[ i ] = pom;//BlockIterPom[ i ];
+    //Distributed mesh for MPI overlaps (without MPI null pointer)
+    const Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh();
+    vecLowerOverlaps = meshPom->getLowerOverlap();
+    vecUpperOverlaps = meshPom->getUpperOverlap();
   }
+#endif
 }
 
-template < typename Index >
-__global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > dBlock, int nBlocks )
+
+
+
+template< typename Real, typename Device, typename Index, 
+          typename Communicator, typename Anisotropy >
+bool 
+FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
+goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, 
+        MeshFunctionType& aux, const InterfaceMapType& interfaceMap,
+        const AnisotropyPointer& anisotropy )
 {
-  int i = threadIdx.x;
-  int blId = blockIdx.x;
-  int blockSize = blockDim.x;
-  /*if ( i == 0 && blId == 0 ){
-   printf( "nBlocks = %d \n", nBlocks );
-   for( int j = nBlocks-1; j > -1 ; j--){
-   printf( "cislo = %d \n", BlockIterDevice[ j ] );
-   }
-   }*/
-  __shared__ int sArray[ 1024 ];
-  sArray[ i ] = 0;
-  if( blId * 1024 + i < nBlocks )
-    sArray[ i ] = BlockIterDevice[ blId * 1024 + i ];
-  __syncthreads();
-  /*extern __shared__ volatile int sArray[];
-   unsigned int i = threadIdx.x;
-   unsigned int gid = blockIdx.x * blockSize * 2 + threadIdx.x;
-   unsigned int gridSize = blockSize * 2 * gridDim.x;
-   sArray[ i ] = 0;
-   while( gid < nBlocks )
-   {
-   sArray[ i ] += BlockIterDevice[ gid ] + BlockIterDevice[ gid + blockSize ];
-   gid += gridSize;
-   }
-   __syncthreads();*/
+  bool calculated = false;
+  const MeshType& mesh = aux.getMesh();
+  const IndexType stepX = boundsFrom[0] < boundsTo[0]? 1 : -1;
+  const IndexType stepY = boundsFrom[1] < boundsTo[1]? 1 : -1;
   
-  if ( blockSize == 1024) {
-    if (i < 512)
-      sArray[ i ] += sArray[ i + 512 ];
-  }
-  __syncthreads();
-  if (blockSize >= 512) {
-    if (i < 256) {
-      sArray[ i ] += sArray[ i + 256 ];
-    }
-  }
-  __syncthreads();
-  if (blockSize >= 256) {
-    if (i < 128) {
-      sArray[ i ] += sArray[ i + 128 ];
-    }
-  }
-  __syncthreads();
-  if (blockSize >= 128) {
-    if (i < 64) {
-      sArray[ i ] += sArray[ i + 64 ];
-    }
-  }
-  __syncthreads();
-  if (i < 32 )
+  typename MeshType::Cell cell( mesh );
+  cell.refresh();
+  
+  for( cell.getCoordinates().y() = boundsFrom[1];
+          TNL::abs( cell.getCoordinates().y() - boundsTo[1] ) > 0;
+          cell.getCoordinates().y() += stepY )
   {
-    if(  blockSize >= 64 ) sArray[ i ] += sArray[ i + 32 ];
-    if(  blockSize >= 32 )  sArray[ i ] += sArray[ i + 16 ];
-    if(  blockSize >= 16 )  sArray[ i ] += sArray[ i + 8 ];
-    if(  blockSize >= 8 )  sArray[ i ] += sArray[ i + 4 ];
-    if(  blockSize >= 4 )  sArray[ i ] += sArray[ i + 2 ];
-    if(  blockSize >= 2 )  sArray[ i ] += sArray[ i + 1 ];
+    for( cell.getCoordinates().x() = boundsFrom[0];
+           TNL::abs( cell.getCoordinates().x() - boundsTo[0] ) > 0;
+            cell.getCoordinates().x() += stepX )
+    {
+      cell.refresh();
+      if( ! interfaceMap( cell ) )
+      {
+        calculated = this->updateCell( aux, cell ) || calculated;
+      }
+    }
   }
-  
-  if( i == 0 )
-    dBlock[ blId ] = sArray[ 0 ];
+  return calculated;
 }
 
 
 
-template < int sizeSArray, typename Real, typename Device, typename Index >
-__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
-        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
-        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock )
+
+#ifdef HAVE_MPI
+template< typename Real, typename Device, typename Index, 
+          typename Communicator, typename Anisotropy >
+void 
+FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
+getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh )
 {
-  int thri = threadIdx.x; int thrj = threadIdx.y;
-  int i = threadIdx.x + blockDim.x*blockIdx.x;
-  int j = blockDim.y*blockIdx.y + threadIdx.y;
-  /** FOR CHESS METHOD */
-  //if( (blockIdx.y%2  + blockIdx.x) % 2 == oddEvenBlock )
-  //{
-  /**------------------------------------------*/
+  Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh();
   
+  int calculateFromNeighbours[4] = {0,0,0,0};
+  const int *neighbours = meshDistr->getNeighbors(); // Getting neighbors of distributed mesh
+  MPI::Request *requestsInformation;
+  requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ];  
   
-  /** FOR FIM METHOD */
+  int neighCount = 0; // should this thread calculate again?
   
-  if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] )
-  { 
-    __syncthreads();
-    /**-----------------------------------------*/
-    const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
-    __shared__ int dimX;
-    __shared__ int dimY;
-    __shared__ Real hx;
-    __shared__ Real hy;
-    if( thri==0 && thrj == 0)
-    {
-      dimX = mesh.getDimensions().x();
-      dimY = mesh.getDimensions().y();
-      hx = mesh.getSpaceSteps().x();
-      hy = mesh.getSpaceSteps().y();
-      BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] = 0;
-    }
-    __syncthreads();
-    int numOfBlockx;
-    int numOfBlocky;
-    int xkolik;
-    int ykolik;
-    
-    xkolik = blockDim.x + 1;
-    ykolik = blockDim.y + 1;
-    numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0);
-    numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0);
-    
-    if( numOfBlockx - 1 == blockIdx.x )
-      xkolik = dimX - (blockIdx.x)*blockDim.x+1;
-    
-    if( numOfBlocky -1 == blockIdx.y )
-      ykolik = dimY - (blockIdx.y)*blockDim.y+1;
-    __syncthreads();
-    
-    int currentIndex = thrj * blockDim.x + thri;
-    //__shared__ volatile bool changed[ blockDim.x*blockDim.y ];
-    __shared__ volatile bool changed[ (sizeSArray-2)*(sizeSArray-2)];
-    changed[ currentIndex ] = false;
-    if( thrj == 0 && thri == 0 )
-      changed[ 0 ] = true;
-    
-    
-    //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ];
-    __shared__ volatile Real sArray[ sizeSArray * sizeSArray ];
-    sArray[ thrj * sizeSArray + thri ] = std::numeric_limits< Real >::max();
-    
-    //filling sArray edges
-    if( thri == 0 )
-    {      
-      if( dimX > (blockIdx.x+1) * blockDim.x  && thrj+1 < ykolik )
-        sArray[(thrj+1)*sizeSArray + xkolik] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX + xkolik ];
-      else
-        sArray[(thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max();
-    }
-    
-    if( thri == 1 )
-    {
-      if( blockIdx.x != 0 && thrj+1 < ykolik )
-        sArray[(thrj+1)*sizeSArray + 0] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX ];
-      else
-        sArray[(thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max();
-    }
-    
-    if( thri == 2 )
-    {
-      if( dimY > (blockIdx.y+1) * blockDim.y  && thrj+1 < xkolik )
-        sArray[ ykolik*sizeSArray + thrj+1 ] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + ykolik*dimX + thrj+1 ];
-      else
-        sArray[ykolik*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
-      
-    }
-    
-    if( thri == 3 )
-    {
-      if( blockIdx.y != 0 && thrj+1 < xkolik )
-        sArray[0*sizeSArray + thrj+1] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + thrj+1 ];
-      else
-        sArray[0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
-    }
-    
-    if( i < dimX && j < dimY )
-    {    
-      sArray[(thrj+1)*sizeSArray + thri+1] = aux[ j*dimX + i ];
-    }
-    __syncthreads();  
-    
-    while( changed[ 0 ] )
-    {
-      __syncthreads();
-      
-      changed[ currentIndex] = false;
-      
-      //calculation of update cell
-      if( i < dimX && j < dimY )
-      {
-        if( ! interfaceMap[ j * dimX + i ] )
-        {
-          changed[ currentIndex ] = ptr.updateCell<sizeSArray>( sArray, thri+1, thrj+1, hx,hy);
-        }
-      }
-      __syncthreads();
-      
-      //pyramid reduction
-      if( blockDim.x*blockDim.y == 1024 )
-      {
-        if( currentIndex < 512 )
-        {
-          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
-        }
-      }
-      __syncthreads();
-      if( blockDim.x*blockDim.y >= 512 )
-      {
-        if( currentIndex < 256 )
-        {
-          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
-        }
-      }
-      __syncthreads();
-      if( blockDim.x*blockDim.y >= 256 )
-      {
-        if( currentIndex < 128 )
-        {
-          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
-        }
-      }
-      __syncthreads();
-      if( blockDim.x*blockDim.y >= 128 )
-      {
-        if( currentIndex < 64 )
-        {
-          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
-        }
-      }
-      __syncthreads();
-      if( currentIndex < 32 ) 
-      {
-        if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
-        if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
-        if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
-        if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
-        if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
-        if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
-      }
-      if( thri == 0 && thrj == 0 && changed[ 0 ] ){
-        BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] = 1;
-      }
-      /*if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 )
-       {
-       for( int k = 15; k>-1; k-- ){
-       for( int l = 0; l < 16; l++ )
-       printf( "%f\t", sArray[k * 16 + l]);
-       printf( "\n");
-       }
-       printf( "\n");
-       }*/
-      __syncthreads();
-    }
-    if( i < dimX && j < dimY )
-      helpFunc[ j * dimX + i ] = sArray[ ( thrj + 1 ) * sizeSArray + thri + 1 ];
-    
-  } 
+  if( neighbours[0] != -1 ) // LEFT
+  {
+    requestsInformation[neighCount++] =
+            MPI::ISend( &calculatedBefore, 1, neighbours[0], 0, MPI::AllGroup );
+    requestsInformation[neighCount++] = 
+            MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup );
+  }
+  
+  if( neighbours[1] != -1 ) // RIGHT
+  {
+    requestsInformation[neighCount++] =
+            MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup ); 
+    requestsInformation[neighCount++] = 
+            MPI::IRecv( &calculateFromNeighbours[1], 1, neighbours[1], 0, MPI::AllGroup );
+  }
+  
+  if( neighbours[2] != -1 ) //UP
+  {
+    requestsInformation[neighCount++] = 
+            MPI::ISend( &calculatedBefore, 1, neighbours[2], 0, MPI::AllGroup );
+    requestsInformation[neighCount++] =
+            MPI::IRecv( &calculateFromNeighbours[2], 1, neighbours[2], 0, MPI::AllGroup  );
+  }
+  
+  if( neighbours[5] != -1 ) //DOWN
+  {
+    requestsInformation[neighCount++] = 
+            MPI::ISend( &calculatedBefore, 1, neighbours[5], 0, MPI::AllGroup );
+    requestsInformation[neighCount++] = 
+            MPI::IRecv( &calculateFromNeighbours[3], 1, neighbours[5], 0, MPI::AllGroup );
+  }
+  MPI::WaitAll( requestsInformation, neighCount );
+  
+  MPI::Allreduce( &calculatedBefore, &calculatedBefore, 1, MPI_LOR,  MPI::AllGroup );
+  calculateMPIAgain = calculateFromNeighbours[0] || calculateFromNeighbours[1] ||
+              calculateFromNeighbours[2] || calculateFromNeighbours[3];
 }
 #endif
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index 2a1183bc2007c1c4c6498c1027b7cdc7ef3a3e1d..325b626f7bf5262637f8e1b43ec9e156bbeca26b 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -13,13 +13,13 @@
 
 #pragma once
 
-#include "tnlFastSweepingMethod.h"
 
 template< typename Real,
         typename Device,
         typename Index,
+        typename Communicator,
         typename Anisotropy >
-FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
 FastSweepingMethod()
 : maxIterations( 1 )
 {
@@ -29,9 +29,10 @@ FastSweepingMethod()
 template< typename Real,
         typename Device,
         typename Index,
+        typename Communicator,
         typename Anisotropy >
 const Index&
-FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
 getMaxIterations() const
 {
   
@@ -40,9 +41,10 @@ getMaxIterations() const
 template< typename Real,
         typename Device,
         typename Index,
+        typename Communicator,
         typename Anisotropy >
 void
-FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
 setMaxIterations( const IndexType& maxIterations )
 {
   
@@ -51,10 +53,12 @@ setMaxIterations( const IndexType& maxIterations )
 template< typename Real,
         typename Device,
         typename Index,
+        typename Communicator,
         typename Anisotropy >
 void
-FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
+FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
 solve( const MeshPointer& mesh,
+        MeshFunctionPointer& Aux,
         const AnisotropyPointer& anisotropy,
         MeshFunctionPointer& u )
 {
@@ -62,8 +66,13 @@ solve( const MeshPointer& mesh,
   InterfaceMapPointer interfaceMapPtr;
   auxPtr->setMesh( mesh );
   interfaceMapPtr->setMesh( mesh );
+  
+  // getting overlaps ( WITHOUT MPI SHOULD BE 0 )
+  Containers::StaticVector< 3, IndexType > vecLowerOverlaps, vecUpperOverlaps;
+  setOverlaps( vecLowerOverlaps, vecUpperOverlaps, mesh );
+  
   std::cout << "Initiating the interface cells ..." << std::endl;
-  BaseType::initInterface( u, auxPtr, interfaceMapPtr );
+  BaseType::initInterface( u, auxPtr, interfaceMapPtr, vecLowerOverlaps, vecUpperOverlaps );
   auxPtr->save( "aux-ini.tnl" );   
   
   typename MeshType::Cell cell( *mesh );
@@ -71,594 +80,436 @@ solve( const MeshPointer& mesh,
   IndexType iteration( 0 );
   MeshFunctionType aux = *auxPtr;
   InterfaceMapType interfaceMap = * interfaceMapPtr;
+  aux.template synchronize< Communicator >(); //synchronization of intial conditions
+  
   while( iteration < this->maxIterations )
   {
-    if( std::is_same< DeviceType, Devices::Host >::value )
+    // indicates weather we calculated in the last passage of the while cycle 
+    // calculatedBefore is same for all ranks 
+    // without MPI should be FALSE at the end of while cycle body
+    int calculatedBefore = 1; 
+    
+    // indicates if the MPI process should calculate again in upcoming passage of cycle
+    // calculateMPIAgain is a value that can differ in every rank
+    // without MPI should be FALSE at the end of while cycle body
+    int calculateMPIAgain = 1; 
+    
+    while( calculatedBefore )
     {
-      int numThreadsPerBlock = 64;
-      
+      calculatedBefore = 0;
       
-      int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
-      int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
-      int numBlocksZ = mesh->getDimensions().z() / numThreadsPerBlock + (mesh->getDimensions().z() % numThreadsPerBlock != 0 ? 1:0);
-      //std::cout << "numBlocksX = " << numBlocksX << std::endl;
-      
-      /*Real **sArray = new Real*[numBlocksX*numBlocksY];
-       for( int i = 0; i < numBlocksX * numBlocksY; i++ )
-       sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];*/
-      
-      ArrayContainer BlockIterHost;
-      BlockIterHost.setSize( numBlocksX * numBlocksY * numBlocksZ );
-      BlockIterHost.setValue( 1 );
-      int IsCalculationDone = 1;
-      
-      MeshFunctionPointer helpFunc( mesh );
-      MeshFunctionPointer helpFunc1( mesh );
-      helpFunc1 = auxPtr;
-      auxPtr = helpFunc;
-      helpFunc = helpFunc1;
-      //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl;
-      /*for( int k = numBlocksX-1; k >-1; k-- ){
-       for( int l = 0; l < numBlocksY; l++ ){
-       std::cout<< BlockIterHost[ l*numBlocksX  + k ];
-       }
-       std::cout<<std::endl;
-       }
-       std::cout<<std::endl;*/
-      unsigned int numWhile = 0;
-      while( IsCalculationDone  )
-      {      
-        IsCalculationDone = 0;
-        helpFunc1 = auxPtr;
-        auxPtr = helpFunc;
-        helpFunc = helpFunc1;
-        this->template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+      if( std::is_same< DeviceType, Devices::Host >::value && calculateMPIAgain ) // should we calculate in Host?
+      {
+        calculateMPIAgain = 0;
         
-        //Reduction      
-        for( int i = 0; i < BlockIterHost.getSize(); i++ ){
-          if( IsCalculationDone == 0 ){
-            IsCalculationDone = IsCalculationDone || BlockIterHost[ i ];
-            //break;
-          }
-        }
-        numWhile++;
-        std::cout <<"numWhile = "<< numWhile <<std::endl;
-        /*for( int k = 0; k < numBlocksZ; k++ ){
-          for( int j = numBlocksY-1; j>-1; j-- ){
-            for( int i = 0; i < numBlocksX; i++ ){
-              //std::cout << (*auxPtr)[ k * numBlocksX * numBlocksY + j * numBlocksX + i ] << " ";
-              std::cout << BlockIterHost[ k * numBlocksX * numBlocksY + j * numBlocksX + i ];
-            }
-            std::cout << std::endl;
-          }
-          std::cout << std::endl;
-        }
-        std::cout << std::endl;*/
+/** HERE IS FSM FOR OPENMP (NO MPI) - isnt worthy */
+        /*int numThreadsPerBlock = -1;
+         
+         numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0));
+         //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
+         if( numThreadsPerBlock <= 16 )
+         numThreadsPerBlock = 16;
+         else if(numThreadsPerBlock <= 32 )
+         numThreadsPerBlock = 32;
+         else if(numThreadsPerBlock <= 64 )
+         numThreadsPerBlock = 64;
+         else if(numThreadsPerBlock <= 128 )
+         numThreadsPerBlock = 128;
+         else if(numThreadsPerBlock <= 256 )
+         numThreadsPerBlock = 256;
+         else if(numThreadsPerBlock <= 512 )
+         numThreadsPerBlock = 512;
+         else
+         numThreadsPerBlock = 1024;
+         //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
+         
+         if( numThreadsPerBlock == -1 ){
+            printf("Fail in setting numThreadsPerBlock.\n");
+         break;
+         }
+         
+         int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
+         int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
+         int numBlocksZ = mesh->getDimensions().z() / numThreadsPerBlock + (mesh->getDimensions().z() % numThreadsPerBlock != 0 ? 1:0);
+         //std::cout << "numBlocksX = " << numBlocksX << std::endl;
+         
+         //Real **sArray = new Real*[numBlocksX*numBlocksY];
+         // for( int i = 0; i < numBlocksX * numBlocksY; i++ )
+         // sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];
+         
+         ArrayContainer BlockIterHost;
+         BlockIterHost.setSize( numBlocksX * numBlocksY * numBlocksZ );
+         BlockIterHost.setValue( 1 );
+         int IsCalculationDone = 1;
+         
+         MeshFunctionPointer helpFunc( mesh );
+         MeshFunctionPointer helpFunc1( mesh );
+         helpFunc1 = auxPtr;
+         auxPtr = helpFunc;
+         helpFunc = helpFunc1;
+         //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl;
+         //for( int k = numBlocksX-1; k >-1; k-- ){
+         //for( int l = 0; l < numBlocksY; l++ ){
+         // std::cout<< BlockIterHost[ l*numBlocksX  + k ];
+         // }
+         // std::cout<<std::endl;
+         // }
+         // std::cout<<std::endl;
+         unsigned int numWhile = 0;
+         while( IsCalculationDone  )
+         {      
+         IsCalculationDone = 0;
+         helpFunc1 = auxPtr;
+         auxPtr = helpFunc;
+         helpFunc = helpFunc1;
+         switch ( numThreadsPerBlock ){
+         case 16:
+         this->template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         case 32:
+         this->template updateBlocks< 34 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         case 64:
+         this->template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         case 128:
+         this->template updateBlocks< 130 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         case 256:
+         this->template updateBlocks< 258 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         case 512:
+         this->template updateBlocks< 514 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         default:
+         this->template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock );
+         }
+         //Reduction      
+         for( int i = 0; i < BlockIterHost.getSize(); i++ ){
+         if( IsCalculationDone == 0 ){
+         IsCalculationDone = IsCalculationDone || BlockIterHost[ i ];
+         //break;
+         }
+         }
+         numWhile++;
+         
+         
+         this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY, numBlocksZ );
+         
+         //string s( "aux-"+ std::to_string(numWhile) + ".tnl");
+         //aux.save( s );
+         }
+         if( numWhile == 1 ){
+         auxPtr = helpFunc;
+         }
+         aux = *auxPtr;*/
+/**------------------------------------------------------------------------------*/
         
-        this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY, numBlocksZ );
         
-        /*for( int k = 0; k < numBlocksZ; k++ ){
-          for( int j = numBlocksY-1; j>-1; j-- ){
-            for( int i = 0; i < numBlocksX; i++ ){
-              //std::cout << (*auxPtr)[ k * numBlocksX * numBlocksY + j * numBlocksX + i ] << " ";
-              std::cout << BlockIterHost[ k * numBlocksX * numBlocksY + j * numBlocksX + i ];
-            }
-            std::cout << std::endl;
-          }
-          std::cout << std::endl;
-        }*/
+/** HERE IS FSM WITH MPI AND WITHOUT MPI */
+        StaticVector boundsFrom; StaticVector boundsTo;
         
-        /*for( int j = numBlocksY-1; j>-1; j-- ){
-         for( int i = 0; i < numBlocksX; i++ )
-         std::cout << "BlockIterHost = "<< j*numBlocksX + i<< " ," << BlockIterHost[ j * numBlocksX + i ];
-         std::cout << std::endl;
-         }
-         std::cout << std::endl;*/
+    // TOP, NORTH and EAST        
+        boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2];
+        boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
+        boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
+        calculatedBefore = goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
         
-        //std::cout<<std::endl;
-        //string s( "aux-"+ std::to_string(numWhile) + ".tnl");
-        //aux.save( s );
-      }
-      if( numWhile == 1 ){
-        auxPtr = helpFunc;
+    // TOP, NORTH and WEST        
+        boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2];
+        boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
+        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0];
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
+        
+    // TOP, SOUTH and EAST        
+        boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2];
+        boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
+        boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
+        
+    // TOP, SOUTH and WEST        
+        boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2];
+        boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
+        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; 
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
+            
+    // BOTTOM, NOTH and EAST        
+        boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2];
+        boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
+        boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); 
+        
+    // BOTTOM, NOTH and WEST        
+        boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2];
+        boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1];
+        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; 
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );
+        
+    // BOTTOM, SOUTH and EAST        
+        boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2];
+        boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
+        boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0];
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );    
+        
+    // BOTTOM, SOUTH and WEST        
+        boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2];
+        boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1];
+        boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0];
+        goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy );    
+        
+        
+  /**----------------------------------------------------------------------------------*/
       }
-      aux = *auxPtr;
-      
-      /*for( cell.getCoordinates().z() = 0;
-       cell.getCoordinates().z() < mesh->getDimensions().z();
-       cell.getCoordinates().z()++ )
-       {
-       for( cell.getCoordinates().y() = 0;
-       cell.getCoordinates().y() < mesh->getDimensions().y();
-       cell.getCoordinates().y()++ )
-       {
-       for( cell.getCoordinates().x() = 0;
-       cell.getCoordinates().x() < mesh->getDimensions().x();
-       cell.getCoordinates().x()++ )
-       {
-       cell.refresh();
-       if( ! interfaceMap( cell ) )
-       this->updateCell( aux, cell );
-       }
-       }
-       }
-       //aux.save( "aux-1.tnl" );
-       
-       for( cell.getCoordinates().z() = 0;
-       cell.getCoordinates().z() < mesh->getDimensions().z();
-       cell.getCoordinates().z()++ )
-       {
-       for( cell.getCoordinates().y() = 0;
-       cell.getCoordinates().y() < mesh->getDimensions().y();
-       cell.getCoordinates().y()++ )
-       {
-       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-       cell.getCoordinates().x() >= 0 ;
-       cell.getCoordinates().x()-- )		
-       {
-       //std::cerr << "2 -> ";
-       cell.refresh();
-       if( ! interfaceMap( cell ) )            
-       this->updateCell( aux, cell );
-       }
-       }
-       }
-       //aux.save( "aux-2.tnl" );
-       for( cell.getCoordinates().z() = 0;
-       cell.getCoordinates().z() < mesh->getDimensions().z();
-       cell.getCoordinates().z()++ )
-       {
-       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-       cell.getCoordinates().y() >= 0 ;
-       cell.getCoordinates().y()-- )
-       {
-       for( cell.getCoordinates().x() = 0;
-       cell.getCoordinates().x() < mesh->getDimensions().x();
-       cell.getCoordinates().x()++ )
-       {
-       //std::cerr << "3 -> ";
-       cell.refresh();
-       if( ! interfaceMap( cell ) )            
-       this->updateCell( aux, cell );
-       }
-       }
-       }
-       //aux.save( "aux-3.tnl" );
-       
-       for( cell.getCoordinates().z() = 0;
-       cell.getCoordinates().z() < mesh->getDimensions().z();
-       cell.getCoordinates().z()++ )
-       {
-       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-       cell.getCoordinates().y() >= 0;
-       cell.getCoordinates().y()-- )
-       {
-       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-       cell.getCoordinates().x() >= 0 ;
-       cell.getCoordinates().x()-- )		
-       {
-       //std::cerr << "4 -> ";
-       cell.refresh();
-       if( ! interfaceMap( cell ) )            
-       this->updateCell( aux, cell );
-       }
-       }
-       }     
-       //aux.save( "aux-4.tnl" );
-       
-       for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-       cell.getCoordinates().z() >= 0;
-       cell.getCoordinates().z()-- )
-       {
-       for( cell.getCoordinates().y() = 0;
-       cell.getCoordinates().y() < mesh->getDimensions().y();
-       cell.getCoordinates().y()++ )
-       {
-       for( cell.getCoordinates().x() = 0;
-       cell.getCoordinates().x() < mesh->getDimensions().x();
-       cell.getCoordinates().x()++ )
-       {
-       //std::cerr << "5 -> ";
-       cell.refresh();
-       if( ! interfaceMap( cell ) )
-       this->updateCell( aux, cell );
-       }
-       }
-       }
-       //aux.save( "aux-5.tnl" );
-       
-       for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-       cell.getCoordinates().z() >= 0;
-       cell.getCoordinates().z()-- )
-       {
-       for( cell.getCoordinates().y() = 0;
-       cell.getCoordinates().y() < mesh->getDimensions().y();
-       cell.getCoordinates().y()++ )
-       {
-       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-       cell.getCoordinates().x() >= 0 ;
-       cell.getCoordinates().x()-- )		
-       {
-       //std::cerr << "6 -> ";
-       cell.refresh();
-       if( ! interfaceMap( cell ) )            
-       this->updateCell( aux, cell );
-       }
-       }
-       }
-       //aux.save( "aux-6.tnl" );
-       
-       for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-       cell.getCoordinates().z() >= 0;
-       cell.getCoordinates().z()-- )
-       {
-       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-       cell.getCoordinates().y() >= 0 ;
-       cell.getCoordinates().y()-- )
-       {
-       for( cell.getCoordinates().x() = 0;
-       cell.getCoordinates().x() < mesh->getDimensions().x();
-       cell.getCoordinates().x()++ )
-       {
-       //std::cerr << "7 -> ";
-       cell.refresh();
-       if( ! interfaceMap( cell ) )            
-       this->updateCell( aux, cell );
-       }
-       }
-       }
-       //aux.save( "aux-7.tnl" );
-       
-       for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-       cell.getCoordinates().z() >= 0;
-       cell.getCoordinates().z()-- )
-       {
-       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-       cell.getCoordinates().y() >= 0;
-       cell.getCoordinates().y()-- )
-       {
-       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-       cell.getCoordinates().x() >= 0 ;
-       cell.getCoordinates().x()-- )		
-       {
-       //std::cerr << "8 -> ";
-       cell.refresh();
-       if( ! interfaceMap( cell ) )            
-       this->updateCell( aux, cell );
-       }
-       }
-       }*/
-    }
-    if( std::is_same< DeviceType, Devices::Cuda >::value )
-    {
-      // TODO: CUDA code
-#ifdef HAVE_CUDA
-      const int cudaBlockSize( 8 );
-      int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
-      int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
-      int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z(), cudaBlockSize ); 
-      if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
-        std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
-      dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
-      dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
-      
-      tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
-      
-      
-      int BlockIterD = 1;
-      
-      TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
-      BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ );
-      BlockIterDevice.setValue( 1 );
-      TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom;
-      BlockIterPom.setSize( numBlocksX * numBlocksY * numBlocksZ );
-      BlockIterPom.setValue( 0 );
-      /*int *BlockIterDevice;
-       cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/
-      int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0);
-      
-      TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
-      dBlock.setSize( nBlocks );
-      dBlock.setValue( 0 );
-      
-      int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0);
-      /*int *dBlock;
-       cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/
-      MeshFunctionPointer helpFunc1( mesh );      
-      MeshFunctionPointer helpFunc( mesh );
-      
-      helpFunc1 = auxPtr;
-      auxPtr = helpFunc;
-      helpFunc = helpFunc1;
-      int numIter = 0;
-      
-      while( BlockIterD )
+      if( std::is_same< DeviceType, Devices::Cuda >::value && calculateMPIAgain )
       {
-        helpFunc1 = auxPtr;
-        auxPtr = helpFunc;
-        helpFunc = helpFunc1;
-        TNL_CHECK_CUDA_DEVICE;
+#ifdef HAVE_CUDA
+        // cudaBlockSize is a size of blocks. It's the number raised to the 3 power.
+        // the number should be less than 10^3 (num of threads in one grid is maximally 1024)
+        // IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2)
+        const int cudaBlockSize( 8 );
         
-        CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr,
-                interfaceMapPtr.template getData< Device >(),
-                auxPtr.template getData< Device>(),
-                helpFunc.template modifyData< Device>(),
-                BlockIterDevice.getView() );
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
+        // Getting the number of blocks in grid in each direction (without overlaps bcs we dont calculate on overlaps)
+        int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize );
+        int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize );
+        int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z() - vecLowerOverlaps[2] - vecUpperOverlaps[2], cudaBlockSize ); 
+        if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
+          std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
         
-        GetNeighbours3D<<< nBlocksNeigh, 1024 >>>( BlockIterDevice.getView(), BlockIterPom.getView(), numBlocksX, numBlocksY, numBlocksZ );
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
-        BlockIterDevice = BlockIterPom;
+        // Making the variables for global function CudaUpdateCellCaller.
+        dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
+        dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
         
-        CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY * numBlocksZ ) );
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
+        BaseType ptr; // tnlDirectEikonalMethodBase type for calling of function inside CudaUpdateCellCaller
+        
+        
+        int BlockIterD = 1; //variable that tells us weather we should calculate the main cuda body again
         
-        CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks );
+        // Array containing information about each block in grid, answering question (Have we calculated in this block?)
+        TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice( numBlocksX * numBlocksY * numBlocksZ );
+        BlockIterDevice.setValue( 1 ); // calculate all in the first passage
+        
+        // Helping Array for GetNeighbours3D
+        TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom( numBlocksX * numBlocksY * numBlocksZ );
+        BlockIterPom.setValue( 0 ); //doesnt matter what number
+        
+        
+        
+        // number of neighbours in one block (1024 threads) for GetNeighbours3D
+        int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0);
+        
+        
+        //MeshFunctionPointer helpFunc1( mesh );      
+        MeshFunctionPointer helpFunc( mesh );
+        helpFunc.template modifyData() = auxPtr.template getData();
+        Devices::Cuda::synchronizeDevice(); 
+                
+        int numIter = 0; // number of passages of following while cycle
+        
+        while( BlockIterD ) //main body of cuda code
+        {
+          
+          Devices::Cuda::synchronizeDevice();          
+          // main function that calculates all values in each blocks
+          // calculated values are in helpFunc
+          CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr,
+                  interfaceMapPtr.template getData< Device >(),
+                  auxPtr.template getData< Device>(),
+                  helpFunc.template modifyData< Device>(),
+                  BlockIterDevice.getView(), vecLowerOverlaps, vecUpperOverlaps );
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
+          // Switching pointers to helpFunc and auxPtr so real results are in memory of helpFunc but here under variable auxPtr
+          auxPtr.swap( helpFunc );
+          
+          Devices::Cuda::synchronizeDevice();
+          // Neighbours of blocks that calculatedBefore in this passage should calculate in the next!
+          // BlockIterDevice contains blocks that calculatedBefore in this passage and BlockIterPom those that should calculate in next (are neighbours)
+          GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice.getView(), BlockIterPom.getView(), numBlocksX, numBlocksY, numBlocksZ );
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
+          BlockIterDevice = BlockIterPom;
+          Devices::Cuda::synchronizeDevice();
+          
+          // .containsValue(1) is actually parallel reduction implemented in TNL
+          BlockIterD = BlockIterDevice.containsValue(1);
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
+          
+          numIter++;
+          if( BlockIterD ){ 
+            // if we calculated in this passage, we should send the info via MPI so neighbours should calculate after synchronization
+            calculatedBefore = 1;
+          }
+        }
+        if( numIter%2 == 1 ){
+          
+          // We need auxPtr to point on memory of original auxPtr (not to helpFunc)
+          // last passage of previous while cycle didnt calculate any number anyway so switching names doesnt effect values
+          auxPtr.swap( helpFunc ); 
+          Devices::Cuda::synchronizeDevice();
+        }
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
-        cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
-        numIter++;
-        /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
-         BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
-        
+        aux = *auxPtr;
+        interfaceMap = *interfaceMapPtr;
+#endif
       }
-      if( numIter == 1 ){
-        auxPtr = helpFunc;
+      
+#ifdef HAVE_MPI
+      if( CommunicatorType::isDistributed() )
+      {
+        getInfoFromNeighbours( calculatedBefore, calculateMPIAgain, mesh );
+
+        // synchronizate the overlaps 
+        aux.template synchronize< Communicator >();
+
       }
-      //cudaFree( BlockIterDevice );
-      //cudaFree( dBlock );
-      cudaDeviceSynchronize();
-      TNL_CHECK_CUDA_DEVICE;
-      aux = *auxPtr;
-      interfaceMap = *interfaceMapPtr;
 #endif
+      
+      if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculatedBefore 0!
+        calculatedBefore = 0; //otherwise we would go throw the FSM code and CUDA FSM code again uselessly
     }
-    
     //aux.save( "aux-8.tnl" );
     iteration++;
     
   }
+  // Saving the results into Aux for MakeSnapshot function.
+  Aux = auxPtr; 
   aux.save("aux-final.tnl");
 }
 
-#ifdef HAVE_CUDA
-template < typename Index >
-__global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom,
-        int numBlockX, int numBlockY, int numBlockZ )
+// PROTECTED FUNCTIONS:
+
+template< typename Real, typename Device, typename Index, 
+          typename Communicator, typename Anisotropy >
+void 
+FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
+setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps,
+              const MeshPointer& mesh)
 {
-  int i = blockIdx.x * 1024 + threadIdx.x;
-  
-  if( i < numBlockX * numBlockY * numBlockZ )
+  vecLowerOverlaps[0] = 0; vecLowerOverlaps[1] = 0; vecLowerOverlaps[2] = 0;
+  vecUpperOverlaps[0] = 0; vecUpperOverlaps[1] = 0; vecUpperOverlaps[2] = 0;
+#ifdef HAVE_MPI
+  if( CommunicatorType::isDistributed() ) //If we started solver with MPI
   {
-    int pom = 0;//BlockIterPom[ i ] = 0;
-    int m=0, l=0, k=0;
-    l = i/( numBlockX * numBlockY );
-    k = (i-l*numBlockX * numBlockY )/(numBlockX );
-    m = (i-l*numBlockX * numBlockY )%( numBlockX );
-    if( m > 0 && BlockIterDevice[ i - 1 ] ){
-      pom = 1;//BlockIterPom[ i ] = 1;
-    }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){
-      pom = 1;//BlockIterPom[ i ] = 1;
-    }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){
-      pom = 1;// BlockIterPom[ i ] = 1;
-    }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){
-      pom = 1;//BlockIterPom[ i ] = 1;
-    }else if( l > 0 && BlockIterDevice[ i - numBlockX*numBlockY ] ){
-      pom = 1;
-    }else if( l < numBlockZ-1 && BlockIterDevice[ i + numBlockX*numBlockY ] ){
-      pom = 1;
-    }
-    
-    BlockIterPom[ i ] = pom;//BlockIterPom[ i ];
+    //Distributed mesh for MPI overlaps (without MPI null pointer)
+    const Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshPom = mesh->getDistributedMesh();
+    vecLowerOverlaps = meshPom->getLowerOverlap();
+    vecUpperOverlaps = meshPom->getUpperOverlap();
   }
+#endif
 }
 
-template < int sizeSArray, typename Real, typename Device, typename Index >
-__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
-        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
-        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
-        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice )
+
+
+
+template< typename Real, typename Device, typename Index, 
+          typename Communicator, typename Anisotropy >
+bool 
+FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
+goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, 
+        MeshFunctionType& aux, const InterfaceMapType& interfaceMap,
+        const AnisotropyPointer& anisotropy )
 {
-  int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
-  int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z;
-  int i = threadIdx.x + blockDim.x*blockIdx.x;
-  int j = blockDim.y*blockIdx.y + threadIdx.y;
-  int k = blockDim.z*blockIdx.z + threadIdx.z;
-  int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri;
+  bool calculated = false;
+  const MeshType& mesh = aux.getMesh();
+  const IndexType stepX = boundsFrom[0] < boundsTo[0]? 1 : -1;
+  const IndexType stepY = boundsFrom[1] < boundsTo[1]? 1 : -1;
+  const IndexType stepZ = boundsFrom[2] < boundsTo[2]? 1 : -1;
   
-  if( BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] )
+  typename MeshType::Cell cell( mesh );
+  cell.refresh();
+  
+  for( cell.getCoordinates().z() = boundsFrom[2];
+          TNL::abs( cell.getCoordinates().z() - boundsTo[2] ) > 0;
+          cell.getCoordinates().z() += stepZ )
   {
-    __syncthreads();
-    
-    __shared__ volatile bool changed[ 8*8*8/*(sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2)*/];
-    
-    changed[ currentIndex ] = false;
-    if( thrj == 0 && thri == 0 && thrk == 0 )
-      changed[ 0 ] = true;
-    
-    const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
-    __shared__ Real hx; __shared__ int dimX;
-    __shared__ Real hy; __shared__ int dimY;
-    __shared__ Real hz; __shared__ int dimZ;
-    
-    if( thrj == 1 && thri == 1 && thrk == 1 )
-    {
-      //printf( "We are in the calculation. Block = %d.\n",blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x  );
-      hx = mesh.getSpaceSteps().x();
-      hy = mesh.getSpaceSteps().y();
-      hz = mesh.getSpaceSteps().z();
-      dimX = mesh.getDimensions().x();
-      dimY = mesh.getDimensions().y();
-      dimZ = mesh.getDimensions().z();
-      BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 0;
-    }
-    __shared__ volatile Real sArray[ 10*10*10/*sizeSArray * sizeSArray * sizeSArray*/ ];
-    sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = std::numeric_limits< Real >::max();
-    
-    //filling sArray edges
-    int numOfBlockx;
-    int numOfBlocky;
-    int numOfBlockz;
-    int xkolik;
-    int ykolik;
-    int zkolik;
-    xkolik = blockDim.x + 1;
-    ykolik = blockDim.y + 1;
-    zkolik = blockDim.z + 1;
-    numOfBlockx = gridDim.x;
-    numOfBlocky = gridDim.y;
-    numOfBlockz = gridDim.z;
-    __syncthreads();
-    
-    if( numOfBlockx - 1 == blIdx )
-      xkolik = dimX - (blIdx)*blockDim.x+1;
-    if( numOfBlocky -1 == blIdy )
-      ykolik = dimY - (blIdy)*blockDim.y+1;
-    if( numOfBlockz-1 == blIdz )
-      zkolik = dimZ - (blIdz)*blockDim.z+1;
-    __syncthreads();
-    
-    if( thri == 0 )
-    {        
-      if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik )
-        sArray[(thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ];
-      else
-        sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max();
-    }
-    
-    if( thri == 1 )
-    {
-      if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik )
-        sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ];
-      else
-        sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max();
-    }
-    if( thri == 2 )
-    {        
-      if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik )
-        sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ];
-      else
-        sArray[ (thrk+1) * sizeSArray * sizeSArray + 0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
-    }
-    
-    if( thri == 3 )
-    {
-      if( dimY > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik )
-        sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ];
-      else
-        sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
-    }
-    if( thri == 4 )
-    {        
-      if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik )
-        sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ];
-      else
-        sArray[0 * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thrk+1] = std::numeric_limits< Real >::max();
-    }
-    
-    if( thri == 5 )
-    {
-      if( dimZ > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik )
-        sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ];
-      else
-        sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = std::numeric_limits< Real >::max();
-    }
-    
-    if( i < dimX && j < dimY && k < dimZ )
+    for( cell.getCoordinates().y() = boundsFrom[1];
+            TNL::abs( cell.getCoordinates().y() - boundsTo[1] ) > 0;
+            cell.getCoordinates().y() += stepY )
     {
-      sArray[(thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = aux[ k*dimX*dimY + j*dimX + i ];
-    }
-    __syncthreads(); 
-    
-    while( changed[ 0 ] )
-    {
-      __syncthreads();
-      
-      changed[ currentIndex ] = false;
-      
-      //calculation of update cell
-      if( i < dimX && j < dimY && k < dimZ )
-      {
-        if( ! interfaceMap[ k*dimX*dimY + j * dimX + i ] )
-        {
-          changed[ currentIndex ] = ptr.updateCell3D< sizeSArray >( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz);
-        }
-      }
-      __syncthreads();
-      
-      //pyramid reduction
-      if( blockDim.x*blockDim.y*blockDim.z == 1024 )
-      {
-        if( currentIndex < 512 )
-        {
-          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
-        }
-      }
-      __syncthreads();
-      if( blockDim.x*blockDim.y*blockDim.z >= 512 )
-      {
-        if( currentIndex < 256 )
-        {
-          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
-        }
-      }
-      __syncthreads();
-      if( blockDim.x*blockDim.y*blockDim.z >= 256 )
-      {
-        if( currentIndex < 128 )
-        {
-          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
-        }
-      }
-      __syncthreads();
-      if( blockDim.x*blockDim.y*blockDim.z >= 128 )
+      for( cell.getCoordinates().x() = boundsFrom[0];
+              TNL::abs( cell.getCoordinates().x() - boundsTo[0] ) > 0;
+              cell.getCoordinates().x() += stepX )
       {
-        if( currentIndex < 64 )
+        cell.refresh();
+        if( ! interfaceMap( cell ) )
         {
-          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
+          calculated = this->updateCell( aux, cell ) || calculated;
         }
       }
-      __syncthreads();
-      if( currentIndex < 32 )
-      {
-        if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
-        if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
-        if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
-        if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
-        if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
-        if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
-      }
-      __syncthreads();
-      
-      /*if(thri == 0 && thrj ==0 && thrk ==0 && blIdx == 0 && blIdy == 0 && blIdz == 0)
-       {
-       //for(int m = 0; m < 8; m++){
-       int m = 4;
-       for(int n = 0; n<8; n++){
-       for(int b=0; b<8; b++)
-       printf(" %i ", changed[m*64 + n*8 + b]);
-       printf("\n");
-       }
-       printf("\n \n");
-       }
-       //}*/
-      
-      if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 )
-      {
-        //printf( "Setting block calculation. Block = %d.\n",blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x  );
-        BlockIterDevice[ blIdz * gridDim.x * gridDim.y + blIdy * gridDim.x + blIdx ] = 1;
-      }
-      __syncthreads();
     }
-    
-    if( i < dimX && j < dimY && k < dimZ )
-      helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thri+1 ];
-    
-  } 
-}  
+  }
+  return calculated;
+}
+
+
+
+
+#ifdef HAVE_MPI
+template< typename Real, typename Device, typename Index, 
+          typename Communicator, typename Anisotropy >
+void 
+FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
+getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh )
+{
+  Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh();
+  
+  int calculateFromNeighbours[6] = {0,0,0,0,0,0};
+        
+  const int *neighbours = meshDistr->getNeighbors(); // Getting neighbors of distributed mesh
+  MPI::Request *requestsInformation;
+  requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ];  
+  
+  int neighCount = 0; // should this thread calculate again?
+  
+  if( neighbours[0] != -1 ) // WEST
+  {
+    requestsInformation[neighCount++] =
+            MPI::ISend( &calculatedBefore, 1, neighbours[0], 0, MPI::AllGroup );
+    requestsInformation[neighCount++] = 
+            MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup );
+  }
+  
+  if( neighbours[1] != -1 ) // EAST
+  {
+    requestsInformation[neighCount++] =
+            MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup ); 
+    requestsInformation[neighCount++] = 
+            MPI::IRecv( &calculateFromNeighbours[1], 1, neighbours[1], 0, MPI::AllGroup );
+  }
+  
+  if( neighbours[2] != -1 ) //NORTH
+  {
+    requestsInformation[neighCount++] = 
+            MPI::ISend( &calculatedBefore, 1, neighbours[2], 0, MPI::AllGroup );
+    requestsInformation[neighCount++] =
+            MPI::IRecv( &calculateFromNeighbours[2], 1, neighbours[2], 0, MPI::AllGroup );
+  }
+  
+  if( neighbours[5] != -1 ) //SOUTH
+  {
+    requestsInformation[neighCount++] = 
+            MPI::ISend( &calculatedBefore, 1, neighbours[5], 0, MPI::AllGroup );
+    requestsInformation[neighCount++] = 
+            MPI::IRecv( &calculateFromNeighbours[3], 1, neighbours[5], 0, MPI::AllGroup );
+  }
+  
+  if( neighbours[8] != -1 ) // TOP 
+  {
+    requestsInformation[neighCount++] = 
+            MPI::ISend( &calculatedBefore, 1, neighbours[8], 0, MPI::AllGroup );
+    requestsInformation[neighCount++] = 
+            MPI::IRecv( &calculateFromNeighbours[4], 1, neighbours[8], 0, MPI::AllGroup );
+  }
+  
+  if( neighbours[17] != -1 ) //BOTTOM
+  {
+    requestsInformation[neighCount++] =
+            MPI::ISend( &calculatedBefore, 1, neighbours[17], 0, MPI::AllGroup );
+    requestsInformation[neighCount++] = 
+            MPI::IRecv( &calculateFromNeighbours[5], 1, neighbours[17], 0, MPI::AllGroup );
+  }
+  
+  MPI::WaitAll( requestsInformation, neighCount );
+  
+  MPI::Allreduce( &calculatedBefore, &calculatedBefore, 1, MPI_LOR,  MPI::AllGroup );
+  calculateMPIAgain = calculateFromNeighbours[0] || calculateFromNeighbours[1] ||
+                      calculateFromNeighbours[2] || calculateFromNeighbours[3] ||
+                      calculateFromNeighbours[4] || calculateFromNeighbours[5];
+}
 #endif