diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index 7d990c1bb1a5ae471337f35d33401d2cc88743e5..f712ce2cc9ff3de5701a8550722b8904d33c4229 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -19,102 +19,112 @@ class tnlDirectEikonalMethodsBase
 };
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
 class tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >
 {
-   public:
-      
-      typedef Meshes::Grid< 1, Real, Device, Index > MeshType;
-      typedef Real RealType;
-      typedef Device DevcieType;
-      typedef Index IndexType;
-      typedef Functions::MeshFunction< MeshType > MeshFunctionType;
-      typedef Functions::MeshFunction< MeshType, 1, bool > InterfaceMapType;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
-      using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;
-      
-      void initInterface( const MeshFunctionPointer& input,
-                          MeshFunctionPointer& output,
-                          InterfaceMapPointer& interfaceMap );
-      
-      template< typename MeshEntity >
-      __cuda_callable__ void updateCell( MeshFunctionType& u,
-                                         const MeshEntity& cell,
-                                         const RealType velocity = 1.0  );
-      
-      __cuda_callable__ bool updateCell( volatile Real sArray[18],
-                                         int thri, const Real h,
-                                         const Real velocity = 1.0 );
+  public:
+    
+    typedef Meshes::Grid< 1, Real, Device, Index > MeshType;
+    typedef Real RealType;
+    typedef Device DevcieType;
+    typedef Index IndexType;
+    typedef Functions::MeshFunction< MeshType > MeshFunctionType;
+    typedef Functions::MeshFunction< MeshType, 1, bool > InterfaceMapType;
+    using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
+    using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;
+    
+    void initInterface( const MeshFunctionPointer& input,
+            MeshFunctionPointer& output,
+            InterfaceMapPointer& interfaceMap );
+    
+    template< typename MeshEntity >
+    __cuda_callable__ void updateCell( MeshFunctionType& u,
+            const MeshEntity& cell,
+            const RealType velocity = 1.0  );
+    
+    __cuda_callable__ bool updateCell( volatile Real sArray[18],
+            int thri, const Real h,
+            const Real velocity = 1.0 );
 };
 
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
 class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
 {
-   public:
-      typedef Meshes::Grid< 2, Real, Device, Index > MeshType;
-      typedef Real RealType;
-      typedef Device DevcieType;
-      typedef Index IndexType;
-      typedef Functions::MeshFunction< MeshType > MeshFunctionType;
-      typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType;
-      typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
-      using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;
-
-      void initInterface( const MeshFunctionPointer& input,
-                          MeshFunctionPointer& output,
-                          InterfaceMapPointer& interfaceMap );
-      
-      template< typename MeshEntity >
-      __cuda_callable__ void updateCell( MeshFunctionType& u,
-                                         const MeshEntity& cell,
-                                         const RealType velocity = 1.0 );
-      
-      template< int sizeSArray >
-      __cuda_callable__ bool updateCell( volatile Real *sArray,
-                                         int thri, int thrj, const Real hx, const Real hy,
-                                         const Real velocity = 1.0 );
-      
-      template< int sizeSArray >
-      void updateBlocks( InterfaceMapType interfaceMap,
-                         MeshFunctionType aux,
-                         MeshFunctionType helpFunc,
-                         ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ );
-      
-      void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY  );
+  public:
+    typedef Meshes::Grid< 2, Real, Device, Index > MeshType;
+    typedef Real RealType;
+    typedef Device DevcieType;
+    typedef Index IndexType;
+    typedef Functions::MeshFunction< MeshType > MeshFunctionType;
+    typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType;
+    typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
+    using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
+    using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;
+    
+    void initInterface( const MeshFunctionPointer& input,
+            MeshFunctionPointer& output,
+            InterfaceMapPointer& interfaceMap );
+    
+    template< typename MeshEntity >
+    __cuda_callable__ void updateCell( MeshFunctionType& u,
+            const MeshEntity& cell,
+            const RealType velocity = 1.0 );
+    
+    template< int sizeSArray >
+    __cuda_callable__ bool updateCell( volatile Real *sArray,
+            int thri, int thrj, const Real hx, const Real hy,
+            const Real velocity = 1.0 );
+    
+    template< int sizeSArray >
+    void updateBlocks( InterfaceMapType interfaceMap,
+            MeshFunctionType aux,
+            MeshFunctionType helpFunc,
+            ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ );
+    
+    void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY  );
 };
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
 class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
 {
-   public:
-      typedef Meshes::Grid< 3, Real, Device, Index > MeshType;
-      typedef Real RealType;
-      typedef Device DevcieType;
-      typedef Index IndexType;
-      typedef Functions::MeshFunction< MeshType > MeshFunctionType;
-      typedef Functions::MeshFunction< MeshType, 3, bool > InterfaceMapType;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
-      using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;      
-
-      void initInterface( const MeshFunctionPointer& input,
-                          MeshFunctionPointer& output,
-                          InterfaceMapPointer& interfaceMap );
-      
-      template< typename MeshEntity >
-      __cuda_callable__ void updateCell( MeshFunctionType& u,
-                                         const MeshEntity& cell,
-                                         const RealType velocity = 1.0);
-      
-      __cuda_callable__ bool updateCell( volatile Real sArray[10][10][10],
-                                         int thri, int thrj, int thrk, const Real hx, const Real hy, const Real hz,
-                                         const Real velocity = 1.0 );
+  public:
+    typedef Meshes::Grid< 3, Real, Device, Index > MeshType;
+    typedef Real RealType;
+    typedef Device DevcieType;
+    typedef Index IndexType;
+    typedef Functions::MeshFunction< MeshType > MeshFunctionType;
+    typedef Functions::MeshFunction< MeshType, 3, bool > InterfaceMapType;
+    typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
+    using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
+    using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;      
+    
+    void initInterface( const MeshFunctionPointer& input,
+            MeshFunctionPointer& output,
+            InterfaceMapPointer& interfaceMap );
+    
+    template< typename MeshEntity >
+    __cuda_callable__ void updateCell( MeshFunctionType& u,
+            const MeshEntity& cell,
+            const RealType velocity = 1.0);
+    
+    template< int sizeSArray >
+    void updateBlocks( const InterfaceMapType interfaceMap,
+            const MeshFunctionType aux,
+            MeshFunctionType& helpFunc,
+            ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ );
+    
+    void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ );
+    
+    template< int sizeSArray >
+    __cuda_callable__ bool updateCell3D( volatile Real *sArray,
+            int thri, int thrj, int thrk, const Real hx, const Real hy, const Real hz,
+            const Real velocity = 1.0 );
 };
 
 template < typename T1, typename T2 >
@@ -126,46 +136,46 @@ __cuda_callable__ void sortMinims( T1 pom[] );
 #ifdef HAVE_CUDA
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, 
-                                Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output,
-                                Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap  );
+        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap  );
 
 template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > ptr,
-                                      const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap,
-                                      Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& aux,
-                                      bool *BlockIterDevice );
+        const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap,
+        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& aux,
+        bool *BlockIterDevice );
 
 template < int sizeSArray, typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
-                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
-                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
-                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock =0);
+        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
+        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock =0);
 
 template < typename Index >
 __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
-                                   TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks );
+        TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks );
 
 template < typename Index >
 __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
-                               TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY );
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY );
 
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
-                                Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
-                                Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap );
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap );
 
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, 
-                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
-                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap );
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap );
 
 template < int sizeSArray, typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
-                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
-                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
-                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
-                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );
+        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
+        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );
 
 template < typename Index >
 __global__ void GetNeighbours3D( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index 5083544e2b31fbcd5fa02e727b4dd0d0756385bf..8f7937541d4708f519cd73a1cb50f0b582fbad48 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -148,6 +148,7 @@ updateBlocks( InterfaceMapType interfaceMap,
       }
       
       
+      //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
       for( int thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ )
       {        
         if( dimX > (blIdx+1) * numThreadsPerBlock  && thrj+1 < ykolik )
@@ -263,6 +264,370 @@ updateBlocks( InterfaceMapType interfaceMap,
     }
   }
 }
+template< typename Real,
+        typename Device,
+        typename Index >
+template< int sizeSArray >
+void
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+updateBlocks( const InterfaceMapType interfaceMap,
+        const MeshFunctionType aux,
+        MeshFunctionType& helpFunc,
+        ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
+{  
+//#pragma omp parallel for schedule( dynamic )
+  for( int i = 0; i < BlockIterHost.getSize(); i++ )
+  {
+    if( BlockIterHost[ i ] )
+    {
+      MeshType mesh = interfaceMap.template getMesh< Devices::Host >();
+      
+      int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
+      int dimZ = mesh.getDimensions().z();
+      //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl;
+      int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
+      int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
+      int numOfBlockz = dimZ/numThreadsPerBlock + ((dimZ%numThreadsPerBlock != 0) ? 1:0);
+      //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl;
+      int xkolik = numThreadsPerBlock + 1;
+      int ykolik = numThreadsPerBlock + 1;
+      int zkolik = numThreadsPerBlock + 1;
+      
+      
+      int blIdz = i/( numOfBlockx * numOfBlocky );
+      int blIdy = (i-blIdz*numOfBlockx * numOfBlocky )/(numOfBlockx );
+      int blIdx = (i-blIdz*numOfBlockx * numOfBlocky )%( numOfBlockx );
+      //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl;
+      
+      if( numOfBlockx - 1 == blIdx )
+        xkolik = dimX - (blIdx)*numThreadsPerBlock+1;
+      if( numOfBlocky -1 == blIdy )
+        ykolik = dimY - (blIdy)*numThreadsPerBlock+1;
+      if( numOfBlockz-1 == blIdz )
+        zkolik = dimZ - (blIdz)*numThreadsPerBlock+1;
+      //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl;
+      
+      
+      /*bool changed[numThreadsPerBlock*numThreadsPerBlock];
+       changed[ 0 ] = 1;*/
+      Real hx = mesh.getSpaceSteps().x();
+      Real hy = mesh.getSpaceSteps().y();
+      Real hz = mesh.getSpaceSteps().z();
+      
+      bool changed = false;
+      BlockIterHost[ i ] = 0;
+      
+      
+      Real *sArray;
+      sArray = new Real[ sizeSArray * sizeSArray * sizeSArray ];
+      if( sArray == nullptr )
+        std::cout << "Error while allocating memory for sArray." << std::endl;
+      
+      for( int k = 0; k < sizeSArray; k++ )
+        for( int l = 0; l < sizeSArray; l++ )
+          for( int m = 0; m < sizeSArray; m++ ){
+            sArray[ m * sizeSArray * sizeSArray + k * sizeSArray + l ] = std::numeric_limits< Real >::max();
+          }
+      
+      
+      for( int thrk = 0; thrk < numThreadsPerBlock; thrk++ )
+        for( int thrj = 0; thrj < numThreadsPerBlock; thrj++ )
+        {
+          if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik )
+            sArray[(thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX -1 + thrk*dimX*dimY ];
+          
+          if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < zkolik )
+            sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy *numThreadsPerBlock*dimX+ blIdx*numThreadsPerBlock + numThreadsPerBlock + thrj * dimX + thrk*dimX*dimY ];
+          
+          if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik )
+            sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX + thrj + thrk*dimX*dimY ];
+          
+          if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik && thrk+1 < zkolik )
+            sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + (blIdy+1) * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj + thrk*dimX*dimY ];
+          
+          if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik )
+            sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX * dimY + thrj * dimX + thrk ];
+          
+          if( dimZ > (blIdz+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < xkolik )
+            sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = 
+                    aux[ (blIdz+1)*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX + thrk ];
+        }
+      
+      for( int m = 0; m < numThreadsPerBlock; m++ ){
+        for( int k = 0; k < numThreadsPerBlock; k++ ){
+          for( int l = 0; l < numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+              sArray[(m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1] = 
+                      aux[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ];
+          }
+        }
+      }
+      /*string s;
+      int numWhile = 0;
+      for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      for( int m = 0; m < numThreadsPerBlock; m++ ){
+        for( int k = 0; k < numThreadsPerBlock; k++ ){ 
+          for( int l = 0; l < numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){
+              //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl;
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                //printf("In with point m  = %d, k = %d, l = %d\n", m, k, l);
+                changed = this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz) || changed;
+                
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      for( int m = numThreadsPerBlock-1; m >-1; m-- ){
+        for( int k = 0; k < numThreadsPerBlock; k++ ){
+          for( int l = 0; l <numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      for( int m = 0; m < numThreadsPerBlock; m++ ){
+        for( int k = 0; k < numThreadsPerBlock; k++ ){
+          for( int l = numThreadsPerBlock-1; l >-1; l-- ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );
+      */
+      for( int m = numThreadsPerBlock-1; m >-1; m-- ){
+        for( int k = 0; k < numThreadsPerBlock; k++ ){
+          for( int l = numThreadsPerBlock-1; l >-1; l-- ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell3D< sizeSArray >(  sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      for( int m = 0; m < numThreadsPerBlock; m++ ){
+        for( int k = numThreadsPerBlock-1; k > -1; k-- ){
+          for( int l = 0; l <numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      for( int m = numThreadsPerBlock-1; m >-1; m-- ){
+        for( int k = numThreadsPerBlock-1; k > -1; k-- ){
+          for( int l = 0; l <numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      for( int m = 0; m < numThreadsPerBlock; m++ ){
+        for( int k = numThreadsPerBlock-1; k > -1; k-- ){
+          for( int l = numThreadsPerBlock-1; l >-1; l-- ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      
+      for( int m = numThreadsPerBlock-1; m >-1; m-- ){
+        for( int k = numThreadsPerBlock-1; k > -1; k-- ){
+          for( int l = numThreadsPerBlock-1; l >-1; l-- ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      if( changed ){
+        BlockIterHost[ i ] = 1;
+      }
+      
+      
+      for( int k = 0; k < numThreadsPerBlock; k++ ){ 
+        for( int l = 0; l < numThreadsPerBlock; l++ ) {
+          for( int m = 0; m < numThreadsPerBlock; m++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){      
+              helpFunc[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] = 
+                      sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+              //std::cout << helpFunc[ m*dimX*dimY + k*dimX + l ] << " ";
+            }
+          }
+          //std::cout << std::endl;
+        }
+        //std::cout << std::endl;
+      }
+      //helpFunc.save( "helpF.tnl");
+      delete []sArray;
+    }
+  }
+}
+template< typename Real,
+        typename Device,
+        typename Index >
+void 
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ )
+{
+  int* BlockIterPom; 
+  BlockIterPom = new int [ numBlockX * numBlockY * numBlockZ ];
+  
+  for( int i = 0; i< BlockIterHost.getSize(); i++)
+  {
+    BlockIterPom[ i ] = 0;
+    
+    int m=0, l=0, k=0;
+    l = i/( numBlockX * numBlockY );
+    k = (i-l*numBlockX * numBlockY )/(numBlockX );
+    m = (i-l*numBlockX * numBlockY )%( numBlockX );
+    
+    if( m > 0 && BlockIterHost[ i - 1 ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( l > 0 && BlockIterHost[ i - numBlockX*numBlockY ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( l < numBlockZ-1 && BlockIterHost[ i + numBlockX*numBlockY ] ){
+      BlockIterPom[ i ] = 1;
+    }
+  }
+  for( int i = 0; i< BlockIterHost.getSize(); i++)
+  { 
+    BlockIterHost[ i ] = BlockIterPom[ i ];
+  }
+}
+
 
 template< typename Real,
         typename Device,
@@ -619,8 +984,8 @@ initInterface( const MeshFunctionPointer& _input,
         {
           cell.refresh();
           output[ cell.getIndex() ] =
-                  input( cell ) > 0 ? std::numeric_limits< RealType >::max() :
-                    - std::numeric_limits< RealType >::max();
+                  input( cell ) > 0 ? 10://std::numeric_limits< RealType >::max() :
+                    -10;//- std::numeric_limits< RealType >::max();
           interfaceMap[ cell.getIndex() ] = false;
         }
     
@@ -967,6 +1332,82 @@ updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real
   
   return false;
 }
+template< typename Real,
+        typename Device,
+        typename Index >
+template< int sizeSArray >
+__cuda_callable__ 
+bool 
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+updateCell3D( volatile Real *sArray, int thri, int thrj, int thrk,
+        const Real hx, const Real hy, const Real hz, const Real v )
+{
+  const RealType value = sArray[thrk *sizeSArray * sizeSArray + thrj * sizeSArray + thri];
+  
+  RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
+  
+  c = TNL::argAbsMin( sArray[ (thrk+1)* sizeSArray*sizeSArray + thrj * sizeSArray + thri ],
+          sArray[ (thrk-1) * sizeSArray *sizeSArray + thrj* sizeSArray + thri ] );
+  
+  b = TNL::argAbsMin( sArray[ thrk* sizeSArray*sizeSArray + (thrj+1) * sizeSArray + thri ],
+          sArray[ thrk* sizeSArray * sizeSArray + (thrj-1)* sizeSArray +thri ] );
+  
+  a = TNL::argAbsMin( sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri+1 ],
+          sArray[ thrk* sizeSArray * sizeSArray + thrj* sizeSArray +thri-1 ] );
+  
+  /*if( thrk == 8 )
+    printf("Calculating a = %f, b = %f, c = %f\n" , a, b, c );*/
+  
+  if( fabs( a ) == 10&& //std::numeric_limits< RealType >::max() && 
+          fabs( b ) == 10&&//std::numeric_limits< RealType >::max() &&
+          fabs( c ) == 10)//std::numeric_limits< RealType >::max() )
+    return false;
+  
+  RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
+  
+  sortMinims( pom );
+  
+  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
+  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
+  {
+    sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ] = argAbsMin( value, tmp );
+    tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
+    if ( fabs( tmp ) >  0.001*hx )
+      return true;
+    else
+      return false;
+  }
+  else
+  {
+    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
+            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
+    if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
+    {
+      sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ] = argAbsMin( value, tmp );
+      tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
+      if ( fabs( tmp ) > 0.001*hx )
+        return true;
+      else
+        return false;
+    }
+    else
+    {
+      tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
+              TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
+              hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
+              hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
+      sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ] = argAbsMin( value, tmp );
+      tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
+      if ( fabs( tmp ) > 0.001*hx )
+        return true;
+      else
+        return false;
+    }
+  }
+  
+  return false;
+}
 
 #ifdef HAVE_CUDA
 template < typename Real, typename Device, typename Index >
@@ -1215,78 +1656,4 @@ updateCell( volatile Real sArray[18], int thri, const Real h, const Real v )
   else
     return false;
 }
-
-template< typename Real,
-        typename Device,
-        typename Index >
-__cuda_callable__ 
-bool 
-tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
-updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk,
-        const Real hx, const Real hy, const Real hz, const Real v )
-{
-  const RealType value = sArray[thrk][thrj][thri];
-  //std::cout << value << std::endl;
-  RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
-  
-  c = TNL::argAbsMin( sArray[ thrk+1 ][ thrj ][ thri ],
-          sArray[ thrk-1 ][ thrj ][ thri ] );
-  
-  b = TNL::argAbsMin( sArray[ thrk ][ thrj+1 ][ thri ],
-          sArray[ thrk ][ thrj-1 ][ thri ] );
-  
-  a = TNL::argAbsMin( sArray[ thrk ][ thrj ][ thri+1 ],
-          sArray[ thrk ][ thrj ][ thri-1 ] );
-  
-  
-  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-          fabs( b ) == std::numeric_limits< RealType >::max() &&
-          fabs( c ) == std::numeric_limits< RealType >::max() )
-    return false;
-  
-  RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
-  
-  sortMinims( pom );
-  
-  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
-  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
-  {
-    sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
-    tmp = value - sArray[ thrk ][ thrj ][ thri ];
-    if ( fabs( tmp ) >  0.001*hx )
-      return true;
-    else
-      return false;
-  }
-  else
-  {
-    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
-            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
-            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-    if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
-    {
-      sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
-      tmp = value - sArray[ thrk ][ thrj ][ thri ];
-      if ( fabs( tmp ) > 0.001*hx )
-        return true;
-      else
-        return false;
-    }
-    else
-    {
-      tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
-              TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
-              hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
-              hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
-      sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
-      tmp = value - sArray[ thrk ][ thrj ][ thri ];
-      if ( fabs( tmp ) > 0.001*hx )
-        return true;
-      else
-        return false;
-    }
-  }
-  
-  return false;
-}
 #endif
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
index 60c690e0623c82f0a50d12823ad0bcee648e1d86..57b1886e8b67c943051623c0a07f301faa76cad8 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
@@ -1,9 +1,9 @@
 /***************************************************************************
-                          FastSweepingMethod.h  -  description
-                             -------------------
-    begin                : Jul 14, 2016
-    copyright            : (C) 2017 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
+ FastSweepingMethod.h  -  description
+ -------------------
+ begin                : Jul 14, 2016
+ copyright            : (C) 2017 by Tomas Oberhuber
+ email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
 /* See Copyright Notice in tnl/Copyright */
@@ -17,132 +17,134 @@
 
 
 template< typename Mesh,
-          typename Anisotropy = Functions::Analytic::Constant< Mesh::getMeshDimension(), typename Mesh::RealType > >
+        typename Anisotropy = Functions::Analytic::Constant< Mesh::getMeshDimension(), typename Mesh::RealType > >
 class FastSweepingMethod
 {   
 };
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >
-   : public tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >
+: public tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >
 {
-   //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
-   
-   public:
-      
-      typedef Meshes::Grid< 1, Real, Device, Index > MeshType;
-      typedef Real RealType;
-      typedef Device DeviceType;
-      typedef Index IndexType;
-      typedef Anisotropy AnisotropyType;
-      typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > BaseType;
-      using MeshPointer = Pointers::SharedPointer<  MeshType >;
-      using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
-      
-      
-      using typename BaseType::InterfaceMapType;
-      using typename BaseType::MeshFunctionType;
-      using typename BaseType::InterfaceMapPointer;
-      using typename BaseType::MeshFunctionPointer;
-      
-      
-      FastSweepingMethod();
-      
-      const IndexType& getMaxIterations() const;
-      
-      void setMaxIterations( const IndexType& maxIterations );
-      
-      void solve( const MeshPointer& mesh,
-                  const AnisotropyPointer& anisotropy,
-                  MeshFunctionPointer& u );
-      
-      
-   protected:
+  //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
+  
+  public:
+    
+    typedef Meshes::Grid< 1, Real, Device, Index > MeshType;
+    typedef Real RealType;
+    typedef Device DeviceType;
+    typedef Index IndexType;
+    typedef Anisotropy AnisotropyType;
+    typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > BaseType;
+    using MeshPointer = Pointers::SharedPointer<  MeshType >;
+    using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
+    
+    
+    using typename BaseType::InterfaceMapType;
+    using typename BaseType::MeshFunctionType;
+    using typename BaseType::InterfaceMapPointer;
+    using typename BaseType::MeshFunctionPointer;
+    
+    
+    FastSweepingMethod();
+    
+    const IndexType& getMaxIterations() const;
+    
+    void setMaxIterations( const IndexType& maxIterations );
+    
+    void solve( const MeshPointer& mesh,
+            const AnisotropyPointer& anisotropy,
+            MeshFunctionPointer& u );
+    
+    
+    protected:
       
       const IndexType maxIterations;
 };
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >
-   : public tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
+: public tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
 {
-   //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
-   
-   public:
-      
-      typedef Meshes::Grid< 2, Real, Device, Index > MeshType;
-      typedef Real RealType;
-      typedef Device DeviceType;
-      typedef Index IndexType;
-      typedef Anisotropy AnisotropyType;
-      typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > BaseType;
-      using MeshPointer = Pointers::SharedPointer<  MeshType >;
-      using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
-
-      using typename BaseType::InterfaceMapType;
-      using typename BaseType::MeshFunctionType;
-      using typename BaseType::InterfaceMapPointer;
-      using typename BaseType::MeshFunctionPointer;
-      using typename BaseType::ArrayContainer;
-
-      FastSweepingMethod();
-      
-      const IndexType& getMaxIterations() const;
-      
-      void setMaxIterations( const IndexType& maxIterations );
-      
-      void solve( const MeshPointer& mesh,
-                  const AnisotropyPointer& anisotropy,
-                  MeshFunctionPointer& u );
-      
-   protected:
+  //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
+  
+  public:
+    
+    typedef Meshes::Grid< 2, Real, Device, Index > MeshType;
+    typedef Real RealType;
+    typedef Device DeviceType;
+    typedef Index IndexType;
+    typedef Anisotropy AnisotropyType;
+    typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > BaseType;
+    using MeshPointer = Pointers::SharedPointer<  MeshType >;
+    using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
+    
+    using typename BaseType::InterfaceMapType;
+    using typename BaseType::MeshFunctionType;
+    using typename BaseType::InterfaceMapPointer;
+    using typename BaseType::MeshFunctionPointer;
+    using typename BaseType::ArrayContainer;
+    
+    FastSweepingMethod();
+    
+    const IndexType& getMaxIterations() const;
+    
+    void setMaxIterations( const IndexType& maxIterations );
+    
+    void solve( const MeshPointer& mesh,
+            const AnisotropyPointer& anisotropy,
+            MeshFunctionPointer& u );
+    
+    protected:
       
       const IndexType maxIterations;
 };
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >
-   : public tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
+: public tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
 {
-   //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
-   
-   public:
-      
-      typedef Meshes::Grid< 3, Real, Device, Index > MeshType;
-      typedef Real RealType;
-      typedef Device DeviceType;
-      typedef Index IndexType;
-      typedef Anisotropy AnisotropyType;
-      typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > BaseType;
-      using MeshPointer = Pointers::SharedPointer<  MeshType >;
-      using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
-      
-      using typename BaseType::InterfaceMapType;
-      using typename BaseType::MeshFunctionType;
-      using typename BaseType::InterfaceMapPointer;
-      using typename BaseType::MeshFunctionPointer;      
-      
-      FastSweepingMethod();
-      
-      const IndexType& getMaxIterations() const;
-      
-      void setMaxIterations( const IndexType& maxIterations );
-      
-      void solve( const MeshPointer& mesh,
-                  const AnisotropyPointer& anisotropy,
-                  MeshFunctionPointer& u );
-      
-      
-   protected:
+  //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
+  
+  public:
+    
+    typedef Meshes::Grid< 3, Real, Device, Index > MeshType;
+    typedef Real RealType;
+    typedef Device DeviceType;
+    typedef Index IndexType;
+    typedef Anisotropy AnisotropyType;
+    typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > BaseType;
+    using MeshPointer = Pointers::SharedPointer<  MeshType >;
+    using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
+    
+    using typename BaseType::InterfaceMapType;
+    using typename BaseType::MeshFunctionType;
+    using typename BaseType::InterfaceMapPointer;
+    using typename BaseType::MeshFunctionPointer;   
+    using typename BaseType::ArrayContainer;
+    
+    
+    FastSweepingMethod();
+    
+    const IndexType& getMaxIterations() const;
+    
+    void setMaxIterations( const IndexType& maxIterations );
+    
+    void solve( const MeshPointer& mesh,
+            const AnisotropyPointer& anisotropy,
+            MeshFunctionPointer& u );
+    
+    
+    protected:
       
       const IndexType maxIterations;
 };
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index fa2716897d0cfd685f563b837796b033dd4cc91f..d5ce1efe164bc835866dc7fc95909b3defdf846f 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -15,9 +15,12 @@
 
 #include "tnlFastSweepingMethod.h"
 #include <TNL/Devices/Cuda.h>
-#include <string.h>
+#include <TNL/Communicators/MpiDefs.h>
+
 
 
+
+#include <string.h>
 #include <iostream>
 #include <fstream>
 
@@ -80,16 +83,48 @@ solve( const MeshPointer& mesh,
   MeshFunctionType aux = *auxPtr;
   
   
+//#ifdef HAVE_MPI
+  bool a = Communicators::MpiCommunicator::IsInitialized();
+  if( a )
+    printf("Je Init\n");
+  else
+    printf("Neni Init\n");
+//#endif
   
   while( iteration < this->maxIterations )
   {
     if( std::is_same< DeviceType, Devices::Host >::value )
     {
-      int numThreadsPerBlock = 16;
+      int numThreadsPerBlock = -1;
+      
+      numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0));
+      //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
+      if( numThreadsPerBlock <= 16 )
+        numThreadsPerBlock = 16;
+      else if(numThreadsPerBlock <= 32 )
+        numThreadsPerBlock = 32;
+      else if(numThreadsPerBlock <= 64 )
+        numThreadsPerBlock = 64;
+      else if(numThreadsPerBlock <= 128 )
+        numThreadsPerBlock = 128;
+      else if(numThreadsPerBlock <= 256 )
+        numThreadsPerBlock = 256;
+      else if(numThreadsPerBlock <= 512 )
+        numThreadsPerBlock = 512;
+      else
+        numThreadsPerBlock = 1024;
+      //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
+      
+      if( numThreadsPerBlock == -1 ){
+        printf("Fail in setting numThreadsPerBlock.\n");
+        break;
+      }
+      
       
       
       int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
       int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
+      
       //std::cout << "numBlocksX = " << numBlocksX << std::endl;
       
       /*Real **sArray = new Real*[numBlocksX*numBlocksY];
@@ -115,13 +150,29 @@ solve( const MeshPointer& mesh,
        }
        std::cout<<std::endl;*/
       unsigned int numWhile = 0;
-      while( IsCalculationDone && numWhile < 1 )
+      while( IsCalculationDone )
       {      
         IsCalculationDone = 0;
         helpFunc1 = auxPtr;
         auxPtr = helpFunc;
         helpFunc = helpFunc1;
-        this->template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+        switch ( numThreadsPerBlock ){
+          case 16:
+            this->template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+          case 32:
+            this->template updateBlocks< 34 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+          case 64:
+            this->template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+          case 128:
+            this->template updateBlocks< 130 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+          case 256:
+            this->template updateBlocks< 258 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+          case 512:
+            this->template updateBlocks< 514 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+          default:
+            this->template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+        }
+        
         
         //Reduction      
         for( int i = 0; i < BlockIterHost.getSize(); i++ ){
@@ -131,14 +182,14 @@ solve( const MeshPointer& mesh,
           }
         }
         numWhile++;
-        std::cout <<"numWhile = "<< numWhile <<std::endl;
+        /*std::cout <<"numWhile = "<< numWhile <<std::endl;
         
         for( int j = numBlocksY-1; j>-1; j-- ){
           for( int i = 0; i < numBlocksX; i++ )
             std::cout << BlockIterHost[ j * numBlocksX + i ];
           std::cout << std::endl;
         }
-        std::cout << std::endl;
+        std::cout << std::endl;*/
         
         this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY );
         
@@ -150,8 +201,8 @@ solve( const MeshPointer& mesh,
          std::cout << std::endl;*/
         
         //std::cout<<std::endl;
-        string s( "aux-"+ std::to_string(numWhile) + ".tnl");
-        aux.save( s );
+        //string s( "aux-"+ std::to_string(numWhile) + ".tnl");
+        //aux.save( s );
       }
       if( numWhile == 1 ){
         auxPtr = helpFunc;
@@ -266,8 +317,8 @@ solve( const MeshPointer& mesh,
       BlockIterPom.setSize( numBlocksX * numBlocksY  );
       BlockIterPom.setValue( 0 );
       /*TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1;
-      BlockIterPom1.setSize( numBlocksX * numBlocksY  );
-      BlockIterPom1.setValue( 0 );*/
+       BlockIterPom1.setSize( numBlocksX * numBlocksY  );
+       BlockIterPom1.setValue( 0 );*/
       /*int *BlockIterDevice;
        cudaMalloc((void**) &BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/
       int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
@@ -408,6 +459,7 @@ solve( const MeshPointer& mesh,
     }
     iteration++;
   }
+  //#endif
   aux.save("aux-final.tnl");
 }
 
@@ -527,7 +579,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
   
   
   /** FOR FIM METHOD */
-    
+  
   if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] )
   { 
     __syncthreads();
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index 65aba5bf58daa953ee43b68efc47db3d263a90cc..5af33cf29605ce983d75b61846ceeafeada7fde0 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -64,9 +64,6 @@ solve( const MeshPointer& mesh,
   interfaceMapPtr->setMesh( mesh );
   std::cout << "Initiating the interface cells ..." << std::endl;
   BaseType::initInterface( u, auxPtr, interfaceMapPtr );
-#ifdef HAVE_CUDA
-  cudaDeviceSynchronize();
-#endif
   auxPtr->save( "aux-ini.tnl" );   
   
   typename MeshType::Cell cell( *mesh );
@@ -78,170 +75,259 @@ solve( const MeshPointer& mesh,
   {
     if( std::is_same< DeviceType, Devices::Host >::value )
     {
-      for( cell.getCoordinates().z() = 0;
-              cell.getCoordinates().z() < mesh->getDimensions().z();
-              cell.getCoordinates().z()++ )
-      {
-        for( cell.getCoordinates().y() = 0;
-                cell.getCoordinates().y() < mesh->getDimensions().y();
-                cell.getCoordinates().y()++ )
-        {
-          for( cell.getCoordinates().x() = 0;
-                  cell.getCoordinates().x() < mesh->getDimensions().x();
-                  cell.getCoordinates().x()++ )
-          {
-            cell.refresh();
-            if( ! interfaceMap( cell ) )
-              this->updateCell( aux, cell );
-          }
-        }
-      }
-      //aux.save( "aux-1.tnl" );
+      int numThreadsPerBlock = 64;
       
-      for( cell.getCoordinates().z() = 0;
-              cell.getCoordinates().z() < mesh->getDimensions().z();
-              cell.getCoordinates().z()++ )
-      {
-        for( cell.getCoordinates().y() = 0;
-                cell.getCoordinates().y() < mesh->getDimensions().y();
-                cell.getCoordinates().y()++ )
-        {
-          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                  cell.getCoordinates().x() >= 0 ;
-                  cell.getCoordinates().x()-- )		
-          {
-            //std::cerr << "2 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )            
-              this->updateCell( aux, cell );
-          }
-        }
-      }
-      //aux.save( "aux-2.tnl" );
-      for( cell.getCoordinates().z() = 0;
-              cell.getCoordinates().z() < mesh->getDimensions().z();
-              cell.getCoordinates().z()++ )
-      {
-        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-                cell.getCoordinates().y() >= 0 ;
-                cell.getCoordinates().y()-- )
-        {
-          for( cell.getCoordinates().x() = 0;
-                  cell.getCoordinates().x() < mesh->getDimensions().x();
-                  cell.getCoordinates().x()++ )
-          {
-            //std::cerr << "3 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )            
-              this->updateCell( aux, cell );
-          }
-        }
-      }
-      //aux.save( "aux-3.tnl" );
       
-      for( cell.getCoordinates().z() = 0;
-              cell.getCoordinates().z() < mesh->getDimensions().z();
-              cell.getCoordinates().z()++ )
-      {
-        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-                cell.getCoordinates().y() >= 0;
-                cell.getCoordinates().y()-- )
-        {
-          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                  cell.getCoordinates().x() >= 0 ;
-                  cell.getCoordinates().x()-- )		
-          {
-            //std::cerr << "4 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )            
-              this->updateCell( aux, cell );
-          }
-        }
-      }     
-      //aux.save( "aux-4.tnl" );
+      int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
+      int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
+      int numBlocksZ = mesh->getDimensions().z() / numThreadsPerBlock + (mesh->getDimensions().z() % numThreadsPerBlock != 0 ? 1:0);
+      //std::cout << "numBlocksX = " << numBlocksX << std::endl;
       
-      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-              cell.getCoordinates().z() >= 0;
-              cell.getCoordinates().z()-- )
-      {
-        for( cell.getCoordinates().y() = 0;
-                cell.getCoordinates().y() < mesh->getDimensions().y();
-                cell.getCoordinates().y()++ )
-        {
-          for( cell.getCoordinates().x() = 0;
-                  cell.getCoordinates().x() < mesh->getDimensions().x();
-                  cell.getCoordinates().x()++ )
-          {
-            //std::cerr << "5 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )
-              this->updateCell( aux, cell );
-          }
-        }
-      }
-      //aux.save( "aux-5.tnl" );
+      /*Real **sArray = new Real*[numBlocksX*numBlocksY];
+       for( int i = 0; i < numBlocksX * numBlocksY; i++ )
+       sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];*/
       
-      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-              cell.getCoordinates().z() >= 0;
-              cell.getCoordinates().z()-- )
-      {
-        for( cell.getCoordinates().y() = 0;
-                cell.getCoordinates().y() < mesh->getDimensions().y();
-                cell.getCoordinates().y()++ )
-        {
-          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                  cell.getCoordinates().x() >= 0 ;
-                  cell.getCoordinates().x()-- )		
-          {
-            //std::cerr << "6 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )            
-              this->updateCell( aux, cell );
-          }
-        }
-      }
-      //aux.save( "aux-6.tnl" );
+      ArrayContainer BlockIterHost;
+      BlockIterHost.setSize( numBlocksX * numBlocksY * numBlocksZ );
+      BlockIterHost.setValue( 1 );
+      int IsCalculationDone = 1;
       
-      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-              cell.getCoordinates().z() >= 0;
-              cell.getCoordinates().z()-- )
-      {
-        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-                cell.getCoordinates().y() >= 0 ;
-                cell.getCoordinates().y()-- )
-        {
-          for( cell.getCoordinates().x() = 0;
-                  cell.getCoordinates().x() < mesh->getDimensions().x();
-                  cell.getCoordinates().x()++ )
-          {
-            //std::cerr << "7 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )            
-              this->updateCell( aux, cell );
+      MeshFunctionPointer helpFunc( mesh );
+      MeshFunctionPointer helpFunc1( mesh );
+      helpFunc1 = auxPtr;
+      auxPtr = helpFunc;
+      helpFunc = helpFunc1;
+      //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl;
+      /*for( int k = numBlocksX-1; k >-1; k-- ){
+       for( int l = 0; l < numBlocksY; l++ ){
+       std::cout<< BlockIterHost[ l*numBlocksX  + k ];
+       }
+       std::cout<<std::endl;
+       }
+       std::cout<<std::endl;*/
+      unsigned int numWhile = 0;
+      while( IsCalculationDone  )
+      {      
+        IsCalculationDone = 0;
+        helpFunc1 = auxPtr;
+        auxPtr = helpFunc;
+        helpFunc = helpFunc1;
+        this->template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+        
+        //Reduction      
+        for( int i = 0; i < BlockIterHost.getSize(); i++ ){
+          if( IsCalculationDone == 0 ){
+            IsCalculationDone = IsCalculationDone || BlockIterHost[ i ];
+            //break;
           }
         }
-      }
-      //aux.save( "aux-7.tnl" );
-      
-      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-              cell.getCoordinates().z() >= 0;
-              cell.getCoordinates().z()-- )
-      {
-        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-                cell.getCoordinates().y() >= 0;
-                cell.getCoordinates().y()-- )
-        {
-          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                  cell.getCoordinates().x() >= 0 ;
-                  cell.getCoordinates().x()-- )		
-          {
-            //std::cerr << "8 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )            
-              this->updateCell( aux, cell );
+        numWhile++;
+        std::cout <<"numWhile = "<< numWhile <<std::endl;
+        /*for( int k = 0; k < numBlocksZ; k++ ){
+          for( int j = numBlocksY-1; j>-1; j-- ){
+            for( int i = 0; i < numBlocksX; i++ ){
+              //std::cout << (*auxPtr)[ k * numBlocksX * numBlocksY + j * numBlocksX + i ] << " ";
+              std::cout << BlockIterHost[ k * numBlocksX * numBlocksY + j * numBlocksX + i ];
+            }
+            std::cout << std::endl;
           }
+          std::cout << std::endl;
         }
+        std::cout << std::endl;*/
+        
+        this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY, numBlocksZ );
+        
+        /*for( int k = 0; k < numBlocksZ; k++ ){
+          for( int j = numBlocksY-1; j>-1; j-- ){
+            for( int i = 0; i < numBlocksX; i++ ){
+              //std::cout << (*auxPtr)[ k * numBlocksX * numBlocksY + j * numBlocksX + i ] << " ";
+              std::cout << BlockIterHost[ k * numBlocksX * numBlocksY + j * numBlocksX + i ];
+            }
+            std::cout << std::endl;
+          }
+          std::cout << std::endl;
+        }*/
+        
+        /*for( int j = numBlocksY-1; j>-1; j-- ){
+         for( int i = 0; i < numBlocksX; i++ )
+         std::cout << "BlockIterHost = "<< j*numBlocksX + i<< " ," << BlockIterHost[ j * numBlocksX + i ];
+         std::cout << std::endl;
+         }
+         std::cout << std::endl;*/
+        
+        //std::cout<<std::endl;
+        //string s( "aux-"+ std::to_string(numWhile) + ".tnl");
+        //aux.save( s );
+      }
+      if( numWhile == 1 ){
+        auxPtr = helpFunc;
       }
+      aux = *auxPtr;
+      
+      /*for( cell.getCoordinates().z() = 0;
+       cell.getCoordinates().z() < mesh->getDimensions().z();
+       cell.getCoordinates().z()++ )
+       {
+       for( cell.getCoordinates().y() = 0;
+       cell.getCoordinates().y() < mesh->getDimensions().y();
+       cell.getCoordinates().y()++ )
+       {
+       for( cell.getCoordinates().x() = 0;
+       cell.getCoordinates().x() < mesh->getDimensions().x();
+       cell.getCoordinates().x()++ )
+       {
+       cell.refresh();
+       if( ! interfaceMap( cell ) )
+       this->updateCell( aux, cell );
+       }
+       }
+       }
+       //aux.save( "aux-1.tnl" );
+       
+       for( cell.getCoordinates().z() = 0;
+       cell.getCoordinates().z() < mesh->getDimensions().z();
+       cell.getCoordinates().z()++ )
+       {
+       for( cell.getCoordinates().y() = 0;
+       cell.getCoordinates().y() < mesh->getDimensions().y();
+       cell.getCoordinates().y()++ )
+       {
+       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+       cell.getCoordinates().x() >= 0 ;
+       cell.getCoordinates().x()-- )		
+       {
+       //std::cerr << "2 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       }
+       //aux.save( "aux-2.tnl" );
+       for( cell.getCoordinates().z() = 0;
+       cell.getCoordinates().z() < mesh->getDimensions().z();
+       cell.getCoordinates().z()++ )
+       {
+       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+       cell.getCoordinates().y() >= 0 ;
+       cell.getCoordinates().y()-- )
+       {
+       for( cell.getCoordinates().x() = 0;
+       cell.getCoordinates().x() < mesh->getDimensions().x();
+       cell.getCoordinates().x()++ )
+       {
+       //std::cerr << "3 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       }
+       //aux.save( "aux-3.tnl" );
+       
+       for( cell.getCoordinates().z() = 0;
+       cell.getCoordinates().z() < mesh->getDimensions().z();
+       cell.getCoordinates().z()++ )
+       {
+       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+       cell.getCoordinates().y() >= 0;
+       cell.getCoordinates().y()-- )
+       {
+       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+       cell.getCoordinates().x() >= 0 ;
+       cell.getCoordinates().x()-- )		
+       {
+       //std::cerr << "4 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       }     
+       //aux.save( "aux-4.tnl" );
+       
+       for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+       cell.getCoordinates().z() >= 0;
+       cell.getCoordinates().z()-- )
+       {
+       for( cell.getCoordinates().y() = 0;
+       cell.getCoordinates().y() < mesh->getDimensions().y();
+       cell.getCoordinates().y()++ )
+       {
+       for( cell.getCoordinates().x() = 0;
+       cell.getCoordinates().x() < mesh->getDimensions().x();
+       cell.getCoordinates().x()++ )
+       {
+       //std::cerr << "5 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )
+       this->updateCell( aux, cell );
+       }
+       }
+       }
+       //aux.save( "aux-5.tnl" );
+       
+       for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+       cell.getCoordinates().z() >= 0;
+       cell.getCoordinates().z()-- )
+       {
+       for( cell.getCoordinates().y() = 0;
+       cell.getCoordinates().y() < mesh->getDimensions().y();
+       cell.getCoordinates().y()++ )
+       {
+       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+       cell.getCoordinates().x() >= 0 ;
+       cell.getCoordinates().x()-- )		
+       {
+       //std::cerr << "6 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       }
+       //aux.save( "aux-6.tnl" );
+       
+       for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+       cell.getCoordinates().z() >= 0;
+       cell.getCoordinates().z()-- )
+       {
+       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+       cell.getCoordinates().y() >= 0 ;
+       cell.getCoordinates().y()-- )
+       {
+       for( cell.getCoordinates().x() = 0;
+       cell.getCoordinates().x() < mesh->getDimensions().x();
+       cell.getCoordinates().x()++ )
+       {
+       //std::cerr << "7 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       }
+       //aux.save( "aux-7.tnl" );
+       
+       for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+       cell.getCoordinates().z() >= 0;
+       cell.getCoordinates().z()-- )
+       {
+       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+       cell.getCoordinates().y() >= 0;
+       cell.getCoordinates().y()-- )
+       {
+       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+       cell.getCoordinates().x() >= 0 ;
+       cell.getCoordinates().x()-- )		
+       {
+       //std::cerr << "8 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       }*/
     }
     if( std::is_same< DeviceType, Devices::Cuda >::value )
     {
@@ -389,7 +475,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
   {
     __syncthreads();
     
-    __shared__ volatile bool changed[ (sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2)];
+    __shared__ volatile bool changed[ 8*8*8/*(sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2)*/];
     
     changed[ currentIndex ] = false;
     if( thrj == 0 && thri == 0 && thrk == 0 )
@@ -402,6 +488,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     
     if( thrj == 1 && thri == 1 && thrk == 1 )
     {
+      //printf( "We are in the calculation. Block = %d.\n",blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x  );
       hx = mesh.getSpaceSteps().x();
       hy = mesh.getSpaceSteps().y();
       hz = mesh.getSpaceSteps().z();
@@ -410,8 +497,8 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       dimZ = mesh.getDimensions().z();
       BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 0;
     }
-    __shared__ volatile Real sArray[sizeSArray][sizeSArray][sizeSArray];
-    sArray[thrk+1][thrj+1][thri+1] = std::numeric_limits< Real >::max();
+    __shared__ volatile Real sArray[ 10*10*10/*sizeSArray * sizeSArray * sizeSArray*/ ];
+    sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = std::numeric_limits< Real >::max();
     
     //filling sArray edges
     int numOfBlockx;
@@ -426,6 +513,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     numOfBlockx = gridDim.x;
     numOfBlocky = gridDim.y;
     numOfBlockz = gridDim.z;
+    __syncthreads();
     
     if( numOfBlockx - 1 == blIdx )
       xkolik = dimX - (blIdx)*blockDim.x+1;
@@ -438,54 +526,55 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     if( thri == 0 )
     {        
       if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik )
-        sArray[thrk+1][thrj+1][0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ];
+        sArray[(thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ];
       else
-        sArray[thrk+1][thrj+1][0] = std::numeric_limits< Real >::max();
+        sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max();
     }
     
     if( thri == 1 )
     {
       if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik )
-        sArray[thrk+1][thrj+1][9] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ];
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ];
       else
-        sArray[thrk+1][thrj+1][9] = std::numeric_limits< Real >::max();
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max();
     }
     if( thri == 2 )
     {        
       if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik )
-        sArray[thrk+1][0][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ];
+        sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ];
       else
-        sArray[thrk+1][0][thrj+1] = std::numeric_limits< Real >::max();
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + 0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
     }
     
     if( thri == 3 )
     {
       if( dimY > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik )
-        sArray[thrk+1][9][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ];
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ];
       else
-        sArray[thrk+1][9][thrj+1] = std::numeric_limits< Real >::max();
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
     }
     if( thri == 4 )
     {        
       if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik )
-        sArray[0][thrj+1][thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ];
+        sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ];
       else
-        sArray[0][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
+        sArray[0 * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thrk+1] = std::numeric_limits< Real >::max();
     }
     
     if( thri == 5 )
     {
       if( dimZ > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik )
-        sArray[9][thrj+1][thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ];
+        sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ];
       else
-        sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
+        sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = std::numeric_limits< Real >::max();
     }
     
     if( i < dimX && j < dimY && k < dimZ )
     {
-      sArray[thrk+1][thrj+1][thri+1] = aux[ k*dimX*dimY + j*dimX + i ];
+      sArray[(thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = aux[ k*dimX*dimY + j*dimX + i ];
     }
     __syncthreads(); 
+    
     while( changed[ 0 ] )
     {
       __syncthreads();
@@ -493,11 +582,11 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       changed[ currentIndex ] = false;
       
       //calculation of update cell
-      if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ )
+      if( i < dimX && j < dimY && k < dimZ )
       {
-        if( ! interfaceMap[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] )
+        if( ! interfaceMap[ k*dimX*dimY + j * dimX + i ] )
         {
-          changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz);
+          changed[ currentIndex ] = ptr.updateCell3D< sizeSArray >( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz);
         }
       }
       __syncthreads();
@@ -535,7 +624,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
         }
       }
       __syncthreads();
-      if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
+      if( currentIndex < 32 )
       {
         if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
         if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
@@ -548,7 +637,8 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       
       /*if(thri == 0 && thrj ==0 && thrk ==0 && blIdx == 0 && blIdy == 0 && blIdz == 0)
        {
-       for(int m = 0; m < 8; m++){
+       //for(int m = 0; m < 8; m++){
+       int m = 4;
        for(int n = 0; n<8; n++){
        for(int b=0; b<8; b++)
        printf(" %i ", changed[m*64 + n*8 + b]);
@@ -556,16 +646,19 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
        }
        printf("\n \n");
        }
-       }*/
+       //}*/
+      
       if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 )
       {
-        BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 1;
+        //printf( "Setting block calculation. Block = %d.\n",blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x  );
+        BlockIterDevice[ blIdz * gridDim.x * gridDim.y + blIdy * gridDim.x + blIdx ] = 1;
       }
       __syncthreads();
     }
     
     if( i < dimX && j < dimY && k < dimZ )
-      helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ];
+      helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thri+1 ];
+    
   } 
 }  
 #endif