diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index 138671a3b7c9fa6bae88e2bc6f2651726b3fae5c..69fce1467ff594daef81e35d1c49118ec8252637 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -67,7 +67,17 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
       template< typename MeshEntity >
       __cuda_callable__ void updateCell( MeshFunctionType& u,
                                          const MeshEntity& cell,
-                                         const RealType velocity = 1.0 );      
+                                         const RealType velocity = 1.0 );
+      
+      __cuda_callable__ void updateCell( Real sArray[18][18],
+                                         int thri, int thrj, const Real hx, const Real hy,
+                                         const Real velocity = 1.0 );
+      
+      __cuda_callable__ void setsArray( MeshFunctionType& aux, Real sArray[18][18],
+                                            int dimX, int dimY, int i, int j );
+      
+      /*__cuda_callable__ void getsArray( MeshFunctionType& aux, Real sArray[18][18],
+                                            int dimX, int dimY, int i, int j );*/
 };
 
 template< typename Real,
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index bf78a02a483083f84e6ded44bc823674b61bb96f..a89d230f6497fc0145712d002c38be1a110157f0 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -218,7 +218,6 @@ updateCell( MeshFunctionType& u,
 {
    const auto& neighborEntities = cell.template getNeighborEntities< 2 >();
    const MeshType& mesh = cell.getMesh();
-  
    const RealType& hx = mesh.getSpaceSteps().x();
    const RealType& hy = mesh.getSpaceSteps().y();
    const RealType value = u( cell );
@@ -598,7 +597,7 @@ T1 meet2DCondition( T1 a, T1 b, const T2 ha, const T2 hb, const T1 value, double
 }
 
 template < typename T1 >
-__cuda_callable__ void sortMinims( T1 pom[])
+__cuda_callable__ void sortMinims( T1 pom[] )
 {
     T1 tmp[6] = {0.0,0.0,0.0,0.0,0.0,0.0}; 
     if( fabs(pom[0]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[2])){
@@ -796,3 +795,231 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3
         }
     }
 }
+
+template< typename Real,
+          typename Device,
+          typename Index >
+__cuda_callable__ void
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+setsArray( MeshFunctionType& aux, Real sArray[18][18], int dimX, int dimY, int blockIdx, int blockIdy )
+{
+    int numOfBlockx = dimX/16 + ((dimX%16 != 0) ? 1:0);
+    int numOfBlocky = dimY/16 + ((dimY%16 != 0) ? 1:0);
+    int xkolik = 18;
+    int ykolik = 18;
+    int xOd = 0;
+    int yOd = 0;
+    
+    if( blockIdx == 0 )
+        xOd = 1;
+    if( blockIdy == 0 )
+        yOd = 1;
+    
+    if( numOfBlockx - 1 == blockIdx )
+        xkolik = dimX - (numOfBlockx-1)*16+1;
+
+    if( numOfBlocky -1 == blockIdy )
+        ykolik = dimY - (numOfBlocky-1)*16+1;
+    
+    if( dimX > (blockIdx+1) * 16 )
+    {
+        for( int i = yOd; i < ykolik; i++ )
+        {
+            sArray[i][17] = aux[ blockIdy*16*dimX - dimX + blockIdx*16 - 1 + i*dimX + 17 ];
+        }
+    }
+    if( blockIdx != 0 )
+    {
+        for( int i = yOd; i < ykolik; i++ )
+        {
+            sArray[i][0] = aux[ blockIdy*16*dimX - dimX + blockIdx*16 - 1 + i*dimX + 0];
+        }
+    } 
+    if( dimY > (blockIdy+1) * 16 )
+    {
+        for( int i = xOd; i < xkolik; i++ )
+        {
+            sArray[17][i] = aux[ blockIdy*16*dimX - dimX + blockIdx*16 - 1 + 17*dimX + i ];
+        }
+    }
+    if( blockIdy != 0 )
+    {
+        for( int i = xOd; i < xkolik; i++ )
+        {
+            sArray[0][i] = aux[ blockIdy*16*dimX - dimX + blockIdx*16 - 1 + 0*dimX + i ];
+        }
+    }
+    
+    /*int numOfBlockx = dimX/16 + ((dimX%16 != 0) ? 1:0);
+    int numOfBlocky = dimY/16 + ((dimY%16 != 0) ? 1:0);
+    int xkolik = 18;
+    int ykolik = 18;
+
+    if( numOfBlockx - 1 == blockIdx )
+        xkolik = dimX - (numOfBlockx-1)*16+1;
+
+    if( numOfBlocky -1 == blockIdy )
+        ykolik = dimY - (numOfBlocky-1)*16+1;
+
+
+    if( numOfBlockx == 0 || numOfBlocky == 0 )
+        return;
+    else
+    {
+        if( blockIdx == 0 )
+        {
+            if( blockIdy == 0 )
+            {
+                for( int j = 0; j < ykolik-1; j++ )
+                  for( int i = 0; i < xkolik-1; i++ )
+                    {
+                      sArray[ j+1 ][ i+1 ] = aux[ j*dimX + i ];
+                    }
+
+                //vypis( *sArray );
+            }
+            else
+            {
+                for( int j = 0; j < ykolik; j++ )
+                  for( int i = 0; i < xkolik-1; i++ )
+                    {
+                      sArray[ j ][ i+1 ] = aux[ blockIdy*16*dimX - dimX + j*dimX + i ];
+                    }
+            }
+        }
+        else if( blockIdy == 0 )
+        {
+            for( int j = 0; j < ykolik-1; j++ )
+              for( int i = 0; i < xkolik; i++ )
+                 {
+                   sArray[ j+1 ][ i ] = aux[ blockIdx*16 - 1 + j*dimX + i ];
+                 }
+        }
+        else
+        {
+           for( int j = 0; j < ykolik; j++ )
+              for( int i = 0; i < xkolik; i++ )
+                {
+                  //if( dimX*dimY-1 >  blockIdy*16*dimX - dimX + blockIdx*16 - 1 + j*dimX + i )
+                    sArray[ j ][ i ] = aux[ blockIdy*16*dimX - dimX + blockIdx*16 - 1 + j*dimX + i ];
+                }
+        }
+    }*/
+}
+
+/*template< typename Real,
+          typename Device,
+          typename Index >
+__cuda_callable__ void
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+getsArray( MeshFunctionType& aux, Real sArray[18][18], int dimX, int dimY, int blockIdx, int blockIdy )
+{
+    int numOfBlockx = dimX/16 + ((dimX%16 != 0) ? 1:0);
+    int numOfBlocky = dimY/16 + ((dimY%16 != 0) ? 1:0);
+    int xkolik = 18;
+    int ykolik = 18;
+    
+    if( numOfBlockx - 1 == blockIdx )
+        xkolik = dimX - (numOfBlockx-1)*16+2;
+
+    if( numOfBlocky -1 == blockIdy )
+        ykolik = dimY - (numOfBlocky-1)*16+2;
+
+    if( numOfBlockx == 0 || numOfBlocky == 0 )
+        return;
+    else
+    {
+        if( blockIdx == 0 )
+        {
+            if( blockIdy == 0 )
+            {
+               for( int j = 0; j < ykolik-2; j++ )
+                  for( int i = 0; i < xkolik-2; i++ )
+                    {
+                      aux[ j*dimX + i ] = sArray[ j+1 ][ i+1 ];
+                    }
+            }
+            else
+            {
+               for( int j = 1; j < ykolik-1; j++ )
+                  for( int i = 0; i < xkolik-2; i++ )
+                    {
+                      aux[ blockIdy*16*dimX - dimX + j*dimX + i ] = sArray[ j ][ i+1 ];
+                    }
+            }
+        }
+        else if( blockIdy == 0 )
+        {
+            for( int j = 0; j < ykolik-2; j++ )
+               for( int i = 1; i < xkolik-1; i++ )
+                 {
+                   aux[ blockIdx*16 - 1 + j*dimX + i ] = sArray[ j+1 ][ i ];
+                 }
+        }
+        else
+        {
+            for( int j = 1; j < ykolik-1; j++ )    
+              for( int i = 1; i < xkolik-1; i++ )
+                {
+                    aux[ blockIdy*16*dimX - dimX + blockIdx*16 - 1 + j*dimX + i ] = sArray[ j ][ i ];
+                }
+        }
+    }
+}*/
+
+template< typename Real,
+          typename Device,
+          typename Index >
+__cuda_callable__
+void
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+updateCell( Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy,
+            const Real v )
+{
+   
+   //const Meshes::Grid< 2, Real, Device, Index >& mesh = u.template getMesh< Devices::Cuda >();
+   /*typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell;
+   Cell cell( mesh );
+   cell.getCoordinates().x() = i; cell.getCoordinates().y() = j;
+   cell.refresh();
+   
+   const auto& neighborEntities = cell.template getNeighborEntities< 2 >();
+   const RealType& hx = mesh.getSpaceSteps().x();
+   const RealType& hy = mesh.getSpaceSteps().y();*/
+   const RealType value = sArray[ thrj ][ thri ];//u( cell );
+   RealType a, b, tmp = TypeInfo< RealType >::getMaxValue();
+   
+   /*if( value != u[ cell.getIndex() ] )
+       return;*/
+   
+   b = TNL::argAbsMin( sArray[ thrj+1 ][ thri ],
+                        sArray[ thrj-1 ][ thri ] );
+    
+   a = TNL::argAbsMin( sArray[ thrj ][ thri+1 ],
+                        sArray[ thrj ][ thri-1 ] );
+
+
+    
+    if( fabs( a ) == TypeInfo< RealType >::getMaxValue() && 
+        fabs( b ) == TypeInfo< RealType >::getMaxValue() )
+       return;
+   
+    RealType pom[6] = { a, b, TypeInfo< Real >::getMaxValue(), (RealType)hx, (RealType)hy, 0.0 };
+    sortMinims( pom );
+    tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
+    
+                                
+    if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
+    {
+        //u[ cell.getIndex() ]= argAbsMin( value, tmp );
+        sArray[ thrj ][ thri ] = argAbsMin( value, tmp );
+    }
+    else
+    {
+        tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
+            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
+        //u[ cell.getIndex() ]= argAbsMin( value, tmp );
+        sArray[ thrj ][ thri ] = argAbsMin( value, tmp );
+    }
+}
\ No newline at end of file
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
index e44aaca871ee1f9118351c5032b5cb68f3536caf..54e577dacde239171ae299ee62b078bd7a406c35 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
@@ -99,7 +99,6 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >
                   const AnisotropyPointer& anisotropy,
                   MeshFunctionPointer& u );
       
-      
    protected:
       
       const IndexType maxIterations;
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index 73e170895872d8b1a572031da4ea3baa423de24c..80976772cde6e1d9ffcad92bf8779cea17906e29 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -221,11 +221,12 @@ solve( const MeshPointer& mesh,
           dim3 blockSize( cudaBlockSize, cudaBlockSize );
           dim3 gridSize( numBlocksX, numBlocksY );
           Devices::Cuda::synchronizeDevice();
-          int DIM = mesh->getDimensions().x();
           
           tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
-          for( int k = 0; k < numBlocksX; k++)
-            CudaUpdateCellCaller< Real, Device, Index ><<< gridSize, blockSize >>>( ptr,
+          int nBlockIter = numBlocksX > numBlocksY ? numBlocksX : numBlocksY;
+          nBlockIter = numBlocksX == numBlocksY ? nBlockIter + 1 : nBlockIter;
+          for( int k = 0; k < nBlockIter; k++)
+            CudaUpdateCellCaller<<< gridSize, blockSize, 18 * 18 * sizeof( Real ) >>>( ptr,
                                                                                     interfaceMapPtr.template getData< Device >(),
                                                                                     auxPtr.template modifyData< Device>() );
           cudaDeviceSynchronize();
@@ -245,24 +246,44 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
                                       const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                       Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux )
 {
-    int i = threadIdx.x + blockDim.x*blockIdx.x;
-    int j = blockDim.y*blockIdx.y + threadIdx.y;
+    int thri = threadIdx.x; int thrj = threadIdx.y; // nelze ke stejnym pristupovat znovu pres threadIdx (vzdy da jine hodnoty)
+    int blIdx = blockIdx.x; int blIdy = blockIdx.y;
+    int i = thri + blockDim.x*blIdx;
+    int j = blockDim.y*blIdy + thrj;
     const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
+    const Real hx = mesh.getSpaceSteps().x();
+    const Real hy = mesh.getSpaceSteps().y();
+    
+    __shared__ Real sArray[ 18 ][ 18 ];
+    for( int k = 0; k < 18; k++ )
+        for( int l = 0; l < 18; l++ )
+            sArray[ k ][ l ] = TypeInfo< Real >::getMaxValue();
+    __syncthreads();
+    /*//filling shared array
+    ptr.setsArray( aux, sArray, mesh.getDimensions().x(), mesh.getDimensions().y(), blIdx, blIdy );
+    __syncthreads();*/
     
     if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
     {
-        typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell;
-        Cell cell( mesh );
-        cell.getCoordinates().x() = i; cell.getCoordinates().y() = j;
-        cell.refresh();
-        //tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
-        for( int k = 0; k < 16; k++ )
+        sArray[thrj+1][thri+1] = aux[ j*mesh.getDimensions().x() + i ];
+        ptr.setsArray( aux, sArray, mesh.getDimensions().x(), mesh.getDimensions().y(), blIdx, blIdy ); //fill edges of sArray
+        __syncthreads();
+            
+        
+        if( ! interfaceMap[ j*mesh.getDimensions().x() + i ] ) 
         {
-            if( ! interfaceMap( cell ) )
+            for( int k = 0; k < 17; k++ )
             {
-               ptr.updateCell( aux, cell );
+                ptr.updateCell( sArray, thri+1, thrj+1, hx, hy );
+                __syncthreads();
             }
         }
+        /*for( int k = 0; k < mesh.getDimensions().x(); k++)
+            for( int l = 0; l < mesh.getDimensions().y(); l++)
+                aux[ k*mesh.getDimensions().x() + l ] = TypeInfo< Real >::getMaxValue();*/
+        aux[j*mesh.getDimensions().x() + i] = sArray[thrj+1][thri+1];
+        __syncthreads();
     }
+    //ptr.getsArray( aux, sArray, mesh.getDimensions().x(), mesh.getDimensions().y(), blIdx, blIdy );
 }
 //#endif
\ No newline at end of file
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index 3e8cb7bb19c054a35a282e09b70d3b38ef6cda6e..5ddd3bdeb039e38ef904a4b574c90bbd4dd89e68 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -21,7 +21,7 @@ template< typename Real,
           typename Anisotropy >
 FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
 FastSweepingMethod()
-: maxIterations( 2 )
+: maxIterations( 1 )
 {
    
 }
@@ -257,7 +257,7 @@ solve( const MeshPointer& mesh,
           
           tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
           for( int k = 0; k < numBlocksX; k++)
-          CudaUpdateCellCaller< Real, Device, Index ><<< gridSize, blockSize >>>( ptr,
+          CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
                                                                                   interfaceMapPtr.template getData< Device >(),
                                                                                   auxPtr.template modifyData< Device>() );
           cudaDeviceSynchronize();
@@ -291,7 +291,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
         cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k;
         cell.refresh();
         //tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
-        for( int l = 0; l < 8; l++ )
+        for( int l = 0; l < 10; l++ )
         {
             if( ! interfaceMap( cell ) )
             {