From f5c32276c88cf7bd2ca5bc45b3bfc06768486275 Mon Sep 17 00:00:00 2001
From: Fencl <fenclmat@fjfi.cvut.cz>
Date: Thu, 4 Oct 2018 19:30:14 +0200
Subject: [PATCH 01/20] Chess model implemented in 2D.

---
 .../tnlDirectEikonalMethodsBase.h             |   8 +-
 .../tnlDirectEikonalMethodsBase_impl.h        |  12 +-
 .../tnlFastSweepingMethod2D_impl.h            | 212 ++++++++++--------
 3 files changed, 124 insertions(+), 108 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index b981a92a8..eb7cbd2a5 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -129,12 +129,12 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
 template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
-                                      Real *aux,
-                                      int *BlockIterDevice);
+                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
+                                      int *BlockIterDevice, int oddEvenBlock);
 __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks );
 
-template < typename Real, typename Device, typename Index >
-__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a );
+/*template < typename Real, typename Device, typename Index >
+__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a );*/
 
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index 649a5ad43..cfea6aca0 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -945,7 +945,7 @@ updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, con
     {
         sArray[ thrj ][ thri ] = argAbsMin( value, tmp );
         tmp = value - sArray[ thrj ][ thri ];
-        if ( fabs( tmp ) >  0.01*hx )
+        if ( fabs( tmp ) >  0.001*hx )
             return true;
         else
             return false;
@@ -957,7 +957,7 @@ updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, con
             ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
         sArray[ thrj ][ thri ] = argAbsMin( value, tmp );
         tmp = value - sArray[ thrj ][ thri ];
-        if ( fabs( tmp ) > 0.01*hx )
+        if ( fabs( tmp ) > 0.001*hx )
             return true;
         else
             return false;
@@ -989,7 +989,7 @@ updateCell( volatile Real sArray[18], int thri, const Real h, const Real v )
     sArray[ thri ] = argAbsMin( value, tmp );
     
     tmp = value - sArray[ thri ];
-    if ( fabs( tmp ) >  0.01*h )
+    if ( fabs( tmp ) >  0.001*h )
         return true;
     else
         return false;
@@ -1032,7 +1032,7 @@ updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk,
     {
         sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
         tmp = value - sArray[ thrk ][ thrj ][ thri ];
-        if ( fabs( tmp ) >  0.01*hx )
+        if ( fabs( tmp ) >  0.001*hx )
             return true;
         else
             return false;
@@ -1046,7 +1046,7 @@ updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk,
         {
             sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
             tmp = value - sArray[ thrk ][ thrj ][ thri ];
-            if ( fabs( tmp ) > 0.01*hx )
+            if ( fabs( tmp ) > 0.001*hx )
                 return true;
             else
                 return false;
@@ -1059,7 +1059,7 @@ updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk,
                 hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
             sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
             tmp = value - sArray[ thrk ][ thrj ][ thri ];
-            if ( fabs( tmp ) > 0.01*hx )
+            if ( fabs( tmp ) > 0.001*hx )
                 return true;
             else
                 return false;
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index 6703843c1..7e4028fbe 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -26,7 +26,7 @@ template< typename Real,
           typename Anisotropy >
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
 FastSweepingMethod()
-: maxIterations( 100 )
+: maxIterations( 1 )
 {
    
 }
@@ -250,7 +250,7 @@ solve( const MeshPointer& mesh,
           
           tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
           
-          aux1<<< gridSize, blockSize >>>( auxPtr.template modifyData< Device>(), dAux,1 );
+          //aux1<<< gridSize, blockSize >>>( auxPtr.template modifyData< Device>(), dAux,1 );
           
           //int BlockIter = 1;// = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) );
 
@@ -261,7 +261,7 @@ solve( const MeshPointer& mesh,
           int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0);
           int *dBlock;
           cudaMalloc(&dBlock, nBlocks * sizeof( int ) );
-          
+          int oddEvenBlock = 0;
           while( BlockIterD )
           {
            /*for( int i = 0; i < numBlocksX * numBlocksY; i++ )
@@ -269,19 +269,30 @@ solve( const MeshPointer& mesh,
                        
             CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
                                                              interfaceMapPtr.template getData< Device >(),
-                                                             dAux,
-                                                             BlockIterDevice );
+                                                             auxPtr.template modifyData< Device>(),
+                                                             BlockIterDevice,
+                                                             oddEvenBlock );
+	    TNL_CHECK_CUDA_DEVICE;
+            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
+            CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
+                                                             interfaceMapPtr.template getData< Device >(),
+                                                             auxPtr.template modifyData< Device>(),
+                                                             BlockIterDevice,
+                                                             oddEvenBlock );
+	    TNL_CHECK_CUDA_DEVICE;
+            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
             
             CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
+	    TNL_CHECK_CUDA_DEVICE;
             CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
-            
+            TNL_CHECK_CUDA_DEVICE;
             cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
                                    
             /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
                 BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
             
           }
-          aux1<<<gridSize,blockSize>>>( auxPtr.template modifyData< Device>(), dAux, 0 );
+          //aux1<<<gridSize,blockSize>>>( auxPtr.template modifyData< Device>(), dAux, 0 );
           cudaFree( dAux );
           cudaFree( BlockIterDevice );
           cudaFree( dBlock );
@@ -299,7 +310,7 @@ solve( const MeshPointer& mesh,
 }
 
 #ifdef HAVE_CUDA
-template < typename Real, typename Device, typename Index >
+/*template < typename Real, typename Device, typename Index >
 __global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a )
 {
     int i = threadIdx.x + blockDim.x*blockIdx.x;
@@ -314,7 +325,7 @@ __global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, In
         aux[ j*mesh.getDimensions().x() + i ] = dAux[ j*mesh.getDimensions().x() + i ];
     }
     
-}
+}*/
 
 __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks )
 {
@@ -366,8 +377,8 @@ __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlock
 template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
-                                      Real *aux,
-                                      int *BlockIterDevice )
+                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
+                                      int *BlockIterDevice, int oddEvenBlock )
 {
     int thri = threadIdx.x; int thrj = threadIdx.y;
     int blIdx = blockIdx.x; int blIdy = blockIdx.y;
@@ -417,109 +428,114 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     }
     __syncthreads();
     
-    if( thri == 0 )
-    {        
-        if( dimX > (blIdx+1) * blockDim.x  && thrj+1 < ykolik )
-            sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ];
-        else
-            sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
-    }
-    
-    if( thri == 1 )
-    {
-        if( blIdx != 0 && thrj+1 < ykolik )
-            sArray[thrj+1][0] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX ];
-        else
-            sArray[thrj+1][0] = std::numeric_limits< Real >::max();
-    }
-    
-    if( thri == 2 )
-    {
-        if( dimY > (blIdy+1) * blockDim.y  && thri+1 < xkolik )
-            sArray[ykolik][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thrj+1 ];
-        else
-           sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();
-    }
-    
-    if( thri == 3 )
+    if( (blIdy%2  + blIdx) % 2 == oddEvenBlock )
     {
-        if( blIdy != 0 && thrj+1 < xkolik )
-            sArray[0][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + thrj+1 ];
-        else
-            sArray[0][thrj+1] = std::numeric_limits< Real >::max();
-    }
     
-        
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
-    {    
-        sArray[thrj+1][thri+1] = aux[ j*mesh.getDimensions().x() + i ];
-    }
-    __syncthreads();  
+        if( thri == 0 )
+        {        
+            if( dimX > (blIdx+1) * blockDim.x  && thrj+1 < ykolik )
+                sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ];
+            else
+                sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
+        }
+
+        if( thri == 1 )
+        {
+            if( blIdx != 0 && thrj+1 < ykolik )
+                sArray[thrj+1][0] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX ];
+            else
+                sArray[thrj+1][0] = std::numeric_limits< Real >::max();
+        }
+
+        if( thri == 2 )
+        {
+            if( dimY > (blIdy+1) * blockDim.y  && thri+1 < xkolik )
+                sArray[ykolik][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thrj+1 ];
+            else
+               sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();
+        }
+
+        if( thri == 3 )
+        {
+            if( blIdy != 0 && thrj+1 < xkolik )
+                sArray[0][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + thrj+1 ];
+            else
+                sArray[0][thrj+1] = std::numeric_limits< Real >::max();
+        }
+
 
-    while( changed[ 0 ] )
-    {
-        __syncthreads();
-        
-        changed[ currentIndex] = false;
-        
-    //calculation of update cell
         if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+        {    
+            sArray[thrj+1][thri+1] = aux[ j*mesh.getDimensions().x() + i ];
+        }
+        __syncthreads();  
+
+        while( changed[ 0 ] )
         {
-            if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] )
+            __syncthreads();
+
+            changed[ currentIndex] = false;
+
+        //calculation of update cell
+            if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
             {
-                changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy);
+                if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] )
+                {
+                    changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy);
+                }
             }
-        }
-        __syncthreads();
-        
-    //pyramid reduction
-        if( blockDim.x*blockDim.y == 1024 )
-        {
-            if( currentIndex < 512 )
+            __syncthreads();
+
+        //pyramid reduction
+            if( blockDim.x*blockDim.y == 1024 )
             {
-                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
+                if( currentIndex < 512 )
+                {
+                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
+                }
             }
-        }
-        __syncthreads();
-        if( blockDim.x*blockDim.y >= 512 )
-        {
-            if( currentIndex < 256 )
+            __syncthreads();
+            if( blockDim.x*blockDim.y >= 512 )
             {
-                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
+                if( currentIndex < 256 )
+                {
+                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
+                }
             }
-        }
-        __syncthreads();
-        if( blockDim.x*blockDim.y >= 256 )
-        {
-            if( currentIndex < 128 )
+            __syncthreads();
+            if( blockDim.x*blockDim.y >= 256 )
             {
-                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
+                if( currentIndex < 128 )
+                {
+                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
+                }
             }
-        }
-        __syncthreads();
-        if( blockDim.x*blockDim.y >= 128 )
-        {
-            if( currentIndex < 64 )
+            __syncthreads();
+            if( blockDim.x*blockDim.y >= 128 )
             {
-                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
+                if( currentIndex < 64 )
+                {
+                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
+                }
             }
+            __syncthreads();
+            if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
+            {
+                if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
+                if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
+                if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
+                if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
+                if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
+                if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
+            }
+            if( changed[ 0 ] && thri == 0 && thrj == 0 )
+                BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1;
+            __syncthreads();
         }
-        __syncthreads();
-        if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
-        {
-            if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
-            if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
-            if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
-            if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
-            if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
-            if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
-        }
-        if( changed[ 0 ] && thri == 0 && thrj == 0 )
-            BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1;
-        __syncthreads();
+        
+        if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) )
+            aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ];
+
     }
-  
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) )
-        aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ];
 }
 #endif
-- 
GitLab


From da336fb8bd927bc927bde8bde5876b18f07a23cf Mon Sep 17 00:00:00 2001
From: Fencl <fenclmat@fjfi.cvut.cz>
Date: Sun, 7 Oct 2018 12:55:16 +0200
Subject: [PATCH 02/20] FIM method implemented. Neighbours are being found on
 CPU. 3D parallel method disabled because of Array changes.

---
 .../tnlDirectEikonalMethodsBase.h             |   9 +-
 .../tnlFastSweepingMethod2D_impl.h            | 199 +++++++++++-------
 .../tnlFastSweepingMethod3D_impl.h            |   4 +-
 3 files changed, 134 insertions(+), 78 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index eb7cbd2a5..c92368deb 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -113,6 +113,8 @@ T1 meet2DCondition( T1 a, T1 b, const T2 ha, const T2 hb, const T1 value, double
 template < typename T1 >
 __cuda_callable__ void sortMinims( T1 pom[] );
 
+template < typename Index >
+void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY );
 
 #ifdef HAVE_CUDA
 template < typename Real, typename Device, typename Index >
@@ -130,8 +132,11 @@ template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                       Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-                                      int *BlockIterDevice, int oddEvenBlock);
-__global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks );
+                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );
+
+template < typename Index >
+__global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+                                   TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks );
 
 /*template < typename Real, typename Device, typename Index >
 __global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a );*/
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index 7e4028fbe..817811c84 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -235,13 +235,6 @@ solve( const MeshPointer& mesh,
       {
          // TODO: CUDA code
 #ifdef HAVE_CUDA
-          
-          Real *dAux;
-          cudaMalloc(&dAux, ( mesh->getDimensions().x() * mesh->getDimensions().y() ) * sizeof( Real ) );
-          
-          
-          
-          
           const int cudaBlockSize( 16 );
           int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
           int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
@@ -250,18 +243,30 @@ solve( const MeshPointer& mesh,
           
           tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
           
-          //aux1<<< gridSize, blockSize >>>( auxPtr.template modifyData< Device>(), dAux,1 );
+          TNL::Containers::Array< int, Devices::Host, IndexType > BlockIter;
+          BlockIter.setSize( numBlocksX * numBlocksY );
+          BlockIter.setValue( 0 );
+          /*int* BlockIter = (int*)malloc( ( numBlocksX * numBlocksY ) * sizeof( int ) );
+          for( int i = 0; i < numBlocksX*numBlocksY +1; i++)
+              BlockIter[i] = 1;*/
           
-          //int BlockIter = 1;// = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) );
-
-          int *BlockIterDevice;
           int BlockIterD = 1;
           
+          TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
+          BlockIterDevice.setSize( numBlocksX * numBlocksY );
+          BlockIterDevice.setValue( 1 );
+          /*int *BlockIterDevice;
           cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );
+          cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);*/
+          
           int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0);
-          int *dBlock;
-          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );
-          int oddEvenBlock = 0;
+          
+          TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
+          dBlock.setSize( nBlocks );
+          dBlock.setValue( 0 );
+          /*int *dBlock;
+          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/
+          
           while( BlockIterD )
           {
            /*for( int i = 0; i < numBlocksX * numBlocksY; i++ )
@@ -270,89 +275,132 @@ solve( const MeshPointer& mesh,
             CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
                                                              interfaceMapPtr.template getData< Device >(),
                                                              auxPtr.template modifyData< Device>(),
-                                                             BlockIterDevice,
-                                                             oddEvenBlock );
-	    TNL_CHECK_CUDA_DEVICE;
-            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
-            CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
-                                                             interfaceMapPtr.template getData< Device >(),
-                                                             auxPtr.template modifyData< Device>(),
-                                                             BlockIterDevice,
-                                                             oddEvenBlock );
-	    TNL_CHECK_CUDA_DEVICE;
-            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
+                                                             BlockIterDevice );
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
+            
+            BlockIter = BlockIterDevice;
+            //cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyDeviceToHost);
+            GetNeighbours( BlockIter, numBlocksX, numBlocksY );
+            
+            BlockIterDevice = BlockIter;
+            //cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
+            
+            
+            CudaParallelReduc<<<  nBlocks, 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
             
-            CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
-	    TNL_CHECK_CUDA_DEVICE;
             CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+            cudaDeviceSynchronize();
             TNL_CHECK_CUDA_DEVICE;
+            
             cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
                                    
             /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
                 BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
             
           }
-          //aux1<<<gridSize,blockSize>>>( auxPtr.template modifyData< Device>(), dAux, 0 );
-          cudaFree( dAux );
-          cudaFree( BlockIterDevice );
+          /*cudaFree( BlockIterDevice );
           cudaFree( dBlock );
+          delete BlockIter;*/
           cudaDeviceSynchronize();
           
           TNL_CHECK_CUDA_DEVICE;
               
-          //aux = *auxPtr;
-          //interfaceMap = *interfaceMapPtr;
+          aux = *auxPtr;
+          interfaceMap = *interfaceMapPtr;
 #endif
       }
       iteration++;
    }
    aux.save("aux-final.tnl");
 }
-
-#ifdef HAVE_CUDA
-/*template < typename Real, typename Device, typename Index >
-__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a )
+template < typename Index >
+void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY )
 {
-    int i = threadIdx.x + blockDim.x*blockIdx.x;
-    int j = blockDim.y*blockIdx.y + threadIdx.y;
-    const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >();
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 1 )
-    {    
-        dAux[ j*mesh.getDimensions().x() + i ] = aux[ j*mesh.getDimensions().x() + i ];
-    }
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 0 )
-    {    
-        aux[ j*mesh.getDimensions().x() + i ] = dAux[ j*mesh.getDimensions().x() + i ];
-    }
-    
-}*/
+    TNL::Containers::Array< int, Devices::Host, Index > BlockIterPom;
+    BlockIterPom.setSize( numBlockX * numBlockY );
+    BlockIterPom.setValue( 0 );
+  /*int* BlockIterPom; 
+  BlockIterPom = new int[numBlockX * numBlockY];*/
+  /*for(int i = 0; i < numBlockX * numBlockY; i++)
+    BlockIterPom[ i ] = 0;*/
+  for(int i = 0; i < numBlockX * numBlockY; i++)
+  {
+      
+      if( BlockIter[ i ] )
+      {
+          // i = k*numBlockY + m;
+          int m=0, k=0;
+          m = i%numBlockY;
+          k = i/numBlockY;
+          if( k > 0 && numBlockY > 1 )
+            BlockIterPom[i - numBlockX] = 1;
+          if( k < numBlockY-1 && numBlockY > 1 )
+            BlockIterPom[i + numBlockX] = 1;
+          
+          if( m >= 0 && m < numBlockX - 1 && numBlockX > 1 )
+              BlockIterPom[ i+1 ] = 1;
+          if( m <= numBlockX -1 && m > 0 && numBlockX > 1 )
+              BlockIterPom[ i-1 ] = 1;
+      }
+  }
+  for(int i = 0; i < numBlockX * numBlockY; i++ ){
+///      if( !BlockIter[ i ] )
+        BlockIter[ i ] = BlockIterPom[ i ];
+///      else
+///        BlockIter[ i ] = 0;
+  }
+  /*for( int i = numBlockX-1; i > -1; i-- )
+  {
+      for( int j = 0; j< numBlockY; j++ )
+          std::cout << BlockIter[ i*numBlockY + j ];
+      std::cout << std::endl;
+  }
+  std::cout << std::endl;*/
+  //delete[] BlockIterPom;
+}
 
-__global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks )
+#ifdef HAVE_CUDA
+template < typename Index >
+__global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+                                   TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks )
 {
     int i = threadIdx.x;
     int blId = blockIdx.x;
+    /*if ( i == 0 && blId == 0 ){
+            printf( "nBlocks = %d \n", nBlocks );
+        for( int j = nBlocks-1; j > -1 ; j--){
+            printf( "cislo = %d \n", BlockIterDevice[ j ] );
+        }
+    }*/
     __shared__ volatile int sArray[ 512 ];
-    sArray[ i ] = false;
-    if(blId * 1024 + i < nBlocks )
-        sArray[ i ] = BlockIterDevice[ blId * 1024 + i ];
+    sArray[ i ] = 0;
+    if( blId * 512 + i < nBlocks )
+        sArray[ i ] = BlockIterDevice[ blId * 512 + i ];
+    __syncthreads();
     
-    if (blockDim.x * blockDim.y == 1024) {
+    if (blockDim.x == 1024) {
         if (i < 512)
-            sArray[ i ] += sArray[ i ];
+            sArray[ i ] += sArray[ i + 512 ];
     }
     __syncthreads();
-    if (blockDim.x * blockDim.y >= 512) {
+    if (blockDim.x >= 512) {
         if (i < 256) {
-            sArray[ i ] += sArray[ i ];
+            sArray[ i ] += sArray[ i + 256 ];
         }
     }
-    if (blockDim.x * blockDim.y >= 256) {
+    __syncthreads();
+    if (blockDim.x >= 256) {
         if (i < 128) {
             sArray[ i ] += sArray[ i + 128 ];
         }
     }
     __syncthreads();
-    if (blockDim.x * blockDim.y >= 128) {
+    if (blockDim.x >= 128) {
         if (i < 64) {
             sArray[ i ] += sArray[ i + 64 ];
         }
@@ -360,12 +408,12 @@ __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlock
     __syncthreads();
     if (i < 32 )
     {
-        if(  blockDim.x * blockDim.y >= 64 ) sArray[ i ] += sArray[ i + 32 ];
-        if(  blockDim.x * blockDim.y >= 32 )  sArray[ i ] += sArray[ i + 16 ];
-        if(  blockDim.x * blockDim.y >= 16 )  sArray[ i ] += sArray[ i + 8 ];
-        if(  blockDim.x * blockDim.y >= 8 )  sArray[ i ] += sArray[ i + 4 ];
-        if(  blockDim.x * blockDim.y >= 4 )  sArray[ i ] += sArray[ i + 2 ];
-        if(  blockDim.x * blockDim.y >= 2 )  sArray[ i ] += sArray[ i + 1 ];
+        if(  blockDim.x >= 64 ) sArray[ i ] += sArray[ i + 32 ];
+        if(  blockDim.x >= 32 )  sArray[ i ] += sArray[ i + 16 ];
+        if(  blockDim.x >= 16 )  sArray[ i ] += sArray[ i + 8 ];
+        if(  blockDim.x >= 8 )  sArray[ i ] += sArray[ i + 4 ];
+        if(  blockDim.x >= 4 )  sArray[ i ] += sArray[ i + 2 ];
+        if(  blockDim.x >= 2 )  sArray[ i ] += sArray[ i + 1 ];
     }
     
     if( i == 0 )
@@ -378,14 +426,15 @@ template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                       Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-                                      int *BlockIterDevice, int oddEvenBlock )
+                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice )
 {
     int thri = threadIdx.x; int thrj = threadIdx.y;
     int blIdx = blockIdx.x; int blIdy = blockIdx.y;
     int i = thri + blockDim.x*blIdx;
     int j = blockDim.y*blIdy + thrj;
     int currentIndex = thrj * blockDim.x + thri;
-    
+    if( BlockIterDevice[ blIdy * gridDim.x + blIdx] )
+    {
     //__shared__ volatile bool changed[ blockDim.x*blockDim.y ];
     __shared__ volatile bool changed[16*16];
     changed[ currentIndex ] = false;
@@ -424,13 +473,13 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
 
         if( numOfBlocky -1 == blIdy )
             ykolik = dimY - (blIdy)*blockDim.y+1;
-        BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0;
+        //BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0;
     }
     __syncthreads();
     
-    if( (blIdy%2  + blIdx) % 2 == oddEvenBlock )
-    {
-    
+        if(thri == 0 && thrj == 0 )
+            BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0;
+
         if( thri == 0 )
         {        
             if( dimX > (blIdx+1) * blockDim.x  && thrj+1 < ykolik )
@@ -528,14 +577,16 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
                 if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
                 if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
             }
-            if( changed[ 0 ] && thri == 0 && thrj == 0 )
+            if( changed[ 0 ] && thri == 0 && thrj == 0 ){
                 BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1;
+            }
             __syncthreads();
         }
-        
+
         if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) )
             aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ];
-
     }
+    /*if( thri == 0 && thrj == 0 )
+        printf( "Block ID = %d, value = %d \n", (blIdy * numOfBlockx + blIdx), BlockIterDevice[ blIdy * numOfBlockx + blIdx ] );*/
 }
 #endif
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index b024979cc..8c85745cd 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -272,8 +272,8 @@ solve( const MeshPointer& mesh,
                                                               interfaceMapPtr.template getData< Device >(),
                                                               auxPtr.template modifyData< Device>(),
                                                               BlockIterDevice );
-            CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
-            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+            //CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
+            //CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
             
             cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
                                    
-- 
GitLab


From 444e01c4c17449ccfcf432caf9bcf32ff6e80dfc Mon Sep 17 00:00:00 2001
From: Fencl <fenclmat@fjfi.cvut.cz>
Date: Mon, 22 Oct 2018 21:13:54 +0200
Subject: [PATCH 03/20] FIM method is now faster than chess method but some
 random error occurs.

---
 .../tnlDirectEikonalMethodsBase.h             |  24 +-
 .../tnlDirectEikonalMethodsBase_impl.h        | 193 ++++
 .../hamilton-jacobi/tnlFastSweepingMethod.h   |   3 +-
 .../tnlFastSweepingMethod2D_impl.h            | 846 +++++++++---------
 .../tnlFastSweepingMethod3D_impl.h            |  31 +-
 5 files changed, 657 insertions(+), 440 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index eb7cbd2a5..08ed947ed 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -61,8 +61,9 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
       typedef Index IndexType;
       typedef Functions::MeshFunction< MeshType > MeshFunctionType;
       typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType;
+      typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
-      using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;      
+      using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;
 
       void initInterface( const MeshFunctionPointer& input,
                           MeshFunctionPointer& output,
@@ -76,6 +77,11 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
       __cuda_callable__ bool updateCell( volatile Real sArray[18][18],
                                          int thri, int thrj, const Real hx, const Real hy,
                                          const Real velocity = 1.0 );
+      void updateBlocks( InterfaceMapType interfaceMap,
+                         MeshFunctionType aux,
+                         ArrayContainer BlockIterHost, int numThreadsPerBlock );
+      
+      void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY  );
 };
 
 template< typename Real,
@@ -113,6 +119,8 @@ T1 meet2DCondition( T1 a, T1 b, const T2 ha, const T2 hb, const T1 value, double
 template < typename T1 >
 __cuda_callable__ void sortMinims( T1 pom[] );
 
+template < typename Index >
+void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY );
 
 #ifdef HAVE_CUDA
 template < typename Real, typename Device, typename Index >
@@ -130,11 +138,15 @@ template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                       Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-                                      int *BlockIterDevice, int oddEvenBlock);
-__global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks );
+                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne = 1 );
+
+template < typename Index >
+__global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+                                   TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks );
 
-/*template < typename Real, typename Device, typename Index >
-__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a );*/
+template < typename Index >
+__global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+                               /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY );
 
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
@@ -150,7 +162,7 @@ template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
                                       Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
-                                      int *BlockIterDevice );
+                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );
 #endif
 
 #include "tnlDirectEikonalMethodsBase_impl.h"
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index cfea6aca0..1f9fc5eeb 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -89,6 +89,199 @@ initInterface( const MeshFunctionPointer& _input,
     }
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+void 
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+updateBlocks( InterfaceMapType interfaceMap,
+              MeshFunctionType aux,
+              ArrayContainer BlockIterHost, int numThreadsPerBlock )
+{
+  for( int i = 0; i < BlockIterHost.getSize(); i++ )
+  {
+    if( BlockIterHost[ i ] )
+    {
+      MeshType mesh = interfaceMap.template getMesh< Devices::Host >();
+    
+      int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
+      int numOfBlockx = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
+      int numOfBlocky = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
+      int xkolik = numThreadsPerBlock + 1;
+      int ykolik = numThreadsPerBlock + 1;
+      
+      int blIdx = i%numOfBlockx;
+      int blIdy = i/numOfBlocky;
+      
+      if( numOfBlockx - 1 == blIdx )
+        xkolik = dimX - (blIdx)*numThreadsPerBlock+1;
+      
+      if( numOfBlocky -1 == blIdy )
+        ykolik = dimY - (blIdy)*numThreadsPerBlock+1;
+    
+        
+      /*bool changed[numThreadsPerBlock*numThreadsPerBlock];
+      changed[ 0 ] = 1;*/
+      Real hx = mesh.getSpaceSteps().x();
+      Real hy = mesh.getSpaceSteps().y();
+      
+      Real changed1[ 16*16 ];
+      /*Real changed2[ 16*16 ];
+      Real changed3[ 16*16 ];
+      Real changed4[ 16*16 ];*/
+      Real sArray[18][18];
+      
+      for( int thri = 0; thri < numThreadsPerBlock + 2; thri++ )
+        for( int thrj = 0; thrj < numThreadsPerBlock + 2; thrj++ )
+          sArray[thrj][thri] = std::numeric_limits< Real >::max();
+    
+      BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0;
+    
+      for( int thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ )
+      {        
+        if( dimX > (blIdx+1) * numThreadsPerBlock  && thrj+1 < ykolik )
+          sArray[thrj+1][xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ];
+        else
+         sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
+      
+    
+        if( blIdx != 0 && thrj+1 < ykolik )
+          sArray[thrj+1][0] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ];
+        else
+          sArray[thrj+1][0] = std::numeric_limits< Real >::max();
+    
+        if( dimY > (blIdy+1) * numThreadsPerBlock  && thrj+1 < xkolik )
+          sArray[ykolik][thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ];
+        else
+          sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();
+      
+        if( blIdy != 0 && thrj+1 < xkolik )
+          sArray[0][thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ];
+        else
+          sArray[0][thrj+1] = std::numeric_limits< Real >::max();
+      }
+    
+      for( int k = 0; k < numThreadsPerBlock; k++ )
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          sArray[k+1][l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ];
+
+      for( int k = 0; k < numThreadsPerBlock; k++ ) 
+        for( int l = 0; l < numThreadsPerBlock; l++ ){
+          changed1[ k*numThreadsPerBlock + l ] = 0;
+          /*changed2[ k*numThreadsPerBlock + l ] = 0;
+          changed3[ k*numThreadsPerBlock + l ] = 0;
+          changed4[ k*numThreadsPerBlock + l ] = 0;*/
+          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
+          {
+            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
+            {
+              changed1[ k*numThreadsPerBlock + l ] = this->updateCell( sArray, l+1, k+1, hx,hy);
+            }
+          }
+        }
+
+      for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
+        for( int l = 0; l < numThreadsPerBlock; l++ ) { 
+          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
+          {
+            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
+            {
+              /*changed2[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
+            }
+          }
+        }
+
+      for( int k = 0; k < numThreadsPerBlock; k++ ) 
+        for( int l = numThreadsPerBlock-1; l >-1; l-- ) { 
+          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
+          {
+            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
+            {
+              /*changed3[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
+            }
+          }
+        }
+
+      for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
+        for( int l = numThreadsPerBlock-1; l >-1; l-- ) { 
+          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
+          {
+            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
+            {
+              /*changed4[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
+            }
+          }
+        }
+
+      for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
+        for( int l = numThreadsPerBlock-1; l >-1; l-- ){
+          changed1[ 0 ] = changed1[ 0 ] || changed1[ k*numThreadsPerBlock + l ];
+          /*changed2[ 0 ] = changed2[ 0 ] || changed2[ k*numThreadsPerBlock + l ];
+          changed3[ 0 ] = changed3[ 0 ] || changed3[ k*numThreadsPerBlock + l ];
+          changed4[ 0 ] = changed4[ 0 ] || changed4[ k*numThreadsPerBlock + l ];*/
+        }
+      
+      if( changed1[ 0 ] /*|| changed2[ 0 ] ||changed3[ 0 ] ||changed4[ 0 ]*/ )
+        BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 1;
+
+      for( int k = 0; k < numThreadsPerBlock; k++ ){ 
+        for( int l = 0; l < numThreadsPerBlock; l++ ) {       
+          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY &&
+              (!interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ]) )
+            aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] = sArray[ k + 1 ][ l + 1 ];
+          //std::cout<< sArray[k+1][l+1];
+        }
+        //std::cout<<std::endl;
+      }
+    }
+  }
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+void 
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY )
+{
+  int* BlockIterPom; 
+  BlockIterPom = new int [numBlockX * numBlockY];
+  
+  for(int i = 0; i < numBlockX * numBlockY; i++)
+  {
+    BlockIterPom[ i ] = 0;  
+    if( BlockIterHost[ i ] )
+    {
+      // i = k*numBlockY + m;
+      int m=0, k=0;
+      m = i%numBlockX;
+      k = i/numBlockX;
+      if( k > 0 )
+        BlockIterPom[i - numBlockX] = 1;
+      if( k < numBlockY - 1 )
+        BlockIterPom[i + numBlockX] = 1;
+      
+      if( m < numBlockX - 1 )
+        BlockIterPom[ i+1 ] = 1;
+      if( m > 0 )
+        BlockIterPom[ i-1 ] = 1;
+    }
+  }
+  for(int i = 0; i < numBlockX * numBlockY; i++ )
+      //if( !BlockIter[ i ] )
+        BlockIterHost[ i ] = BlockIterPom[ i ];
+      /*else
+        BlockIter[ i ] = 0;*/
+  /*for( int i = numBlockX-1; i > -1; i-- )
+  {
+      for( int j = 0; j< numBlockY; j++ )
+          std::cout << BlockIterHost[ i*numBlockY + j ];
+      std::cout << std::endl;
+  }
+  std::cout << std::endl;*/
+  delete[] BlockIterPom;
+}
+
 template< typename Real,
           typename Device,
           typename Index >
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
index fa8077427..60c690e06 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
@@ -88,7 +88,8 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >
       using typename BaseType::InterfaceMapType;
       using typename BaseType::MeshFunctionType;
       using typename BaseType::InterfaceMapPointer;
-      using typename BaseType::MeshFunctionPointer;      
+      using typename BaseType::MeshFunctionPointer;
+      using typename BaseType::ArrayContainer;
 
       FastSweepingMethod();
       
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index 7e4028fbe..e23148db5 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -21,355 +21,348 @@
 #include <fstream>
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
 FastSweepingMethod()
 : maxIterations( 1 )
 {
-   
+  
 }
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 const Index&
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
 getMaxIterations() const
 {
-   
+  
 }
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 void
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
 setMaxIterations( const IndexType& maxIterations )
 {
-   
+  
 }
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 void
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
 solve( const MeshPointer& mesh,
-       const AnisotropyPointer& anisotropy,
-       MeshFunctionPointer& u )
-{
-   /*MeshFunctionType v;
-   v.setMesh(mesh);
-   double A[320][320];
-    for (int i = 0; i < 320; i++)
-        for (int j = 0; j < 320; j++)
-            A[i][j] = 0;
-    
-    std::ifstream file("/home/maty/Downloads/mapa2.txt");
-
-    for (int i = 0; i < 320; i++)
-        for (int j = 0; j < 320; j++)
-            file >> A[i][j];
-    file.close();
-    for (int i = 0; i < 320; i++)
-        for (int j = 0; j < 320; j++)
-            v[i*320 + j] = A[i][j];
-   v.save("mapa.tnl");*/
-   
-       
-   MeshFunctionPointer auxPtr;
-   InterfaceMapPointer interfaceMapPtr;
-   auxPtr->setMesh( mesh );
-   interfaceMapPtr->setMesh( mesh );
-   std::cout << "Initiating the interface cells ..." << std::endl;
-   BaseType::initInterface( u, auxPtr, interfaceMapPtr );
+        const AnisotropyPointer& anisotropy,
+        MeshFunctionPointer& u )
+{  
+  MeshFunctionPointer auxPtr;
+  InterfaceMapPointer interfaceMapPtr;
+  auxPtr->setMesh( mesh );
+  interfaceMapPtr->setMesh( mesh );
+  std::cout << "Initiating the interface cells ..." << std::endl;
+  BaseType::initInterface( u, auxPtr, interfaceMapPtr );
+  
+  auxPtr->save( "aux-ini.tnl" );
+  
+  typename MeshType::Cell cell( *mesh );
+  
+  IndexType iteration( 0 );
+  InterfaceMapType interfaceMap = *interfaceMapPtr;
+  MeshFunctionType aux = *auxPtr;
+  
+  
+  
+  
+  while( iteration < this->maxIterations )
+  {
+    if( std::is_same< DeviceType, Devices::Host >::value )
+    {
+      int numThreadsPerBlock = 16;
+      
+      int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
+      int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
+      
+          
+      ArrayContainer BlockIterHost;
+      BlockIterHost.setSize( numBlocksX * numBlocksY );
+      BlockIterHost.setValue( 1 );
+      /*for( int k = numBlocksX-1; k >-1; k-- ){
+        for( int l = 0; l < numBlocksY; l++ ){
+          std::cout<< BlockIterHost[ l*numBlocksX  + k ];
+        }
+        std::cout<<std::endl;
+      }
+      std::cout<<std::endl;*/
+      
+      while( BlockIterHost[ 0 ] )
+      {          
+        this->updateBlocks( interfaceMap, aux, BlockIterHost, numThreadsPerBlock);
         
-   auxPtr->save( "aux-ini.tnl" );
-
-   typename MeshType::Cell cell( *mesh );
-   
-   IndexType iteration( 0 );
-   InterfaceMapType interfaceMap = *interfaceMapPtr;
-   MeshFunctionType aux = *auxPtr;
-   while( iteration < this->maxIterations )
-   {
-      if( std::is_same< DeviceType, Devices::Host >::value )
-      {
-         for( cell.getCoordinates().y() = 0;
+        this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY );
+        
+  //Reduction      
+        for( int k = numBlocksX-1; k >-1; k-- ){
+          for( int l = 0; l < numBlocksY; l++ ){
+            //std::cout<< BlockIterHost[ l*numBlocksX  + k ];
+            BlockIterHost[ 0 ] = BlockIterHost[ 0 ] || BlockIterHost[ l*numBlocksX + k ];
+          }
+          //std::cout<<std::endl;
+        }
+        //std::cout<<std::endl;
+      }
+      /*for( cell.getCoordinates().y() = 0;
               cell.getCoordinates().y() < mesh->getDimensions().y();
               cell.getCoordinates().y()++ )
-         {
-            for( cell.getCoordinates().x() = 0;
-                 cell.getCoordinates().x() < mesh->getDimensions().x();
-                 cell.getCoordinates().x()++ )
-               {
-                  cell.refresh();
-                  if( ! interfaceMap( cell ) )
-                     this->updateCell( aux, cell );
-               }
-         }
-
-         //aux.save( "aux-1.tnl" );
-
-         for( cell.getCoordinates().y() = 0;
+      {
+        for( cell.getCoordinates().x() = 0;
+                cell.getCoordinates().x() < mesh->getDimensions().x();
+                cell.getCoordinates().x()++ )
+        {
+          cell.refresh();
+          if( ! interfaceMap( cell ) )
+            this->updateCell( aux, cell );
+        }
+      }
+      
+      //aux.save( "aux-1.tnl" );
+      
+      for( cell.getCoordinates().y() = 0;
               cell.getCoordinates().y() < mesh->getDimensions().y();
               cell.getCoordinates().y()++ )
-         {
-            for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                 cell.getCoordinates().x() >= 0 ;
-                 cell.getCoordinates().x()-- )		
-               {
-                  //std::cerr << "2 -> ";
-                  cell.refresh();
-                  if( ! interfaceMap( cell ) )            
-                     this->updateCell( aux, cell );
-               }
-         }
-
-         //aux.save( "aux-2.tnl" );
-
-         for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+      {
+        for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                cell.getCoordinates().x() >= 0 ;
+                cell.getCoordinates().x()-- )		
+        {
+          //std::cerr << "2 -> ";
+          cell.refresh();
+          if( ! interfaceMap( cell ) )            
+            this->updateCell( aux, cell );
+        }
+      }
+      
+      //aux.save( "aux-2.tnl" );
+      
+      for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
               cell.getCoordinates().y() >= 0 ;
               cell.getCoordinates().y()-- )
-            {
-            for( cell.getCoordinates().x() = 0;
-                 cell.getCoordinates().x() < mesh->getDimensions().x();
-                 cell.getCoordinates().x()++ )
-               {
-                  //std::cerr << "3 -> ";
-                  cell.refresh();
-                  if( ! interfaceMap( cell ) )            
-                     this->updateCell( aux, cell );
-               }
-            }
-
-         //aux.save( "aux-3.tnl" );
-
-         for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+      {
+        for( cell.getCoordinates().x() = 0;
+                cell.getCoordinates().x() < mesh->getDimensions().x();
+                cell.getCoordinates().x()++ )
+        {
+          //std::cerr << "3 -> ";
+          cell.refresh();
+          if( ! interfaceMap( cell ) )            
+            this->updateCell( aux, cell );
+        }
+      }
+      
+      //aux.save( "aux-3.tnl" );
+      
+      for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
               cell.getCoordinates().y() >= 0;
               cell.getCoordinates().y()-- )
-            {
-            for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                 cell.getCoordinates().x() >= 0 ;
-                 cell.getCoordinates().x()-- )		
-               {
-                  //std::cerr << "4 -> ";
-                  cell.refresh();
-                  if( ! interfaceMap( cell ) )            
-                     this->updateCell( aux, cell );
-               }
-            }
-
-         //aux.save( "aux-4.tnl" );
-
-         /*for( cell.getCoordinates().x() = 0;
-              cell.getCoordinates().x() < mesh->getDimensions().y();
-              cell.getCoordinates().x()++ )
-         {
-            for( cell.getCoordinates().y() = 0;
-                 cell.getCoordinates().y() < mesh->getDimensions().x();
-                 cell.getCoordinates().y()++ )
-               {
-                  cell.refresh();
-                  if( ! interfaceMap( cell ) )
-                     this->updateCell( aux, cell );
-               }
-         }     
-
-
-         aux.save( "aux-5.tnl" );
-
-         for( cell.getCoordinates().x() = 0;
-              cell.getCoordinates().x() < mesh->getDimensions().y();
-              cell.getCoordinates().x()++ )
-         {
-            for( cell.getCoordinates().y() = mesh->getDimensions().x() - 1;
-                 cell.getCoordinates().y() >= 0 ;
-                 cell.getCoordinates().y()-- )		
-               {
-                  //std::cerr << "2 -> ";
-                  cell.refresh();
-                  if( ! interfaceMap( cell ) )            
-                     this->updateCell( aux, cell );
-               }
-         }
-         aux.save( "aux-6.tnl" );
-
-         for( cell.getCoordinates().x() = mesh->getDimensions().y() - 1;
-              cell.getCoordinates().x() >= 0 ;
-              cell.getCoordinates().x()-- )
-            {
-            for( cell.getCoordinates().y() = 0;
-                 cell.getCoordinates().y() < mesh->getDimensions().x();
-                 cell.getCoordinates().y()++ )
-               {
-                  //std::cerr << "3 -> ";
-                  cell.refresh();
-                  if( ! interfaceMap( cell ) )            
-                     this->updateCell( aux, cell );
-               }
-            }
-         aux.save( "aux-7.tnl" );
-
-         for( cell.getCoordinates().x() = mesh->getDimensions().y() - 1;
-              cell.getCoordinates().x() >= 0;
-              cell.getCoordinates().x()-- )
-            {
-            for( cell.getCoordinates().y() = mesh->getDimensions().x() - 1;
-                 cell.getCoordinates().y() >= 0 ;
-                 cell.getCoordinates().y()-- )		
-               {
-                  //std::cerr << "4 -> ";
-                  cell.refresh();
-                  if( ! interfaceMap( cell ) )            
-                     this->updateCell( aux, cell );
-               }
-            }*/
-      }
-      if( std::is_same< DeviceType, Devices::Cuda >::value )
       {
-         // TODO: CUDA code
+        for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                cell.getCoordinates().x() >= 0 ;
+                cell.getCoordinates().x()-- )		
+        {
+          //std::cerr << "4 -> ";
+          cell.refresh();
+          if( ! interfaceMap( cell ) )            
+            this->updateCell( aux, cell );
+        }
+      }*/
+    }
+    if( std::is_same< DeviceType, Devices::Cuda >::value )
+    {
+      // TODO: CUDA code
 #ifdef HAVE_CUDA
-          
-          Real *dAux;
-          cudaMalloc(&dAux, ( mesh->getDimensions().x() * mesh->getDimensions().y() ) * sizeof( Real ) );
-          
-          
-          
-          
-          const int cudaBlockSize( 16 );
-          int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
-          int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
-          dim3 blockSize( cudaBlockSize, cudaBlockSize );
-          dim3 gridSize( numBlocksX, numBlocksY );
-          
-          tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
-          
-          //aux1<<< gridSize, blockSize >>>( auxPtr.template modifyData< Device>(), dAux,1 );
-          
-          //int BlockIter = 1;// = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) );
-
-          int *BlockIterDevice;
-          int BlockIterD = 1;
-          
-          cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );
-          int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0);
-          int *dBlock;
-          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );
-          int oddEvenBlock = 0;
-          while( BlockIterD )
-          {
-           /*for( int i = 0; i < numBlocksX * numBlocksY; i++ )
-                BlockIter[ i ] = false;*/
-                       
-            CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
-                                                             interfaceMapPtr.template getData< Device >(),
-                                                             auxPtr.template modifyData< Device>(),
-                                                             BlockIterDevice,
-                                                             oddEvenBlock );
-	    TNL_CHECK_CUDA_DEVICE;
-            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
-            CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
-                                                             interfaceMapPtr.template getData< Device >(),
-                                                             auxPtr.template modifyData< Device>(),
-                                                             BlockIterDevice,
-                                                             oddEvenBlock );
-	    TNL_CHECK_CUDA_DEVICE;
-            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
-            
-            CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
-	    TNL_CHECK_CUDA_DEVICE;
-            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
-            TNL_CHECK_CUDA_DEVICE;
-            cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
-                                   
-            /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
-                BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
-            
-          }
-          //aux1<<<gridSize,blockSize>>>( auxPtr.template modifyData< Device>(), dAux, 0 );
-          cudaFree( dAux );
-          cudaFree( BlockIterDevice );
-          cudaFree( dBlock );
-          cudaDeviceSynchronize();
-          
-          TNL_CHECK_CUDA_DEVICE;
-              
-          //aux = *auxPtr;
-          //interfaceMap = *interfaceMapPtr;
-#endif
+      TNL_CHECK_CUDA_DEVICE;
+      const int cudaBlockSize( 16 );
+      int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
+      int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
+      dim3 blockSize( cudaBlockSize, cudaBlockSize );
+      dim3 gridSize( numBlocksX, numBlocksY );
+      
+      tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
+      
+      int BlockIterD = 1;
+      
+      TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
+      BlockIterDevice.setSize( numBlocksX * numBlocksY );
+      BlockIterDevice.setValue( 1 );
+      TNL_CHECK_CUDA_DEVICE;
+      int ne = 0;
+      CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
+                                                       interfaceMapPtr.template getData< Device >(),
+                                                       auxPtr.template modifyData< Device>(),
+                                                       BlockIterDevice, ne);
+      TNL_CHECK_CUDA_DEVICE;
+      
+      /*TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom;
+      BlockIterPom.setSize( numBlocksX * numBlocksY  );
+      BlockIterPom.setValue( 0 );*/
+      /*TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1;
+      BlockIterPom1.setSize( numBlocksX * numBlocksY  );
+      BlockIterPom1.setValue( 0 );*/
+      /*int *BlockIterDevice;
+       cudaMalloc((void**) &BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/
+      int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
+      //std::cout << "nBlocksNeigh = " << nBlocksNeigh << std::endl;
+      //free( BlockIter );
+      /*int *BlockIterPom;
+       cudaMalloc((void**) &BlockIterPom, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/
+      
+      int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0);
+      TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
+      dBlock.setSize( nBlocks  );
+      TNL_CHECK_CUDA_DEVICE;
+      /*int *dBlock;
+       cudaMalloc((void**) &dBlock, nBlocks * sizeof( int ) );*/
+      //int pocIter = 0;
+      while( BlockIterD )
+      {
+        /*BlockIterPom1 = BlockIterDevice;
+        for( int j = numBlocksY-1; j>-1; j-- ){
+          for( int i = 0; i < numBlocksX; i++ )
+            std::cout << BlockIterPom1[ j * numBlocksX + i ];
+          std::cout << std::endl;
+        }
+        std::cout << std::endl;*/
+        
+        CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
+                                                         interfaceMapPtr.template getData< Device >(),
+                                                         auxPtr.template modifyData< Device>(),
+                                                         BlockIterDevice, 1);
+        cudaDeviceSynchronize();
+        TNL_CHECK_CUDA_DEVICE;
+        
+        /*int poc = 0;
+        for( int i = 0; i < numBlocksX * numBlocksY; i++ )
+          if( BlockIterPom1[ i ] )
+            poc = poc+1;
+        std::cout << "pocet bloku, ktere se pocitali = " << poc << std::endl;*/
+        
+        GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, /*BlockIterPom,*/ numBlocksX, numBlocksY );
+        cudaDeviceSynchronize();
+        TNL_CHECK_CUDA_DEVICE;
+        
+        CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
+        TNL_CHECK_CUDA_DEVICE;
+        
+        CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+        TNL_CHECK_CUDA_DEVICE;
+        
+        BlockIterD = dBlock.getElement( 0 );
+        //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
+        cudaDeviceSynchronize();
+        TNL_CHECK_CUDA_DEVICE;
+        
+        /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
+         BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
+        //pocIter ++;
       }
-      iteration++;
-   }
-   aux.save("aux-final.tnl");
+      cudaDeviceSynchronize();
+      TNL_CHECK_CUDA_DEVICE;
+      
+      //std::cout<< pocIter << std::endl;
+      
+      aux = *auxPtr;
+      interfaceMap = *interfaceMapPtr;
+#endif
+    }
+    iteration++;
+  }
+  aux.save("aux-final.tnl");
 }
 
 #ifdef HAVE_CUDA
-/*template < typename Real, typename Device, typename Index >
-__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a )
+template < typename Index >
+__global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+                               /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY )
 {
-    int i = threadIdx.x + blockDim.x*blockIdx.x;
-    int j = blockDim.y*blockIdx.y + threadIdx.y;
-    const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >();
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 1 )
-    {    
-        dAux[ j*mesh.getDimensions().x() + i ] = aux[ j*mesh.getDimensions().x() + i ];
-    }
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 0 )
-    {    
-        aux[ j*mesh.getDimensions().x() + i ] = dAux[ j*mesh.getDimensions().x() + i ];
-    }
+  int i = blockIdx.x * 1024 + threadIdx.x;
+  
+  if( i < numBlockX * numBlockY )
+  {
+    int pom = 0;//BlockIterPom[ i ] = 0;
+    int m=0, k=0;
+    m = i%numBlockX;
+    k = i/numBlockX;
+    if( m > 0 )
+      if( BlockIterDevice[ i - 1 ] )
+        pom = 1;//BlockIterPom[ i ] = 1;
+    if( m < numBlockX -1 && pom == 0 )
+      if( BlockIterDevice[ i + 1 ] )
+        pom = 1;//BlockIterPom[ i ] = 1;
+    if( k > 0 && pom == 0 )
+      if( BlockIterDevice[ i - numBlockX ] )
+        pom = 1;// BlockIterPom[ i ] = 1;
+    if( k < numBlockY -1 && pom == 0 )
+      if( BlockIterDevice[ i + numBlockX ] )
+        pom = 1;//BlockIterPom[ i ] = 1;
     
-}*/
+          
+      
+    BlockIterDevice[ i ] = pom;//BlockIterPom[ i ];
+  }
+}
 
-__global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks )
+template < typename Index >
+__global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+                                   TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks )
 {
-    int i = threadIdx.x;
-    int blId = blockIdx.x;
-    __shared__ volatile int sArray[ 512 ];
-    sArray[ i ] = false;
-    if(blId * 1024 + i < nBlocks )
-        sArray[ i ] = BlockIterDevice[ blId * 1024 + i ];
-    
-    if (blockDim.x * blockDim.y == 1024) {
-        if (i < 512)
-            sArray[ i ] += sArray[ i ];
+  int i = threadIdx.x;
+  int blId = blockIdx.x;
+  __shared__ volatile int sArray[ 512 ];
+  sArray[ i ] = 0;
+  if(blId * 512 + i < nBlocks )
+    sArray[ i ] = BlockIterDevice[ blId * 512 + i ];
+  __syncthreads();
+  if (blockDim.x == 1024) {
+    if (i < 512)
+      sArray[ i ] += sArray[ i + 512 ];
+  }
+  __syncthreads();
+  if (blockDim.x  >= 512) {
+    if (i < 256) {
+      sArray[ i ] += sArray[ i + 256 ];
     }
-    __syncthreads();
-    if (blockDim.x * blockDim.y >= 512) {
-        if (i < 256) {
-            sArray[ i ] += sArray[ i ];
-        }
+  }
+  if (blockDim.x >= 256) {
+    if (i < 128) {
+      sArray[ i ] += sArray[ i + 128 ];
     }
-    if (blockDim.x * blockDim.y >= 256) {
-        if (i < 128) {
-            sArray[ i ] += sArray[ i + 128 ];
-        }
+  }
+  __syncthreads();
+  if (blockDim.x >= 128) {
+    if (i < 64) {
+      sArray[ i ] += sArray[ i + 64 ];
     }
-    __syncthreads();
-    if (blockDim.x * blockDim.y >= 128) {
-        if (i < 64) {
-            sArray[ i ] += sArray[ i + 64 ];
-        }
-    }
-    __syncthreads();
-    if (i < 32 )
-    {
-        if(  blockDim.x * blockDim.y >= 64 ) sArray[ i ] += sArray[ i + 32 ];
-        if(  blockDim.x * blockDim.y >= 32 )  sArray[ i ] += sArray[ i + 16 ];
-        if(  blockDim.x * blockDim.y >= 16 )  sArray[ i ] += sArray[ i + 8 ];
-        if(  blockDim.x * blockDim.y >= 8 )  sArray[ i ] += sArray[ i + 4 ];
-        if(  blockDim.x * blockDim.y >= 4 )  sArray[ i ] += sArray[ i + 2 ];
-        if(  blockDim.x * blockDim.y >= 2 )  sArray[ i ] += sArray[ i + 1 ];
-    }
-    
-    if( i == 0 )
-        dBlock[ blId ] = sArray[ 0 ];
+  }
+  __syncthreads();
+  if (i < 32 )
+  {
+    if(  blockDim.x >= 64 ) sArray[ i ] += sArray[ i + 32 ];
+    if(  blockDim.x >= 32 )  sArray[ i ] += sArray[ i + 16 ];
+    if(  blockDim.x >= 16 )  sArray[ i ] += sArray[ i + 8 ];
+    if(  blockDim.x >= 8 )  sArray[ i ] += sArray[ i + 4 ];
+    if(  blockDim.x >= 4 )  sArray[ i ] += sArray[ i + 2 ];
+    if(  blockDim.x >= 2 )  sArray[ i ] += sArray[ i + 1 ];
+  }
+  
+  if( i == 0 )
+    dBlock[ blId ] = sArray[ 0 ];
 }
 
 
@@ -378,10 +371,40 @@ template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                       Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-                                      int *BlockIterDevice, int oddEvenBlock )
+                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne )
 {
-    int thri = threadIdx.x; int thrj = threadIdx.y;
-    int blIdx = blockIdx.x; int blIdy = blockIdx.y;
+  int thri = threadIdx.x; int thrj = threadIdx.y;
+  int blIdx = blockIdx.x; int blIdy = blockIdx.y;
+  int grIdx = gridDim.x;
+  
+  if( BlockIterDevice[ blIdy * grIdx + blIdx] )
+  {
+  
+    const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
+    
+    int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
+    __shared__ volatile int numOfBlockx;
+    __shared__ volatile int numOfBlocky;
+    __shared__ int xkolik;
+    __shared__ int ykolik;
+    __shared__ volatile int NE;
+    if( thri == 0 && thrj == 0 )
+    {
+      xkolik = blockDim.x + 1;
+      ykolik = blockDim.y + 1;
+      numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0);
+      numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0);
+      
+      if( numOfBlockx - 1 == blIdx )
+        xkolik = dimX - (blIdx)*blockDim.x+1;
+      
+      if( numOfBlocky -1 == blIdy )
+        ykolik = dimY - (blIdy)*blockDim.y+1;
+        BlockIterDevice[ blIdy * grIdx + blIdx ] = 0;
+        NE = ne;
+    }
+    __syncthreads();
+   
     int i = thri + blockDim.x*blIdx;
     int j = blockDim.y*blIdy + thrj;
     int currentIndex = thrj * blockDim.x + thri;
@@ -389,17 +412,15 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     //__shared__ volatile bool changed[ blockDim.x*blockDim.y ];
     __shared__ volatile bool changed[16*16];
     changed[ currentIndex ] = false;
-    
     if( thrj == 0 && thri == 0 )
-        changed[ 0 ] = true;
+      changed[ 0 ] = true;
     
-    const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
     __shared__ Real hx;
     __shared__ Real hy;
     if( thrj == 1 && thri == 1 )
     {
-        hx = mesh.getSpaceSteps().x();
-        hy = mesh.getSpaceSteps().y();
+      hx = mesh.getSpaceSteps().x();
+      hy = mesh.getSpaceSteps().y();
     }
     
     //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ];
@@ -407,135 +428,110 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     sArray[thrj][thri] = std::numeric_limits< Real >::max();
     
     //filling sArray edges
-    int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
-    __shared__ volatile int numOfBlockx;
-    __shared__ volatile int numOfBlocky;
-    __shared__ int xkolik;
-    __shared__ int ykolik;
-    if( thri == 0 && thrj == 0 )
+    if( thri == 0 )
+    {        
+      if( dimX > (blIdx+1) * blockDim.x  && thrj+1 < ykolik && NE == 1 )
+        sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ];
+      else
+        sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
+    }
+    
+    if( thri == 1 )
     {
-        xkolik = blockDim.x + 1;
-        ykolik = blockDim.y + 1;
-        numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0);
-        numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0);
+      if( blIdx != 0 && thrj+1 < ykolik && NE == 1 )
+        sArray[thrj+1][0] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX ];
+      else
+        sArray[thrj+1][0] = std::numeric_limits< Real >::max();
+    }
     
-        if( numOfBlockx - 1 == blIdx )
-            xkolik = dimX - (blIdx)*blockDim.x+1;
-
-        if( numOfBlocky -1 == blIdy )
-            ykolik = dimY - (blIdy)*blockDim.y+1;
-        BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0;
+    if( thri == 2 )
+    {
+      if( dimY > (blIdy+1) * blockDim.y  && thri+1 < xkolik && NE == 1 )
+        sArray[ykolik][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thrj+1 ];
+      else
+        sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();
     }
-    __syncthreads();
     
-    if( (blIdy%2  + blIdx) % 2 == oddEvenBlock )
+    if( thri == 3 )
     {
+      if( blIdy != 0 && thrj+1 < xkolik && NE == 1 )
+        sArray[0][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + thrj+1 ];
+      else
+        sArray[0][thrj+1] = std::numeric_limits< Real >::max();
+    }
     
-        if( thri == 0 )
-        {        
-            if( dimX > (blIdx+1) * blockDim.x  && thrj+1 < ykolik )
-                sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ];
-            else
-                sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
-        }
-
-        if( thri == 1 )
+    
+    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+    {    
+      sArray[thrj+1][thri+1] = aux[ j*mesh.getDimensions().x() + i ];
+    }
+    __syncthreads();  
+    
+    while( changed[ 0 ] )
+    {
+      __syncthreads();
+      
+      changed[ currentIndex] = false;
+      
+      //calculation of update cell
+      if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+      {
+        if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] )
         {
-            if( blIdx != 0 && thrj+1 < ykolik )
-                sArray[thrj+1][0] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX ];
-            else
-                sArray[thrj+1][0] = std::numeric_limits< Real >::max();
+          changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy);
         }
-
-        if( thri == 2 )
+      }
+      __syncthreads();
+      
+      //pyramid reduction
+      if( blockDim.x*blockDim.y == 1024 )
+      {
+        if( currentIndex < 512 )
         {
-            if( dimY > (blIdy+1) * blockDim.y  && thri+1 < xkolik )
-                sArray[ykolik][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thrj+1 ];
-            else
-               sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
         }
-
-        if( thri == 3 )
+      }
+      __syncthreads();
+      if( blockDim.x*blockDim.y >= 512 )
+      {
+        if( currentIndex < 256 )
         {
-            if( blIdy != 0 && thrj+1 < xkolik )
-                sArray[0][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + thrj+1 ];
-            else
-                sArray[0][thrj+1] = std::numeric_limits< Real >::max();
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
         }
-
-
-        if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
-        {    
-            sArray[thrj+1][thri+1] = aux[ j*mesh.getDimensions().x() + i ];
+      }
+      __syncthreads();
+      if( blockDim.x*blockDim.y >= 256 )
+      {
+        if( currentIndex < 128 )
+        {
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
         }
-        __syncthreads();  
-
-        while( changed[ 0 ] )
+      }
+      __syncthreads();
+      if( blockDim.x*blockDim.y >= 128 )
+      {
+        if( currentIndex < 64 )
         {
-            __syncthreads();
-
-            changed[ currentIndex] = false;
-
-        //calculation of update cell
-            if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
-            {
-                if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] )
-                {
-                    changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy);
-                }
-            }
-            __syncthreads();
-
-        //pyramid reduction
-            if( blockDim.x*blockDim.y == 1024 )
-            {
-                if( currentIndex < 512 )
-                {
-                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
-                }
-            }
-            __syncthreads();
-            if( blockDim.x*blockDim.y >= 512 )
-            {
-                if( currentIndex < 256 )
-                {
-                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
-                }
-            }
-            __syncthreads();
-            if( blockDim.x*blockDim.y >= 256 )
-            {
-                if( currentIndex < 128 )
-                {
-                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
-                }
-            }
-            __syncthreads();
-            if( blockDim.x*blockDim.y >= 128 )
-            {
-                if( currentIndex < 64 )
-                {
-                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
-                }
-            }
-            __syncthreads();
-            if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
-            {
-                if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
-                if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
-                if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
-                if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
-                if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
-                if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
-            }
-            if( changed[ 0 ] && thri == 0 && thrj == 0 )
-                BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1;
-            __syncthreads();
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
         }
-        
-        if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) )
-            aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ];
-
+      }
+      __syncthreads();
+      if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
+      {
+        if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
+        if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
+        if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
+        if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
+        if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
+        if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
+      }
+      if( changed[ 0 ] && thri == 0 && thrj == 0 )
+        BlockIterDevice[ blIdy * grIdx + blIdx ] = 1;
+      __syncthreads();
     }
+    
+    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) )
+      aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ];
+  }
 }
 #endif
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index b024979cc..4daf9fc92 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -258,13 +258,21 @@ solve( const MeshPointer& mesh,
                  
           tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
           
-          int *BlockIterDevice;
+          
           int BlockIterD = 1;
           
-          cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );
+          TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
+          BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ );
+          BlockIterDevice.setValue( 1 );
+          /*int *BlockIterDevice;
+          cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/
           int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0);
-          int *dBlock;
-          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );
+          
+          TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
+          dBlock.setSize( nBlocks );
+          dBlock.setValue( 0 );
+          /*int *dBlock;
+          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/
           
           while( BlockIterD )
           {
@@ -272,17 +280,24 @@ solve( const MeshPointer& mesh,
                                                               interfaceMapPtr.template getData< Device >(),
                                                               auxPtr.template modifyData< Device>(),
                                                               BlockIterDevice );
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
+            
             CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
-            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
             
+            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
             cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
                                    
             /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
                 BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
             
           }
-          cudaFree( BlockIterDevice );
-          cudaFree( dBlock );
+          //cudaFree( BlockIterDevice );
+          //cudaFree( dBlock );
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
           aux = *auxPtr;
@@ -302,7 +317,7 @@ template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
                                       Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
-                                      int *BlockIterDevice )
+                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice )
 {
     int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
     int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z;
-- 
GitLab


From 1607d6774ba61d957b87e37d820e21b697d96b4a Mon Sep 17 00:00:00 2001
From: Fencl <fenclmat@fjfi.cvut.cz>
Date: Tue, 30 Oct 2018 18:38:41 +0100
Subject: [PATCH 04/20] FIM method implemented for GPU and FIM-FSM implemented
 for CPU (parallel).

---
 .../tnlDirectEikonalMethodsBase.h             |   22 +-
 .../tnlDirectEikonalMethodsBase_impl.h        | 2045 +++++++++--------
 .../tnlFastSweepingMethod2D_impl.h            |  863 +++----
 .../tnlFastSweepingMethod3D_impl.h            |    5 -
 4 files changed, 1411 insertions(+), 1524 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index 0f45be71c..cbb1a1ff6 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -74,12 +74,16 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
                                          const MeshEntity& cell,
                                          const RealType velocity = 1.0 );
       
-      __cuda_callable__ bool updateCell( volatile Real sArray[18][18],
+      template< int sizeSArray >
+      __cuda_callable__ bool updateCell( volatile Real *sArray,
                                          int thri, int thrj, const Real hx, const Real hy,
                                          const Real velocity = 1.0 );
+      
+      template< int sizeSArray >
       void updateBlocks( InterfaceMapType interfaceMap,
                          MeshFunctionType aux,
-                         ArrayContainer BlockIterHost, int numThreadsPerBlock );
+                         MeshFunctionType helpFunc,
+                         ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ );
       
       void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY  );
 };
@@ -119,9 +123,6 @@ T1 meet2DCondition( T1 a, T1 b, const T2 ha, const T2 hb, const T1 value, double
 template < typename T1 >
 __cuda_callable__ void sortMinims( T1 pom[] );
 
-template < typename Index >
-void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY );
-
 #ifdef HAVE_CUDA
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, 
@@ -134,15 +135,12 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
                                       Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& aux,
                                       bool *BlockIterDevice );
 
-template < typename Real, typename Device, typename Index >
+template < int sizeSArray, typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
-                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-<<<<<<< HEAD
-                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne = 1 );
-=======
-                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );
->>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
+                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
+                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
+                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock =0);
 
 template < typename Index >
 __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index 1f9fc5eeb..95971c9b8 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -1,4 +1,4 @@
- /* 
+/* 
  * File:   tnlDirectEikonalMethodsBase_impl.h
  * Author: oberhuber
  *
@@ -13,233 +13,259 @@
 #include "tnlFastSweepingMethod.h"
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >::
 initInterface( const MeshFunctionPointer& _input,
-               MeshFunctionPointer& _output,
-               InterfaceMapPointer& _interfaceMap  )
+        MeshFunctionPointer& _output,
+        InterfaceMapPointer& _interfaceMap  )
 {
-    if( std::is_same< Device, Devices::Cuda >::value )
-    {
+  if( std::is_same< Device, Devices::Cuda >::value )
+  {
 #ifdef HAVE_CUDA
-        const MeshType& mesh = _input->getMesh();
-        
-        const int cudaBlockSize( 16 );
-        int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
-        dim3 blockSize( cudaBlockSize );
-        dim3 gridSize( numBlocksX );
-        Devices::Cuda::synchronizeDevice();
-        CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
-                                                   _output.template modifyData< Device >(),
-                                                   _interfaceMap.template modifyData< Device >() );
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
+    const MeshType& mesh = _input->getMesh();
+    
+    const int cudaBlockSize( 16 );
+    int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
+    dim3 blockSize( cudaBlockSize );
+    dim3 gridSize( numBlocksX );
+    Devices::Cuda::synchronizeDevice();
+    CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
+            _output.template modifyData< Device >(),
+            _interfaceMap.template modifyData< Device >() );
+    cudaDeviceSynchronize();
+    TNL_CHECK_CUDA_DEVICE;
 #endif
-    }
-    if( std::is_same< Device, Devices::Host >::value )
-    {
-        const MeshType& mesh = _input->getMesh();
-        typedef typename MeshType::Cell Cell;
-        const MeshFunctionType& input = _input.getData();
-        MeshFunctionType& output = _output.modifyData();
-        InterfaceMapType& interfaceMap = _interfaceMap.modifyData();
-        Cell cell( mesh );
-        for( cell.getCoordinates().x() = 0;
+  }
+  if( std::is_same< Device, Devices::Host >::value )
+  {
+    const MeshType& mesh = _input->getMesh();
+    typedef typename MeshType::Cell Cell;
+    const MeshFunctionType& input = _input.getData();
+    MeshFunctionType& output = _output.modifyData();
+    InterfaceMapType& interfaceMap = _interfaceMap.modifyData();
+    Cell cell( mesh );
+    for( cell.getCoordinates().x() = 0;
             cell.getCoordinates().x() < mesh.getDimensions().x();
             cell.getCoordinates().x() ++ )
-           {
-               cell.refresh();
-               output[ cell.getIndex() ] =
-               input( cell ) >= 0 ? std::numeric_limits< RealType >::max() :
-                                  -std::numeric_limits< RealType >::max();
-               interfaceMap[ cell.getIndex() ] = false;
-           }
-        
-        
-        const RealType& h = mesh.getSpaceSteps().x();
-        for( cell.getCoordinates().x() = 0;
-             cell.getCoordinates().x() < mesh.getDimensions().x() - 1;
-             cell.getCoordinates().x() ++ )
+    {
+      cell.refresh();
+      output[ cell.getIndex() ] =
+              input( cell ) >= 0 ? std::numeric_limits< RealType >::max() :
+                -std::numeric_limits< RealType >::max();
+      interfaceMap[ cell.getIndex() ] = false;
+    }
+    
+    
+    const RealType& h = mesh.getSpaceSteps().x();
+    for( cell.getCoordinates().x() = 0;
+            cell.getCoordinates().x() < mesh.getDimensions().x() - 1;
+            cell.getCoordinates().x() ++ )
+    {
+      cell.refresh();
+      const RealType& c = input( cell );      
+      if( ! cell.isBoundaryEntity()  )
+      {
+        const auto& neighbors = cell.getNeighborEntities();
+        Real pom = 0;
+        //const IndexType& c = cell.getIndex();
+        const IndexType e = neighbors.template getEntityIndex<  1 >();
+        if( c * input[ e ] <= 0 )
         {
-           cell.refresh();
-           const RealType& c = input( cell );      
-           if( ! cell.isBoundaryEntity()  )
-           {
-              const auto& neighbors = cell.getNeighborEntities();
-              Real pom = 0;
-              //const IndexType& c = cell.getIndex();
-              const IndexType e = neighbors.template getEntityIndex<  1 >();
-              if( c * input[ e ] <= 0 )
-              {
-                pom = TNL::sign( c )*( h * c )/( c - input[ e ]);
-                if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
-                    output[ cell.getIndex() ] = pom;
-
-                pom = pom - TNL::sign( c )*h; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
-                if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
-                    output[ e ] = pom; 
-
-                interfaceMap[ cell.getIndex() ] = true;
-                interfaceMap[ e ] = true;
-              }
-           }
+          pom = TNL::sign( c )*( h * c )/( c - input[ e ]);
+          if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
+            output[ cell.getIndex() ] = pom;
+          
+          pom = pom - TNL::sign( c )*h; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
+          if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
+            output[ e ] = pom; 
+          
+          interfaceMap[ cell.getIndex() ] = true;
+          interfaceMap[ e ] = true;
         }
+      }
     }
+  }
 }
 
 template< typename Real,
-          typename Device,
-          typename Index >
-void 
+        typename Device,
+        typename Index >
+template< int sizeSArray >
+void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
 updateBlocks( InterfaceMapType interfaceMap,
-              MeshFunctionType aux,
-              ArrayContainer BlockIterHost, int numThreadsPerBlock )
+        MeshFunctionType aux,
+        MeshFunctionType helpFunc,
+        ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
 {
+#pragma omp parallel for schedule( dynamic )
   for( int i = 0; i < BlockIterHost.getSize(); i++ )
   {
     if( BlockIterHost[ i ] )
     {
       MeshType mesh = interfaceMap.template getMesh< Devices::Host >();
-    
+      
       int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
-      int numOfBlockx = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
-      int numOfBlocky = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
+      //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl;
+      int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
+      int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
+      //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl;
       int xkolik = numThreadsPerBlock + 1;
       int ykolik = numThreadsPerBlock + 1;
       
       int blIdx = i%numOfBlockx;
-      int blIdy = i/numOfBlocky;
+      int blIdy = i/numOfBlockx;
+      //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl;
       
       if( numOfBlockx - 1 == blIdx )
         xkolik = dimX - (blIdx)*numThreadsPerBlock+1;
       
       if( numOfBlocky -1 == blIdy )
         ykolik = dimY - (blIdy)*numThreadsPerBlock+1;
-    
-        
+      //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl;
+      
+      
       /*bool changed[numThreadsPerBlock*numThreadsPerBlock];
-      changed[ 0 ] = 1;*/
+       changed[ 0 ] = 1;*/
       Real hx = mesh.getSpaceSteps().x();
       Real hy = mesh.getSpaceSteps().y();
       
-      Real changed1[ 16*16 ];
-      /*Real changed2[ 16*16 ];
-      Real changed3[ 16*16 ];
-      Real changed4[ 16*16 ];*/
-      Real sArray[18][18];
+      bool changed = false;
+      
+      
+      RealType *sArray;
+      sArray = new Real[ sizeSArray * sizeSArray ];
+      if( sArray == nullptr )
+        std::cout << "Error while allocating memory for sArray." << std::endl;
+      
+      for( int thri = 0; thri < sizeSArray; thri++ ){
+        for( int thrj = 0; thrj < sizeSArray; thrj++ )
+          sArray/*[i]*/[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max();
+      }
       
-      for( int thri = 0; thri < numThreadsPerBlock + 2; thri++ )
-        for( int thrj = 0; thrj < numThreadsPerBlock + 2; thrj++ )
-          sArray[thrj][thri] = std::numeric_limits< Real >::max();
-    
       BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0;
-    
+      
       for( int thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ )
       {        
         if( dimX > (blIdx+1) * numThreadsPerBlock  && thrj+1 < ykolik )
-          sArray[thrj+1][xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ];
-        else
-         sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
-      
-    
+          sArray/*[i]*/[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ];
+        
+        
         if( blIdx != 0 && thrj+1 < ykolik )
-          sArray[thrj+1][0] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ];
-        else
-          sArray[thrj+1][0] = std::numeric_limits< Real >::max();
-    
+          sArray/*[i]*/[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ];
+        
         if( dimY > (blIdy+1) * numThreadsPerBlock  && thrj+1 < xkolik )
-          sArray[ykolik][thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ];
-        else
-          sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();
-      
+          sArray/*[i]*/[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ];
+        
         if( blIdy != 0 && thrj+1 < xkolik )
-          sArray[0][thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ];
-        else
-          sArray[0][thrj+1] = std::numeric_limits< Real >::max();
+          sArray/*[i]*/[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ];
       }
-    
-      for( int k = 0; k < numThreadsPerBlock; k++ )
-        for( int l = 0; l < numThreadsPerBlock; l++ ) 
-          sArray[k+1][l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ];
-
-      for( int k = 0; k < numThreadsPerBlock; k++ ) 
+      
+      for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ )
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
+            sArray/*[i]*/[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ];
+      }
+      bool pom = false;
+      for( int k = 0; k < numThreadsPerBlock; k++ ){ 
         for( int l = 0; l < numThreadsPerBlock; l++ ){
-          changed1[ k*numThreadsPerBlock + l ] = 0;
-          /*changed2[ k*numThreadsPerBlock + l ] = 0;
-          changed3[ k*numThreadsPerBlock + l ] = 0;
-          changed4[ k*numThreadsPerBlock + l ] = 0;*/
-          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
-          {
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ){
+            //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl;
             if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
             {
-              changed1[ k*numThreadsPerBlock + l ] = this->updateCell( sArray, l+1, k+1, hx,hy);
+              pom = this->updateCell<sizeSArray>( sArray/*[i]*/, l+1, k+1, hx,hy);
+              changed = changed || pom;
             }
           }
         }
-
-      for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
-        for( int l = 0; l < numThreadsPerBlock; l++ ) { 
-          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
+      }
+      /*aux.save( "aux-1pruch.tnl" );
+      for( int k = 0; k < sizeSArray; k++ ){ 
+        for( int l = 0; l < sizeSArray; l++ ) {
+          std::cout << sArray[ k * sizeSArray + l] << " ";
+        }
+        std::cout << std::endl;
+      }*/
+           
+      for( int k = 0; k < numThreadsPerBlock; k++ ) 
+        for( int l = numThreadsPerBlock-1; l >-1; l-- ) { 
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
           {
             if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
             {
-              /*changed2[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
+              this->updateCell<sizeSArray>( sArray/*[i]*/, l+1, k+1, hx,hy);
             }
           }
         }
-
-      for( int k = 0; k < numThreadsPerBlock; k++ ) 
-        for( int l = numThreadsPerBlock-1; l >-1; l-- ) { 
-          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
+      /*aux.save( "aux-2pruch.tnl" );
+      for( int k = 0; k < sizeSArray; k++ ){ 
+        for( int l = 0; l < sizeSArray; l++ ) {
+          std::cout << sArray[ k * sizeSArray + l] << " ";
+        }
+        std::cout << std::endl;
+      }*/
+      
+      for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
+        for( int l = 0; l < numThreadsPerBlock; l++ ) {
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
           {
             if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
             {
-              /*changed3[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
+              this->updateCell<sizeSArray>( sArray/*[i]*/, l+1, k+1, hx,hy);
             }
           }
         }
-
-      for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
+      /*aux.save( "aux-3pruch.tnl" );
+      for( int k = 0; k < sizeSArray; k++ ){ 
+        for( int l = 0; l < sizeSArray; l++ ) {
+          std::cout << sArray[ k * sizeSArray + l] << " ";
+        }
+        std::cout << std::endl;
+      }*/
+      
+      for( int k = numThreadsPerBlock-1; k > -1; k-- ){
         for( int l = numThreadsPerBlock-1; l >-1; l-- ) { 
-          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
           {
             if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
             {
-              /*changed4[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
+              this->updateCell<sizeSArray>( sArray/*[i]*/, l+1, k+1, hx,hy);
             }
           }
         }
-
-      for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
-        for( int l = numThreadsPerBlock-1; l >-1; l-- ){
-          changed1[ 0 ] = changed1[ 0 ] || changed1[ k*numThreadsPerBlock + l ];
-          /*changed2[ 0 ] = changed2[ 0 ] || changed2[ k*numThreadsPerBlock + l ];
-          changed3[ 0 ] = changed3[ 0 ] || changed3[ k*numThreadsPerBlock + l ];
-          changed4[ 0 ] = changed4[ 0 ] || changed4[ k*numThreadsPerBlock + l ];*/
+      }
+      /*aux.save( "aux-4pruch.tnl" );
+      for( int k = 0; k < sizeSArray; k++ ){ 
+        for( int l = 0; l < sizeSArray; l++ ) {
+          std::cout << sArray[ k * sizeSArray + l] << " ";
         }
+        std::cout << std::endl;
+      }*/
+      
       
-      if( changed1[ 0 ] /*|| changed2[ 0 ] ||changed3[ 0 ] ||changed4[ 0 ]*/ )
+      if( changed ){
         BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 1;
-
+      }
+      
+      
       for( int k = 0; k < numThreadsPerBlock; k++ ){ 
-        for( int l = 0; l < numThreadsPerBlock; l++ ) {       
-          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY &&
-              (!interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ]) )
-            aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] = sArray[ k + 1 ][ l + 1 ];
+        for( int l = 0; l < numThreadsPerBlock; l++ ) {
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )      
+            helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] = sArray/*[i]*/[ (k + 1)* sizeSArray + l + 1 ];
           //std::cout<< sArray[k+1][l+1];
         }
         //std::cout<<std::endl;
       }
+      //delete []sArray;
     }
   }
 }
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
 void 
 tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
 getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY )
@@ -249,643 +275,643 @@ getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY )
   
   for(int i = 0; i < numBlockX * numBlockY; i++)
   {
-    BlockIterPom[ i ] = 0;  
-    if( BlockIterHost[ i ] )
-    {
-      // i = k*numBlockY + m;
-      int m=0, k=0;
-      m = i%numBlockX;
-      k = i/numBlockX;
-      if( k > 0 )
-        BlockIterPom[i - numBlockX] = 1;
-      if( k < numBlockY - 1 )
-        BlockIterPom[i + numBlockX] = 1;
-      
-      if( m < numBlockX - 1 )
-        BlockIterPom[ i+1 ] = 1;
-      if( m > 0 )
-        BlockIterPom[ i-1 ] = 1;
+    BlockIterPom[ i ] = 0;//BlockIterPom[ i ] = 0;
+    int m=0, k=0;
+    m = i%numBlockX;
+    k = i/numBlockX;
+    if( m > 0 && BlockIterHost[ i - 1 ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){
+      BlockIterPom[ i ] = 1;
     }
+    //BlockIterPom[ i ];
   }
-  for(int i = 0; i < numBlockX * numBlockY; i++ )
-      //if( !BlockIter[ i ] )
-        BlockIterHost[ i ] = BlockIterPom[ i ];
-      /*else
-        BlockIter[ i ] = 0;*/
-  /*for( int i = numBlockX-1; i > -1; i-- )
+  
+  for(int i = 0; i < numBlockX * numBlockY; i++)
   {
-      for( int j = 0; j< numBlockY; j++ )
-          std::cout << BlockIterHost[ i*numBlockY + j ];
-      std::cout << std::endl;
+    if( !BlockIterHost[ i ] )
+      BlockIterHost[ i ] = BlockIterPom[ i ];
   }
-  std::cout << std::endl;*/
+  /*else
+   BlockIter[ i ] = 0;*/
+  /*for( int i = numBlockX-1; i > -1; i-- )
+   {
+   for( int j = 0; j< numBlockY; j++ )
+   std::cout << BlockIterHost[ i*numBlockY + j ];
+   std::cout << std::endl;
+   }
+   std::cout << std::endl;*/
   delete[] BlockIterPom;
 }
 
 template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename MeshEntity >
+        typename Device,
+        typename Index >
+template< typename MeshEntity >
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >::
 updateCell( MeshFunctionType& u,
-            const MeshEntity& cell, 
-            const RealType v )
+        const MeshEntity& cell, 
+        const RealType v )
 {
-    const auto& neighborEntities = cell.template getNeighborEntities< 1 >();
-    const MeshType& mesh = cell.getMesh();
-    const RealType& h = mesh.getSpaceSteps().x();
-    const RealType value = u( cell );
-    RealType a, tmp = std::numeric_limits< RealType >::max();
-
-    if( cell.getCoordinates().x() == 0 )
-       a = u[ neighborEntities.template getEntityIndex< 1 >() ];
-    else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
-       a = u[ neighborEntities.template getEntityIndex< -1 >() ];
-    else
-    {
-       a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1 >() ],
-                           u[ neighborEntities.template getEntityIndex<  1 >() ] );
-    }
-
-    if( fabs( a ) == std::numeric_limits< RealType >::max() )
-      return;
-   
-    tmp = a + TNL::sign( value ) * h/v;
-    
-    u[ cell.getIndex() ] = argAbsMin( value, tmp );
+  const auto& neighborEntities = cell.template getNeighborEntities< 1 >();
+  const MeshType& mesh = cell.getMesh();
+  const RealType& h = mesh.getSpaceSteps().x();
+  const RealType value = u( cell );
+  RealType a, tmp = std::numeric_limits< RealType >::max();
+  
+  if( cell.getCoordinates().x() == 0 )
+    a = u[ neighborEntities.template getEntityIndex< 1 >() ];
+  else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
+    a = u[ neighborEntities.template getEntityIndex< -1 >() ];
+  else
+  {
+    a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1 >() ],
+            u[ neighborEntities.template getEntityIndex<  1 >() ] );
+  }
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() )
+    return;
+  
+  tmp = a + TNL::sign( value ) * h/v;
+  
+  u[ cell.getIndex() ] = argAbsMin( value, tmp );
 }
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
 initInterface( const MeshFunctionPointer& _input,
-               MeshFunctionPointer& _output,
-               InterfaceMapPointer& _interfaceMap )
+        MeshFunctionPointer& _output,
+        InterfaceMapPointer& _interfaceMap )
 {
-            
-    if( std::is_same< Device, Devices::Cuda >::value )
-    {
+  
+  if( std::is_same< Device, Devices::Cuda >::value )
+  {
 #ifdef HAVE_CUDA
-        const MeshType& mesh = _input->getMesh();
-        
-        const int cudaBlockSize( 16 );
-        int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
-        int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
-        dim3 blockSize( cudaBlockSize, cudaBlockSize );
-        dim3 gridSize( numBlocksX, numBlocksY );
-        Devices::Cuda::synchronizeDevice();
-        CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
-                                                   _output.template modifyData< Device >(),
-                                                   _interfaceMap.template modifyData< Device >() );
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
+    const MeshType& mesh = _input->getMesh();
+    
+    const int cudaBlockSize( 16 );
+    int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
+    int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
+    dim3 blockSize( cudaBlockSize, cudaBlockSize );
+    dim3 gridSize( numBlocksX, numBlocksY );
+    Devices::Cuda::synchronizeDevice();
+    CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
+            _output.template modifyData< Device >(),
+            _interfaceMap.template modifyData< Device >() );
+    cudaDeviceSynchronize();
+    TNL_CHECK_CUDA_DEVICE;
 #endif
-    }
-    if( std::is_same< Device, Devices::Host >::value )
-    {
-        MeshFunctionType input = _input.getData();
-        
-        /*double A[320][320];
-        std::ifstream fileInit("/home/maty/Downloads/initData.txt");
-
-        for (int i = 0; i < 320; i++)
-            for (int j = 0; j < 320; j++)
-                fileInit >> A[i][j];
-        fileInit.close();
-        for (int i = 0; i < 320; i++)
-            for (int j = 0; j < 320; j++)
-                input[i*320 + j] = A[i][j];*/
-        
-        
-         MeshFunctionType& output = _output.modifyData();
-         InterfaceMapType& interfaceMap = _interfaceMap.modifyData();
-        const MeshType& mesh = input.getMesh();
-        typedef typename MeshType::Cell Cell;
-        Cell cell( mesh );
-        for( cell.getCoordinates().y() = 0;
-             cell.getCoordinates().y() < mesh.getDimensions().y();
-             cell.getCoordinates().y() ++ )
-            for( cell.getCoordinates().x() = 0;
-                 cell.getCoordinates().x() < mesh.getDimensions().x();
-                 cell.getCoordinates().x() ++ )
-                {
-                    cell.refresh();
-                    output[ cell.getIndex() ] =
-                    input( cell ) >= 0 ? std::numeric_limits< RealType >::max() :
-                                       - std::numeric_limits< RealType >::max();
-                    interfaceMap[ cell.getIndex() ] = false;
-                }
-
-       const RealType& hx = mesh.getSpaceSteps().x();
-       const RealType& hy = mesh.getSpaceSteps().y();     
-       for( cell.getCoordinates().y() = 0;
+  }
+  if( std::is_same< Device, Devices::Host >::value )
+  {
+    MeshFunctionType input = _input.getData();
+    
+    /*double A[320][320];
+     std::ifstream fileInit("/home/maty/Downloads/initData.txt");
+     
+     for (int i = 0; i < 320; i++)
+     for (int j = 0; j < 320; j++)
+     fileInit >> A[j];
+     fileInit.close();
+     for (int i = 0; i < 320; i++)
+     for (int j = 0; j < 320; j++)
+     input[i*320 + j] = A[j];*/
+    
+    
+    MeshFunctionType& output = _output.modifyData();
+    InterfaceMapType& interfaceMap = _interfaceMap.modifyData();
+    const MeshType& mesh = input.getMesh();
+    typedef typename MeshType::Cell Cell;
+    Cell cell( mesh );
+    for( cell.getCoordinates().y() = 0;
             cell.getCoordinates().y() < mesh.getDimensions().y();
             cell.getCoordinates().y() ++ )
-          for( cell.getCoordinates().x() = 0;
-               cell.getCoordinates().x() < mesh.getDimensions().x();
-               cell.getCoordinates().x() ++ )
+      for( cell.getCoordinates().x() = 0;
+              cell.getCoordinates().x() < mesh.getDimensions().x();
+              cell.getCoordinates().x() ++ )
+      {
+        cell.refresh();
+        output[ cell.getIndex() ] =
+                input( cell ) >= 0 ? std::numeric_limits< RealType >::max() :
+                  - std::numeric_limits< RealType >::max();
+        interfaceMap[ cell.getIndex() ] = false;
+      }
+    
+    const RealType& hx = mesh.getSpaceSteps().x();
+    const RealType& hy = mesh.getSpaceSteps().y();     
+    for( cell.getCoordinates().y() = 0;
+            cell.getCoordinates().y() < mesh.getDimensions().y();
+            cell.getCoordinates().y() ++ )
+      for( cell.getCoordinates().x() = 0;
+              cell.getCoordinates().x() < mesh.getDimensions().x();
+              cell.getCoordinates().x() ++ )
+      {
+        cell.refresh();
+        const RealType& c = input( cell );
+        if( ! cell.isBoundaryEntity()  )
+        {
+          auto neighbors = cell.getNeighborEntities();
+          Real pom = 0;
+          const IndexType e = neighbors.template getEntityIndex<  1,  0 >();
+          const IndexType n = neighbors.template getEntityIndex<  0,  1 >();
+          //Try init with exact data:
+          /*if( c * input[ n ] <= 0 )
+           {
+           output[ cell.getIndex() ] = c;
+           output[ n ] = input[ n ];
+           interfaceMap[ cell.getIndex() ] = true;
+           interfaceMap[ n ] = true;
+           }   
+           if( c * input[ e ] <= 0 )
+           {   
+           output[ cell.getIndex() ] = c;
+           output[ e ] = input[ e ];
+           interfaceMap[ cell.getIndex() ] = true;
+           interfaceMap[ e ] = true;
+           }*/
+          if( c * input[ n ] <= 0 )
           {
-             cell.refresh();
-             const RealType& c = input( cell );
-             if( ! cell.isBoundaryEntity()  )
+            /*if( c >= 0 )
+             {*/
+            pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
+            if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
+              output[ cell.getIndex() ] = pom;
+            pom = pom - TNL::sign( c )*hy;
+            if( TNL::abs( output[ n ] ) > TNL::abs( pom ) )
+              output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy;
+            /*}else
              {
-                auto neighbors = cell.getNeighborEntities();
-                Real pom = 0;
-                const IndexType e = neighbors.template getEntityIndex<  1,  0 >();
-                const IndexType n = neighbors.template getEntityIndex<  0,  1 >();
-                //Try init with exact data:
-                /*if( c * input[ n ] <= 0 )
-                {
-                    output[ cell.getIndex() ] = c;
-                    output[ n ] = input[ n ];
-                    interfaceMap[ cell.getIndex() ] = true;
-                    interfaceMap[ n ] = true;
-                }   
-                if( c * input[ e ] <= 0 )
-                {   
-                    output[ cell.getIndex() ] = c;
-                    output[ e ] = input[ e ];
-                    interfaceMap[ cell.getIndex() ] = true;
-                    interfaceMap[ e ] = true;
-                }*/
-                if( c * input[ n ] <= 0 )
-                {
-                    /*if( c >= 0 )
-                    {*/
-                        pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
-                        if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
-                            output[ cell.getIndex() ] = pom;
-                        pom = pom - TNL::sign( c )*hy;
-                        if( TNL::abs( output[ n ] ) > TNL::abs( pom ) )
-                            output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy;
-                    /*}else
-                    {
-                        pom = - ( hy * c )/( c - input[ n ]);
-                        if( output[ cell.getIndex() ] < pom )
-                            output[ cell.getIndex() ] = pom;
-                        if( output[ n ] > hy + pom )
-                            output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]);
-                    }*/
-                    interfaceMap[ cell.getIndex() ] = true;
-                    interfaceMap[ n ] = true;
-                }
-                if( c * input[ e ] <= 0 )
-                {
-                    /*if( c >= 0 )
-                    {*/
-                        pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
-                        if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
-                            output[ cell.getIndex() ] = pom;
-
-                        pom = pom - TNL::sign( c )*hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
-                        if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
-                            output[ e ] = pom; 
-                    /*}else
-                    {
-                        pom = - (hx * c)/( c - input[ e ]);
-                        if( output[ cell.getIndex() ] < pom )
-                            output[ cell.getIndex() ] = pom;
-
-                        pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
-                        if( output[ e ] > pom )
-                            output[ e ] = pom;
-                    }*/
-                    interfaceMap[ cell.getIndex() ] = true;
-                    interfaceMap[ e ] = true;
-                }
-             }
+             pom = - ( hy * c )/( c - input[ n ]);
+             if( output[ cell.getIndex() ] < pom )
+             output[ cell.getIndex() ] = pom;
+             if( output[ n ] > hy + pom )
+             output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]);
+             }*/
+            interfaceMap[ cell.getIndex() ] = true;
+            interfaceMap[ n ] = true;
           }
+          if( c * input[ e ] <= 0 )
+          {
+            /*if( c >= 0 )
+             {*/
+            pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
+            if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
+              output[ cell.getIndex() ] = pom;
+            
+            pom = pom - TNL::sign( c )*hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
+            if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
+              output[ e ] = pom; 
+            /*}else
+             {
+             pom = - (hx * c)/( c - input[ e ]);
+             if( output[ cell.getIndex() ] < pom )
+             output[ cell.getIndex() ] = pom;
+             
+             pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
+             if( output[ e ] > pom )
+             output[ e ] = pom;
+             }*/
+            interfaceMap[ cell.getIndex() ] = true;
+            interfaceMap[ e ] = true;
+          }
+        }
       }
+  }
 }
 
 template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename MeshEntity >
+        typename Device,
+        typename Index >
+template< typename MeshEntity >
 __cuda_callable__
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
 updateCell( MeshFunctionType& u,
-            const MeshEntity& cell,   
-            const RealType v)
+        const MeshEntity& cell,   
+        const RealType v)
 {
-   const auto& neighborEntities = cell.template getNeighborEntities< 2 >();
-   const MeshType& mesh = cell.getMesh();
-   const RealType& hx = mesh.getSpaceSteps().x();
-   const RealType& hy = mesh.getSpaceSteps().y();
-   const RealType value = u( cell );
-   RealType a, b, tmp = std::numeric_limits< RealType >::max();
-   
-   if( cell.getCoordinates().x() == 0 )
-      a = u[ neighborEntities.template getEntityIndex< 1,  0 >() ];
-   else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
-      a = u[ neighborEntities.template getEntityIndex< -1,  0 >() ];
-   else
-   {
-      a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1,  0 >() ],
-                          u[ neighborEntities.template getEntityIndex<  1,  0 >() ] );
-   }
-
-   if( cell.getCoordinates().y() == 0 )
-      b = u[ neighborEntities.template getEntityIndex< 0,  1 >()];
-   else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 )
-      b = u[ neighborEntities.template getEntityIndex< 0,  -1 >() ];
-   else
-   {
-      b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0,  -1 >() ],
-                          u[ neighborEntities.template getEntityIndex< 0,   1 >() ] );
-   }
-   if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-       fabs( b ) == std::numeric_limits< RealType >::max() )
-      return;
-   /*if( fabs( a ) == TypeInfo< Real >::getMaxValue() ||
-       fabs( b ) == TypeInfo< Real >::getMaxValue() ||
-       fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) )
+  const auto& neighborEntities = cell.template getNeighborEntities< 2 >();
+  const MeshType& mesh = cell.getMesh();
+  const RealType& hx = mesh.getSpaceSteps().x();
+  const RealType& hy = mesh.getSpaceSteps().y();
+  const RealType value = u( cell );
+  RealType a, b, tmp = std::numeric_limits< RealType >::max();
+  
+  if( cell.getCoordinates().x() == 0 )
+    a = u[ neighborEntities.template getEntityIndex< 1,  0 >() ];
+  else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
+    a = u[ neighborEntities.template getEntityIndex< -1,  0 >() ];
+  else
+  {
+    a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1,  0 >() ],
+            u[ neighborEntities.template getEntityIndex<  1,  0 >() ] );
+  }
+  
+  if( cell.getCoordinates().y() == 0 )
+    b = u[ neighborEntities.template getEntityIndex< 0,  1 >()];
+  else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 )
+    b = u[ neighborEntities.template getEntityIndex< 0,  -1 >() ];
+  else
+  {
+    b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0,  -1 >() ],
+            u[ neighborEntities.template getEntityIndex< 0,   1 >() ] );
+  }
+  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+          fabs( b ) == std::numeric_limits< RealType >::max() )
+    return;
+  /*if( fabs( a ) == TypeInfo< Real >::getMaxValue() ||
+   fabs( b ) == TypeInfo< Real >::getMaxValue() ||
+   fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) )
    {
-      tmp = 
-        fabs( a ) >= fabs( b ) ? b + TNL::sign( value ) * hy :
-                                 a + TNL::sign( value ) * hx;
+   tmp = 
+   fabs( a ) >= fabs( b ) ? b + TNL::sign( value ) * hy :
+   a + TNL::sign( value ) * hx;
    }*/
-   /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() &&
-       fabs( b ) != TypeInfo< Real >::getMaxValue() &&
-       fabs( a - b ) < TNL::sqrt( (hx * hx + hy * hy)/v ) )
+  /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() &&
+   fabs( b ) != TypeInfo< Real >::getMaxValue() &&
+   fabs( a - b ) < TNL::sqrt( (hx * hx + hy * hy)/v ) )
    {
-       tmp = ( hx * hx * b + hy * hy * a + 
-            sign( value ) * hx * hy * TNL::sqrt( ( hx * hx + hy * hy )/v - 
-            ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy );
-       u[ cell.getIndex() ] =  tmp;
+   tmp = ( hx * hx * b + hy * hy * a + 
+   sign( value ) * hx * hy * TNL::sqrt( ( hx * hx + hy * hy )/v - 
+   ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy );
+   u[ cell.getIndex() ] =  tmp;
    }
    else
    {
-       tmp = 
-          fabs( a ) > fabs( b ) ? b + TNL::sign( value ) * hy/v :
-                                   a + TNL::sign( value ) * hx/v;
-       u[ cell.getIndex() ] = argAbsMin( value, tmp );
-       //tmp = TypeInfo< RealType >::getMaxValue();
+   tmp = 
+   fabs( a ) > fabs( b ) ? b + TNL::sign( value ) * hy/v :
+   a + TNL::sign( value ) * hx/v;
+   u[ cell.getIndex() ] = argAbsMin( value, tmp );
+   //tmp = TypeInfo< RealType >::getMaxValue();
    }*/
-    RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
-    sortMinims( pom );
-    tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
-    
-                                
-    if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
-        u[ cell.getIndex() ] = argAbsMin( value, tmp );
-    else
-    {
-        tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+  RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
+  sortMinims( pom );
+  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
+  
+  
+  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
+    u[ cell.getIndex() ] = argAbsMin( value, tmp );
+  else
+  {
+    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
             TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
             ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-        u[ cell.getIndex() ] = argAbsMin( value, tmp );
-    }
+    u[ cell.getIndex() ] = argAbsMin( value, tmp );
+  }
 }
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
 initInterface( const MeshFunctionPointer& _input,
-               MeshFunctionPointer& _output,
-               InterfaceMapPointer& _interfaceMap  )
+        MeshFunctionPointer& _output,
+        InterfaceMapPointer& _interfaceMap  )
 {
-    if( std::is_same< Device, Devices::Cuda >::value )
-    {
+  if( std::is_same< Device, Devices::Cuda >::value )
+  {
 #ifdef HAVE_CUDA
-        const MeshType& mesh = _input->getMesh();
-        
-        const int cudaBlockSize( 8 );
-        int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
-        int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
-        int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize );
-        if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
-            std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
-        dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
-        dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
-        Devices::Cuda::synchronizeDevice();
-        CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(),
-                                                     _output.template modifyData< Device >(),
-                                                     _interfaceMap.template modifyData< Device >() );
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
+    const MeshType& mesh = _input->getMesh();
+    
+    const int cudaBlockSize( 8 );
+    int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
+    int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
+    int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize );
+    if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
+      std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
+    dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
+    dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
+    Devices::Cuda::synchronizeDevice();
+    CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(),
+            _output.template modifyData< Device >(),
+            _interfaceMap.template modifyData< Device >() );
+    cudaDeviceSynchronize();
+    TNL_CHECK_CUDA_DEVICE;
 #endif
-    }
-    if( std::is_same< Device, Devices::Host >::value )
-    {
-        const MeshFunctionType& input =  _input.getData();
-        MeshFunctionType& output =  _output.modifyData();
-        InterfaceMapType& interfaceMap =  _interfaceMap.modifyData();
-        const MeshType& mesh = input.getMesh();
-        typedef typename MeshType::Cell Cell;
-        Cell cell( mesh );
-        for( cell.getCoordinates().z() = 0;
-             cell.getCoordinates().z() < mesh.getDimensions().z();
-             cell.getCoordinates().z() ++ )
-             for( cell.getCoordinates().y() = 0;
-                  cell.getCoordinates().y() < mesh.getDimensions().y();
-                  cell.getCoordinates().y() ++ )
-                 for( cell.getCoordinates().x() = 0;
-                      cell.getCoordinates().x() < mesh.getDimensions().x();
-                      cell.getCoordinates().x() ++ )
-                 {
-                     cell.refresh();
-                     output[ cell.getIndex() ] =
-                     input( cell ) > 0 ? std::numeric_limits< RealType >::max() :
-                                        - std::numeric_limits< RealType >::max();
-                     interfaceMap[ cell.getIndex() ] = false;
-                 }
-
-        const RealType& hx = mesh.getSpaceSteps().x();
-        const RealType& hy = mesh.getSpaceSteps().y();
-        const RealType& hz = mesh.getSpaceSteps().z();
-        for( cell.getCoordinates().z() = 0;
-             cell.getCoordinates().z() < mesh.getDimensions().z();
-             cell.getCoordinates().z() ++ )   
-           for( cell.getCoordinates().y() = 0;
-                cell.getCoordinates().y() < mesh.getDimensions().y();
-                cell.getCoordinates().y() ++ )
-              for( cell.getCoordinates().x() = 0;
-                   cell.getCoordinates().x() < mesh.getDimensions().x();
-                   cell.getCoordinates().x() ++ )
+  }
+  if( std::is_same< Device, Devices::Host >::value )
+  {
+    const MeshFunctionType& input =  _input.getData();
+    MeshFunctionType& output =  _output.modifyData();
+    InterfaceMapType& interfaceMap =  _interfaceMap.modifyData();
+    const MeshType& mesh = input.getMesh();
+    typedef typename MeshType::Cell Cell;
+    Cell cell( mesh );
+    for( cell.getCoordinates().z() = 0;
+            cell.getCoordinates().z() < mesh.getDimensions().z();
+            cell.getCoordinates().z() ++ )
+      for( cell.getCoordinates().y() = 0;
+              cell.getCoordinates().y() < mesh.getDimensions().y();
+              cell.getCoordinates().y() ++ )
+        for( cell.getCoordinates().x() = 0;
+                cell.getCoordinates().x() < mesh.getDimensions().x();
+                cell.getCoordinates().x() ++ )
+        {
+          cell.refresh();
+          output[ cell.getIndex() ] =
+                  input( cell ) > 0 ? std::numeric_limits< RealType >::max() :
+                    - std::numeric_limits< RealType >::max();
+          interfaceMap[ cell.getIndex() ] = false;
+        }
+    
+    const RealType& hx = mesh.getSpaceSteps().x();
+    const RealType& hy = mesh.getSpaceSteps().y();
+    const RealType& hz = mesh.getSpaceSteps().z();
+    for( cell.getCoordinates().z() = 0;
+            cell.getCoordinates().z() < mesh.getDimensions().z();
+            cell.getCoordinates().z() ++ )   
+      for( cell.getCoordinates().y() = 0;
+              cell.getCoordinates().y() < mesh.getDimensions().y();
+              cell.getCoordinates().y() ++ )
+        for( cell.getCoordinates().x() = 0;
+                cell.getCoordinates().x() < mesh.getDimensions().x();
+                cell.getCoordinates().x() ++ )
+        {
+          cell.refresh();
+          const RealType& c = input( cell );
+          if( ! cell.isBoundaryEntity() )
+          {
+            auto neighbors = cell.getNeighborEntities();
+            Real pom = 0;
+            const IndexType e = neighbors.template getEntityIndex<  1,  0,  0 >();
+            const IndexType n = neighbors.template getEntityIndex<  0,  1,  0 >();
+            const IndexType t = neighbors.template getEntityIndex<  0,  0,  1 >();
+            //Try exact initiation
+            /*const IndexType w = neighbors.template getEntityIndex< -1,  0,  0 >();
+             const IndexType s = neighbors.template getEntityIndex<  0, -1,  0 >();
+             const IndexType b = neighbors.template getEntityIndex<  0,  0, -1 >();
+             if( c * input[ e ] <= 0 )
+             {
+             output[ cell.getIndex() ] = c;
+             output[ e ] = input[ e ];
+             interfaceMap[ e ] = true;   
+             interfaceMap[ cell.getIndex() ] = true;
+             }
+             else if( c * input[ n ] <= 0 )
+             {
+             output[ cell.getIndex() ] = c;
+             output[ n ] = input[ n ];
+             interfaceMap[ n ] = true;   
+             interfaceMap[ cell.getIndex() ] = true;
+             }
+             else if( c * input[ t ] <= 0 )
+             {
+             output[ cell.getIndex() ] = c;
+             output[ t ] = input[ t ];
+             interfaceMap[ t ] = true;   
+             interfaceMap[ cell.getIndex() ] = true;
+             }*/
+            if( c * input[ n ] <= 0 )
+            {
+              if( c >= 0 )
               {
-                 cell.refresh();
-                 const RealType& c = input( cell );
-                 if( ! cell.isBoundaryEntity() )
-                 {
-                    auto neighbors = cell.getNeighborEntities();
-                    Real pom = 0;
-                    const IndexType e = neighbors.template getEntityIndex<  1,  0,  0 >();
-                    const IndexType n = neighbors.template getEntityIndex<  0,  1,  0 >();
-                    const IndexType t = neighbors.template getEntityIndex<  0,  0,  1 >();
-                    //Try exact initiation
-                    /*const IndexType w = neighbors.template getEntityIndex< -1,  0,  0 >();
-                    const IndexType s = neighbors.template getEntityIndex<  0, -1,  0 >();
-                    const IndexType b = neighbors.template getEntityIndex<  0,  0, -1 >();
-                    if( c * input[ e ] <= 0 )
-                    {
-                       output[ cell.getIndex() ] = c;
-                       output[ e ] = input[ e ];
-                       interfaceMap[ e ] = true;   
-                       interfaceMap[ cell.getIndex() ] = true;
-                    }
-                    else if( c * input[ n ] <= 0 )
-                    {
-                       output[ cell.getIndex() ] = c;
-                       output[ n ] = input[ n ];
-                       interfaceMap[ n ] = true;   
-                       interfaceMap[ cell.getIndex() ] = true;
-                    }
-                    else if( c * input[ t ] <= 0 )
-                    {
-                       output[ cell.getIndex() ] = c;
-                       output[ t ] = input[ t ];
-                       interfaceMap[ t ] = true;   
-                       interfaceMap[ cell.getIndex() ] = true;
-                    }*/
-                    if( c * input[ n ] <= 0 )
-                    {
-                        if( c >= 0 )
-                        {
-                        pom = ( hy * c )/( c - input[ n ]);
-                        if( output[ cell.getIndex() ] > pom ) 
-                            output[ cell.getIndex() ] = pom;
-
-                        if ( output[ n ] < pom - hy)
-                             output[ n ] = pom - hy; // ( hy * c )/( c - input[ n ]) - hy;
-
-                        }else
-                        {
-                          pom = - ( hy * c )/( c - input[ n ]);
-                          if( output[ cell.getIndex() ] < pom )
-                              output[ cell.getIndex() ] = pom;
-                          if( output[ n ] > hy + pom )
-                              output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]);
-
-                        }
-                    interfaceMap[ cell.getIndex() ] = true;
-                    interfaceMap[ n ] = true;
-                    }
-                    if( c * input[ e ] <= 0 )
-                    {
-                        if( c >= 0 )
-                        {
-                            pom = ( hx * c )/( c - input[ e ]);
-                            if( output[ cell.getIndex() ] > pom )
-                                output[ cell.getIndex() ] = pom;
-
-                            pom = pom - hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
-                            if( output[ e ] < pom )
-                                output[ e ] = pom;      
-
-                        }else
-                        {
-                            pom = - (hx * c)/( c - input[ e ]);
-                            if( output[ cell.getIndex() ] < pom )
-                                output[ cell.getIndex() ] = pom;
-
-                            pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
-                            if( output[ e ] > pom )
-                                output[ e ] = pom;
-                        }
-                    interfaceMap[ cell.getIndex() ] = true;
-                    interfaceMap[ e ] = true;
-                    }
-                    if( c * input[ t ] <= 0 )
-                    {
-                        if( c >= 0 )
-                        {
-                            pom = ( hz * c )/( c - input[ t ]);
-                            if( output[ cell.getIndex() ] > pom )
-                                output[ cell.getIndex() ] = pom;
-
-                            pom = pom - hz; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
-                            if( output[ t ] < pom )
-                                output[ t ] = pom; 
-
-                        }else
-                        {
-                            pom = - (hz * c)/( c - input[ t ]);
-                            if( output[ cell.getIndex() ] < pom )
-                                output[ cell.getIndex() ] = pom;
-
-                            pom = pom + hz; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
-                            if( output[ t ] > pom )
-                                output[ t ] = pom;
-
-                        }
-                    interfaceMap[ cell.getIndex() ] = true;
-                    interfaceMap[ t ] = true;
-                    }    
-                 }
-                 /*output[ cell.getIndex() ] =
-                    c > 0 ? TypeInfo< RealType >::getMaxValue() :
-                           -TypeInfo< RealType >::getMaxValue();
-                 interfaceMap[ cell.getIndex() ] = false;*/ //is on line 245
+                pom = ( hy * c )/( c - input[ n ]);
+                if( output[ cell.getIndex() ] > pom ) 
+                  output[ cell.getIndex() ] = pom;
+                
+                if ( output[ n ] < pom - hy)
+                  output[ n ] = pom - hy; // ( hy * c )/( c - input[ n ]) - hy;
+                
+              }else
+              {
+                pom = - ( hy * c )/( c - input[ n ]);
+                if( output[ cell.getIndex() ] < pom )
+                  output[ cell.getIndex() ] = pom;
+                if( output[ n ] > hy + pom )
+                  output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]);
+                
               }
-    }
+              interfaceMap[ cell.getIndex() ] = true;
+              interfaceMap[ n ] = true;
+            }
+            if( c * input[ e ] <= 0 )
+            {
+              if( c >= 0 )
+              {
+                pom = ( hx * c )/( c - input[ e ]);
+                if( output[ cell.getIndex() ] > pom )
+                  output[ cell.getIndex() ] = pom;
+                
+                pom = pom - hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
+                if( output[ e ] < pom )
+                  output[ e ] = pom;      
+                
+              }else
+              {
+                pom = - (hx * c)/( c - input[ e ]);
+                if( output[ cell.getIndex() ] < pom )
+                  output[ cell.getIndex() ] = pom;
+                
+                pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
+                if( output[ e ] > pom )
+                  output[ e ] = pom;
+              }
+              interfaceMap[ cell.getIndex() ] = true;
+              interfaceMap[ e ] = true;
+            }
+            if( c * input[ t ] <= 0 )
+            {
+              if( c >= 0 )
+              {
+                pom = ( hz * c )/( c - input[ t ]);
+                if( output[ cell.getIndex() ] > pom )
+                  output[ cell.getIndex() ] = pom;
+                
+                pom = pom - hz; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
+                if( output[ t ] < pom )
+                  output[ t ] = pom; 
+                
+              }else
+              {
+                pom = - (hz * c)/( c - input[ t ]);
+                if( output[ cell.getIndex() ] < pom )
+                  output[ cell.getIndex() ] = pom;
+                
+                pom = pom + hz; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
+                if( output[ t ] > pom )
+                  output[ t ] = pom;
+                
+              }
+              interfaceMap[ cell.getIndex() ] = true;
+              interfaceMap[ t ] = true;
+            }    
+          }
+          /*output[ cell.getIndex() ] =
+           c > 0 ? TypeInfo< RealType >::getMaxValue() :
+           -TypeInfo< RealType >::getMaxValue();
+           interfaceMap[ cell.getIndex() ] = false;*/ //is on line 245
+        }
+  }
 }
 
 template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename MeshEntity >
+        typename Device,
+        typename Index >
+template< typename MeshEntity >
 __cuda_callable__
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
 updateCell( MeshFunctionType& u,
-            const MeshEntity& cell, 
-            const RealType v )
+        const MeshEntity& cell, 
+        const RealType v )
 {
-   const auto& neighborEntities = cell.template getNeighborEntities< 3 >();
-   const MeshType& mesh = cell.getMesh();
+  const auto& neighborEntities = cell.template getNeighborEntities< 3 >();
+  const MeshType& mesh = cell.getMesh();
   
-   const RealType& hx = mesh.getSpaceSteps().x();
-   const RealType& hy = mesh.getSpaceSteps().y();
-   const RealType& hz = mesh.getSpaceSteps().z();
-   const RealType value = u( cell );
-   //std::cout << value << std::endl;
-   RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
-   
-   
-   if( cell.getCoordinates().x() == 0 )
-      a = u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ];
-   else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
-      a = u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ];
-   else
+  const RealType& hx = mesh.getSpaceSteps().x();
+  const RealType& hy = mesh.getSpaceSteps().y();
+  const RealType& hz = mesh.getSpaceSteps().z();
+  const RealType value = u( cell );
+  //std::cout << value << std::endl;
+  RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
+  
+  
+  if( cell.getCoordinates().x() == 0 )
+    a = u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ];
+  else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
+    a = u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ];
+  else
+  {
+    a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ],
+            u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ] );
+  }
+  if( cell.getCoordinates().y() == 0 )
+    b = u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ];
+  else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 )
+    b = u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ];
+  else
+  {
+    b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ],
+            u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ] );
+  }if( cell.getCoordinates().z() == 0 )
+    c = u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ];
+  else if( cell.getCoordinates().z() == mesh.getDimensions().z() - 1 )
+    c = u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ];
+  else
+  {
+    c = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ],
+            u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ] );
+  }
+  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+          fabs( b ) == std::numeric_limits< RealType >::max() &&
+          fabs( c ) == std::numeric_limits< RealType >::max() )
+    return;
+  
+  
+  /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() &&
+   fabs( b ) != TypeInfo< Real >::getMaxValue() &&
+   fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) )
    {
-      a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ],
-                        u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ] );
+   tmp = ( hx * hx * a + hy * hy * b + 
+   sign( value ) * hx * hy * sqrt( ( hx * hx + hy * hy )/v - 
+   ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy );
    }
-   if( cell.getCoordinates().y() == 0 )
-      b = u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ];
-   else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 )
-      b = u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ];
-   else
-   {
-      b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ],
-                        u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ] );
-   }if( cell.getCoordinates().z() == 0 )
-      c = u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ];
-   else if( cell.getCoordinates().z() == mesh.getDimensions().z() - 1 )
-      c = u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ];
-   else
+   if( fabs( a ) != TypeInfo< Real >::getMaxValue() &&
+   fabs( c ) != TypeInfo< Real >::getMaxValue() &&
+   fabs( a - c ) >= TNL::sqrt( (hx * hx + hz * hz)/v ) )
    {
-      c = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ],
-                         u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ] );
+   tmp = ( hx * hx * a + hz * hz * c + 
+   sign( value ) * hx * hz * sqrt( ( hx * hx + hz * hz )/v - 
+   ( a - c ) * ( a - c ) ) )/( hx * hx + hz * hz );
    }
-   if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-       fabs( b ) == std::numeric_limits< RealType >::max() &&
-       fabs( c ) == std::numeric_limits< RealType >::max() )
-      return;
-   
-   
-       /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() &&
-           fabs( b ) != TypeInfo< Real >::getMaxValue() &&
-           fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) )
-       {
-           tmp = ( hx * hx * a + hy * hy * b + 
-                sign( value ) * hx * hy * sqrt( ( hx * hx + hy * hy )/v - 
-                ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy );
-       }
-       if( fabs( a ) != TypeInfo< Real >::getMaxValue() &&
-           fabs( c ) != TypeInfo< Real >::getMaxValue() &&
-           fabs( a - c ) >= TNL::sqrt( (hx * hx + hz * hz)/v ) )
-       {
-           tmp = ( hx * hx * a + hz * hz * c + 
-                sign( value ) * hx * hz * sqrt( ( hx * hx + hz * hz )/v - 
-                ( a - c ) * ( a - c ) ) )/( hx * hx + hz * hz );
-       }
-       if( fabs( b ) != TypeInfo< Real >::getMaxValue() &&
-           fabs( c ) != TypeInfo< Real >::getMaxValue() &&
-           fabs( b - c ) >= TNL::sqrt( (hy * hy + hz * hz)/v ) )
-       {
-           tmp = ( hy * hy * b + hz * hz * c + 
-                sign( value ) * hy * hz * sqrt( ( hy * hy + hz * hz )/v - 
-                ( b - c ) * ( b - c ) ) )/( hy * hy + hz * hz );
-       }*/
-    RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
-    sortMinims( pom );   
-    tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
-    if( fabs( tmp ) < fabs( pom[ 1 ] ) )
+   if( fabs( b ) != TypeInfo< Real >::getMaxValue() &&
+   fabs( c ) != TypeInfo< Real >::getMaxValue() &&
+   fabs( b - c ) >= TNL::sqrt( (hy * hy + hz * hz)/v ) )
+   {
+   tmp = ( hy * hy * b + hz * hz * c + 
+   sign( value ) * hy * hz * sqrt( ( hy * hy + hz * hz )/v - 
+   ( b - c ) * ( b - c ) ) )/( hy * hy + hz * hz );
+   }*/
+  RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
+  sortMinims( pom );   
+  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
+  if( fabs( tmp ) < fabs( pom[ 1 ] ) )
+  {
+    u[ cell.getIndex() ] = argAbsMin( value, tmp ); 
+  }
+  else
+  {
+    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
+            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
+    if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
     {
-        u[ cell.getIndex() ] = argAbsMin( value, tmp ); 
+      u[ cell.getIndex() ] = argAbsMin( value, tmp );
     }
     else
     {
-        tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
-            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
-            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-        if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
-        {
-            u[ cell.getIndex() ] = argAbsMin( value, tmp );
-        }
-        else
-        {
-            tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
-                TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
-                hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
-                hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
-            u[ cell.getIndex() ] = argAbsMin( value, tmp );
-        }
+      tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
+              TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
+              hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
+              hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
+      u[ cell.getIndex() ] = argAbsMin( value, tmp );
     }
+  }
 }
 
 template < typename T1, typename T2 >
 T1 meet2DCondition( T1 a, T1 b, const T2 ha, const T2 hb, const T1 value, double v)
 {
-   T1 tmp;
-   if( fabs( a ) != std::numeric_limits< T1 >::max &&
-       fabs( b ) != std::numeric_limits< T1 >::max &&
-       fabs( a - b ) < ha/v )//TNL::sqrt( (ha * ha + hb * hb)/2 )/v )
-   {
-      tmp = ( ha * ha * b + hb * hb * a + 
+  T1 tmp;
+  if( fabs( a ) != std::numeric_limits< T1 >::max &&
+          fabs( b ) != std::numeric_limits< T1 >::max &&
+          fabs( a - b ) < ha/v )//TNL::sqrt( (ha * ha + hb * hb)/2 )/v )
+  {
+    tmp = ( ha * ha * b + hb * hb * a + 
             TNL::sign( value ) * ha * hb * TNL::sqrt( ( ha * ha + hb * hb )/( v * v ) - 
             ( a - b ) * ( a - b ) ) )/( ha * ha + hb * hb );
-   }
-   else
-   {
-       tmp = std::numeric_limits< T1 >::max;
-   }
-   
-   return tmp;
+  }
+  else
+  {
+    tmp = std::numeric_limits< T1 >::max;
+  }
+  
+  return tmp;
 }
 
 template < typename T1 >
 __cuda_callable__ void sortMinims( T1 pom[] )
 {
-    T1 tmp[6] = {0.0,0.0,0.0,0.0,0.0,0.0}; 
-    if( fabs(pom[0]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[2])){
-        tmp[0] = pom[0]; tmp[1] = pom[1]; tmp[2] = pom[2];
-        tmp[3] = pom[3]; tmp[4] = pom[4]; tmp[5] = pom[5];
-        
-    }
-    else if( fabs(pom[0]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[1]) ){
-        tmp[0] = pom[0]; tmp[1] = pom[2]; tmp[2] = pom[1];
-        tmp[3] = pom[3]; tmp[4] = pom[5]; tmp[5] = pom[4];
-    }
-    else if( fabs(pom[1]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[2]) ){
-        tmp[0] = pom[1]; tmp[1] = pom[0]; tmp[2] = pom[2];
-        tmp[3] = pom[4]; tmp[4] = pom[3]; tmp[5] = pom[5];
-    }
-    else if( fabs(pom[1]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[0]) ){
-        tmp[0] = pom[1]; tmp[1] = pom[2]; tmp[2] = pom[0];
-        tmp[3] = pom[4]; tmp[4] = pom[5]; tmp[5] = pom[3];
-    }
-    else if( fabs(pom[2]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[1]) ){
-        tmp[0] = pom[2]; tmp[1] = pom[0]; tmp[2] = pom[1];
-        tmp[3] = pom[5]; tmp[4] = pom[3]; tmp[5] = pom[4];
-    }
-    else if( fabs(pom[2]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[0]) ){
-        tmp[0] = pom[2]; tmp[1] = pom[1]; tmp[2] = pom[0];
-        tmp[3] = pom[5]; tmp[4] = pom[4]; tmp[5] = pom[3];
-    }
+  T1 tmp[6] = {0.0,0.0,0.0,0.0,0.0,0.0}; 
+  if( fabs(pom[0]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[2])){
+    tmp[0] = pom[0]; tmp[1] = pom[1]; tmp[2] = pom[2];
+    tmp[3] = pom[3]; tmp[4] = pom[4]; tmp[5] = pom[5];
     
-    for( int i = 0; i < 6; i++ )
-    {
-        pom[ i ] = tmp[ i ];
-    }   
+  }
+  else if( fabs(pom[0]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[1]) ){
+    tmp[0] = pom[0]; tmp[1] = pom[2]; tmp[2] = pom[1];
+    tmp[3] = pom[3]; tmp[4] = pom[5]; tmp[5] = pom[4];
+  }
+  else if( fabs(pom[1]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[2]) ){
+    tmp[0] = pom[1]; tmp[1] = pom[0]; tmp[2] = pom[2];
+    tmp[3] = pom[4]; tmp[4] = pom[3]; tmp[5] = pom[5];
+  }
+  else if( fabs(pom[1]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[0]) ){
+    tmp[0] = pom[1]; tmp[1] = pom[2]; tmp[2] = pom[0];
+    tmp[3] = pom[4]; tmp[4] = pom[5]; tmp[5] = pom[3];
+  }
+  else if( fabs(pom[2]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[1]) ){
+    tmp[0] = pom[2]; tmp[1] = pom[0]; tmp[2] = pom[1];
+    tmp[3] = pom[5]; tmp[4] = pom[3]; tmp[5] = pom[4];
+  }
+  else if( fabs(pom[2]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[0]) ){
+    tmp[0] = pom[2]; tmp[1] = pom[1]; tmp[2] = pom[0];
+    tmp[3] = pom[5]; tmp[4] = pom[4]; tmp[5] = pom[3];
+  }
+  
+  for( int i = 0; i < 6; i++ )
+  {
+    pom[ i ] = tmp[ i ];
+  }   
 }
 
 
@@ -893,372 +919,373 @@ __cuda_callable__ void sortMinims( T1 pom[] )
 #ifdef HAVE_CUDA
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, 
-                                Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output,
-                                Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap  )
+        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap  )
 {
-    int i = threadIdx.x + blockDim.x*blockIdx.x;
-    const Meshes::Grid< 1, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
+  int i = threadIdx.x + blockDim.x*blockIdx.x;
+  const Meshes::Grid< 1, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
+  
+  if( i < mesh.getDimensions().x()  )
+  {
+    typedef typename Meshes::Grid< 1, Real, Device, Index >::Cell Cell;
+    Cell cell( mesh );
+    cell.getCoordinates().x() = i;
+    cell.refresh();
+    const Index cind = cell.getIndex();
+    
     
-    if( i < mesh.getDimensions().x()  )
+    output[ cind ] =
+            input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
+              - std::numeric_limits< Real >::max();
+    interfaceMap[ cind ] = false; 
+    
+    const Real& h = mesh.getSpaceSteps().x();
+    cell.refresh();
+    const Real& c = input( cell );
+    if( ! cell.isBoundaryEntity()  )
     {
-        typedef typename Meshes::Grid< 1, Real, Device, Index >::Cell Cell;
-        Cell cell( mesh );
-        cell.getCoordinates().x() = i;
-        cell.refresh();
-        const Index cind = cell.getIndex();
-
-
-        output[ cind ] =
-               input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
-                                    - std::numeric_limits< Real >::max();
-        interfaceMap[ cind ] = false; 
-
-        const Real& h = mesh.getSpaceSteps().x();
-        cell.refresh();
-        const Real& c = input( cell );
-        if( ! cell.isBoundaryEntity()  )
-        {
-           auto neighbors = cell.getNeighborEntities();
-           Real pom = 0;
-           const Index e = neighbors.template getEntityIndex< 1 >();
-           const Index w = neighbors.template getEntityIndex< -1 >();
-           if( c * input[ e ] <= 0 )
-           {
-               pom = TNL::sign( c )*( h * c )/( c - input[ e ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
-                   output[ cind ] = pom;                       
-
-               interfaceMap[ cind ] = true;
-           }
-           if( c * input[ w ] <= 0 )
-           {
-               pom = TNL::sign( c )*( h * c )/( c - input[ w ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-                   output[ cind ] = pom;
-
-               interfaceMap[ cind ] = true;
-           }
-        }
+      auto neighbors = cell.getNeighborEntities();
+      Real pom = 0;
+      const Index e = neighbors.template getEntityIndex< 1 >();
+      const Index w = neighbors.template getEntityIndex< -1 >();
+      if( c * input[ e ] <= 0 )
+      {
+        pom = TNL::sign( c )*( h * c )/( c - input[ e ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
+          output[ cind ] = pom;                       
+        
+        interfaceMap[ cind ] = true;
+      }
+      if( c * input[ w ] <= 0 )
+      {
+        pom = TNL::sign( c )*( h * c )/( c - input[ w ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+          output[ cind ] = pom;
+        
+        interfaceMap[ cind ] = true;
+      }
     }
-           
+  }
+  
 }
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
-                                Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
-                                Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap ) 
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap ) 
 {
-    int i = threadIdx.x + blockDim.x*blockIdx.x;
-    int j = blockDim.y*blockIdx.y + threadIdx.y;
-    const Meshes::Grid< 2, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
+  int i = threadIdx.x + blockDim.x*blockIdx.x;
+  int j = blockDim.y*blockIdx.y + threadIdx.y;
+  const Meshes::Grid< 2, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
+  
+  if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+  {
+    typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell;
+    Cell cell( mesh );
+    cell.getCoordinates().x() = i; cell.getCoordinates().y() = j;
+    cell.refresh();
+    const Index cind = cell.getIndex();
+    
     
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+    output[ cind ] =
+            input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
+              - std::numeric_limits< Real >::max();
+    interfaceMap[ cind ] = false; 
+    
+    const Real& hx = mesh.getSpaceSteps().x();
+    const Real& hy = mesh.getSpaceSteps().y();
+    cell.refresh();
+    const Real& c = input( cell );
+    if( ! cell.isBoundaryEntity()  )
     {
-        typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell;
-        Cell cell( mesh );
-        cell.getCoordinates().x() = i; cell.getCoordinates().y() = j;
-        cell.refresh();
-        const Index cind = cell.getIndex();
-
-
-        output[ cind ] =
-               input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
-                                    - std::numeric_limits< Real >::max();
-        interfaceMap[ cind ] = false; 
-
-        const Real& hx = mesh.getSpaceSteps().x();
-        const Real& hy = mesh.getSpaceSteps().y();
-        cell.refresh();
-        const Real& c = input( cell );
-        if( ! cell.isBoundaryEntity()  )
-        {
-           auto neighbors = cell.getNeighborEntities();
-           Real pom = 0;
-           const Index e = neighbors.template getEntityIndex<  1,  0 >();
-           const Index w = neighbors.template getEntityIndex<  -1,  0 >();
-           const Index n = neighbors.template getEntityIndex<  0,  1 >();
-           const Index s = neighbors.template getEntityIndex<  0,  -1 >();
-           
-           if( c * input[ n ] <= 0 )
-           {
-               pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-                   output[ cind ] = pom;
-
-               interfaceMap[ cell.getIndex() ] = true;
-           }
-           if( c * input[ e ] <= 0 )
-           {
-               pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
-                   output[ cind ] = pom;                       
-
-               interfaceMap[ cind ] = true;
-           }
-           if( c * input[ w ] <= 0 )
-           {
-               pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-                   output[ cind ] = pom;
-
-               interfaceMap[ cind ] = true;
-           }
-           if( c * input[ s ] <= 0 )
-           {
-               pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-                   output[ cind ] = pom;
-
-               interfaceMap[ cind ] = true;
-           }
-        }
+      auto neighbors = cell.getNeighborEntities();
+      Real pom = 0;
+      const Index e = neighbors.template getEntityIndex<  1,  0 >();
+      const Index w = neighbors.template getEntityIndex<  -1,  0 >();
+      const Index n = neighbors.template getEntityIndex<  0,  1 >();
+      const Index s = neighbors.template getEntityIndex<  0,  -1 >();
+      
+      if( c * input[ n ] <= 0 )
+      {
+        pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+          output[ cind ] = pom;
+        
+        interfaceMap[ cell.getIndex() ] = true;
+      }
+      if( c * input[ e ] <= 0 )
+      {
+        pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
+          output[ cind ] = pom;                       
+        
+        interfaceMap[ cind ] = true;
+      }
+      if( c * input[ w ] <= 0 )
+      {
+        pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+          output[ cind ] = pom;
+        
+        interfaceMap[ cind ] = true;
+      }
+      if( c * input[ s ] <= 0 )
+      {
+        pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+          output[ cind ] = pom;
+        
+        interfaceMap[ cind ] = true;
+      }
     }
+  }
 }
 
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, 
-                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
-                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap )
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap )
 {
-    int i = threadIdx.x + blockDim.x*blockIdx.x;
-    int j = blockDim.y*blockIdx.y + threadIdx.y;
-    int k = blockDim.z*blockIdx.z + threadIdx.z;
-    const Meshes::Grid< 3, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
+  int i = threadIdx.x + blockDim.x*blockIdx.x;
+  int j = blockDim.y*blockIdx.y + threadIdx.y;
+  int k = blockDim.z*blockIdx.z + threadIdx.z;
+  const Meshes::Grid< 3, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
+  
+  if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
+  {
+    typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell;
+    Cell cell( mesh );
+    cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k;
+    cell.refresh();
+    const Index cind = cell.getIndex();
+    
+    
+    output[ cind ] =
+            input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
+              - std::numeric_limits< Real >::max();
+    interfaceMap[ cind ] = false; 
+    cell.refresh();
     
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
+    const Real& hx = mesh.getSpaceSteps().x();
+    const Real& hy = mesh.getSpaceSteps().y();
+    const Real& hz = mesh.getSpaceSteps().z();
+    const Real& c = input( cell );
+    if( ! cell.isBoundaryEntity()  )
     {
-        typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell;
-        Cell cell( mesh );
-        cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k;
-        cell.refresh();
-        const Index cind = cell.getIndex();
-
-
-        output[ cind ] =
-               input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
-                                    - std::numeric_limits< Real >::max();
-        interfaceMap[ cind ] = false; 
-        cell.refresh();
-
-        const Real& hx = mesh.getSpaceSteps().x();
-        const Real& hy = mesh.getSpaceSteps().y();
-        const Real& hz = mesh.getSpaceSteps().z();
-        const Real& c = input( cell );
-        if( ! cell.isBoundaryEntity()  )
-        {
-           auto neighbors = cell.getNeighborEntities();
-           Real pom = 0;
-           const Index e = neighbors.template getEntityIndex<  1, 0, 0 >();
-           const Index w = neighbors.template getEntityIndex<  -1, 0, 0 >();
-           const Index n = neighbors.template getEntityIndex<  0, 1, 0 >();
-           const Index s = neighbors.template getEntityIndex<  0, -1, 0 >();
-           const Index t = neighbors.template getEntityIndex<  0, 0, 1 >();
-           const Index b = neighbors.template getEntityIndex<  0, 0, -1 >();
-           
-           if( c * input[ n ] <= 0 )
-           {
-               pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-                   output[ cind ] = pom;
-
-               interfaceMap[ cind ] = true;
-           }
-           if( c * input[ e ] <= 0 )
-           {
-               pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
-                   output[ cind ] = pom;                       
-
-               interfaceMap[ cind ] = true;
-           }
-           if( c * input[ w ] <= 0 )
-           {
-               pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-                   output[ cind ] = pom;
-
-               interfaceMap[ cind ] = true;
-           }
-           if( c * input[ s ] <= 0 )
-           {
-               pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-                   output[ cind ] = pom;
-
-               interfaceMap[ cind ] = true;
-           }
-           if( c * input[ b ] <= 0 )
-           {
-               pom = TNL::sign( c )*( hz * c )/( c - input[ b ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-                   output[ cind ] = pom;
-
-               interfaceMap[ cind ] = true;
-           }
-           if( c * input[ t ] <= 0 )
-           {
-               pom = TNL::sign( c )*( hz * c )/( c - input[ t ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-                   output[ cind ] = pom;
-
-               interfaceMap[ cind ] = true;
-           }
-        }
+      auto neighbors = cell.getNeighborEntities();
+      Real pom = 0;
+      const Index e = neighbors.template getEntityIndex<  1, 0, 0 >();
+      const Index w = neighbors.template getEntityIndex<  -1, 0, 0 >();
+      const Index n = neighbors.template getEntityIndex<  0, 1, 0 >();
+      const Index s = neighbors.template getEntityIndex<  0, -1, 0 >();
+      const Index t = neighbors.template getEntityIndex<  0, 0, 1 >();
+      const Index b = neighbors.template getEntityIndex<  0, 0, -1 >();
+      
+      if( c * input[ n ] <= 0 )
+      {
+        pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+          output[ cind ] = pom;
+        
+        interfaceMap[ cind ] = true;
+      }
+      if( c * input[ e ] <= 0 )
+      {
+        pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
+          output[ cind ] = pom;                       
+        
+        interfaceMap[ cind ] = true;
+      }
+      if( c * input[ w ] <= 0 )
+      {
+        pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+          output[ cind ] = pom;
+        
+        interfaceMap[ cind ] = true;
+      }
+      if( c * input[ s ] <= 0 )
+      {
+        pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+          output[ cind ] = pom;
+        
+        interfaceMap[ cind ] = true;
+      }
+      if( c * input[ b ] <= 0 )
+      {
+        pom = TNL::sign( c )*( hz * c )/( c - input[ b ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+          output[ cind ] = pom;
+        
+        interfaceMap[ cind ] = true;
+      }
+      if( c * input[ t ] <= 0 )
+      {
+        pom = TNL::sign( c )*( hz * c )/( c - input[ t ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+          output[ cind ] = pom;
+        
+        interfaceMap[ cind ] = true;
+      }
     }
+  }
 }
 
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
+template< int sizeSArray >
 __cuda_callable__
 bool
 tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
-updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy,
-            const Real v )
+updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real hy,
+        const Real v )
 {
-   const RealType value = sArray[ thrj ][ thri ];
-   RealType a, b, tmp = std::numeric_limits< RealType >::max();
-      
-   b = TNL::argAbsMin( sArray[ thrj+1 ][ thri ],
-                        sArray[ thrj-1 ][ thri ] );
-    
-   a = TNL::argAbsMin( sArray[ thrj ][ thri+1 ],
-                        sArray[ thrj ][ thri-1 ] );
-
-    if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-        fabs( b ) == std::numeric_limits< RealType >::max() )
-       return false;
-   
-    RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
-    sortMinims( pom );
-    tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
-    
-                                
-    if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
-    {
-        sArray[ thrj ][ thri ] = argAbsMin( value, tmp );
-        tmp = value - sArray[ thrj ][ thri ];
-        if ( fabs( tmp ) >  0.001*hx )
-            return true;
-        else
-            return false;
-    }
+  const RealType value = sArray[ thrj * sizeSArray + thri ];
+  RealType a, b, tmp = std::numeric_limits< RealType >::max();
+  
+  b = TNL::argAbsMin( sArray[ (thrj+1) * sizeSArray + thri ],
+          sArray[ (thrj-1) * sizeSArray + thri ] );
+  
+  a = TNL::argAbsMin( sArray[ thrj * sizeSArray + thri+1 ],
+          sArray[ thrj * sizeSArray + thri-1 ] );
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+          fabs( b ) == std::numeric_limits< RealType >::max() )
+    return false;
+  
+  RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
+  sortMinims( pom );
+  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
+  
+  
+  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
+  {
+    sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp );
+    tmp = value - sArray[ thrj * sizeSArray + thri ];
+    if ( fabs( tmp ) >  0.001*hx )
+      return true;
     else
-    {
-        tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+      return false;
+  }
+  else
+  {
+    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
             TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
             ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-        sArray[ thrj ][ thri ] = argAbsMin( value, tmp );
-        tmp = value - sArray[ thrj ][ thri ];
-        if ( fabs( tmp ) > 0.001*hx )
-            return true;
-        else
-            return false;
-    }
-    
-    return false;
+    sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp );
+    tmp = value - sArray[ thrj * sizeSArray + thri ];
+    if ( fabs( tmp ) > 0.001*hx )
+      return true;
+    else
+      return false;
+  }
+  
+  return false;
 }
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
 __cuda_callable__
 bool
 tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >::
 updateCell( volatile Real sArray[18], int thri, const Real h, const Real v )
 {
-   const RealType value = sArray[ thri ];
-   RealType a, tmp = std::numeric_limits< RealType >::max();
-      
-   a = TNL::argAbsMin( sArray[ thri+1 ],
-                       sArray[ thri-1 ] );
-
-    if( fabs( a ) == std::numeric_limits< RealType >::max() )
-       return false;
-   
-    tmp = a + TNL::sign( value ) * h/v;
-    
-                                
-    sArray[ thri ] = argAbsMin( value, tmp );
-    
-    tmp = value - sArray[ thri ];
-    if ( fabs( tmp ) >  0.001*h )
-        return true;
-    else
-        return false;
+  const RealType value = sArray[ thri ];
+  RealType a, tmp = std::numeric_limits< RealType >::max();
+  
+  a = TNL::argAbsMin( sArray[ thri+1 ],
+          sArray[ thri-1 ] );
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() )
+    return false;
+  
+  tmp = a + TNL::sign( value ) * h/v;
+  
+  
+  sArray[ thri ] = argAbsMin( value, tmp );
+  
+  tmp = value - sArray[ thri ];
+  if ( fabs( tmp ) >  0.001*h )
+    return true;
+  else
+    return false;
 }
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
 __cuda_callable__ 
 bool 
 tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
 updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk,
         const Real hx, const Real hy, const Real hz, const Real v )
 {
-   const RealType value = sArray[thrk][thrj][thri];
-   //std::cout << value << std::endl;
-   RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
-   
-   c = TNL::argAbsMin( sArray[ thrk+1 ][ thrj ][ thri ],
-                        sArray[ thrk-1 ][ thrj ][ thri ] );
-    
-   b = TNL::argAbsMin( sArray[ thrk ][ thrj+1 ][ thri ],
-                        sArray[ thrk ][ thrj-1 ][ thri ] );
-   
-   a = TNL::argAbsMin( sArray[ thrk ][ thrj ][ thri+1 ],
-                        sArray[ thrk ][ thrj ][ thri-1 ] );
-   
-   
-   if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-       fabs( b ) == std::numeric_limits< RealType >::max() &&
-       fabs( c ) == std::numeric_limits< RealType >::max() )
+  const RealType value = sArray[thrk][thrj][thri];
+  //std::cout << value << std::endl;
+  RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
+  
+  c = TNL::argAbsMin( sArray[ thrk+1 ][ thrj ][ thri ],
+          sArray[ thrk-1 ][ thrj ][ thri ] );
+  
+  b = TNL::argAbsMin( sArray[ thrk ][ thrj+1 ][ thri ],
+          sArray[ thrk ][ thrj-1 ][ thri ] );
+  
+  a = TNL::argAbsMin( sArray[ thrk ][ thrj ][ thri+1 ],
+          sArray[ thrk ][ thrj ][ thri-1 ] );
+  
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+          fabs( b ) == std::numeric_limits< RealType >::max() &&
+          fabs( c ) == std::numeric_limits< RealType >::max() )
+    return false;
+  
+  RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
+  
+  sortMinims( pom );
+  
+  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
+  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
+  {
+    sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
+    tmp = value - sArray[ thrk ][ thrj ][ thri ];
+    if ( fabs( tmp ) >  0.001*hx )
+      return true;
+    else
       return false;
-   
-    RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
-    
-    sortMinims( pom );
-    
-    tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
-    if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
+  }
+  else
+  {
+    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
+            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
+    if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
     {
-        sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
-        tmp = value - sArray[ thrk ][ thrj ][ thri ];
-        if ( fabs( tmp ) >  0.001*hx )
-            return true;
-        else
-            return false;
+      sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
+      tmp = value - sArray[ thrk ][ thrj ][ thri ];
+      if ( fabs( tmp ) > 0.001*hx )
+        return true;
+      else
+        return false;
     }
     else
     {
-        tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
-            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
-            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-        if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
-        {
-            sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
-            tmp = value - sArray[ thrk ][ thrj ][ thri ];
-            if ( fabs( tmp ) > 0.001*hx )
-                return true;
-            else
-                return false;
-        }
-        else
-        {
-            tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
-                TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
-                hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
-                hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
-            sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
-            tmp = value - sArray[ thrk ][ thrj ][ thri ];
-            if ( fabs( tmp ) > 0.001*hx )
-                return true;
-            else
-                return false;
-        }
+      tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
+              TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
+              hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
+              hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
+      sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
+      tmp = value - sArray[ thrk ][ thrj ][ thri ];
+      if ( fabs( tmp ) > 0.001*hx )
+        return true;
+      else
+        return false;
     }
-    
-    return false;
+  }
+  
+  return false;
 }
 #endif
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index 4520eab0a..e29421bb1 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -15,6 +15,7 @@
 
 #include "tnlFastSweepingMethod.h"
 #include <TNL/Devices/Cuda.h>
+#include <string.h>
 
 
 #include <iostream>
@@ -80,116 +81,171 @@ solve( const MeshPointer& mesh,
   
   
-  
   while( iteration < this->maxIterations )
   {
     if( std::is_same< DeviceType, Devices::Host >::value )
     {
-      int numThreadsPerBlock = 16;
+      int numThreadsPerBlock = 1024;
+      
       
       int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
       int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
+      //std::cout << "numBlocksX = " << numBlocksX << std::endl;
+      
+      /*Real **sArray = new Real*[numBlocksX*numBlocksY];
+       for( int i = 0; i < numBlocksX * numBlocksY; i++ )
+       sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];*/
       
-          
       ArrayContainer BlockIterHost;
       BlockIterHost.setSize( numBlocksX * numBlocksY );
       BlockIterHost.setValue( 1 );
+      int IsCalculationDone = 1;
+      
+      MeshFunctionPointer helpFunc( mesh );
+      MeshFunctionPointer helpFunc1( mesh );
+      helpFunc1 = auxPtr;
+      auxPtr = helpFunc;
+      helpFunc = helpFunc1;
+      //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl;
       /*for( int k = numBlocksX-1; k >-1; k-- ){
-        for( int l = 0; l < numBlocksY; l++ ){
-          std::cout<< BlockIterHost[ l*numBlocksX  + k ];
+       for( int l = 0; l < numBlocksY; l++ ){
+       std::cout<< BlockIterHost[ l*numBlocksX  + k ];
+       }
+       std::cout<<std::endl;
+       }
+       std::cout<<std::endl;*/
+      unsigned int numWhile = 0;
+      while( IsCalculationDone  )
+      {      
+        IsCalculationDone = 0;
+        helpFunc1 = auxPtr;
+        auxPtr = helpFunc;
+        helpFunc = helpFunc1;
+        this->template updateBlocks< 1026 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+        
+        for( int i = 0; i < BlockIterHost.getSize(); i++ ){
+          if( IsCalculationDone == 0 ){
+            IsCalculationDone = IsCalculationDone || BlockIterHost[ i ];
+            //break;
+          }
         }
-        std::cout<<std::endl;
-      }
-      std::cout<<std::endl;*/
-      
-      while( BlockIterHost[ 0 ] )
-      {          
-        this->updateBlocks( interfaceMap, aux, BlockIterHost, numThreadsPerBlock);
+        numWhile++;
+        
+        for( int j = numBlocksY-1; j>-1; j-- ){
+          for( int i = 0; i < numBlocksX; i++ )
+            std::cout << BlockIterHost[ j * numBlocksX + i ];
+          std::cout << std::endl;
+        }
+        std::cout << std::endl;
         
         this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY );
         
-  //Reduction      
-        for( int k = numBlocksX-1; k >-1; k-- ){
-          for( int l = 0; l < numBlocksY; l++ ){
-            //std::cout<< BlockIterHost[ l*numBlocksX  + k ];
-            BlockIterHost[ 0 ] = BlockIterHost[ 0 ] || BlockIterHost[ l*numBlocksX + k ];
-          }
-          //std::cout<<std::endl;
-        }
+        /*for( int j = numBlocksY-1; j>-1; j-- ){
+         for( int i = 0; i < numBlocksX; i++ )
+         std::cout << "BlockIterHost = "<< j*numBlocksX + i<< " ," << BlockIterHost[ j * numBlocksX + i ];
+         std::cout << std::endl;
+         }
+         std::cout << std::endl;*/
+        //Reduction      
+        
         //std::cout<<std::endl;
+        string s( "aux-"+ std::to_string(numWhile) + ".tnl");
+        aux.save( s );
       }
-      /*for( cell.getCoordinates().y() = 0;
-              cell.getCoordinates().y() < mesh->getDimensions().y();
-              cell.getCoordinates().y()++ )
-      {
-        for( cell.getCoordinates().x() = 0;
-                cell.getCoordinates().x() < mesh->getDimensions().x();
-                cell.getCoordinates().x()++ )
-        {
-          cell.refresh();
-          if( ! interfaceMap( cell ) )
-            this->updateCell( aux, cell );
-        }
-      }
-      
-      //aux.save( "aux-1.tnl" );
-      
-      for( cell.getCoordinates().y() = 0;
-              cell.getCoordinates().y() < mesh->getDimensions().y();
-              cell.getCoordinates().y()++ )
-      {
-        for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                cell.getCoordinates().x() >= 0 ;
-                cell.getCoordinates().x()-- )		
-        {
-          //std::cerr << "2 -> ";
-          cell.refresh();
-          if( ! interfaceMap( cell ) )            
-            this->updateCell( aux, cell );
-        }
+      if( numWhile == 1 ){
+        auxPtr = helpFunc;
       }
+      /*for( int i = 0; i < numBlocksX * numBlocksY; i++ )
+       delete []sArray[i];*/
       
-      //aux.save( "aux-2.tnl" );
       
-      for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-              cell.getCoordinates().y() >= 0 ;
-              cell.getCoordinates().y()-- )
-      {
-        for( cell.getCoordinates().x() = 0;
-                cell.getCoordinates().x() < mesh->getDimensions().x();
-                cell.getCoordinates().x()++ )
-        {
-          //std::cerr << "3 -> ";
-          cell.refresh();
-          if( ! interfaceMap( cell ) )            
-            this->updateCell( aux, cell );
-        }
-      }
-      
-      //aux.save( "aux-3.tnl" );
+      /*for( cell.getCoordinates().y() = 0;
+       cell.getCoordinates().y() < mesh->getDimensions().y();
+       cell.getCoordinates().y()++ )
+       {
+       for( cell.getCoordinates().x() = 0;
+       cell.getCoordinates().x() < mesh->getDimensions().x();
+       cell.getCoordinates().x()++ )
+       {
+       cell.refresh();
+       if( ! interfaceMap( cell ) )
+       this->updateCell( aux, cell );
+       }
+       }
+       
+       //aux.save( "aux-1.tnl" );
+       
+       for( cell.getCoordinates().y() = 0;
+       cell.getCoordinates().y() < mesh->getDimensions().y();
+       cell.getCoordinates().y()++ )
+       {
+       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+       cell.getCoordinates().x() >= 0 ;
+       cell.getCoordinates().x()-- )		
+       {
+       //std::cerr << "2 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       
+       //aux.save( "aux-2.tnl" );
+       
+       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+       cell.getCoordinates().y() >= 0 ;
+       cell.getCoordinates().y()-- )
+       {
+       for( cell.getCoordinates().x() = 0;
+       cell.getCoordinates().x() < mesh->getDimensions().x();
+       cell.getCoordinates().x()++ )
+       {
+       //std::cerr << "3 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       
+       //aux.save( "aux-3.tnl" );
+       
+       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+       cell.getCoordinates().y() >= 0;
+       cell.getCoordinates().y()-- )
+       {
+       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+       cell.getCoordinates().x() >= 0 ;
+       cell.getCoordinates().x()-- )		
+       {
+       //std::cerr << "4 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       
+       for( int j = 0;
+       j < mesh->getDimensions().y();
+       j++ )
+       {
+       for( int i = 0;
+       i < mesh->getDimensions().x();
+       i++ )
+       {
+       std::cout << aux[ i * mesh->getDimensions().y() + j ] << " ";
+       }
+       std::cout << std::endl;
+       }*/
       
-      for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-              cell.getCoordinates().y() >= 0;
-              cell.getCoordinates().y()-- )
-      {
-        for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                cell.getCoordinates().x() >= 0 ;
-                cell.getCoordinates().x()-- )		
-        {
-          //std::cerr << "4 -> ";
-          cell.refresh();
-          if( ! interfaceMap( cell ) )            
-            this->updateCell( aux, cell );
-        }
-      }*/
     }
     if( std::is_same< DeviceType, Devices::Cuda >::value )
     {
       // TODO: CUDA code
 #ifdef HAVE_CUDA
-<<<<<<< HEAD
       TNL_CHECK_CUDA_DEVICE;
+      // Maximum cudaBlockSite is 32. Because of maximum num. of threads in kernel.
       const int cudaBlockSize( 16 );
+      
       int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
       int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
       dim3 blockSize( cudaBlockSize, cudaBlockSize );
@@ -203,19 +259,14 @@ solve( const MeshPointer& mesh,
       BlockIterDevice.setSize( numBlocksX * numBlocksY );
       BlockIterDevice.setValue( 1 );
       TNL_CHECK_CUDA_DEVICE;
-      int ne = 0;
-      CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
-                                                       interfaceMapPtr.template getData< Device >(),
-                                                       auxPtr.template modifyData< Device>(),
-                                                       BlockIterDevice, ne);
-      TNL_CHECK_CUDA_DEVICE;
+      
       
       /*TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom;
-      BlockIterPom.setSize( numBlocksX * numBlocksY  );
-      BlockIterPom.setValue( 0 );*/
+       BlockIterPom.setSize( numBlocksX * numBlocksY  );
+       BlockIterPom.setValue( 0 );*/
       /*TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1;
-      BlockIterPom1.setSize( numBlocksX * numBlocksY  );
-      BlockIterPom1.setValue( 0 );*/
+       BlockIterPom1.setSize( numBlocksX * numBlocksY  );
+       BlockIterPom1.setValue( 0 );*/
       /*int *BlockIterDevice;
        cudaMalloc((void**) &BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/
       int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
@@ -224,139 +275,125 @@ solve( const MeshPointer& mesh,
       /*int *BlockIterPom;
        cudaMalloc((void**) &BlockIterPom, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/
       
-      int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0);
+      int nBlocks = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
+      
       TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
-      dBlock.setSize( nBlocks  );
+      dBlock.setSize( nBlocks );
       TNL_CHECK_CUDA_DEVICE;
       /*int *dBlock;
        cudaMalloc((void**) &dBlock, nBlocks * sizeof( int ) );*/
-      //int pocIter = 0;
+      
+      
+      MeshFunctionPointer helpFunc1;
+      helpFunc1->setMesh(mesh);
+      
+      MeshFunctionPointer helpFunc( mesh );
+      
+      helpFunc1 = auxPtr;
+      auxPtr = helpFunc;
+      helpFunc = helpFunc1;
+      
+      int numIter = 0;
+      
+      //int oddEvenBlock = 0;
       while( BlockIterD )
       {
-        /*BlockIterPom1 = BlockIterDevice;
-        for( int j = numBlocksY-1; j>-1; j-- ){
-          for( int i = 0; i < numBlocksX; i++ )
-            std::cout << BlockIterPom1[ j * numBlocksX + i ];
-          std::cout << std::endl;
-        }
-        std::cout << std::endl;*/
+        /** HERE IS CHESS METHOD **/
         
-        CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
-                                                         interfaceMapPtr.template getData< Device >(),
-                                                         auxPtr.template modifyData< Device>(),
-                                                         BlockIterDevice, 1);
+        /*auxPtr = helpFunc;
+        
+        CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
+                interfaceMapPtr.template getData< Device >(),
+                auxPtr.template getData< Device>(),
+                helpFunc.template modifyData< Device>(),
+                BlockIterDevice,
+                oddEvenBlock );
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
+        auxPtr = helpFunc;
         
-        /*int poc = 0;
-        for( int i = 0; i < numBlocksX * numBlocksY; i++ )
-          if( BlockIterPom1[ i ] )
-            poc = poc+1;
-        std::cout << "pocet bloku, ktere se pocitali = " << poc << std::endl;*/
+        oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
         
-        GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, /*BlockIterPom,*/ numBlocksX, numBlocksY );
+        CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
+                interfaceMapPtr.template getData< Device >(),
+                auxPtr.template getData< Device>(),
+                helpFunc.template modifyData< Device>(),
+                BlockIterDevice,
+                oddEvenBlock );
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
+        auxPtr = helpFunc;
         
-        CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
-        TNL_CHECK_CUDA_DEVICE;
+        oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
         
-        CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+        CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
+        cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
-        
-        BlockIterD = dBlock.getElement( 0 );
-        //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
+        CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
         
+        BlockIterD = dBlock.getElement( 0 );*/
+        
+        /**------------------------------------------------------------------------------------------------*/
+        
+        
+        /** HERE IS FIM **/
+        
+         helpFunc1 = auxPtr;
+         auxPtr = helpFunc;
+         helpFunc = helpFunc1;
+         
+         //int pocBloku = 0;
+         Devices::Cuda::synchronizeDevice();
+         CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
+         interfaceMapPtr.template getData< Device >(),
+         auxPtr.template modifyData< Device>(),
+         helpFunc.template modifyData< Device>(),
+         BlockIterDevice );
+         cudaDeviceSynchronize();
+         TNL_CHECK_CUDA_DEVICE;
+         
+         //std::cout << "Pocet aktivnich bloku = " << pocBloku << std::endl;
+         
+         GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, numBlocksX, numBlocksY );
+         cudaDeviceSynchronize();
+         TNL_CHECK_CUDA_DEVICE;
+         
+         //std::cout<< "Probehlo" << std::endl;
+         
+         //TNL::swap( auxPtr, helpFunc );
+         
+         
+         CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
+         TNL_CHECK_CUDA_DEVICE;
+         
+         CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+         TNL_CHECK_CUDA_DEVICE;
+         
+         
+         BlockIterD = dBlock.getElement( 0 );
+         //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
+         cudaDeviceSynchronize();
+         TNL_CHECK_CUDA_DEVICE;
+         
+        
+        /**-----------------------------------------------------------------------------------------------------------*/
         /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
          BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
-        //pocIter ++;
-=======
-          const int cudaBlockSize( 16 );
-          int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
-          int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
-          dim3 blockSize( cudaBlockSize, cudaBlockSize );
-          dim3 gridSize( numBlocksX, numBlocksY );
-          
-          tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
-          
-          TNL::Containers::Array< int, Devices::Host, IndexType > BlockIter;
-          BlockIter.setSize( numBlocksX * numBlocksY );
-          BlockIter.setValue( 0 );
-          /*int* BlockIter = (int*)malloc( ( numBlocksX * numBlocksY ) * sizeof( int ) );
-          for( int i = 0; i < numBlocksX*numBlocksY +1; i++)
-              BlockIter[i] = 1;*/
-          
-          int BlockIterD = 1;
-          
-          TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
-          BlockIterDevice.setSize( numBlocksX * numBlocksY );
-          BlockIterDevice.setValue( 1 );
-          /*int *BlockIterDevice;
-          cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );
-          cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);*/
-          
-          int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0);
-          
-          TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
-          dBlock.setSize( nBlocks );
-          dBlock.setValue( 0 );
-          /*int *dBlock;
-          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/
-          
-          while( BlockIterD )
-          {
-           /*for( int i = 0; i < numBlocksX * numBlocksY; i++ )
-                BlockIter[ i ] = false;*/
-                       
-            CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
-                                                             interfaceMapPtr.template getData< Device >(),
-                                                             auxPtr.template modifyData< Device>(),
-                                                             BlockIterDevice );
-            cudaDeviceSynchronize();
-            TNL_CHECK_CUDA_DEVICE;
-            
-            BlockIter = BlockIterDevice;
-            //cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyDeviceToHost);
-            GetNeighbours( BlockIter, numBlocksX, numBlocksY );
-            
-            BlockIterDevice = BlockIter;
-            //cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);
-            cudaDeviceSynchronize();
-            TNL_CHECK_CUDA_DEVICE;
-            
-            
-            CudaParallelReduc<<<  nBlocks, 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
-            cudaDeviceSynchronize();
-            TNL_CHECK_CUDA_DEVICE;
-            
-            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
-            cudaDeviceSynchronize();
-            TNL_CHECK_CUDA_DEVICE;
-            
-            cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
-                                   
-            /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
-                BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
-            
-          }
-          /*cudaFree( BlockIterDevice );
-          cudaFree( dBlock );
-          delete BlockIter;*/
-          cudaDeviceSynchronize();
-          
-          TNL_CHECK_CUDA_DEVICE;
-              
-          aux = *auxPtr;
-          interfaceMap = *interfaceMapPtr;
-#endif
->>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
+        numIter ++;
       }
+      if( numIter == 1 ){
+        helpFunc1 = auxPtr;
+        auxPtr = helpFunc;
+        helpFunc = helpFunc1;
+      }
+      /*cudaFree( BlockIterDevice );
+       cudaFree( dBlock );
+       delete BlockIter;*/
       cudaDeviceSynchronize();
-      TNL_CHECK_CUDA_DEVICE;
       
-      //std::cout<< pocIter << std::endl;
+      TNL_CHECK_CUDA_DEVICE;
       
       aux = *auxPtr;
       interfaceMap = *interfaceMapPtr;
@@ -366,12 +403,14 @@ solve( const MeshPointer& mesh,
   }
   aux.save("aux-final.tnl");
 }
-<<<<<<< HEAD
+
 
 #ifdef HAVE_CUDA
+
+
 template < typename Index >
 __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
-                               /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY )
+        /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY )
 {
   int i = blockIdx.x * 1024 + threadIdx.x;
   
@@ -381,103 +420,68 @@ __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index
     int m=0, k=0;
     m = i%numBlockX;
     k = i/numBlockX;
-    if( m > 0 )
-      if( BlockIterDevice[ i - 1 ] )
-        pom = 1;//BlockIterPom[ i ] = 1;
-    if( m < numBlockX -1 && pom == 0 )
-      if( BlockIterDevice[ i + 1 ] )
-        pom = 1;//BlockIterPom[ i ] = 1;
-    if( k > 0 && pom == 0 )
-      if( BlockIterDevice[ i - numBlockX ] )
-        pom = 1;// BlockIterPom[ i ] = 1;
-    if( k < numBlockY -1 && pom == 0 )
-      if( BlockIterDevice[ i + numBlockX ] )
-        pom = 1;//BlockIterPom[ i ] = 1;
+    if( m > 0 && BlockIterDevice[ i - 1 ] ){
+      pom = 1;//BlockIterPom[ i ] = 1;
+    }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){
+      pom = 1;//BlockIterPom[ i ] = 1;
+    }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){
+      pom = 1;// BlockIterPom[ i ] = 1;
+    }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){
+      pom = 1;//BlockIterPom[ i ] = 1;
+    }
     
-          
-      
     BlockIterDevice[ i ] = pom;//BlockIterPom[ i ];
   }
 }
 
-=======
-template < typename Index >
-void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY )
-{
-    TNL::Containers::Array< int, Devices::Host, Index > BlockIterPom;
-    BlockIterPom.setSize( numBlockX * numBlockY );
-    BlockIterPom.setValue( 0 );
-  /*int* BlockIterPom; 
-  BlockIterPom = new int[numBlockX * numBlockY];*/
-  /*for(int i = 0; i < numBlockX * numBlockY; i++)
-    BlockIterPom[ i ] = 0;*/
-  for(int i = 0; i < numBlockX * numBlockY; i++)
-  {
-      
-      if( BlockIter[ i ] )
-      {
-          // i = k*numBlockY + m;
-          int m=0, k=0;
-          m = i%numBlockY;
-          k = i/numBlockY;
-          if( k > 0 && numBlockY > 1 )
-            BlockIterPom[i - numBlockX] = 1;
-          if( k < numBlockY-1 && numBlockY > 1 )
-            BlockIterPom[i + numBlockX] = 1;
-          
-          if( m >= 0 && m < numBlockX - 1 && numBlockX > 1 )
-              BlockIterPom[ i+1 ] = 1;
-          if( m <= numBlockX -1 && m > 0 && numBlockX > 1 )
-              BlockIterPom[ i-1 ] = 1;
-      }
-  }
-  for(int i = 0; i < numBlockX * numBlockY; i++ ){
-///      if( !BlockIter[ i ] )
-        BlockIter[ i ] = BlockIterPom[ i ];
-///      else
-///        BlockIter[ i ] = 0;
-  }
-  /*for( int i = numBlockX-1; i > -1; i-- )
-  {
-      for( int j = 0; j< numBlockY; j++ )
-          std::cout << BlockIter[ i*numBlockY + j ];
-      std::cout << std::endl;
-  }
-  std::cout << std::endl;*/
-  //delete[] BlockIterPom;
-}
-
-#ifdef HAVE_CUDA
->>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
 template < typename Index >
 __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
-                                   TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks )
+        TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks )
 {
-<<<<<<< HEAD
   int i = threadIdx.x;
   int blId = blockIdx.x;
-  __shared__ volatile int sArray[ 512 ];
+  int blockSize = blockDim.x;
+  /*if ( i == 0 && blId == 0 ){
+   printf( "nBlocks = %d \n", nBlocks );
+   for( int j = nBlocks-1; j > -1 ; j--){
+   printf( "cislo = %d \n", BlockIterDevice[ j ] );
+   }
+   }*/
+  __shared__ int sArray[ 1024 ];
   sArray[ i ] = 0;
-  if(blId * 512 + i < nBlocks )
-    sArray[ i ] = BlockIterDevice[ blId * 512 + i ];
+  if( blId * 1024 + i < nBlocks )
+    sArray[ i ] = BlockIterDevice[ blId * 1024 + i ];
   __syncthreads();
-  if (blockDim.x == 1024) {
+  /*extern __shared__ volatile int sArray[];
+   unsigned int i = threadIdx.x;
+   unsigned int gid = blockIdx.x * blockSize * 2 + threadIdx.x;
+   unsigned int gridSize = blockSize * 2 * gridDim.x;
+   sArray[ i ] = 0;
+   while( gid < nBlocks )
+   {
+   sArray[ i ] += BlockIterDevice[ gid ] + BlockIterDevice[ gid + blockSize ];
+   gid += gridSize;
+   }
+   __syncthreads();*/
+  
+  if ( blockSize == 1024) {
     if (i < 512)
       sArray[ i ] += sArray[ i + 512 ];
   }
   __syncthreads();
-  if (blockDim.x  >= 512) {
+  if (blockSize >= 512) {
     if (i < 256) {
       sArray[ i ] += sArray[ i + 256 ];
     }
   }
-  if (blockDim.x >= 256) {
+  __syncthreads();
+  if (blockSize >= 256) {
     if (i < 128) {
       sArray[ i ] += sArray[ i + 128 ];
     }
   }
   __syncthreads();
-  if (blockDim.x >= 128) {
+  if (blockSize >= 128) {
     if (i < 64) {
       sArray[ i ] += sArray[ i + 64 ];
     }
@@ -485,183 +489,120 @@ __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, I
   __syncthreads();
   if (i < 32 )
   {
-    if(  blockDim.x >= 64 ) sArray[ i ] += sArray[ i + 32 ];
-    if(  blockDim.x >= 32 )  sArray[ i ] += sArray[ i + 16 ];
-    if(  blockDim.x >= 16 )  sArray[ i ] += sArray[ i + 8 ];
-    if(  blockDim.x >= 8 )  sArray[ i ] += sArray[ i + 4 ];
-    if(  blockDim.x >= 4 )  sArray[ i ] += sArray[ i + 2 ];
-    if(  blockDim.x >= 2 )  sArray[ i ] += sArray[ i + 1 ];
+    if(  blockSize >= 64 ) sArray[ i ] += sArray[ i + 32 ];
+    if(  blockSize >= 32 )  sArray[ i ] += sArray[ i + 16 ];
+    if(  blockSize >= 16 )  sArray[ i ] += sArray[ i + 8 ];
+    if(  blockSize >= 8 )  sArray[ i ] += sArray[ i + 4 ];
+    if(  blockSize >= 4 )  sArray[ i ] += sArray[ i + 2 ];
+    if(  blockSize >= 2 )  sArray[ i ] += sArray[ i + 1 ];
   }
   
   if( i == 0 )
     dBlock[ blId ] = sArray[ 0 ];
-=======
-    int i = threadIdx.x;
-    int blId = blockIdx.x;
-    /*if ( i == 0 && blId == 0 ){
-            printf( "nBlocks = %d \n", nBlocks );
-        for( int j = nBlocks-1; j > -1 ; j--){
-            printf( "cislo = %d \n", BlockIterDevice[ j ] );
-        }
-    }*/
-    __shared__ volatile int sArray[ 512 ];
-    sArray[ i ] = 0;
-    if( blId * 512 + i < nBlocks )
-        sArray[ i ] = BlockIterDevice[ blId * 512 + i ];
-    __syncthreads();
-    
-    if (blockDim.x == 1024) {
-        if (i < 512)
-            sArray[ i ] += sArray[ i + 512 ];
-    }
-    __syncthreads();
-    if (blockDim.x >= 512) {
-        if (i < 256) {
-            sArray[ i ] += sArray[ i + 256 ];
-        }
-    }
-    __syncthreads();
-    if (blockDim.x >= 256) {
-        if (i < 128) {
-            sArray[ i ] += sArray[ i + 128 ];
-        }
-    }
-    __syncthreads();
-    if (blockDim.x >= 128) {
-        if (i < 64) {
-            sArray[ i ] += sArray[ i + 64 ];
-        }
-    }
-    __syncthreads();
-    if (i < 32 )
-    {
-        if(  blockDim.x >= 64 ) sArray[ i ] += sArray[ i + 32 ];
-        if(  blockDim.x >= 32 )  sArray[ i ] += sArray[ i + 16 ];
-        if(  blockDim.x >= 16 )  sArray[ i ] += sArray[ i + 8 ];
-        if(  blockDim.x >= 8 )  sArray[ i ] += sArray[ i + 4 ];
-        if(  blockDim.x >= 4 )  sArray[ i ] += sArray[ i + 2 ];
-        if(  blockDim.x >= 2 )  sArray[ i ] += sArray[ i + 1 ];
-    }
-    
-    if( i == 0 )
-        dBlock[ blId ] = sArray[ 0 ];
->>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
 }
 
 
-template < typename Real, typename Device, typename Index >
+template < int sizeSArray, typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
-                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
-                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-<<<<<<< HEAD
-                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne )
-=======
-                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice )
->>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
+        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
+        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock )
 {
   int thri = threadIdx.x; int thrj = threadIdx.y;
-  int blIdx = blockIdx.x; int blIdy = blockIdx.y;
-  int grIdx = gridDim.x;
-  
-  if( BlockIterDevice[ blIdy * grIdx + blIdx] )
+  int i = threadIdx.x + blockDim.x*blockIdx.x;
+  int j = blockDim.y*blockIdx.y + threadIdx.y;
+  /** FOR CHESS METHOD */
+  if( (blockIdx.y%2  + blockIdx.x) % 2 == oddEvenBlock )
   {
-  
-    const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
+    /**-----------------------------------------*/
     
-    int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
-    __shared__ volatile int numOfBlockx;
-    __shared__ volatile int numOfBlocky;
-    __shared__ int xkolik;
-    __shared__ int ykolik;
-    __shared__ volatile int NE;
-    if( thri == 0 && thrj == 0 )
+    
+    /** FOR FIM METHOD */
+    /*if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x] )
+     {*/ 
+    /**-----------------------------------------*/
+    const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
+    __shared__ volatile int dimX;
+    __shared__ volatile int dimY;
+    __shared__ volatile Real hx;
+    __shared__ volatile Real hy;
+    if( thri==0 && thrj == 0)
     {
-      xkolik = blockDim.x + 1;
-      ykolik = blockDim.y + 1;
-      numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0);
-      numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0);
-      
-      if( numOfBlockx - 1 == blIdx )
-        xkolik = dimX - (blIdx)*blockDim.x+1;
-      
-      if( numOfBlocky -1 == blIdy )
-        ykolik = dimY - (blIdy)*blockDim.y+1;
-        BlockIterDevice[ blIdy * grIdx + blIdx ] = 0;
-        NE = ne;
+      dimX = mesh.getDimensions().x();
+      dimY = mesh.getDimensions().y();
+      hx = mesh.getSpaceSteps().x();
+      hy = mesh.getSpaceSteps().y();
+      BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] = 0;
     }
     __syncthreads();
-   
-    int i = thri + blockDim.x*blIdx;
-    int j = blockDim.y*blIdy + thrj;
+    int numOfBlockx;
+    int numOfBlocky;
+    int xkolik;
+    int ykolik;
+    
+    xkolik = blockDim.x + 1;
+    ykolik = blockDim.y + 1;
+    numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0);
+    numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0);
+    
+    if( numOfBlockx - 1 == blockIdx.x )
+      xkolik = dimX - (blockIdx.x)*blockDim.x+1;
+    
+    if( numOfBlocky -1 == blockIdx.y )
+      ykolik = dimY - (blockIdx.y)*blockDim.y+1;
+    __syncthreads();
+    
     int currentIndex = thrj * blockDim.x + thri;
-    if( BlockIterDevice[ blIdy * gridDim.x + blIdx] )
-    {
     //__shared__ volatile bool changed[ blockDim.x*blockDim.y ];
-    __shared__ volatile bool changed[16*16];
+    __shared__ volatile bool changed[ (sizeSArray-2)*(sizeSArray-2)];
     changed[ currentIndex ] = false;
     if( thrj == 0 && thri == 0 )
       changed[ 0 ] = true;
     
-    __shared__ Real hx;
-    __shared__ Real hy;
-    if( thrj == 1 && thri == 1 )
-    {
-      hx = mesh.getSpaceSteps().x();
-      hy = mesh.getSpaceSteps().y();
-    }
     
     //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ];
-    __shared__ volatile Real sArray[18][18];
-    sArray[thrj][thri] = std::numeric_limits< Real >::max();
+    __shared__ volatile Real sArray[ sizeSArray * sizeSArray ];
+    sArray[ thrj * sizeSArray + thri ] = std::numeric_limits< Real >::max();
     
     //filling sArray edges
     if( thri == 0 )
-    {        
-      if( dimX > (blIdx+1) * blockDim.x  && thrj+1 < ykolik && NE == 1 )
-        sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ];
+    {      
+      if( dimX > (blockIdx.x+1) * blockDim.x  && thrj+1 < ykolik )
+        sArray[(thrj+1)*sizeSArray + xkolik] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX + xkolik ];
       else
-        sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
+        sArray[(thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max();
     }
     
     if( thri == 1 )
     {
-      if( blIdx != 0 && thrj+1 < ykolik && NE == 1 )
-        sArray[thrj+1][0] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX ];
+      if( blockIdx.x != 0 && thrj+1 < ykolik )
+        sArray[(thrj+1)*sizeSArray + 0] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX ];
       else
-        sArray[thrj+1][0] = std::numeric_limits< Real >::max();
+        sArray[(thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max();
     }
     
-<<<<<<< HEAD
     if( thri == 2 )
     {
-      if( dimY > (blIdy+1) * blockDim.y  && thri+1 < xkolik && NE == 1 )
-        sArray[ykolik][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thrj+1 ];
+      if( dimY > (blockIdx.y+1) * blockDim.y  && thrj+1 < xkolik )
+        sArray[ ykolik*sizeSArray + thrj+1 ] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + ykolik*dimX + thrj+1 ];
       else
-        sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();
-=======
-        if( numOfBlockx - 1 == blIdx )
-            xkolik = dimX - (blIdx)*blockDim.x+1;
-
-        if( numOfBlocky -1 == blIdy )
-            ykolik = dimY - (blIdy)*blockDim.y+1;
-        //BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0;
->>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
+        sArray[ykolik*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
+      
     }
     
-<<<<<<< HEAD
     if( thri == 3 )
     {
-      if( blIdy != 0 && thrj+1 < xkolik && NE == 1 )
-        sArray[0][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + thrj+1 ];
+      if( blockIdx.y != 0 && thrj+1 < xkolik )
+        sArray[0*sizeSArray + thrj+1] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + thrj+1 ];
       else
-        sArray[0][thrj+1] = std::numeric_limits< Real >::max();
+        sArray[0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
     }
     
-    
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+    if( i < dimX && j < dimY )
     {    
-      sArray[thrj+1][thri+1] = aux[ j*mesh.getDimensions().x() + i ];
+      sArray[(thrj+1)*sizeSArray + thri+1] = aux[ j*dimX + i ];
     }
     __syncthreads();  
     
@@ -672,25 +613,11 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       changed[ currentIndex] = false;
       
       //calculation of update cell
-      if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+      if( i < dimX && j < dimY )
       {
-        if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] )
-=======
-        if(thri == 0 && thrj == 0 )
-            BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0;
-
-        if( thri == 0 )
-        {        
-            if( dimX > (blIdx+1) * blockDim.x  && thrj+1 < ykolik )
-                sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ];
-            else
-                sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
-        }
-
-        if( thri == 1 )
->>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
+        if( ! interfaceMap[ j * dimX + i ] )
         {
-          changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy);
+          changed[ currentIndex ] = ptr.updateCell<sizeSArray>( sArray, thri+1, thrj+1, hx,hy);
         }
       }
       __syncthreads();
@@ -724,12 +651,11 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       {
         if( currentIndex < 64 )
         {
-<<<<<<< HEAD
           changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
         }
       }
       __syncthreads();
-      if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
+      if( currentIndex < 32 ) 
       {
         if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
         if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
@@ -738,82 +664,23 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
         if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
         if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
       }
-      if( changed[ 0 ] && thri == 0 && thrj == 0 )
-        BlockIterDevice[ blIdy * grIdx + blIdx ] = 1;
+      if( thri == 0 && thrj == 0 && changed[ 0 ] ){
+        BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] = 1;
+      }
+      /*if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 )
+       {
+       for( int k = 15; k>-1; k-- ){
+       for( int l = 0; l < 16; l++ )
+       printf( "%f\t", sArray[k * 16 + l]);
+       printf( "\n");
+       }
+       printf( "\n");
+       }*/
       __syncthreads();
     }
+    if( i < dimX && j < dimY )
+      helpFunc[ j * dimX + i ] = sArray[ ( thrj + 1 ) * sizeSArray + thri + 1 ];
     
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) )
-      aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ];
-  }
-=======
-            __syncthreads();
-
-            changed[ currentIndex] = false;
-
-        //calculation of update cell
-            if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
-            {
-                if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] )
-                {
-                    changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy);
-                }
-            }
-            __syncthreads();
-
-        //pyramid reduction
-            if( blockDim.x*blockDim.y == 1024 )
-            {
-                if( currentIndex < 512 )
-                {
-                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
-                }
-            }
-            __syncthreads();
-            if( blockDim.x*blockDim.y >= 512 )
-            {
-                if( currentIndex < 256 )
-                {
-                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
-                }
-            }
-            __syncthreads();
-            if( blockDim.x*blockDim.y >= 256 )
-            {
-                if( currentIndex < 128 )
-                {
-                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
-                }
-            }
-            __syncthreads();
-            if( blockDim.x*blockDim.y >= 128 )
-            {
-                if( currentIndex < 64 )
-                {
-                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
-                }
-            }
-            __syncthreads();
-            if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
-            {
-                if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
-                if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
-                if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
-                if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
-                if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
-                if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
-            }
-            if( changed[ 0 ] && thri == 0 && thrj == 0 ){
-                BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1;
-            }
-            __syncthreads();
-        }
-
-        if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) )
-            aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ];
-    }
-    /*if( thri == 0 && thrj == 0 )
-        printf( "Block ID = %d, value = %d \n", (blIdy * numOfBlockx + blIdx), BlockIterDevice[ blIdy * numOfBlockx + blIdx ] );*/
->>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
+  } 
 }
 #endif
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index 8d71bfe06..4daf9fc92 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -280,17 +280,12 @@ solve( const MeshPointer& mesh,
                                                               interfaceMapPtr.template getData< Device >(),
                                                               auxPtr.template modifyData< Device>(),
                                                               BlockIterDevice );
-<<<<<<< HEAD
             cudaDeviceSynchronize();
             TNL_CHECK_CUDA_DEVICE;
             
             CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
             cudaDeviceSynchronize();
             TNL_CHECK_CUDA_DEVICE;
-=======
-            //CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
-            //CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
->>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
             
             CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
             cudaDeviceSynchronize();
-- 
GitLab


From 04c3e8bc96332c62efd318cb971193cfb274c490 Mon Sep 17 00:00:00 2001
From: Fencl <fenclmat@fjfi.cvut.cz>
Date: Wed, 31 Oct 2018 06:44:59 +0100
Subject: [PATCH 05/20] Repair of last commit (error for - wihtout cuda): FIM
 method implemented for 2D GPU and FIM-FSM implemented for 2D CPU (parallel).

---
 .../tnlDirectEikonalMethodsBase_impl.h        | 119 +++++++++---------
 1 file changed, 60 insertions(+), 59 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index 95971c9b8..500d1bf03 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -11,6 +11,7 @@
 
 #include <iostream>
 #include "tnlFastSweepingMethod.h"
+#include "tnlDirectEikonalMethodsBase.h"
 
 template< typename Real,
         typename Device,
@@ -135,7 +136,7 @@ updateBlocks( InterfaceMapType interfaceMap,
       bool changed = false;
       
       
-      RealType *sArray;
+      Real *sArray;
       sArray = new Real[ sizeSArray * sizeSArray ];
       if( sArray == nullptr )
         std::cout << "Error while allocating memory for sArray." << std::endl;
@@ -175,7 +176,7 @@ updateBlocks( InterfaceMapType interfaceMap,
             //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl;
             if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
             {
-              pom = this->updateCell<sizeSArray>( sArray/*[i]*/, l+1, k+1, hx,hy);
+              pom = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy);
               changed = changed || pom;
             }
           }
@@ -195,7 +196,7 @@ updateBlocks( InterfaceMapType interfaceMap,
           {
             if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
             {
-              this->updateCell<sizeSArray>( sArray/*[i]*/, l+1, k+1, hx,hy);
+              this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy);
             }
           }
         }
@@ -213,7 +214,7 @@ updateBlocks( InterfaceMapType interfaceMap,
           {
             if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
             {
-              this->updateCell<sizeSArray>( sArray/*[i]*/, l+1, k+1, hx,hy);
+              this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy);
             }
           }
         }
@@ -231,7 +232,7 @@ updateBlocks( InterfaceMapType interfaceMap,
           {
             if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
             {
-              this->updateCell<sizeSArray>( sArray/*[i]*/, l+1, k+1, hx,hy);
+              this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx, hy, 1.0);
             }
           }
         }
@@ -258,7 +259,7 @@ updateBlocks( InterfaceMapType interfaceMap,
         }
         //std::cout<<std::endl;
       }
-      //delete []sArray;
+      delete []sArray;
     }
   }
 }
@@ -914,7 +915,58 @@ __cuda_callable__ void sortMinims( T1 pom[] )
   }   
 }
 
-
+template< typename Real,
+        typename Device,
+        typename Index >
+template< int sizeSArray >
+__cuda_callable__
+bool
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real hy,
+        const Real v )
+{
+  const RealType value = sArray[ thrj * sizeSArray + thri ];
+  RealType a, b, tmp = std::numeric_limits< RealType >::max();
+  
+  b = TNL::argAbsMin( sArray[ (thrj+1) * sizeSArray + thri ],
+          sArray[ (thrj-1) * sizeSArray + thri ] );
+  
+  a = TNL::argAbsMin( sArray[ thrj * sizeSArray + thri+1 ],
+          sArray[ thrj * sizeSArray + thri-1 ] );
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+          fabs( b ) == std::numeric_limits< RealType >::max() )
+    return false;
+  
+  RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
+  sortMinims( pom );
+  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
+  
+  
+  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
+  {
+    sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp );
+    tmp = value - sArray[ thrj * sizeSArray + thri ];
+    if ( fabs( tmp ) >  0.001*hx )
+      return true;
+    else
+      return false;
+  }
+  else
+  {
+    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
+            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
+    sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp );
+    tmp = value - sArray[ thrj * sizeSArray + thri ];
+    if ( fabs( tmp ) > 0.001*hx )
+      return true;
+    else
+      return false;
+  }
+  
+  return false;
+}
 
 #ifdef HAVE_CUDA
 template < typename Real, typename Device, typename Index >
@@ -1133,58 +1185,7 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3
 }
 
 
-template< typename Real,
-        typename Device,
-        typename Index >
-template< int sizeSArray >
-__cuda_callable__
-bool
-tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
-updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real hy,
-        const Real v )
-{
-  const RealType value = sArray[ thrj * sizeSArray + thri ];
-  RealType a, b, tmp = std::numeric_limits< RealType >::max();
-  
-  b = TNL::argAbsMin( sArray[ (thrj+1) * sizeSArray + thri ],
-          sArray[ (thrj-1) * sizeSArray + thri ] );
-  
-  a = TNL::argAbsMin( sArray[ thrj * sizeSArray + thri+1 ],
-          sArray[ thrj * sizeSArray + thri-1 ] );
-  
-  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-          fabs( b ) == std::numeric_limits< RealType >::max() )
-    return false;
-  
-  RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
-  sortMinims( pom );
-  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
-  
-  
-  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
-  {
-    sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp );
-    tmp = value - sArray[ thrj * sizeSArray + thri ];
-    if ( fabs( tmp ) >  0.001*hx )
-      return true;
-    else
-      return false;
-  }
-  else
-  {
-    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
-            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
-            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-    sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp );
-    tmp = value - sArray[ thrj * sizeSArray + thri ];
-    if ( fabs( tmp ) > 0.001*hx )
-      return true;
-    else
-      return false;
-  }
-  
-  return false;
-}
+
 
 template< typename Real,
         typename Device,
-- 
GitLab


From b95d6c7b98234dd1e93f245e0d2076ac91ba14bc Mon Sep 17 00:00:00 2001
From: Fencl <fenclmat@fjfi.cvut.cz>
Date: Thu, 1 Nov 2018 16:26:36 +0100
Subject: [PATCH 06/20] Last repair of FIM for GPU.

---
 .../tnlDirectEikonalMethodsBase.h             |   2 +-
 .../tnlDirectEikonalMethodsBase_impl.h        |  72 ++++----
 .../tnlFastSweepingMethod2D_impl.h            | 165 ++++++++++--------
 3 files changed, 125 insertions(+), 114 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index cbb1a1ff6..ccbae8abe 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -148,7 +148,7 @@ __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, I
 
 template < typename Index >
 __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
-                               /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY );
+                               TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY );
 
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index 500d1bf03..5083544e2 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -134,6 +134,7 @@ updateBlocks( InterfaceMapType interfaceMap,
       Real hy = mesh.getSpaceSteps().y();
       
       bool changed = false;
+      BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0;
       
       
       Real *sArray;
@@ -143,53 +144,52 @@ updateBlocks( InterfaceMapType interfaceMap,
       
       for( int thri = 0; thri < sizeSArray; thri++ ){
         for( int thrj = 0; thrj < sizeSArray; thrj++ )
-          sArray/*[i]*/[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max();
+          sArray[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max();
       }
       
-      BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0;
       
       for( int thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ )
       {        
         if( dimX > (blIdx+1) * numThreadsPerBlock  && thrj+1 < ykolik )
-          sArray/*[i]*/[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ];
+          sArray[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ];
         
         
         if( blIdx != 0 && thrj+1 < ykolik )
-          sArray/*[i]*/[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ];
+          sArray[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ];
         
         if( dimY > (blIdy+1) * numThreadsPerBlock  && thrj+1 < xkolik )
-          sArray/*[i]*/[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ];
+          sArray[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ];
         
         if( blIdy != 0 && thrj+1 < xkolik )
-          sArray/*[i]*/[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ];
+          sArray[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ];
       }
       
       for( int k = 0; k < numThreadsPerBlock; k++ ){
         for( int l = 0; l < numThreadsPerBlock; l++ )
           if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
-            sArray/*[i]*/[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ];
+            sArray[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ];
       }
-      bool pom = false;
+      
       for( int k = 0; k < numThreadsPerBlock; k++ ){ 
         for( int l = 0; l < numThreadsPerBlock; l++ ){
           if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ){
             //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl;
             if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
             {
-              pom = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy);
-              changed = changed || pom;
+              changed = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy) || changed;
+              
             }
           }
         }
       }
       /*aux.save( "aux-1pruch.tnl" );
-      for( int k = 0; k < sizeSArray; k++ ){ 
-        for( int l = 0; l < sizeSArray; l++ ) {
-          std::cout << sArray[ k * sizeSArray + l] << " ";
-        }
-        std::cout << std::endl;
-      }*/
-           
+       for( int k = 0; k < sizeSArray; k++ ){ 
+       for( int l = 0; l < sizeSArray; l++ ) {
+       std::cout << sArray[ k * sizeSArray + l] << " ";
+       }
+       std::cout << std::endl;
+       }*/
+      
       for( int k = 0; k < numThreadsPerBlock; k++ ) 
         for( int l = numThreadsPerBlock-1; l >-1; l-- ) { 
           if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
@@ -201,12 +201,12 @@ updateBlocks( InterfaceMapType interfaceMap,
           }
         }
       /*aux.save( "aux-2pruch.tnl" );
-      for( int k = 0; k < sizeSArray; k++ ){ 
-        for( int l = 0; l < sizeSArray; l++ ) {
-          std::cout << sArray[ k * sizeSArray + l] << " ";
-        }
-        std::cout << std::endl;
-      }*/
+       for( int k = 0; k < sizeSArray; k++ ){ 
+       for( int l = 0; l < sizeSArray; l++ ) {
+       std::cout << sArray[ k * sizeSArray + l] << " ";
+       }
+       std::cout << std::endl;
+       }*/
       
       for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
         for( int l = 0; l < numThreadsPerBlock; l++ ) {
@@ -219,12 +219,12 @@ updateBlocks( InterfaceMapType interfaceMap,
           }
         }
       /*aux.save( "aux-3pruch.tnl" );
-      for( int k = 0; k < sizeSArray; k++ ){ 
-        for( int l = 0; l < sizeSArray; l++ ) {
-          std::cout << sArray[ k * sizeSArray + l] << " ";
-        }
-        std::cout << std::endl;
-      }*/
+       for( int k = 0; k < sizeSArray; k++ ){ 
+       for( int l = 0; l < sizeSArray; l++ ) {
+       std::cout << sArray[ k * sizeSArray + l] << " ";
+       }
+       std::cout << std::endl;
+       }*/
       
       for( int k = numThreadsPerBlock-1; k > -1; k-- ){
         for( int l = numThreadsPerBlock-1; l >-1; l-- ) { 
@@ -238,12 +238,12 @@ updateBlocks( InterfaceMapType interfaceMap,
         }
       }
       /*aux.save( "aux-4pruch.tnl" );
-      for( int k = 0; k < sizeSArray; k++ ){ 
-        for( int l = 0; l < sizeSArray; l++ ) {
-          std::cout << sArray[ k * sizeSArray + l] << " ";
-        }
-        std::cout << std::endl;
-      }*/
+       for( int k = 0; k < sizeSArray; k++ ){ 
+       for( int l = 0; l < sizeSArray; l++ ) {
+       std::cout << sArray[ k * sizeSArray + l] << " ";
+       }
+       std::cout << std::endl;
+       }*/
       
       
       if( changed ){
@@ -254,7 +254,7 @@ updateBlocks( InterfaceMapType interfaceMap,
       for( int k = 0; k < numThreadsPerBlock; k++ ){ 
         for( int l = 0; l < numThreadsPerBlock; l++ ) {
           if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )      
-            helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] = sArray/*[i]*/[ (k + 1)* sizeSArray + l + 1 ];
+            helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] = sArray[ (k + 1)* sizeSArray + l + 1 ];
           //std::cout<< sArray[k+1][l+1];
         }
         //std::cout<<std::endl;
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index e29421bb1..bc82b7a2c 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -123,6 +123,7 @@ solve( const MeshPointer& mesh,
         helpFunc = helpFunc1;
         this->template updateBlocks< 1026 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
         
+        //Reduction      
         for( int i = 0; i < BlockIterHost.getSize(); i++ ){
           if( IsCalculationDone == 0 ){
             IsCalculationDone = IsCalculationDone || BlockIterHost[ i ];
@@ -130,6 +131,7 @@ solve( const MeshPointer& mesh,
           }
         }
         numWhile++;
+        std::cout <<"numWhile = "<< numWhile <<std::endl;
         
         for( int j = numBlocksY-1; j>-1; j-- ){
           for( int i = 0; i < numBlocksX; i++ )
@@ -146,7 +148,6 @@ solve( const MeshPointer& mesh,
          std::cout << std::endl;
          }
          std::cout << std::endl;*/
-        //Reduction      
         
         //std::cout<<std::endl;
         string s( "aux-"+ std::to_string(numWhile) + ".tnl");
@@ -171,7 +172,7 @@ solve( const MeshPointer& mesh,
        if( ! interfaceMap( cell ) )
        this->updateCell( aux, cell );
        }
-       }
+       } 
        
        //aux.save( "aux-1.tnl" );
        
@@ -261,12 +262,12 @@ solve( const MeshPointer& mesh,
       TNL_CHECK_CUDA_DEVICE;
       
       
-      /*TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom;
-       BlockIterPom.setSize( numBlocksX * numBlocksY  );
-       BlockIterPom.setValue( 0 );*/
+      TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom;
+      BlockIterPom.setSize( numBlocksX * numBlocksY  );
+      BlockIterPom.setValue( 0 );
       /*TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1;
-       BlockIterPom1.setSize( numBlocksX * numBlocksY  );
-       BlockIterPom1.setValue( 0 );*/
+      BlockIterPom1.setSize( numBlocksX * numBlocksY  );
+      BlockIterPom1.setValue( 0 );*/
       /*int *BlockIterDevice;
        cudaMalloc((void**) &BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/
       int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
@@ -284,9 +285,7 @@ solve( const MeshPointer& mesh,
        cudaMalloc((void**) &dBlock, nBlocks * sizeof( int ) );*/
       
       
-      MeshFunctionPointer helpFunc1;
-      helpFunc1->setMesh(mesh);
-      
+      MeshFunctionPointer helpFunc1( mesh );      
       MeshFunctionPointer helpFunc( mesh );
       
       helpFunc1 = auxPtr;
@@ -301,83 +300,94 @@ solve( const MeshPointer& mesh,
         /** HERE IS CHESS METHOD **/
         
         /*auxPtr = helpFunc;
+         
+         CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
+         interfaceMapPtr.template getData< Device >(),
+         auxPtr.template getData< Device>(),
+         helpFunc.template modifyData< Device>(),
+         BlockIterDevice,
+         oddEvenBlock );
+         cudaDeviceSynchronize();
+         TNL_CHECK_CUDA_DEVICE;
+         auxPtr = helpFunc;
+         
+         oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
+         
+         CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
+         interfaceMapPtr.template getData< Device >(),
+         auxPtr.template getData< Device>(),
+         helpFunc.template modifyData< Device>(),
+         BlockIterDevice,
+         oddEvenBlock );
+         cudaDeviceSynchronize();
+         TNL_CHECK_CUDA_DEVICE;
+         auxPtr = helpFunc;
+         
+         oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
+         
+         CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
+         cudaDeviceSynchronize();
+         TNL_CHECK_CUDA_DEVICE;
+         CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+         cudaDeviceSynchronize();
+         TNL_CHECK_CUDA_DEVICE;
+         
+         BlockIterD = dBlock.getElement( 0 );*/
+        
+        /**------------------------------------------------------------------------------------------------*/
+        
+        
+        /** HERE IS FIM **/
         
+        helpFunc1 = auxPtr;
+        auxPtr = helpFunc;
+        helpFunc = helpFunc1;
+        TNL_CHECK_CUDA_DEVICE;
+        
+        //int pocBloku = 0;
+        Devices::Cuda::synchronizeDevice();
         CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
                 interfaceMapPtr.template getData< Device >(),
-                auxPtr.template getData< Device>(),
+                auxPtr.template modifyData< Device>(),
                 helpFunc.template modifyData< Device>(),
-                BlockIterDevice,
-                oddEvenBlock );
+                BlockIterDevice );
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
-        auxPtr = helpFunc;
         
-        oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
+        //std::cout << "Pocet aktivnich bloku = " << pocBloku << std::endl;
+        //BlockIterPom1 = BlockIterDevice;
+        ///for( int i =0; i< numBlocksX; i++ ){
+        //  for( int j = 0; j < numBlocksY; j++ )
+        //  {
+        //    std::cout << BlockIterPom1[j*numBlocksX + i];
+        //  }
+        //  std::cout << std::endl;
+        //}
+        //std::cout << std::endl;
         
-        CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
-                interfaceMapPtr.template getData< Device >(),
-                auxPtr.template getData< Device>(),
-                helpFunc.template modifyData< Device>(),
-                BlockIterDevice,
-                oddEvenBlock );
+        GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, BlockIterPom, numBlocksX, numBlocksY );
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
-        auxPtr = helpFunc;
+        BlockIterDevice = BlockIterPom;
+        
+        //std::cout<< "Probehlo" << std::endl;
+        
+        //TNL::swap( auxPtr, helpFunc );
         
-        oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
         
         CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
-        cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
+        
         CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
-        cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
         
-        BlockIterD = dBlock.getElement( 0 );*/
         
-        /**------------------------------------------------------------------------------------------------*/
+        BlockIterD = dBlock.getElement( 0 );
+        //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
+        cudaDeviceSynchronize();
+        TNL_CHECK_CUDA_DEVICE;
         
         
-        /** HERE IS FIM **/
-        
-         helpFunc1 = auxPtr;
-         auxPtr = helpFunc;
-         helpFunc = helpFunc1;
-         
-         //int pocBloku = 0;
-         Devices::Cuda::synchronizeDevice();
-         CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
-         interfaceMapPtr.template getData< Device >(),
-         auxPtr.template modifyData< Device>(),
-         helpFunc.template modifyData< Device>(),
-         BlockIterDevice );
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
-         
-         //std::cout << "Pocet aktivnich bloku = " << pocBloku << std::endl;
-         
-         GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, numBlocksX, numBlocksY );
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
-         
-         //std::cout<< "Probehlo" << std::endl;
-         
-         //TNL::swap( auxPtr, helpFunc );
-         
-         
-         CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
-         TNL_CHECK_CUDA_DEVICE;
-         
-         CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
-         TNL_CHECK_CUDA_DEVICE;
-         
-         
-         BlockIterD = dBlock.getElement( 0 );
-         //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
-         
-        
         /**-----------------------------------------------------------------------------------------------------------*/
         /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
          BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
@@ -392,7 +402,6 @@ solve( const MeshPointer& mesh,
        cudaFree( dBlock );
        delete BlockIter;*/
       cudaDeviceSynchronize();
-      
       TNL_CHECK_CUDA_DEVICE;
       
       aux = *auxPtr;
@@ -410,7 +419,7 @@ solve( const MeshPointer& mesh,
 
 template < typename Index >
 __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
-        /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY )
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY )
 {
   int i = blockIdx.x * 1024 + threadIdx.x;
   
@@ -430,7 +439,7 @@ __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index
       pom = 1;//BlockIterPom[ i ] = 1;
     }
     
-    BlockIterDevice[ i ] = pom;//BlockIterPom[ i ];
+    BlockIterPom[ i ] = pom;//BlockIterPom[ i ];
   }
 }
 
@@ -514,14 +523,16 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
   int i = threadIdx.x + blockDim.x*blockIdx.x;
   int j = blockDim.y*blockIdx.y + threadIdx.y;
   /** FOR CHESS METHOD */
-  if( (blockIdx.y%2  + blockIdx.x) % 2 == oddEvenBlock )
-  {
-    /**-----------------------------------------*/
-    
+  //if( (blockIdx.y%2  + blockIdx.x) % 2 == oddEvenBlock )
+  //{
+  /**------------------------------------------*/
+  
+  
+  /** FOR FIM METHOD */
     
-    /** FOR FIM METHOD */
-    /*if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x] )
-     {*/ 
+  if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] )
+  { 
+    __syncthreads();
     /**-----------------------------------------*/
     const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
     __shared__ volatile int dimX;
-- 
GitLab


From 0dcd35d5f5d5445a7ecb05cf8b9522a0cf29bca8 Mon Sep 17 00:00:00 2001
From: Fencl <fenclmat@fjfi.cvut.cz>
Date: Mon, 5 Nov 2018 14:43:21 +0100
Subject: [PATCH 07/20] FIM implemented in 3D

---
 .../tnlDirectEikonalMethodsBase.h             |  10 +-
 .../tnlFastSweepingMethod2D_impl.h            |  16 +-
 .../tnlFastSweepingMethod3D_impl.h            | 881 +++++++++---------
 3 files changed, 478 insertions(+), 429 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index ccbae8abe..7d990c1bb 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -160,11 +160,17 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3
                                   Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
                                   Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap );
 
-template < typename Real, typename Device, typename Index >
+template < int sizeSArray, typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
-                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
+                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
+                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
                                       TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );
+
+template < typename Index >
+__global__ void GetNeighbours3D( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,
+        int numBlockX, int numBlockY, int numBlockZ );
 #endif
 
 #include "tnlDirectEikonalMethodsBase_impl.h"
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index bc82b7a2c..fa2716897 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -85,7 +85,7 @@ solve( const MeshPointer& mesh,
   {
     if( std::is_same< DeviceType, Devices::Host >::value )
     {
-      int numThreadsPerBlock = 1024;
+      int numThreadsPerBlock = 16;
       
       
       int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
@@ -115,13 +115,13 @@ solve( const MeshPointer& mesh,
        }
        std::cout<<std::endl;*/
       unsigned int numWhile = 0;
-      while( IsCalculationDone  )
+      while( IsCalculationDone && numWhile < 1 )
       {      
         IsCalculationDone = 0;
         helpFunc1 = auxPtr;
         auxPtr = helpFunc;
         helpFunc = helpFunc1;
-        this->template updateBlocks< 1026 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+        this->template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
         
         //Reduction      
         for( int i = 0; i < BlockIterHost.getSize(); i++ ){
@@ -394,9 +394,7 @@ solve( const MeshPointer& mesh,
         numIter ++;
       }
       if( numIter == 1 ){
-        helpFunc1 = auxPtr;
         auxPtr = helpFunc;
-        helpFunc = helpFunc1;
       }
       /*cudaFree( BlockIterDevice );
        cudaFree( dBlock );
@@ -535,10 +533,10 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     __syncthreads();
     /**-----------------------------------------*/
     const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
-    __shared__ volatile int dimX;
-    __shared__ volatile int dimY;
-    __shared__ volatile Real hx;
-    __shared__ volatile Real hy;
+    __shared__ int dimX;
+    __shared__ int dimY;
+    __shared__ Real hx;
+    __shared__ Real hy;
     if( thri==0 && thrj == 0)
     {
       dimX = mesh.getDimensions().x();
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index 4daf9fc92..65aba5bf5 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -16,511 +16,556 @@
 #include "tnlFastSweepingMethod.h"
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
 FastSweepingMethod()
 : maxIterations( 1 )
 {
-   
+  
 }
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 const Index&
 FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
 getMaxIterations() const
 {
-   
+  
 }
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 void
 FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
 setMaxIterations( const IndexType& maxIterations )
 {
-   
+  
 }
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 void
 FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
 solve( const MeshPointer& mesh,
-       const AnisotropyPointer& anisotropy,
-       MeshFunctionPointer& u )
+        const AnisotropyPointer& anisotropy,
+        MeshFunctionPointer& u )
 {
-   MeshFunctionPointer auxPtr;
-   InterfaceMapPointer interfaceMapPtr;
-   auxPtr->setMesh( mesh );
-   interfaceMapPtr->setMesh( mesh );
-   std::cout << "Initiating the interface cells ..." << std::endl;
-   BaseType::initInterface( u, auxPtr, interfaceMapPtr );
+  MeshFunctionPointer auxPtr;
+  InterfaceMapPointer interfaceMapPtr;
+  auxPtr->setMesh( mesh );
+  interfaceMapPtr->setMesh( mesh );
+  std::cout << "Initiating the interface cells ..." << std::endl;
+  BaseType::initInterface( u, auxPtr, interfaceMapPtr );
 #ifdef HAVE_CUDA
-   cudaDeviceSynchronize();
+  cudaDeviceSynchronize();
 #endif
-   auxPtr->save( "aux-ini.tnl" );   
-   
-   typename MeshType::Cell cell( *mesh );
-   
-   IndexType iteration( 0 );
-   MeshFunctionType aux = *auxPtr;
-   InterfaceMapType interfaceMap = * interfaceMapPtr;
-    while( iteration < this->maxIterations )
+  auxPtr->save( "aux-ini.tnl" );   
+  
+  typename MeshType::Cell cell( *mesh );
+  
+  IndexType iteration( 0 );
+  MeshFunctionType aux = *auxPtr;
+  InterfaceMapType interfaceMap = * interfaceMapPtr;
+  while( iteration < this->maxIterations )
+  {
+    if( std::is_same< DeviceType, Devices::Host >::value )
     {
-        if( std::is_same< DeviceType, Devices::Host >::value )
+      for( cell.getCoordinates().z() = 0;
+              cell.getCoordinates().z() < mesh->getDimensions().z();
+              cell.getCoordinates().z()++ )
+      {
+        for( cell.getCoordinates().y() = 0;
+                cell.getCoordinates().y() < mesh->getDimensions().y();
+                cell.getCoordinates().y()++ )
         {
-           for( cell.getCoordinates().z() = 0;
-                cell.getCoordinates().z() < mesh->getDimensions().z();
-                cell.getCoordinates().z()++ )
-           {
-              for( cell.getCoordinates().y() = 0;
-                   cell.getCoordinates().y() < mesh->getDimensions().y();
-                   cell.getCoordinates().y()++ )
-              {
-                 for( cell.getCoordinates().x() = 0;
-                      cell.getCoordinates().x() < mesh->getDimensions().x();
-                      cell.getCoordinates().x()++ )
-                 {
-                    cell.refresh();
-                    if( ! interfaceMap( cell ) )
-                       this->updateCell( aux, cell );
-                 }
-              }
-           }
-           //aux.save( "aux-1.tnl" );
-
-           for( cell.getCoordinates().z() = 0;
-                cell.getCoordinates().z() < mesh->getDimensions().z();
-                cell.getCoordinates().z()++ )
-           {
-              for( cell.getCoordinates().y() = 0;
-                   cell.getCoordinates().y() < mesh->getDimensions().y();
-                   cell.getCoordinates().y()++ )
-              {
-                 for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                      cell.getCoordinates().x() >= 0 ;
-                      cell.getCoordinates().x()-- )		
-                 {
-                    //std::cerr << "2 -> ";
-                    cell.refresh();
-                    if( ! interfaceMap( cell ) )            
-                       this->updateCell( aux, cell );
-                 }
-              }
-           }
-           //aux.save( "aux-2.tnl" );
-           for( cell.getCoordinates().z() = 0;
-                cell.getCoordinates().z() < mesh->getDimensions().z();
-                cell.getCoordinates().z()++ )
-           {
-              for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-                   cell.getCoordinates().y() >= 0 ;
-                   cell.getCoordinates().y()-- )
-              {
-                 for( cell.getCoordinates().x() = 0;
-                      cell.getCoordinates().x() < mesh->getDimensions().x();
-                      cell.getCoordinates().x()++ )
-                 {
-                    //std::cerr << "3 -> ";
-                    cell.refresh();
-                    if( ! interfaceMap( cell ) )            
-                       this->updateCell( aux, cell );
-                 }
-              }
-           }
-           //aux.save( "aux-3.tnl" );
-
-           for( cell.getCoordinates().z() = 0;
-                cell.getCoordinates().z() < mesh->getDimensions().z();
-                cell.getCoordinates().z()++ )
-           {
-              for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-                   cell.getCoordinates().y() >= 0;
-                   cell.getCoordinates().y()-- )
-              {
-                 for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                      cell.getCoordinates().x() >= 0 ;
-                      cell.getCoordinates().x()-- )		
-                 {
-                    //std::cerr << "4 -> ";
-                    cell.refresh();
-                    if( ! interfaceMap( cell ) )            
-                       this->updateCell( aux, cell );
-                 }
-              }
-           }     
-           //aux.save( "aux-4.tnl" );
-
-           for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-                cell.getCoordinates().z() >= 0;
-                cell.getCoordinates().z()-- )
-           {
-              for( cell.getCoordinates().y() = 0;
-                   cell.getCoordinates().y() < mesh->getDimensions().y();
-                   cell.getCoordinates().y()++ )
-              {
-                 for( cell.getCoordinates().x() = 0;
-                      cell.getCoordinates().x() < mesh->getDimensions().x();
-                      cell.getCoordinates().x()++ )
-                 {
-                    //std::cerr << "5 -> ";
-                    cell.refresh();
-                    if( ! interfaceMap( cell ) )
-                       this->updateCell( aux, cell );
-                 }
-              }
-           }
-           //aux.save( "aux-5.tnl" );
-
-           for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-                cell.getCoordinates().z() >= 0;
-                cell.getCoordinates().z()-- )
-           {
-              for( cell.getCoordinates().y() = 0;
-                   cell.getCoordinates().y() < mesh->getDimensions().y();
-                   cell.getCoordinates().y()++ )
-              {
-                 for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                      cell.getCoordinates().x() >= 0 ;
-                      cell.getCoordinates().x()-- )		
-                 {
-                    //std::cerr << "6 -> ";
-                    cell.refresh();
-                    if( ! interfaceMap( cell ) )            
-                       this->updateCell( aux, cell );
-                 }
-              }
-           }
-           //aux.save( "aux-6.tnl" );
-
-           for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-                cell.getCoordinates().z() >= 0;
-                cell.getCoordinates().z()-- )
-           {
-              for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-                   cell.getCoordinates().y() >= 0 ;
-                   cell.getCoordinates().y()-- )
-              {
-                 for( cell.getCoordinates().x() = 0;
-                      cell.getCoordinates().x() < mesh->getDimensions().x();
-                      cell.getCoordinates().x()++ )
-                 {
-                    //std::cerr << "7 -> ";
-                    cell.refresh();
-                    if( ! interfaceMap( cell ) )            
-                       this->updateCell( aux, cell );
-                 }
-              }
-           }
-           //aux.save( "aux-7.tnl" );
-
-           for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-                cell.getCoordinates().z() >= 0;
-                cell.getCoordinates().z()-- )
-           {
-              for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-                   cell.getCoordinates().y() >= 0;
-                   cell.getCoordinates().y()-- )
-              {
-                 for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                      cell.getCoordinates().x() >= 0 ;
-                      cell.getCoordinates().x()-- )		
-                 {
-                    //std::cerr << "8 -> ";
-                    cell.refresh();
-                    if( ! interfaceMap( cell ) )            
-                       this->updateCell( aux, cell );
-                 }
-              }
-           }
+          for( cell.getCoordinates().x() = 0;
+                  cell.getCoordinates().x() < mesh->getDimensions().x();
+                  cell.getCoordinates().x()++ )
+          {
+            cell.refresh();
+            if( ! interfaceMap( cell ) )
+              this->updateCell( aux, cell );
+          }
+        }
       }
-      if( std::is_same< DeviceType, Devices::Cuda >::value )
+      //aux.save( "aux-1.tnl" );
+      
+      for( cell.getCoordinates().z() = 0;
+              cell.getCoordinates().z() < mesh->getDimensions().z();
+              cell.getCoordinates().z()++ )
       {
-         // TODO: CUDA code
-#ifdef HAVE_CUDA
-          const int cudaBlockSize( 8 );
-          int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
-          int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
-          int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z(), cudaBlockSize ); 
-          if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
-              std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
-          dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
-          dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
-                 
-          tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
-          
-          
-          int BlockIterD = 1;
-          
-          TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
-          BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ );
-          BlockIterDevice.setValue( 1 );
-          /*int *BlockIterDevice;
-          cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/
-          int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0);
-          
-          TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
-          dBlock.setSize( nBlocks );
-          dBlock.setValue( 0 );
-          /*int *dBlock;
-          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/
-          
-          while( BlockIterD )
+        for( cell.getCoordinates().y() = 0;
+                cell.getCoordinates().y() < mesh->getDimensions().y();
+                cell.getCoordinates().y()++ )
+        {
+          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                  cell.getCoordinates().x() >= 0 ;
+                  cell.getCoordinates().x()-- )		
           {
-             CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
-                                                              interfaceMapPtr.template getData< Device >(),
-                                                              auxPtr.template modifyData< Device>(),
-                                                              BlockIterDevice );
-            cudaDeviceSynchronize();
-            TNL_CHECK_CUDA_DEVICE;
-            
-            CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
-            cudaDeviceSynchronize();
-            TNL_CHECK_CUDA_DEVICE;
-            
-            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
-            cudaDeviceSynchronize();
-            TNL_CHECK_CUDA_DEVICE;
-            cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
-                                   
-            /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
-                BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
-            
+            //std::cerr << "2 -> ";
+            cell.refresh();
+            if( ! interfaceMap( cell ) )            
+              this->updateCell( aux, cell );
           }
-          //cudaFree( BlockIterDevice );
-          //cudaFree( dBlock );
-          cudaDeviceSynchronize();
-          TNL_CHECK_CUDA_DEVICE;
-          aux = *auxPtr;
-          interfaceMap = *interfaceMapPtr;
-#endif
+        }
       }
-        
-      //aux.save( "aux-8.tnl" );
-      iteration++;
+      //aux.save( "aux-2.tnl" );
+      for( cell.getCoordinates().z() = 0;
+              cell.getCoordinates().z() < mesh->getDimensions().z();
+              cell.getCoordinates().z()++ )
+      {
+        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+                cell.getCoordinates().y() >= 0 ;
+                cell.getCoordinates().y()-- )
+        {
+          for( cell.getCoordinates().x() = 0;
+                  cell.getCoordinates().x() < mesh->getDimensions().x();
+                  cell.getCoordinates().x()++ )
+          {
+            //std::cerr << "3 -> ";
+            cell.refresh();
+            if( ! interfaceMap( cell ) )            
+              this->updateCell( aux, cell );
+          }
+        }
+      }
+      //aux.save( "aux-3.tnl" );
+      
+      for( cell.getCoordinates().z() = 0;
+              cell.getCoordinates().z() < mesh->getDimensions().z();
+              cell.getCoordinates().z()++ )
+      {
+        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+                cell.getCoordinates().y() >= 0;
+                cell.getCoordinates().y()-- )
+        {
+          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                  cell.getCoordinates().x() >= 0 ;
+                  cell.getCoordinates().x()-- )		
+          {
+            //std::cerr << "4 -> ";
+            cell.refresh();
+            if( ! interfaceMap( cell ) )            
+              this->updateCell( aux, cell );
+          }
+        }
+      }     
+      //aux.save( "aux-4.tnl" );
+      
+      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+              cell.getCoordinates().z() >= 0;
+              cell.getCoordinates().z()-- )
+      {
+        for( cell.getCoordinates().y() = 0;
+                cell.getCoordinates().y() < mesh->getDimensions().y();
+                cell.getCoordinates().y()++ )
+        {
+          for( cell.getCoordinates().x() = 0;
+                  cell.getCoordinates().x() < mesh->getDimensions().x();
+                  cell.getCoordinates().x()++ )
+          {
+            //std::cerr << "5 -> ";
+            cell.refresh();
+            if( ! interfaceMap( cell ) )
+              this->updateCell( aux, cell );
+          }
+        }
+      }
+      //aux.save( "aux-5.tnl" );
+      
+      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+              cell.getCoordinates().z() >= 0;
+              cell.getCoordinates().z()-- )
+      {
+        for( cell.getCoordinates().y() = 0;
+                cell.getCoordinates().y() < mesh->getDimensions().y();
+                cell.getCoordinates().y()++ )
+        {
+          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                  cell.getCoordinates().x() >= 0 ;
+                  cell.getCoordinates().x()-- )		
+          {
+            //std::cerr << "6 -> ";
+            cell.refresh();
+            if( ! interfaceMap( cell ) )            
+              this->updateCell( aux, cell );
+          }
+        }
+      }
+      //aux.save( "aux-6.tnl" );
+      
+      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+              cell.getCoordinates().z() >= 0;
+              cell.getCoordinates().z()-- )
+      {
+        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+                cell.getCoordinates().y() >= 0 ;
+                cell.getCoordinates().y()-- )
+        {
+          for( cell.getCoordinates().x() = 0;
+                  cell.getCoordinates().x() < mesh->getDimensions().x();
+                  cell.getCoordinates().x()++ )
+          {
+            //std::cerr << "7 -> ";
+            cell.refresh();
+            if( ! interfaceMap( cell ) )            
+              this->updateCell( aux, cell );
+          }
+        }
+      }
+      //aux.save( "aux-7.tnl" );
+      
+      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+              cell.getCoordinates().z() >= 0;
+              cell.getCoordinates().z()-- )
+      {
+        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+                cell.getCoordinates().y() >= 0;
+                cell.getCoordinates().y()-- )
+        {
+          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                  cell.getCoordinates().x() >= 0 ;
+                  cell.getCoordinates().x()-- )		
+          {
+            //std::cerr << "8 -> ";
+            cell.refresh();
+            if( ! interfaceMap( cell ) )            
+              this->updateCell( aux, cell );
+          }
+        }
+      }
+    }
+    if( std::is_same< DeviceType, Devices::Cuda >::value )
+    {
+      // TODO: CUDA code
+#ifdef HAVE_CUDA
+      const int cudaBlockSize( 8 );
+      int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
+      int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
+      int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z(), cudaBlockSize ); 
+      if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
+        std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
+      dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
+      dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
+      
+      tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
+      
+      
+      int BlockIterD = 1;
+      
+      TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
+      BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ );
+      BlockIterDevice.setValue( 1 );
+      TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom;
+      BlockIterPom.setSize( numBlocksX * numBlocksY * numBlocksZ );
+      BlockIterPom.setValue( 0 );
+      /*int *BlockIterDevice;
+       cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/
+      int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0);
       
-   }
-   aux.save("aux-final.tnl");
+      TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
+      dBlock.setSize( nBlocks );
+      dBlock.setValue( 0 );
+      
+      int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0);
+      /*int *dBlock;
+       cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/
+      MeshFunctionPointer helpFunc1( mesh );      
+      MeshFunctionPointer helpFunc( mesh );
+      
+      helpFunc1 = auxPtr;
+      auxPtr = helpFunc;
+      helpFunc = helpFunc1;
+      int numIter = 0;
+      
+      while( BlockIterD )
+      {
+        helpFunc1 = auxPtr;
+        auxPtr = helpFunc;
+        helpFunc = helpFunc1;
+        TNL_CHECK_CUDA_DEVICE;
+        
+        CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr,
+                interfaceMapPtr.template getData< Device >(),
+                auxPtr.template getData< Device>(),
+                helpFunc.template modifyData< Device>(),
+                BlockIterDevice );
+        cudaDeviceSynchronize();
+        TNL_CHECK_CUDA_DEVICE;
+        
+        GetNeighbours3D<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, BlockIterPom, numBlocksX, numBlocksY, numBlocksZ );
+        cudaDeviceSynchronize();
+        TNL_CHECK_CUDA_DEVICE;
+        BlockIterDevice = BlockIterPom;
+        
+        CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
+        cudaDeviceSynchronize();
+        TNL_CHECK_CUDA_DEVICE;
+        
+        CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+        cudaDeviceSynchronize();
+        TNL_CHECK_CUDA_DEVICE;
+        cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
+        numIter++;
+        /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
+         BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
+        
+      }
+      if( numIter == 1 ){
+        auxPtr = helpFunc;
+      }
+      //cudaFree( BlockIterDevice );
+      //cudaFree( dBlock );
+      cudaDeviceSynchronize();
+      TNL_CHECK_CUDA_DEVICE;
+      aux = *auxPtr;
+      interfaceMap = *interfaceMapPtr;
+#endif
+    }
+    
+    //aux.save( "aux-8.tnl" );
+    iteration++;
+    
+  }
+  aux.save("aux-final.tnl");
 }
 
 #ifdef HAVE_CUDA
-template < typename Real, typename Device, typename Index >
+template < typename Index >
+__global__ void GetNeighbours3D( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,
+        int numBlockX, int numBlockY, int numBlockZ )
+{
+  int i = blockIdx.x * 1024 + threadIdx.x;
+  
+  if( i < numBlockX * numBlockY * numBlockZ )
+  {
+    int pom = 0;//BlockIterPom[ i ] = 0;
+    int m=0, l=0, k=0;
+    l = i/( numBlockX * numBlockY );
+    k = (i-l*numBlockX * numBlockY )/(numBlockX );
+    m = (i-l*numBlockX * numBlockY )%( numBlockX );
+    if( m > 0 && BlockIterDevice[ i - 1 ] ){
+      pom = 1;//BlockIterPom[ i ] = 1;
+    }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){
+      pom = 1;//BlockIterPom[ i ] = 1;
+    }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){
+      pom = 1;// BlockIterPom[ i ] = 1;
+    }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){
+      pom = 1;//BlockIterPom[ i ] = 1;
+    }else if( l > 0 && BlockIterDevice[ i - numBlockX*numBlockY ] ){
+      pom = 1;
+    }else if( l < numBlockZ-1 && BlockIterDevice[ i + numBlockX*numBlockY ] ){
+      pom = 1;
+    }
+    
+    BlockIterPom[ i ] = pom;//BlockIterPom[ i ];
+  }
+}
+
+template < int sizeSArray, typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
-                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
-                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
-                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice )
+        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
+        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice )
 {
-    int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
-    int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z;
-    int i = threadIdx.x + blockDim.x*blockIdx.x;
-    int j = blockDim.y*blockIdx.y + threadIdx.y;
-    int k = blockDim.z*blockIdx.z + threadIdx.z;
-    int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri;
+  int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
+  int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z;
+  int i = threadIdx.x + blockDim.x*blockIdx.x;
+  int j = blockDim.y*blockIdx.y + threadIdx.y;
+  int k = blockDim.z*blockIdx.z + threadIdx.z;
+  int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri;
+  
+  if( BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] )
+  {
+    __syncthreads();
     
-    __shared__ volatile bool changed[8*8*8];
-    changed[ currentIndex ] = false;
+    __shared__ volatile bool changed[ (sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2)];
     
+    changed[ currentIndex ] = false;
     if( thrj == 0 && thri == 0 && thrk == 0 )
-        changed[ 0 ] = true;
+      changed[ 0 ] = true;
     
     const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
-    __shared__ Real hx;
-    __shared__ Real hy;
-    __shared__ Real hz;
+    __shared__ Real hx; __shared__ int dimX;
+    __shared__ Real hy; __shared__ int dimY;
+    __shared__ Real hz; __shared__ int dimZ;
+    
     if( thrj == 1 && thri == 1 && thrk == 1 )
     {
-        hx = mesh.getSpaceSteps().x();
-        hy = mesh.getSpaceSteps().y();
-        hz = mesh.getSpaceSteps().z();
+      hx = mesh.getSpaceSteps().x();
+      hy = mesh.getSpaceSteps().y();
+      hz = mesh.getSpaceSteps().z();
+      dimX = mesh.getDimensions().x();
+      dimY = mesh.getDimensions().y();
+      dimZ = mesh.getDimensions().z();
+      BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 0;
     }
-    __shared__ volatile Real sArray[10][10][10];
-    sArray[thrk][thrj][thri] = std::numeric_limits< Real >::max();
-    if(thri == 0 )
-    {
-        sArray[8][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
-        sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
-        sArray[thrk+1][thrj+1][8] = std::numeric_limits< Real >::max();
-        sArray[thrk+1][thrj+1][9] = std::numeric_limits< Real >::max();
-        sArray[thrj+1][8][thrk+1] = std::numeric_limits< Real >::max();
-        sArray[thrj+1][9][thrk+1] = std::numeric_limits< Real >::max();
-    }
-            
+    __shared__ volatile Real sArray[sizeSArray][sizeSArray][sizeSArray];
+    sArray[thrk+1][thrj+1][thri+1] = std::numeric_limits< Real >::max();
+    
     //filling sArray edges
-    int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
-    int dimZ = mesh.getDimensions().z();
-    __shared__ volatile int numOfBlockx;
-    __shared__ volatile int numOfBlocky;
-    __shared__ volatile int numOfBlockz;
-    __shared__ int xkolik;
-    __shared__ int ykolik;
-    __shared__ int zkolik;
-    if( thri == 0 && thrj == 0 && thrk == 0 )
-    {
-        xkolik = blockDim.x + 1;
-        ykolik = blockDim.y + 1;
-        zkolik = blockDim.z + 1;
-        numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0);
-        numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0);
-        numOfBlockz = dimZ/blockDim.z + ((dimZ%blockDim.z != 0) ? 1:0);
-        
-        if( numOfBlockx - 1 == blIdx )
-            xkolik = dimX - (blIdx)*blockDim.x+1;
-
-        if( numOfBlocky -1 == blIdy )
-            ykolik = dimY - (blIdy)*blockDim.y+1;
-        if( numOfBlockz-1 == blIdz )
-            zkolik = dimZ - (blIdz)*blockDim.z+1;
-        
-        BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 0;
-    }
+    int numOfBlockx;
+    int numOfBlocky;
+    int numOfBlockz;
+    int xkolik;
+    int ykolik;
+    int zkolik;
+    xkolik = blockDim.x + 1;
+    ykolik = blockDim.y + 1;
+    zkolik = blockDim.z + 1;
+    numOfBlockx = gridDim.x;
+    numOfBlocky = gridDim.y;
+    numOfBlockz = gridDim.z;
+    
+    if( numOfBlockx - 1 == blIdx )
+      xkolik = dimX - (blIdx)*blockDim.x+1;
+    if( numOfBlocky -1 == blIdy )
+      ykolik = dimY - (blIdy)*blockDim.y+1;
+    if( numOfBlockz-1 == blIdz )
+      zkolik = dimZ - (blIdz)*blockDim.z+1;
     __syncthreads();
     
     if( thri == 0 )
     {        
-        if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik )
-            sArray[thrk+1][thrj+1][0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ];
-        else
-            sArray[thrk+1][thrj+1][0] = std::numeric_limits< Real >::max();
+      if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik )
+        sArray[thrk+1][thrj+1][0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ];
+      else
+        sArray[thrk+1][thrj+1][0] = std::numeric_limits< Real >::max();
     }
     
     if( thri == 1 )
     {
-        if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik )
-            sArray[thrk+1][thrj+1][9] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ];
-        else
-            sArray[thrk+1][thrj+1][9] = std::numeric_limits< Real >::max();
+      if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik )
+        sArray[thrk+1][thrj+1][9] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ];
+      else
+        sArray[thrk+1][thrj+1][9] = std::numeric_limits< Real >::max();
     }
     if( thri == 2 )
     {        
-        if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik )
-            sArray[thrk+1][0][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ];
-        else
-            sArray[thrk+1][0][thrj+1] = std::numeric_limits< Real >::max();
+      if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik )
+        sArray[thrk+1][0][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ];
+      else
+        sArray[thrk+1][0][thrj+1] = std::numeric_limits< Real >::max();
     }
     
     if( thri == 3 )
     {
-        if( dimY > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik )
-            sArray[thrk+1][9][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ];
-        else
-            sArray[thrk+1][9][thrj+1] = std::numeric_limits< Real >::max();
+      if( dimY > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik )
+        sArray[thrk+1][9][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ];
+      else
+        sArray[thrk+1][9][thrj+1] = std::numeric_limits< Real >::max();
     }
     if( thri == 4 )
     {        
-        if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik )
-            sArray[0][thrj+1][thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ];
-        else
-            sArray[0][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
+      if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik )
+        sArray[0][thrj+1][thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ];
+      else
+        sArray[0][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
     }
     
     if( thri == 5 )
     {
-        if( dimZ > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik )
-            sArray[9][thrj+1][thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ];
-        else
-            sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
+      if( dimZ > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik )
+        sArray[9][thrj+1][thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ];
+      else
+        sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
     }
     
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
+    if( i < dimX && j < dimY && k < dimZ )
     {
-        sArray[thrk+1][thrj+1][thri+1] = aux[ k*dimX*dimY + j*dimX + i ];
+      sArray[thrk+1][thrj+1][thri+1] = aux[ k*dimX*dimY + j*dimX + i ];
     }
-    __shared__ volatile int loopcounter;
-    loopcounter = 0;
     __syncthreads(); 
     while( changed[ 0 ] )
     {
-        __syncthreads();
-        
-        changed[ currentIndex ] = false;
-        
-    //calculation of update cell
-        if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ )
-        {
-            if( ! interfaceMap[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] )
-            {
-                changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz);
-            }
-        }
-        __syncthreads();
-        
-    //pyramid reduction
-        if( blockDim.x*blockDim.y*blockDim.z == 1024 )
-        {
-            if( currentIndex < 512 )
-            {
-                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
-            }
-        }
-        __syncthreads();
-        if( blockDim.x*blockDim.y*blockDim.z >= 512 )
+      __syncthreads();
+      
+      changed[ currentIndex ] = false;
+      
+      //calculation of update cell
+      if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ )
+      {
+        if( ! interfaceMap[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] )
         {
-            if( currentIndex < 256 )
-            {
-                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
-            }
+          changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz);
         }
-        __syncthreads();
-        if( blockDim.x*blockDim.y*blockDim.z >= 256 )
+      }
+      __syncthreads();
+      
+      //pyramid reduction
+      if( blockDim.x*blockDim.y*blockDim.z == 1024 )
+      {
+        if( currentIndex < 512 )
         {
-            if( currentIndex < 128 )
-            {
-                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
-            }
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
         }
-        __syncthreads();
-        if( blockDim.x*blockDim.y*blockDim.z >= 128 )
+      }
+      __syncthreads();
+      if( blockDim.x*blockDim.y*blockDim.z >= 512 )
+      {
+        if( currentIndex < 256 )
         {
-            if( currentIndex < 64 )
-            {
-                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
-            }
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
         }
-        __syncthreads();
-        if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
+      }
+      __syncthreads();
+      if( blockDim.x*blockDim.y*blockDim.z >= 256 )
+      {
+        if( currentIndex < 128 )
         {
-            if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
-            if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
-            if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
-            if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
-            if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
-            if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
         }
-        __syncthreads();
-        
-        /*if(thri == 0 && thrj ==0 && thrk ==0 && blIdx == 0 && blIdy == 0 && blIdz == 0)
-        {
-            for(int m = 0; m < 8; m++){
-                for(int n = 0; n<8; n++){
-                    for(int b=0; b<8; b++)
-                        printf(" %i ", changed[m*64 + n*8 + b]);
-                    printf("\n");
-                }
-                printf("\n \n");
-            }
-        }*/
-        if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 )
+      }
+      __syncthreads();
+      if( blockDim.x*blockDim.y*blockDim.z >= 128 )
+      {
+        if( currentIndex < 64 )
         {
-            //loopcounter++;
-            BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 1;
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
         }
-        __syncthreads();
-        /*if(thri == 0 && thrj==0 && thrk==0)
-            printf("%i \n",loopcounter);
-        if(loopcounter == 500)
-            break;*/
+      }
+      __syncthreads();
+      if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
+      {
+        if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
+        if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
+        if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
+        if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
+        if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
+        if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
+      }
+      __syncthreads();
+      
+      /*if(thri == 0 && thrj ==0 && thrk ==0 && blIdx == 0 && blIdy == 0 && blIdz == 0)
+       {
+       for(int m = 0; m < 8; m++){
+       for(int n = 0; n<8; n++){
+       for(int b=0; b<8; b++)
+       printf(" %i ", changed[m*64 + n*8 + b]);
+       printf("\n");
+       }
+       printf("\n \n");
+       }
+       }*/
+      if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 )
+      {
+        BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 1;
+      }
+      __syncthreads();
     }
-  
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ && (!interfaceMap[ k*dimX*dimY+j * mesh.getDimensions().x() + i ]) )
-        aux[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ];
-}   
+    
+    if( i < dimX && j < dimY && k < dimZ )
+      helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ];
+  } 
+}  
 #endif
-- 
GitLab


From 98abe9f62cc46968797cec710434d55f456695e0 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 15 Nov 2018 13:24:51 +0100
Subject: [PATCH 08/20] Enabled computations with single precision.

---
 .../Hamilton-Jacobi/Solvers/hamilton-jacobi/MainBuildConfig.h   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/MainBuildConfig.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/MainBuildConfig.h
index f8f9187fa..a2a1d7372 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/MainBuildConfig.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/MainBuildConfig.h
@@ -23,7 +23,7 @@ namespace Solvers {
 /****
  * Turn off support for float and long double.
  */
-template<> struct ConfigTagReal< HamiltonJacobiBuildConfig, float > { enum { enabled = false }; };
+template<> struct ConfigTagReal< HamiltonJacobiBuildConfig, float > { enum { enabled = true }; };
 template<> struct ConfigTagReal< HamiltonJacobiBuildConfig, long double > { enum { enabled = false }; };
 
 /****
-- 
GitLab


From f206b754c5a76ec352ae91e3c3286acc0b27d512 Mon Sep 17 00:00:00 2001
From: Fencl <fenclmat@fjfi.cvut.cz>
Date: Fri, 16 Nov 2018 12:03:40 +0100
Subject: [PATCH 09/20] 3D FSM+FIM implemented 2D FSM+FIM method pickes size of
 rectangular block depending on number of blocks

---
 .../tnlDirectEikonalMethodsBase.h             | 214 ++++----
 .../tnlDirectEikonalMethodsBase_impl.h        | 519 +++++++++++++++---
 .../hamilton-jacobi/tnlFastSweepingMethod.h   | 222 ++++----
 .../tnlFastSweepingMethod2D_impl.h            |  74 ++-
 .../tnlFastSweepingMethod3D_impl.h            | 455 +++++++++------
 5 files changed, 1004 insertions(+), 480 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index 7d990c1bb..f712ce2cc 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -19,102 +19,112 @@ class tnlDirectEikonalMethodsBase
 };
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
 class tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >
 {
-   public:
-      
-      typedef Meshes::Grid< 1, Real, Device, Index > MeshType;
-      typedef Real RealType;
-      typedef Device DevcieType;
-      typedef Index IndexType;
-      typedef Functions::MeshFunction< MeshType > MeshFunctionType;
-      typedef Functions::MeshFunction< MeshType, 1, bool > InterfaceMapType;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
-      using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;
-      
-      void initInterface( const MeshFunctionPointer& input,
-                          MeshFunctionPointer& output,
-                          InterfaceMapPointer& interfaceMap );
-      
-      template< typename MeshEntity >
-      __cuda_callable__ void updateCell( MeshFunctionType& u,
-                                         const MeshEntity& cell,
-                                         const RealType velocity = 1.0  );
-      
-      __cuda_callable__ bool updateCell( volatile Real sArray[18],
-                                         int thri, const Real h,
-                                         const Real velocity = 1.0 );
+  public:
+    
+    typedef Meshes::Grid< 1, Real, Device, Index > MeshType;
+    typedef Real RealType;
+    typedef Device DevcieType;
+    typedef Index IndexType;
+    typedef Functions::MeshFunction< MeshType > MeshFunctionType;
+    typedef Functions::MeshFunction< MeshType, 1, bool > InterfaceMapType;
+    using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
+    using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;
+    
+    void initInterface( const MeshFunctionPointer& input,
+            MeshFunctionPointer& output,
+            InterfaceMapPointer& interfaceMap );
+    
+    template< typename MeshEntity >
+    __cuda_callable__ void updateCell( MeshFunctionType& u,
+            const MeshEntity& cell,
+            const RealType velocity = 1.0  );
+    
+    __cuda_callable__ bool updateCell( volatile Real sArray[18],
+            int thri, const Real h,
+            const Real velocity = 1.0 );
 };
 
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
 class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
 {
-   public:
-      typedef Meshes::Grid< 2, Real, Device, Index > MeshType;
-      typedef Real RealType;
-      typedef Device DevcieType;
-      typedef Index IndexType;
-      typedef Functions::MeshFunction< MeshType > MeshFunctionType;
-      typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType;
-      typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
-      using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;
-
-      void initInterface( const MeshFunctionPointer& input,
-                          MeshFunctionPointer& output,
-                          InterfaceMapPointer& interfaceMap );
-      
-      template< typename MeshEntity >
-      __cuda_callable__ void updateCell( MeshFunctionType& u,
-                                         const MeshEntity& cell,
-                                         const RealType velocity = 1.0 );
-      
-      template< int sizeSArray >
-      __cuda_callable__ bool updateCell( volatile Real *sArray,
-                                         int thri, int thrj, const Real hx, const Real hy,
-                                         const Real velocity = 1.0 );
-      
-      template< int sizeSArray >
-      void updateBlocks( InterfaceMapType interfaceMap,
-                         MeshFunctionType aux,
-                         MeshFunctionType helpFunc,
-                         ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ );
-      
-      void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY  );
+  public:
+    typedef Meshes::Grid< 2, Real, Device, Index > MeshType;
+    typedef Real RealType;
+    typedef Device DevcieType;
+    typedef Index IndexType;
+    typedef Functions::MeshFunction< MeshType > MeshFunctionType;
+    typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType;
+    typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
+    using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
+    using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;
+    
+    void initInterface( const MeshFunctionPointer& input,
+            MeshFunctionPointer& output,
+            InterfaceMapPointer& interfaceMap );
+    
+    template< typename MeshEntity >
+    __cuda_callable__ void updateCell( MeshFunctionType& u,
+            const MeshEntity& cell,
+            const RealType velocity = 1.0 );
+    
+    template< int sizeSArray >
+    __cuda_callable__ bool updateCell( volatile Real *sArray,
+            int thri, int thrj, const Real hx, const Real hy,
+            const Real velocity = 1.0 );
+    
+    template< int sizeSArray >
+    void updateBlocks( InterfaceMapType interfaceMap,
+            MeshFunctionType aux,
+            MeshFunctionType helpFunc,
+            ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ );
+    
+    void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY  );
 };
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
 class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
 {
-   public:
-      typedef Meshes::Grid< 3, Real, Device, Index > MeshType;
-      typedef Real RealType;
-      typedef Device DevcieType;
-      typedef Index IndexType;
-      typedef Functions::MeshFunction< MeshType > MeshFunctionType;
-      typedef Functions::MeshFunction< MeshType, 3, bool > InterfaceMapType;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
-      using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;      
-
-      void initInterface( const MeshFunctionPointer& input,
-                          MeshFunctionPointer& output,
-                          InterfaceMapPointer& interfaceMap );
-      
-      template< typename MeshEntity >
-      __cuda_callable__ void updateCell( MeshFunctionType& u,
-                                         const MeshEntity& cell,
-                                         const RealType velocity = 1.0);
-      
-      __cuda_callable__ bool updateCell( volatile Real sArray[10][10][10],
-                                         int thri, int thrj, int thrk, const Real hx, const Real hy, const Real hz,
-                                         const Real velocity = 1.0 );
+  public:
+    typedef Meshes::Grid< 3, Real, Device, Index > MeshType;
+    typedef Real RealType;
+    typedef Device DevcieType;
+    typedef Index IndexType;
+    typedef Functions::MeshFunction< MeshType > MeshFunctionType;
+    typedef Functions::MeshFunction< MeshType, 3, bool > InterfaceMapType;
+    typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
+    using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
+    using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;      
+    
+    void initInterface( const MeshFunctionPointer& input,
+            MeshFunctionPointer& output,
+            InterfaceMapPointer& interfaceMap );
+    
+    template< typename MeshEntity >
+    __cuda_callable__ void updateCell( MeshFunctionType& u,
+            const MeshEntity& cell,
+            const RealType velocity = 1.0);
+    
+    template< int sizeSArray >
+    void updateBlocks( const InterfaceMapType interfaceMap,
+            const MeshFunctionType aux,
+            MeshFunctionType& helpFunc,
+            ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ );
+    
+    void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ );
+    
+    template< int sizeSArray >
+    __cuda_callable__ bool updateCell3D( volatile Real *sArray,
+            int thri, int thrj, int thrk, const Real hx, const Real hy, const Real hz,
+            const Real velocity = 1.0 );
 };
 
 template < typename T1, typename T2 >
@@ -126,46 +136,46 @@ __cuda_callable__ void sortMinims( T1 pom[] );
 #ifdef HAVE_CUDA
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, 
-                                Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output,
-                                Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap  );
+        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap  );
 
 template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > ptr,
-                                      const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap,
-                                      Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& aux,
-                                      bool *BlockIterDevice );
+        const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap,
+        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& aux,
+        bool *BlockIterDevice );
 
 template < int sizeSArray, typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
-                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
-                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
-                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock =0);
+        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
+        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock =0);
 
 template < typename Index >
 __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
-                                   TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks );
+        TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks );
 
 template < typename Index >
 __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
-                               TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY );
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY );
 
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
-                                Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
-                                Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap );
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap );
 
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, 
-                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
-                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap );
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap );
 
 template < int sizeSArray, typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
-                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
-                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
-                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
-                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );
+        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
+        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );
 
 template < typename Index >
 __global__ void GetNeighbours3D( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index 5083544e2..8f7937541 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -148,6 +148,7 @@ updateBlocks( InterfaceMapType interfaceMap,
       }
       
       
+      //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
       for( int thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ )
       {        
         if( dimX > (blIdx+1) * numThreadsPerBlock  && thrj+1 < ykolik )
@@ -263,6 +264,370 @@ updateBlocks( InterfaceMapType interfaceMap,
     }
   }
 }
+template< typename Real,
+        typename Device,
+        typename Index >
+template< int sizeSArray >
+void
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+updateBlocks( const InterfaceMapType interfaceMap,
+        const MeshFunctionType aux,
+        MeshFunctionType& helpFunc,
+        ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
+{  
+//#pragma omp parallel for schedule( dynamic )
+  for( int i = 0; i < BlockIterHost.getSize(); i++ )
+  {
+    if( BlockIterHost[ i ] )
+    {
+      MeshType mesh = interfaceMap.template getMesh< Devices::Host >();
+      
+      int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
+      int dimZ = mesh.getDimensions().z();
+      //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl;
+      int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
+      int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
+      int numOfBlockz = dimZ/numThreadsPerBlock + ((dimZ%numThreadsPerBlock != 0) ? 1:0);
+      //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl;
+      int xkolik = numThreadsPerBlock + 1;
+      int ykolik = numThreadsPerBlock + 1;
+      int zkolik = numThreadsPerBlock + 1;
+      
+      
+      int blIdz = i/( numOfBlockx * numOfBlocky );
+      int blIdy = (i-blIdz*numOfBlockx * numOfBlocky )/(numOfBlockx );
+      int blIdx = (i-blIdz*numOfBlockx * numOfBlocky )%( numOfBlockx );
+      //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl;
+      
+      if( numOfBlockx - 1 == blIdx )
+        xkolik = dimX - (blIdx)*numThreadsPerBlock+1;
+      if( numOfBlocky -1 == blIdy )
+        ykolik = dimY - (blIdy)*numThreadsPerBlock+1;
+      if( numOfBlockz-1 == blIdz )
+        zkolik = dimZ - (blIdz)*numThreadsPerBlock+1;
+      //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl;
+      
+      
+      /*bool changed[numThreadsPerBlock*numThreadsPerBlock];
+       changed[ 0 ] = 1;*/
+      Real hx = mesh.getSpaceSteps().x();
+      Real hy = mesh.getSpaceSteps().y();
+      Real hz = mesh.getSpaceSteps().z();
+      
+      bool changed = false;
+      BlockIterHost[ i ] = 0;
+      
+      
+      Real *sArray;
+      sArray = new Real[ sizeSArray * sizeSArray * sizeSArray ];
+      if( sArray == nullptr )
+        std::cout << "Error while allocating memory for sArray." << std::endl;
+      
+      for( int k = 0; k < sizeSArray; k++ )
+        for( int l = 0; l < sizeSArray; l++ )
+          for( int m = 0; m < sizeSArray; m++ ){
+            sArray[ m * sizeSArray * sizeSArray + k * sizeSArray + l ] = std::numeric_limits< Real >::max();
+          }
+      
+      
+      for( int thrk = 0; thrk < numThreadsPerBlock; thrk++ )
+        for( int thrj = 0; thrj < numThreadsPerBlock; thrj++ )
+        {
+          if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik )
+            sArray[(thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX -1 + thrk*dimX*dimY ];
+          
+          if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < zkolik )
+            sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy *numThreadsPerBlock*dimX+ blIdx*numThreadsPerBlock + numThreadsPerBlock + thrj * dimX + thrk*dimX*dimY ];
+          
+          if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik )
+            sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX + thrj + thrk*dimX*dimY ];
+          
+          if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik && thrk+1 < zkolik )
+            sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + (blIdy+1) * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj + thrk*dimX*dimY ];
+          
+          if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik )
+            sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX * dimY + thrj * dimX + thrk ];
+          
+          if( dimZ > (blIdz+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < xkolik )
+            sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = 
+                    aux[ (blIdz+1)*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX + thrk ];
+        }
+      
+      for( int m = 0; m < numThreadsPerBlock; m++ ){
+        for( int k = 0; k < numThreadsPerBlock; k++ ){
+          for( int l = 0; l < numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+              sArray[(m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1] = 
+                      aux[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ];
+          }
+        }
+      }
+      /*string s;
+      int numWhile = 0;
+      for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      for( int m = 0; m < numThreadsPerBlock; m++ ){
+        for( int k = 0; k < numThreadsPerBlock; k++ ){ 
+          for( int l = 0; l < numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){
+              //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl;
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                //printf("In with point m  = %d, k = %d, l = %d\n", m, k, l);
+                changed = this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz) || changed;
+                
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      for( int m = numThreadsPerBlock-1; m >-1; m-- ){
+        for( int k = 0; k < numThreadsPerBlock; k++ ){
+          for( int l = 0; l <numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      for( int m = 0; m < numThreadsPerBlock; m++ ){
+        for( int k = 0; k < numThreadsPerBlock; k++ ){
+          for( int l = numThreadsPerBlock-1; l >-1; l-- ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );
+      */
+      for( int m = numThreadsPerBlock-1; m >-1; m-- ){
+        for( int k = 0; k < numThreadsPerBlock; k++ ){
+          for( int l = numThreadsPerBlock-1; l >-1; l-- ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell3D< sizeSArray >(  sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      for( int m = 0; m < numThreadsPerBlock; m++ ){
+        for( int k = numThreadsPerBlock-1; k > -1; k-- ){
+          for( int l = 0; l <numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      for( int m = numThreadsPerBlock-1; m >-1; m-- ){
+        for( int k = numThreadsPerBlock-1; k > -1; k-- ){
+          for( int l = 0; l <numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      for( int m = 0; m < numThreadsPerBlock; m++ ){
+        for( int k = numThreadsPerBlock-1; k > -1; k-- ){
+          for( int l = numThreadsPerBlock-1; l >-1; l-- ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      
+      for( int m = numThreadsPerBlock-1; m >-1; m-- ){
+        for( int k = numThreadsPerBlock-1; k > -1; k-- ){
+          for( int l = numThreadsPerBlock-1; l >-1; l-- ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      if( changed ){
+        BlockIterHost[ i ] = 1;
+      }
+      
+      
+      for( int k = 0; k < numThreadsPerBlock; k++ ){ 
+        for( int l = 0; l < numThreadsPerBlock; l++ ) {
+          for( int m = 0; m < numThreadsPerBlock; m++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){      
+              helpFunc[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] = 
+                      sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+              //std::cout << helpFunc[ m*dimX*dimY + k*dimX + l ] << " ";
+            }
+          }
+          //std::cout << std::endl;
+        }
+        //std::cout << std::endl;
+      }
+      //helpFunc.save( "helpF.tnl");
+      delete []sArray;
+    }
+  }
+}
+template< typename Real,
+        typename Device,
+        typename Index >
+void 
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ )
+{
+  int* BlockIterPom; 
+  BlockIterPom = new int [ numBlockX * numBlockY * numBlockZ ];
+  
+  for( int i = 0; i< BlockIterHost.getSize(); i++)
+  {
+    BlockIterPom[ i ] = 0;
+    
+    int m=0, l=0, k=0;
+    l = i/( numBlockX * numBlockY );
+    k = (i-l*numBlockX * numBlockY )/(numBlockX );
+    m = (i-l*numBlockX * numBlockY )%( numBlockX );
+    
+    if( m > 0 && BlockIterHost[ i - 1 ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( l > 0 && BlockIterHost[ i - numBlockX*numBlockY ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( l < numBlockZ-1 && BlockIterHost[ i + numBlockX*numBlockY ] ){
+      BlockIterPom[ i ] = 1;
+    }
+  }
+  for( int i = 0; i< BlockIterHost.getSize(); i++)
+  { 
+    BlockIterHost[ i ] = BlockIterPom[ i ];
+  }
+}
+
 
 template< typename Real,
         typename Device,
@@ -619,8 +984,8 @@ initInterface( const MeshFunctionPointer& _input,
         {
           cell.refresh();
           output[ cell.getIndex() ] =
-                  input( cell ) > 0 ? std::numeric_limits< RealType >::max() :
-                    - std::numeric_limits< RealType >::max();
+                  input( cell ) > 0 ? 10://std::numeric_limits< RealType >::max() :
+                    -10;//- std::numeric_limits< RealType >::max();
           interfaceMap[ cell.getIndex() ] = false;
         }
     
@@ -967,6 +1332,82 @@ updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real
   
   return false;
 }
+template< typename Real,
+        typename Device,
+        typename Index >
+template< int sizeSArray >
+__cuda_callable__ 
+bool 
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+updateCell3D( volatile Real *sArray, int thri, int thrj, int thrk,
+        const Real hx, const Real hy, const Real hz, const Real v )
+{
+  const RealType value = sArray[thrk *sizeSArray * sizeSArray + thrj * sizeSArray + thri];
+  
+  RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
+  
+  c = TNL::argAbsMin( sArray[ (thrk+1)* sizeSArray*sizeSArray + thrj * sizeSArray + thri ],
+          sArray[ (thrk-1) * sizeSArray *sizeSArray + thrj* sizeSArray + thri ] );
+  
+  b = TNL::argAbsMin( sArray[ thrk* sizeSArray*sizeSArray + (thrj+1) * sizeSArray + thri ],
+          sArray[ thrk* sizeSArray * sizeSArray + (thrj-1)* sizeSArray +thri ] );
+  
+  a = TNL::argAbsMin( sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri+1 ],
+          sArray[ thrk* sizeSArray * sizeSArray + thrj* sizeSArray +thri-1 ] );
+  
+  /*if( thrk == 8 )
+    printf("Calculating a = %f, b = %f, c = %f\n" , a, b, c );*/
+  
+  if( fabs( a ) == 10&& //std::numeric_limits< RealType >::max() && 
+          fabs( b ) == 10&&//std::numeric_limits< RealType >::max() &&
+          fabs( c ) == 10)//std::numeric_limits< RealType >::max() )
+    return false;
+  
+  RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
+  
+  sortMinims( pom );
+  
+  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
+  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
+  {
+    sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ] = argAbsMin( value, tmp );
+    tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
+    if ( fabs( tmp ) >  0.001*hx )
+      return true;
+    else
+      return false;
+  }
+  else
+  {
+    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
+            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
+    if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
+    {
+      sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ] = argAbsMin( value, tmp );
+      tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
+      if ( fabs( tmp ) > 0.001*hx )
+        return true;
+      else
+        return false;
+    }
+    else
+    {
+      tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
+              TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
+              hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
+              hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
+      sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ] = argAbsMin( value, tmp );
+      tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
+      if ( fabs( tmp ) > 0.001*hx )
+        return true;
+      else
+        return false;
+    }
+  }
+  
+  return false;
+}
 
 #ifdef HAVE_CUDA
 template < typename Real, typename Device, typename Index >
@@ -1215,78 +1656,4 @@ updateCell( volatile Real sArray[18], int thri, const Real h, const Real v )
   else
     return false;
 }
-
-template< typename Real,
-        typename Device,
-        typename Index >
-__cuda_callable__ 
-bool 
-tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
-updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk,
-        const Real hx, const Real hy, const Real hz, const Real v )
-{
-  const RealType value = sArray[thrk][thrj][thri];
-  //std::cout << value << std::endl;
-  RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
-  
-  c = TNL::argAbsMin( sArray[ thrk+1 ][ thrj ][ thri ],
-          sArray[ thrk-1 ][ thrj ][ thri ] );
-  
-  b = TNL::argAbsMin( sArray[ thrk ][ thrj+1 ][ thri ],
-          sArray[ thrk ][ thrj-1 ][ thri ] );
-  
-  a = TNL::argAbsMin( sArray[ thrk ][ thrj ][ thri+1 ],
-          sArray[ thrk ][ thrj ][ thri-1 ] );
-  
-  
-  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-          fabs( b ) == std::numeric_limits< RealType >::max() &&
-          fabs( c ) == std::numeric_limits< RealType >::max() )
-    return false;
-  
-  RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
-  
-  sortMinims( pom );
-  
-  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
-  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
-  {
-    sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
-    tmp = value - sArray[ thrk ][ thrj ][ thri ];
-    if ( fabs( tmp ) >  0.001*hx )
-      return true;
-    else
-      return false;
-  }
-  else
-  {
-    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
-            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
-            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-    if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
-    {
-      sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
-      tmp = value - sArray[ thrk ][ thrj ][ thri ];
-      if ( fabs( tmp ) > 0.001*hx )
-        return true;
-      else
-        return false;
-    }
-    else
-    {
-      tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
-              TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
-              hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
-              hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
-      sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
-      tmp = value - sArray[ thrk ][ thrj ][ thri ];
-      if ( fabs( tmp ) > 0.001*hx )
-        return true;
-      else
-        return false;
-    }
-  }
-  
-  return false;
-}
 #endif
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
index 60c690e06..57b1886e8 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
@@ -1,9 +1,9 @@
 /***************************************************************************
-                          FastSweepingMethod.h  -  description
-                             -------------------
-    begin                : Jul 14, 2016
-    copyright            : (C) 2017 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
+ FastSweepingMethod.h  -  description
+ -------------------
+ begin                : Jul 14, 2016
+ copyright            : (C) 2017 by Tomas Oberhuber
+ email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
 /* See Copyright Notice in tnl/Copyright */
@@ -17,132 +17,134 @@
 
 
 template< typename Mesh,
-          typename Anisotropy = Functions::Analytic::Constant< Mesh::getMeshDimension(), typename Mesh::RealType > >
+        typename Anisotropy = Functions::Analytic::Constant< Mesh::getMeshDimension(), typename Mesh::RealType > >
 class FastSweepingMethod
 {   
 };
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >
-   : public tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >
+: public tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >
 {
-   //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
-   
-   public:
-      
-      typedef Meshes::Grid< 1, Real, Device, Index > MeshType;
-      typedef Real RealType;
-      typedef Device DeviceType;
-      typedef Index IndexType;
-      typedef Anisotropy AnisotropyType;
-      typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > BaseType;
-      using MeshPointer = Pointers::SharedPointer<  MeshType >;
-      using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
-      
-      
-      using typename BaseType::InterfaceMapType;
-      using typename BaseType::MeshFunctionType;
-      using typename BaseType::InterfaceMapPointer;
-      using typename BaseType::MeshFunctionPointer;
-      
-      
-      FastSweepingMethod();
-      
-      const IndexType& getMaxIterations() const;
-      
-      void setMaxIterations( const IndexType& maxIterations );
-      
-      void solve( const MeshPointer& mesh,
-                  const AnisotropyPointer& anisotropy,
-                  MeshFunctionPointer& u );
-      
-      
-   protected:
+  //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
+  
+  public:
+    
+    typedef Meshes::Grid< 1, Real, Device, Index > MeshType;
+    typedef Real RealType;
+    typedef Device DeviceType;
+    typedef Index IndexType;
+    typedef Anisotropy AnisotropyType;
+    typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > BaseType;
+    using MeshPointer = Pointers::SharedPointer<  MeshType >;
+    using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
+    
+    
+    using typename BaseType::InterfaceMapType;
+    using typename BaseType::MeshFunctionType;
+    using typename BaseType::InterfaceMapPointer;
+    using typename BaseType::MeshFunctionPointer;
+    
+    
+    FastSweepingMethod();
+    
+    const IndexType& getMaxIterations() const;
+    
+    void setMaxIterations( const IndexType& maxIterations );
+    
+    void solve( const MeshPointer& mesh,
+            const AnisotropyPointer& anisotropy,
+            MeshFunctionPointer& u );
+    
+    
+    protected:
       
       const IndexType maxIterations;
 };
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >
-   : public tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
+: public tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
 {
-   //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
-   
-   public:
-      
-      typedef Meshes::Grid< 2, Real, Device, Index > MeshType;
-      typedef Real RealType;
-      typedef Device DeviceType;
-      typedef Index IndexType;
-      typedef Anisotropy AnisotropyType;
-      typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > BaseType;
-      using MeshPointer = Pointers::SharedPointer<  MeshType >;
-      using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
-
-      using typename BaseType::InterfaceMapType;
-      using typename BaseType::MeshFunctionType;
-      using typename BaseType::InterfaceMapPointer;
-      using typename BaseType::MeshFunctionPointer;
-      using typename BaseType::ArrayContainer;
-
-      FastSweepingMethod();
-      
-      const IndexType& getMaxIterations() const;
-      
-      void setMaxIterations( const IndexType& maxIterations );
-      
-      void solve( const MeshPointer& mesh,
-                  const AnisotropyPointer& anisotropy,
-                  MeshFunctionPointer& u );
-      
-   protected:
+  //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
+  
+  public:
+    
+    typedef Meshes::Grid< 2, Real, Device, Index > MeshType;
+    typedef Real RealType;
+    typedef Device DeviceType;
+    typedef Index IndexType;
+    typedef Anisotropy AnisotropyType;
+    typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > BaseType;
+    using MeshPointer = Pointers::SharedPointer<  MeshType >;
+    using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
+    
+    using typename BaseType::InterfaceMapType;
+    using typename BaseType::MeshFunctionType;
+    using typename BaseType::InterfaceMapPointer;
+    using typename BaseType::MeshFunctionPointer;
+    using typename BaseType::ArrayContainer;
+    
+    FastSweepingMethod();
+    
+    const IndexType& getMaxIterations() const;
+    
+    void setMaxIterations( const IndexType& maxIterations );
+    
+    void solve( const MeshPointer& mesh,
+            const AnisotropyPointer& anisotropy,
+            MeshFunctionPointer& u );
+    
+    protected:
       
       const IndexType maxIterations;
 };
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >
-   : public tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
+: public tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
 {
-   //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
-   
-   public:
-      
-      typedef Meshes::Grid< 3, Real, Device, Index > MeshType;
-      typedef Real RealType;
-      typedef Device DeviceType;
-      typedef Index IndexType;
-      typedef Anisotropy AnisotropyType;
-      typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > BaseType;
-      using MeshPointer = Pointers::SharedPointer<  MeshType >;
-      using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
-      
-      using typename BaseType::InterfaceMapType;
-      using typename BaseType::MeshFunctionType;
-      using typename BaseType::InterfaceMapPointer;
-      using typename BaseType::MeshFunctionPointer;      
-      
-      FastSweepingMethod();
-      
-      const IndexType& getMaxIterations() const;
-      
-      void setMaxIterations( const IndexType& maxIterations );
-      
-      void solve( const MeshPointer& mesh,
-                  const AnisotropyPointer& anisotropy,
-                  MeshFunctionPointer& u );
-      
-      
-   protected:
+  //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
+  
+  public:
+    
+    typedef Meshes::Grid< 3, Real, Device, Index > MeshType;
+    typedef Real RealType;
+    typedef Device DeviceType;
+    typedef Index IndexType;
+    typedef Anisotropy AnisotropyType;
+    typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > BaseType;
+    using MeshPointer = Pointers::SharedPointer<  MeshType >;
+    using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
+    
+    using typename BaseType::InterfaceMapType;
+    using typename BaseType::MeshFunctionType;
+    using typename BaseType::InterfaceMapPointer;
+    using typename BaseType::MeshFunctionPointer;   
+    using typename BaseType::ArrayContainer;
+    
+    
+    FastSweepingMethod();
+    
+    const IndexType& getMaxIterations() const;
+    
+    void setMaxIterations( const IndexType& maxIterations );
+    
+    void solve( const MeshPointer& mesh,
+            const AnisotropyPointer& anisotropy,
+            MeshFunctionPointer& u );
+    
+    
+    protected:
       
       const IndexType maxIterations;
 };
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index fa2716897..d5ce1efe1 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -15,9 +15,12 @@
 
 #include "tnlFastSweepingMethod.h"
 #include <TNL/Devices/Cuda.h>
-#include <string.h>
+#include <TNL/Communicators/MpiDefs.h>
+
 
 
+
+#include <string.h>
 #include <iostream>
 #include <fstream>
 
@@ -80,16 +83,48 @@ solve( const MeshPointer& mesh,
   MeshFunctionType aux = *auxPtr;
   
   
+//#ifdef HAVE_MPI
+  bool a = Communicators::MpiCommunicator::IsInitialized();
+  if( a )
+    printf("Je Init\n");
+  else
+    printf("Neni Init\n");
+//#endif
   
   while( iteration < this->maxIterations )
   {
     if( std::is_same< DeviceType, Devices::Host >::value )
     {
-      int numThreadsPerBlock = 16;
+      int numThreadsPerBlock = -1;
+      
+      numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0));
+      //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
+      if( numThreadsPerBlock <= 16 )
+        numThreadsPerBlock = 16;
+      else if(numThreadsPerBlock <= 32 )
+        numThreadsPerBlock = 32;
+      else if(numThreadsPerBlock <= 64 )
+        numThreadsPerBlock = 64;
+      else if(numThreadsPerBlock <= 128 )
+        numThreadsPerBlock = 128;
+      else if(numThreadsPerBlock <= 256 )
+        numThreadsPerBlock = 256;
+      else if(numThreadsPerBlock <= 512 )
+        numThreadsPerBlock = 512;
+      else
+        numThreadsPerBlock = 1024;
+      //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
+      
+      if( numThreadsPerBlock == -1 ){
+        printf("Fail in setting numThreadsPerBlock.\n");
+        break;
+      }
+      
       
       
       int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
       int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
+      
       //std::cout << "numBlocksX = " << numBlocksX << std::endl;
       
       /*Real **sArray = new Real*[numBlocksX*numBlocksY];
@@ -115,13 +150,29 @@ solve( const MeshPointer& mesh,
        }
        std::cout<<std::endl;*/
       unsigned int numWhile = 0;
-      while( IsCalculationDone && numWhile < 1 )
+      while( IsCalculationDone )
       {      
         IsCalculationDone = 0;
         helpFunc1 = auxPtr;
         auxPtr = helpFunc;
         helpFunc = helpFunc1;
-        this->template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+        switch ( numThreadsPerBlock ){
+          case 16:
+            this->template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+          case 32:
+            this->template updateBlocks< 34 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+          case 64:
+            this->template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+          case 128:
+            this->template updateBlocks< 130 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+          case 256:
+            this->template updateBlocks< 258 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+          case 512:
+            this->template updateBlocks< 514 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+          default:
+            this->template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+        }
+        
         
         //Reduction      
         for( int i = 0; i < BlockIterHost.getSize(); i++ ){
@@ -131,14 +182,14 @@ solve( const MeshPointer& mesh,
           }
         }
         numWhile++;
-        std::cout <<"numWhile = "<< numWhile <<std::endl;
+        /*std::cout <<"numWhile = "<< numWhile <<std::endl;
         
         for( int j = numBlocksY-1; j>-1; j-- ){
           for( int i = 0; i < numBlocksX; i++ )
             std::cout << BlockIterHost[ j * numBlocksX + i ];
           std::cout << std::endl;
         }
-        std::cout << std::endl;
+        std::cout << std::endl;*/
         
         this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY );
         
@@ -150,8 +201,8 @@ solve( const MeshPointer& mesh,
          std::cout << std::endl;*/
         
         //std::cout<<std::endl;
-        string s( "aux-"+ std::to_string(numWhile) + ".tnl");
-        aux.save( s );
+        //string s( "aux-"+ std::to_string(numWhile) + ".tnl");
+        //aux.save( s );
       }
       if( numWhile == 1 ){
         auxPtr = helpFunc;
@@ -266,8 +317,8 @@ solve( const MeshPointer& mesh,
       BlockIterPom.setSize( numBlocksX * numBlocksY  );
       BlockIterPom.setValue( 0 );
       /*TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1;
-      BlockIterPom1.setSize( numBlocksX * numBlocksY  );
-      BlockIterPom1.setValue( 0 );*/
+       BlockIterPom1.setSize( numBlocksX * numBlocksY  );
+       BlockIterPom1.setValue( 0 );*/
       /*int *BlockIterDevice;
        cudaMalloc((void**) &BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/
       int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
@@ -408,6 +459,7 @@ solve( const MeshPointer& mesh,
     }
     iteration++;
   }
+  //#endif
   aux.save("aux-final.tnl");
 }
 
@@ -527,7 +579,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
   
   
   /** FOR FIM METHOD */
-    
+  
   if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] )
   { 
     __syncthreads();
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index 65aba5bf5..5af33cf29 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -64,9 +64,6 @@ solve( const MeshPointer& mesh,
   interfaceMapPtr->setMesh( mesh );
   std::cout << "Initiating the interface cells ..." << std::endl;
   BaseType::initInterface( u, auxPtr, interfaceMapPtr );
-#ifdef HAVE_CUDA
-  cudaDeviceSynchronize();
-#endif
   auxPtr->save( "aux-ini.tnl" );   
   
   typename MeshType::Cell cell( *mesh );
@@ -78,170 +75,259 @@ solve( const MeshPointer& mesh,
   {
     if( std::is_same< DeviceType, Devices::Host >::value )
     {
-      for( cell.getCoordinates().z() = 0;
-              cell.getCoordinates().z() < mesh->getDimensions().z();
-              cell.getCoordinates().z()++ )
-      {
-        for( cell.getCoordinates().y() = 0;
-                cell.getCoordinates().y() < mesh->getDimensions().y();
-                cell.getCoordinates().y()++ )
-        {
-          for( cell.getCoordinates().x() = 0;
-                  cell.getCoordinates().x() < mesh->getDimensions().x();
-                  cell.getCoordinates().x()++ )
-          {
-            cell.refresh();
-            if( ! interfaceMap( cell ) )
-              this->updateCell( aux, cell );
-          }
-        }
-      }
-      //aux.save( "aux-1.tnl" );
+      int numThreadsPerBlock = 64;
       
-      for( cell.getCoordinates().z() = 0;
-              cell.getCoordinates().z() < mesh->getDimensions().z();
-              cell.getCoordinates().z()++ )
-      {
-        for( cell.getCoordinates().y() = 0;
-                cell.getCoordinates().y() < mesh->getDimensions().y();
-                cell.getCoordinates().y()++ )
-        {
-          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                  cell.getCoordinates().x() >= 0 ;
-                  cell.getCoordinates().x()-- )		
-          {
-            //std::cerr << "2 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )            
-              this->updateCell( aux, cell );
-          }
-        }
-      }
-      //aux.save( "aux-2.tnl" );
-      for( cell.getCoordinates().z() = 0;
-              cell.getCoordinates().z() < mesh->getDimensions().z();
-              cell.getCoordinates().z()++ )
-      {
-        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-                cell.getCoordinates().y() >= 0 ;
-                cell.getCoordinates().y()-- )
-        {
-          for( cell.getCoordinates().x() = 0;
-                  cell.getCoordinates().x() < mesh->getDimensions().x();
-                  cell.getCoordinates().x()++ )
-          {
-            //std::cerr << "3 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )            
-              this->updateCell( aux, cell );
-          }
-        }
-      }
-      //aux.save( "aux-3.tnl" );
       
-      for( cell.getCoordinates().z() = 0;
-              cell.getCoordinates().z() < mesh->getDimensions().z();
-              cell.getCoordinates().z()++ )
-      {
-        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-                cell.getCoordinates().y() >= 0;
-                cell.getCoordinates().y()-- )
-        {
-          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                  cell.getCoordinates().x() >= 0 ;
-                  cell.getCoordinates().x()-- )		
-          {
-            //std::cerr << "4 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )            
-              this->updateCell( aux, cell );
-          }
-        }
-      }     
-      //aux.save( "aux-4.tnl" );
+      int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
+      int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
+      int numBlocksZ = mesh->getDimensions().z() / numThreadsPerBlock + (mesh->getDimensions().z() % numThreadsPerBlock != 0 ? 1:0);
+      //std::cout << "numBlocksX = " << numBlocksX << std::endl;
       
-      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-              cell.getCoordinates().z() >= 0;
-              cell.getCoordinates().z()-- )
-      {
-        for( cell.getCoordinates().y() = 0;
-                cell.getCoordinates().y() < mesh->getDimensions().y();
-                cell.getCoordinates().y()++ )
-        {
-          for( cell.getCoordinates().x() = 0;
-                  cell.getCoordinates().x() < mesh->getDimensions().x();
-                  cell.getCoordinates().x()++ )
-          {
-            //std::cerr << "5 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )
-              this->updateCell( aux, cell );
-          }
-        }
-      }
-      //aux.save( "aux-5.tnl" );
+      /*Real **sArray = new Real*[numBlocksX*numBlocksY];
+       for( int i = 0; i < numBlocksX * numBlocksY; i++ )
+       sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];*/
       
-      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-              cell.getCoordinates().z() >= 0;
-              cell.getCoordinates().z()-- )
-      {
-        for( cell.getCoordinates().y() = 0;
-                cell.getCoordinates().y() < mesh->getDimensions().y();
-                cell.getCoordinates().y()++ )
-        {
-          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                  cell.getCoordinates().x() >= 0 ;
-                  cell.getCoordinates().x()-- )		
-          {
-            //std::cerr << "6 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )            
-              this->updateCell( aux, cell );
-          }
-        }
-      }
-      //aux.save( "aux-6.tnl" );
+      ArrayContainer BlockIterHost;
+      BlockIterHost.setSize( numBlocksX * numBlocksY * numBlocksZ );
+      BlockIterHost.setValue( 1 );
+      int IsCalculationDone = 1;
       
-      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-              cell.getCoordinates().z() >= 0;
-              cell.getCoordinates().z()-- )
-      {
-        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-                cell.getCoordinates().y() >= 0 ;
-                cell.getCoordinates().y()-- )
-        {
-          for( cell.getCoordinates().x() = 0;
-                  cell.getCoordinates().x() < mesh->getDimensions().x();
-                  cell.getCoordinates().x()++ )
-          {
-            //std::cerr << "7 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )            
-              this->updateCell( aux, cell );
+      MeshFunctionPointer helpFunc( mesh );
+      MeshFunctionPointer helpFunc1( mesh );
+      helpFunc1 = auxPtr;
+      auxPtr = helpFunc;
+      helpFunc = helpFunc1;
+      //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl;
+      /*for( int k = numBlocksX-1; k >-1; k-- ){
+       for( int l = 0; l < numBlocksY; l++ ){
+       std::cout<< BlockIterHost[ l*numBlocksX  + k ];
+       }
+       std::cout<<std::endl;
+       }
+       std::cout<<std::endl;*/
+      unsigned int numWhile = 0;
+      while( IsCalculationDone  )
+      {      
+        IsCalculationDone = 0;
+        helpFunc1 = auxPtr;
+        auxPtr = helpFunc;
+        helpFunc = helpFunc1;
+        this->template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+        
+        //Reduction      
+        for( int i = 0; i < BlockIterHost.getSize(); i++ ){
+          if( IsCalculationDone == 0 ){
+            IsCalculationDone = IsCalculationDone || BlockIterHost[ i ];
+            //break;
           }
         }
-      }
-      //aux.save( "aux-7.tnl" );
-      
-      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-              cell.getCoordinates().z() >= 0;
-              cell.getCoordinates().z()-- )
-      {
-        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-                cell.getCoordinates().y() >= 0;
-                cell.getCoordinates().y()-- )
-        {
-          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                  cell.getCoordinates().x() >= 0 ;
-                  cell.getCoordinates().x()-- )		
-          {
-            //std::cerr << "8 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )            
-              this->updateCell( aux, cell );
+        numWhile++;
+        std::cout <<"numWhile = "<< numWhile <<std::endl;
+        /*for( int k = 0; k < numBlocksZ; k++ ){
+          for( int j = numBlocksY-1; j>-1; j-- ){
+            for( int i = 0; i < numBlocksX; i++ ){
+              //std::cout << (*auxPtr)[ k * numBlocksX * numBlocksY + j * numBlocksX + i ] << " ";
+              std::cout << BlockIterHost[ k * numBlocksX * numBlocksY + j * numBlocksX + i ];
+            }
+            std::cout << std::endl;
           }
+          std::cout << std::endl;
         }
+        std::cout << std::endl;*/
+        
+        this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY, numBlocksZ );
+        
+        /*for( int k = 0; k < numBlocksZ; k++ ){
+          for( int j = numBlocksY-1; j>-1; j-- ){
+            for( int i = 0; i < numBlocksX; i++ ){
+              //std::cout << (*auxPtr)[ k * numBlocksX * numBlocksY + j * numBlocksX + i ] << " ";
+              std::cout << BlockIterHost[ k * numBlocksX * numBlocksY + j * numBlocksX + i ];
+            }
+            std::cout << std::endl;
+          }
+          std::cout << std::endl;
+        }*/
+        
+        /*for( int j = numBlocksY-1; j>-1; j-- ){
+         for( int i = 0; i < numBlocksX; i++ )
+         std::cout << "BlockIterHost = "<< j*numBlocksX + i<< " ," << BlockIterHost[ j * numBlocksX + i ];
+         std::cout << std::endl;
+         }
+         std::cout << std::endl;*/
+        
+        //std::cout<<std::endl;
+        //string s( "aux-"+ std::to_string(numWhile) + ".tnl");
+        //aux.save( s );
+      }
+      if( numWhile == 1 ){
+        auxPtr = helpFunc;
       }
+      aux = *auxPtr;
+      
+      /*for( cell.getCoordinates().z() = 0;
+       cell.getCoordinates().z() < mesh->getDimensions().z();
+       cell.getCoordinates().z()++ )
+       {
+       for( cell.getCoordinates().y() = 0;
+       cell.getCoordinates().y() < mesh->getDimensions().y();
+       cell.getCoordinates().y()++ )
+       {
+       for( cell.getCoordinates().x() = 0;
+       cell.getCoordinates().x() < mesh->getDimensions().x();
+       cell.getCoordinates().x()++ )
+       {
+       cell.refresh();
+       if( ! interfaceMap( cell ) )
+       this->updateCell( aux, cell );
+       }
+       }
+       }
+       //aux.save( "aux-1.tnl" );
+       
+       for( cell.getCoordinates().z() = 0;
+       cell.getCoordinates().z() < mesh->getDimensions().z();
+       cell.getCoordinates().z()++ )
+       {
+       for( cell.getCoordinates().y() = 0;
+       cell.getCoordinates().y() < mesh->getDimensions().y();
+       cell.getCoordinates().y()++ )
+       {
+       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+       cell.getCoordinates().x() >= 0 ;
+       cell.getCoordinates().x()-- )		
+       {
+       //std::cerr << "2 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       }
+       //aux.save( "aux-2.tnl" );
+       for( cell.getCoordinates().z() = 0;
+       cell.getCoordinates().z() < mesh->getDimensions().z();
+       cell.getCoordinates().z()++ )
+       {
+       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+       cell.getCoordinates().y() >= 0 ;
+       cell.getCoordinates().y()-- )
+       {
+       for( cell.getCoordinates().x() = 0;
+       cell.getCoordinates().x() < mesh->getDimensions().x();
+       cell.getCoordinates().x()++ )
+       {
+       //std::cerr << "3 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       }
+       //aux.save( "aux-3.tnl" );
+       
+       for( cell.getCoordinates().z() = 0;
+       cell.getCoordinates().z() < mesh->getDimensions().z();
+       cell.getCoordinates().z()++ )
+       {
+       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+       cell.getCoordinates().y() >= 0;
+       cell.getCoordinates().y()-- )
+       {
+       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+       cell.getCoordinates().x() >= 0 ;
+       cell.getCoordinates().x()-- )		
+       {
+       //std::cerr << "4 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       }     
+       //aux.save( "aux-4.tnl" );
+       
+       for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+       cell.getCoordinates().z() >= 0;
+       cell.getCoordinates().z()-- )
+       {
+       for( cell.getCoordinates().y() = 0;
+       cell.getCoordinates().y() < mesh->getDimensions().y();
+       cell.getCoordinates().y()++ )
+       {
+       for( cell.getCoordinates().x() = 0;
+       cell.getCoordinates().x() < mesh->getDimensions().x();
+       cell.getCoordinates().x()++ )
+       {
+       //std::cerr << "5 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )
+       this->updateCell( aux, cell );
+       }
+       }
+       }
+       //aux.save( "aux-5.tnl" );
+       
+       for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+       cell.getCoordinates().z() >= 0;
+       cell.getCoordinates().z()-- )
+       {
+       for( cell.getCoordinates().y() = 0;
+       cell.getCoordinates().y() < mesh->getDimensions().y();
+       cell.getCoordinates().y()++ )
+       {
+       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+       cell.getCoordinates().x() >= 0 ;
+       cell.getCoordinates().x()-- )		
+       {
+       //std::cerr << "6 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       }
+       //aux.save( "aux-6.tnl" );
+       
+       for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+       cell.getCoordinates().z() >= 0;
+       cell.getCoordinates().z()-- )
+       {
+       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+       cell.getCoordinates().y() >= 0 ;
+       cell.getCoordinates().y()-- )
+       {
+       for( cell.getCoordinates().x() = 0;
+       cell.getCoordinates().x() < mesh->getDimensions().x();
+       cell.getCoordinates().x()++ )
+       {
+       //std::cerr << "7 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       }
+       //aux.save( "aux-7.tnl" );
+       
+       for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+       cell.getCoordinates().z() >= 0;
+       cell.getCoordinates().z()-- )
+       {
+       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+       cell.getCoordinates().y() >= 0;
+       cell.getCoordinates().y()-- )
+       {
+       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+       cell.getCoordinates().x() >= 0 ;
+       cell.getCoordinates().x()-- )		
+       {
+       //std::cerr << "8 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       }*/
     }
     if( std::is_same< DeviceType, Devices::Cuda >::value )
     {
@@ -389,7 +475,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
   {
     __syncthreads();
     
-    __shared__ volatile bool changed[ (sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2)];
+    __shared__ volatile bool changed[ 8*8*8/*(sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2)*/];
     
     changed[ currentIndex ] = false;
     if( thrj == 0 && thri == 0 && thrk == 0 )
@@ -402,6 +488,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     
     if( thrj == 1 && thri == 1 && thrk == 1 )
     {
+      //printf( "We are in the calculation. Block = %d.\n",blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x  );
       hx = mesh.getSpaceSteps().x();
       hy = mesh.getSpaceSteps().y();
       hz = mesh.getSpaceSteps().z();
@@ -410,8 +497,8 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       dimZ = mesh.getDimensions().z();
       BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 0;
     }
-    __shared__ volatile Real sArray[sizeSArray][sizeSArray][sizeSArray];
-    sArray[thrk+1][thrj+1][thri+1] = std::numeric_limits< Real >::max();
+    __shared__ volatile Real sArray[ 10*10*10/*sizeSArray * sizeSArray * sizeSArray*/ ];
+    sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = std::numeric_limits< Real >::max();
     
     //filling sArray edges
     int numOfBlockx;
@@ -426,6 +513,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     numOfBlockx = gridDim.x;
     numOfBlocky = gridDim.y;
     numOfBlockz = gridDim.z;
+    __syncthreads();
     
     if( numOfBlockx - 1 == blIdx )
       xkolik = dimX - (blIdx)*blockDim.x+1;
@@ -438,54 +526,55 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     if( thri == 0 )
     {        
       if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik )
-        sArray[thrk+1][thrj+1][0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ];
+        sArray[(thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ];
       else
-        sArray[thrk+1][thrj+1][0] = std::numeric_limits< Real >::max();
+        sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max();
     }
     
     if( thri == 1 )
     {
       if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik )
-        sArray[thrk+1][thrj+1][9] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ];
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ];
       else
-        sArray[thrk+1][thrj+1][9] = std::numeric_limits< Real >::max();
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max();
     }
     if( thri == 2 )
     {        
       if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik )
-        sArray[thrk+1][0][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ];
+        sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ];
       else
-        sArray[thrk+1][0][thrj+1] = std::numeric_limits< Real >::max();
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + 0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
     }
     
     if( thri == 3 )
     {
       if( dimY > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik )
-        sArray[thrk+1][9][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ];
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ];
       else
-        sArray[thrk+1][9][thrj+1] = std::numeric_limits< Real >::max();
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
     }
     if( thri == 4 )
     {        
       if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik )
-        sArray[0][thrj+1][thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ];
+        sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ];
       else
-        sArray[0][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
+        sArray[0 * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thrk+1] = std::numeric_limits< Real >::max();
     }
     
     if( thri == 5 )
     {
       if( dimZ > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik )
-        sArray[9][thrj+1][thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ];
+        sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ];
       else
-        sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
+        sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = std::numeric_limits< Real >::max();
     }
     
     if( i < dimX && j < dimY && k < dimZ )
     {
-      sArray[thrk+1][thrj+1][thri+1] = aux[ k*dimX*dimY + j*dimX + i ];
+      sArray[(thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = aux[ k*dimX*dimY + j*dimX + i ];
     }
     __syncthreads(); 
+    
     while( changed[ 0 ] )
     {
       __syncthreads();
@@ -493,11 +582,11 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       changed[ currentIndex ] = false;
       
       //calculation of update cell
-      if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ )
+      if( i < dimX && j < dimY && k < dimZ )
       {
-        if( ! interfaceMap[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] )
+        if( ! interfaceMap[ k*dimX*dimY + j * dimX + i ] )
         {
-          changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz);
+          changed[ currentIndex ] = ptr.updateCell3D< sizeSArray >( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz);
         }
       }
       __syncthreads();
@@ -535,7 +624,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
         }
       }
       __syncthreads();
-      if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
+      if( currentIndex < 32 )
       {
         if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
         if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
@@ -548,7 +637,8 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       
       /*if(thri == 0 && thrj ==0 && thrk ==0 && blIdx == 0 && blIdy == 0 && blIdz == 0)
        {
-       for(int m = 0; m < 8; m++){
+       //for(int m = 0; m < 8; m++){
+       int m = 4;
        for(int n = 0; n<8; n++){
        for(int b=0; b<8; b++)
        printf(" %i ", changed[m*64 + n*8 + b]);
@@ -556,16 +646,19 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
        }
        printf("\n \n");
        }
-       }*/
+       //}*/
+      
       if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 )
       {
-        BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 1;
+        //printf( "Setting block calculation. Block = %d.\n",blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x  );
+        BlockIterDevice[ blIdz * gridDim.x * gridDim.y + blIdy * gridDim.x + blIdx ] = 1;
       }
       __syncthreads();
     }
     
     if( i < dimX && j < dimY && k < dimZ )
-      helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ];
+      helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thri+1 ];
+    
   } 
 }  
 #endif
-- 
GitLab


From bf5e6fd6c9ffaa78e29645493ab3f42a00fbf1f6 Mon Sep 17 00:00:00 2001
From: Fencl <fenclmat@fjfi.cvut.cz>
Date: Thu, 4 Oct 2018 19:30:14 +0200
Subject: [PATCH 10/20] Chess model implemented in 2D.

---
 .../tnlDirectEikonalMethodsBase.h             |   8 +-
 .../tnlDirectEikonalMethodsBase_impl.h        |  12 +-
 .../tnlFastSweepingMethod2D_impl.h            | 212 ++++++++++--------
 3 files changed, 124 insertions(+), 108 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index b981a92a8..eb7cbd2a5 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -129,12 +129,12 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
 template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
-                                      Real *aux,
-                                      int *BlockIterDevice);
+                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
+                                      int *BlockIterDevice, int oddEvenBlock);
 __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks );
 
-template < typename Real, typename Device, typename Index >
-__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a );
+/*template < typename Real, typename Device, typename Index >
+__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a );*/
 
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index 649a5ad43..cfea6aca0 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -945,7 +945,7 @@ updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, con
     {
         sArray[ thrj ][ thri ] = argAbsMin( value, tmp );
         tmp = value - sArray[ thrj ][ thri ];
-        if ( fabs( tmp ) >  0.01*hx )
+        if ( fabs( tmp ) >  0.001*hx )
             return true;
         else
             return false;
@@ -957,7 +957,7 @@ updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, con
             ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
         sArray[ thrj ][ thri ] = argAbsMin( value, tmp );
         tmp = value - sArray[ thrj ][ thri ];
-        if ( fabs( tmp ) > 0.01*hx )
+        if ( fabs( tmp ) > 0.001*hx )
             return true;
         else
             return false;
@@ -989,7 +989,7 @@ updateCell( volatile Real sArray[18], int thri, const Real h, const Real v )
     sArray[ thri ] = argAbsMin( value, tmp );
     
     tmp = value - sArray[ thri ];
-    if ( fabs( tmp ) >  0.01*h )
+    if ( fabs( tmp ) >  0.001*h )
         return true;
     else
         return false;
@@ -1032,7 +1032,7 @@ updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk,
     {
         sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
         tmp = value - sArray[ thrk ][ thrj ][ thri ];
-        if ( fabs( tmp ) >  0.01*hx )
+        if ( fabs( tmp ) >  0.001*hx )
             return true;
         else
             return false;
@@ -1046,7 +1046,7 @@ updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk,
         {
             sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
             tmp = value - sArray[ thrk ][ thrj ][ thri ];
-            if ( fabs( tmp ) > 0.01*hx )
+            if ( fabs( tmp ) > 0.001*hx )
                 return true;
             else
                 return false;
@@ -1059,7 +1059,7 @@ updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk,
                 hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
             sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
             tmp = value - sArray[ thrk ][ thrj ][ thri ];
-            if ( fabs( tmp ) > 0.01*hx )
+            if ( fabs( tmp ) > 0.001*hx )
                 return true;
             else
                 return false;
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index 6703843c1..7e4028fbe 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -26,7 +26,7 @@ template< typename Real,
           typename Anisotropy >
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
 FastSweepingMethod()
-: maxIterations( 100 )
+: maxIterations( 1 )
 {
    
 }
@@ -250,7 +250,7 @@ solve( const MeshPointer& mesh,
           
           tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
           
-          aux1<<< gridSize, blockSize >>>( auxPtr.template modifyData< Device>(), dAux,1 );
+          //aux1<<< gridSize, blockSize >>>( auxPtr.template modifyData< Device>(), dAux,1 );
           
           //int BlockIter = 1;// = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) );
 
@@ -261,7 +261,7 @@ solve( const MeshPointer& mesh,
           int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0);
           int *dBlock;
           cudaMalloc(&dBlock, nBlocks * sizeof( int ) );
-          
+          int oddEvenBlock = 0;
           while( BlockIterD )
           {
            /*for( int i = 0; i < numBlocksX * numBlocksY; i++ )
@@ -269,19 +269,30 @@ solve( const MeshPointer& mesh,
                        
             CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
                                                              interfaceMapPtr.template getData< Device >(),
-                                                             dAux,
-                                                             BlockIterDevice );
+                                                             auxPtr.template modifyData< Device>(),
+                                                             BlockIterDevice,
+                                                             oddEvenBlock );
+	    TNL_CHECK_CUDA_DEVICE;
+            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
+            CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
+                                                             interfaceMapPtr.template getData< Device >(),
+                                                             auxPtr.template modifyData< Device>(),
+                                                             BlockIterDevice,
+                                                             oddEvenBlock );
+	    TNL_CHECK_CUDA_DEVICE;
+            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
             
             CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
+	    TNL_CHECK_CUDA_DEVICE;
             CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
-            
+            TNL_CHECK_CUDA_DEVICE;
             cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
                                    
             /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
                 BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
             
           }
-          aux1<<<gridSize,blockSize>>>( auxPtr.template modifyData< Device>(), dAux, 0 );
+          //aux1<<<gridSize,blockSize>>>( auxPtr.template modifyData< Device>(), dAux, 0 );
           cudaFree( dAux );
           cudaFree( BlockIterDevice );
           cudaFree( dBlock );
@@ -299,7 +310,7 @@ solve( const MeshPointer& mesh,
 }
 
 #ifdef HAVE_CUDA
-template < typename Real, typename Device, typename Index >
+/*template < typename Real, typename Device, typename Index >
 __global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a )
 {
     int i = threadIdx.x + blockDim.x*blockIdx.x;
@@ -314,7 +325,7 @@ __global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, In
         aux[ j*mesh.getDimensions().x() + i ] = dAux[ j*mesh.getDimensions().x() + i ];
     }
     
-}
+}*/
 
 __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks )
 {
@@ -366,8 +377,8 @@ __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlock
 template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
-                                      Real *aux,
-                                      int *BlockIterDevice )
+                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
+                                      int *BlockIterDevice, int oddEvenBlock )
 {
     int thri = threadIdx.x; int thrj = threadIdx.y;
     int blIdx = blockIdx.x; int blIdy = blockIdx.y;
@@ -417,109 +428,114 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     }
     __syncthreads();
     
-    if( thri == 0 )
-    {        
-        if( dimX > (blIdx+1) * blockDim.x  && thrj+1 < ykolik )
-            sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ];
-        else
-            sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
-    }
-    
-    if( thri == 1 )
-    {
-        if( blIdx != 0 && thrj+1 < ykolik )
-            sArray[thrj+1][0] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX ];
-        else
-            sArray[thrj+1][0] = std::numeric_limits< Real >::max();
-    }
-    
-    if( thri == 2 )
-    {
-        if( dimY > (blIdy+1) * blockDim.y  && thri+1 < xkolik )
-            sArray[ykolik][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thrj+1 ];
-        else
-           sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();
-    }
-    
-    if( thri == 3 )
+    if( (blIdy%2  + blIdx) % 2 == oddEvenBlock )
     {
-        if( blIdy != 0 && thrj+1 < xkolik )
-            sArray[0][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + thrj+1 ];
-        else
-            sArray[0][thrj+1] = std::numeric_limits< Real >::max();
-    }
     
-        
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
-    {    
-        sArray[thrj+1][thri+1] = aux[ j*mesh.getDimensions().x() + i ];
-    }
-    __syncthreads();  
+        if( thri == 0 )
+        {        
+            if( dimX > (blIdx+1) * blockDim.x  && thrj+1 < ykolik )
+                sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ];
+            else
+                sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
+        }
+
+        if( thri == 1 )
+        {
+            if( blIdx != 0 && thrj+1 < ykolik )
+                sArray[thrj+1][0] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX ];
+            else
+                sArray[thrj+1][0] = std::numeric_limits< Real >::max();
+        }
+
+        if( thri == 2 )
+        {
+            if( dimY > (blIdy+1) * blockDim.y  && thri+1 < xkolik )
+                sArray[ykolik][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thrj+1 ];
+            else
+               sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();
+        }
+
+        if( thri == 3 )
+        {
+            if( blIdy != 0 && thrj+1 < xkolik )
+                sArray[0][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + thrj+1 ];
+            else
+                sArray[0][thrj+1] = std::numeric_limits< Real >::max();
+        }
+
 
-    while( changed[ 0 ] )
-    {
-        __syncthreads();
-        
-        changed[ currentIndex] = false;
-        
-    //calculation of update cell
         if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+        {    
+            sArray[thrj+1][thri+1] = aux[ j*mesh.getDimensions().x() + i ];
+        }
+        __syncthreads();  
+
+        while( changed[ 0 ] )
         {
-            if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] )
+            __syncthreads();
+
+            changed[ currentIndex] = false;
+
+        //calculation of update cell
+            if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
             {
-                changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy);
+                if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] )
+                {
+                    changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy);
+                }
             }
-        }
-        __syncthreads();
-        
-    //pyramid reduction
-        if( blockDim.x*blockDim.y == 1024 )
-        {
-            if( currentIndex < 512 )
+            __syncthreads();
+
+        //pyramid reduction
+            if( blockDim.x*blockDim.y == 1024 )
             {
-                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
+                if( currentIndex < 512 )
+                {
+                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
+                }
             }
-        }
-        __syncthreads();
-        if( blockDim.x*blockDim.y >= 512 )
-        {
-            if( currentIndex < 256 )
+            __syncthreads();
+            if( blockDim.x*blockDim.y >= 512 )
             {
-                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
+                if( currentIndex < 256 )
+                {
+                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
+                }
             }
-        }
-        __syncthreads();
-        if( blockDim.x*blockDim.y >= 256 )
-        {
-            if( currentIndex < 128 )
+            __syncthreads();
+            if( blockDim.x*blockDim.y >= 256 )
             {
-                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
+                if( currentIndex < 128 )
+                {
+                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
+                }
             }
-        }
-        __syncthreads();
-        if( blockDim.x*blockDim.y >= 128 )
-        {
-            if( currentIndex < 64 )
+            __syncthreads();
+            if( blockDim.x*blockDim.y >= 128 )
             {
-                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
+                if( currentIndex < 64 )
+                {
+                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
+                }
             }
+            __syncthreads();
+            if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
+            {
+                if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
+                if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
+                if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
+                if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
+                if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
+                if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
+            }
+            if( changed[ 0 ] && thri == 0 && thrj == 0 )
+                BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1;
+            __syncthreads();
         }
-        __syncthreads();
-        if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
-        {
-            if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
-            if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
-            if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
-            if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
-            if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
-            if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
-        }
-        if( changed[ 0 ] && thri == 0 && thrj == 0 )
-            BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1;
-        __syncthreads();
+        
+        if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) )
+            aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ];
+
     }
-  
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) )
-        aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ];
 }
 #endif
-- 
GitLab


From 970e64480b33f755d2f5b6e441b77d75494ef920 Mon Sep 17 00:00:00 2001
From: Fencl <fenclmat@fjfi.cvut.cz>
Date: Sun, 7 Oct 2018 12:55:16 +0200
Subject: [PATCH 11/20] FIM method implemented. Neighbours are being found on
 CPU. 3D parallel method disabled because of Array changes.

---
 .../tnlDirectEikonalMethodsBase.h             |   9 +-
 .../tnlFastSweepingMethod2D_impl.h            | 199 +++++++++++-------
 .../tnlFastSweepingMethod3D_impl.h            |   4 +-
 3 files changed, 134 insertions(+), 78 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index eb7cbd2a5..c92368deb 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -113,6 +113,8 @@ T1 meet2DCondition( T1 a, T1 b, const T2 ha, const T2 hb, const T1 value, double
 template < typename T1 >
 __cuda_callable__ void sortMinims( T1 pom[] );
 
+template < typename Index >
+void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY );
 
 #ifdef HAVE_CUDA
 template < typename Real, typename Device, typename Index >
@@ -130,8 +132,11 @@ template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                       Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-                                      int *BlockIterDevice, int oddEvenBlock);
-__global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks );
+                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );
+
+template < typename Index >
+__global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+                                   TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks );
 
 /*template < typename Real, typename Device, typename Index >
 __global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a );*/
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index 7e4028fbe..817811c84 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -235,13 +235,6 @@ solve( const MeshPointer& mesh,
       {
          // TODO: CUDA code
 #ifdef HAVE_CUDA
-          
-          Real *dAux;
-          cudaMalloc(&dAux, ( mesh->getDimensions().x() * mesh->getDimensions().y() ) * sizeof( Real ) );
-          
-          
-          
-          
           const int cudaBlockSize( 16 );
           int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
           int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
@@ -250,18 +243,30 @@ solve( const MeshPointer& mesh,
           
           tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
           
-          //aux1<<< gridSize, blockSize >>>( auxPtr.template modifyData< Device>(), dAux,1 );
+          TNL::Containers::Array< int, Devices::Host, IndexType > BlockIter;
+          BlockIter.setSize( numBlocksX * numBlocksY );
+          BlockIter.setValue( 0 );
+          /*int* BlockIter = (int*)malloc( ( numBlocksX * numBlocksY ) * sizeof( int ) );
+          for( int i = 0; i < numBlocksX*numBlocksY +1; i++)
+              BlockIter[i] = 1;*/
           
-          //int BlockIter = 1;// = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) );
-
-          int *BlockIterDevice;
           int BlockIterD = 1;
           
+          TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
+          BlockIterDevice.setSize( numBlocksX * numBlocksY );
+          BlockIterDevice.setValue( 1 );
+          /*int *BlockIterDevice;
           cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );
+          cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);*/
+          
           int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0);
-          int *dBlock;
-          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );
-          int oddEvenBlock = 0;
+          
+          TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
+          dBlock.setSize( nBlocks );
+          dBlock.setValue( 0 );
+          /*int *dBlock;
+          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/
+          
           while( BlockIterD )
           {
            /*for( int i = 0; i < numBlocksX * numBlocksY; i++ )
@@ -270,89 +275,132 @@ solve( const MeshPointer& mesh,
             CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
                                                              interfaceMapPtr.template getData< Device >(),
                                                              auxPtr.template modifyData< Device>(),
-                                                             BlockIterDevice,
-                                                             oddEvenBlock );
-	    TNL_CHECK_CUDA_DEVICE;
-            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
-            CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
-                                                             interfaceMapPtr.template getData< Device >(),
-                                                             auxPtr.template modifyData< Device>(),
-                                                             BlockIterDevice,
-                                                             oddEvenBlock );
-	    TNL_CHECK_CUDA_DEVICE;
-            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
+                                                             BlockIterDevice );
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
+            
+            BlockIter = BlockIterDevice;
+            //cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyDeviceToHost);
+            GetNeighbours( BlockIter, numBlocksX, numBlocksY );
+            
+            BlockIterDevice = BlockIter;
+            //cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
+            
+            
+            CudaParallelReduc<<<  nBlocks, 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
             
-            CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
-	    TNL_CHECK_CUDA_DEVICE;
             CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+            cudaDeviceSynchronize();
             TNL_CHECK_CUDA_DEVICE;
+            
             cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
                                    
             /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
                 BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
             
           }
-          //aux1<<<gridSize,blockSize>>>( auxPtr.template modifyData< Device>(), dAux, 0 );
-          cudaFree( dAux );
-          cudaFree( BlockIterDevice );
+          /*cudaFree( BlockIterDevice );
           cudaFree( dBlock );
+          delete BlockIter;*/
           cudaDeviceSynchronize();
           
           TNL_CHECK_CUDA_DEVICE;
               
-          //aux = *auxPtr;
-          //interfaceMap = *interfaceMapPtr;
+          aux = *auxPtr;
+          interfaceMap = *interfaceMapPtr;
 #endif
       }
       iteration++;
    }
    aux.save("aux-final.tnl");
 }
-
-#ifdef HAVE_CUDA
-/*template < typename Real, typename Device, typename Index >
-__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a )
+template < typename Index >
+void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY )
 {
-    int i = threadIdx.x + blockDim.x*blockIdx.x;
-    int j = blockDim.y*blockIdx.y + threadIdx.y;
-    const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >();
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 1 )
-    {    
-        dAux[ j*mesh.getDimensions().x() + i ] = aux[ j*mesh.getDimensions().x() + i ];
-    }
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 0 )
-    {    
-        aux[ j*mesh.getDimensions().x() + i ] = dAux[ j*mesh.getDimensions().x() + i ];
-    }
-    
-}*/
+    TNL::Containers::Array< int, Devices::Host, Index > BlockIterPom;
+    BlockIterPom.setSize( numBlockX * numBlockY );
+    BlockIterPom.setValue( 0 );
+  /*int* BlockIterPom; 
+  BlockIterPom = new int[numBlockX * numBlockY];*/
+  /*for(int i = 0; i < numBlockX * numBlockY; i++)
+    BlockIterPom[ i ] = 0;*/
+  for(int i = 0; i < numBlockX * numBlockY; i++)
+  {
+      
+      if( BlockIter[ i ] )
+      {
+          // i = k*numBlockY + m;
+          int m=0, k=0;
+          m = i%numBlockY;
+          k = i/numBlockY;
+          if( k > 0 && numBlockY > 1 )
+            BlockIterPom[i - numBlockX] = 1;
+          if( k < numBlockY-1 && numBlockY > 1 )
+            BlockIterPom[i + numBlockX] = 1;
+          
+          if( m >= 0 && m < numBlockX - 1 && numBlockX > 1 )
+              BlockIterPom[ i+1 ] = 1;
+          if( m <= numBlockX -1 && m > 0 && numBlockX > 1 )
+              BlockIterPom[ i-1 ] = 1;
+      }
+  }
+  for(int i = 0; i < numBlockX * numBlockY; i++ ){
+///      if( !BlockIter[ i ] )
+        BlockIter[ i ] = BlockIterPom[ i ];
+///      else
+///        BlockIter[ i ] = 0;
+  }
+  /*for( int i = numBlockX-1; i > -1; i-- )
+  {
+      for( int j = 0; j< numBlockY; j++ )
+          std::cout << BlockIter[ i*numBlockY + j ];
+      std::cout << std::endl;
+  }
+  std::cout << std::endl;*/
+  //delete[] BlockIterPom;
+}
 
-__global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks )
+#ifdef HAVE_CUDA
+template < typename Index >
+__global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+                                   TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks )
 {
     int i = threadIdx.x;
     int blId = blockIdx.x;
+    /*if ( i == 0 && blId == 0 ){
+            printf( "nBlocks = %d \n", nBlocks );
+        for( int j = nBlocks-1; j > -1 ; j--){
+            printf( "cislo = %d \n", BlockIterDevice[ j ] );
+        }
+    }*/
     __shared__ volatile int sArray[ 512 ];
-    sArray[ i ] = false;
-    if(blId * 1024 + i < nBlocks )
-        sArray[ i ] = BlockIterDevice[ blId * 1024 + i ];
+    sArray[ i ] = 0;
+    if( blId * 512 + i < nBlocks )
+        sArray[ i ] = BlockIterDevice[ blId * 512 + i ];
+    __syncthreads();
     
-    if (blockDim.x * blockDim.y == 1024) {
+    if (blockDim.x == 1024) {
         if (i < 512)
-            sArray[ i ] += sArray[ i ];
+            sArray[ i ] += sArray[ i + 512 ];
     }
     __syncthreads();
-    if (blockDim.x * blockDim.y >= 512) {
+    if (blockDim.x >= 512) {
         if (i < 256) {
-            sArray[ i ] += sArray[ i ];
+            sArray[ i ] += sArray[ i + 256 ];
         }
     }
-    if (blockDim.x * blockDim.y >= 256) {
+    __syncthreads();
+    if (blockDim.x >= 256) {
         if (i < 128) {
             sArray[ i ] += sArray[ i + 128 ];
         }
     }
     __syncthreads();
-    if (blockDim.x * blockDim.y >= 128) {
+    if (blockDim.x >= 128) {
         if (i < 64) {
             sArray[ i ] += sArray[ i + 64 ];
         }
@@ -360,12 +408,12 @@ __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlock
     __syncthreads();
     if (i < 32 )
     {
-        if(  blockDim.x * blockDim.y >= 64 ) sArray[ i ] += sArray[ i + 32 ];
-        if(  blockDim.x * blockDim.y >= 32 )  sArray[ i ] += sArray[ i + 16 ];
-        if(  blockDim.x * blockDim.y >= 16 )  sArray[ i ] += sArray[ i + 8 ];
-        if(  blockDim.x * blockDim.y >= 8 )  sArray[ i ] += sArray[ i + 4 ];
-        if(  blockDim.x * blockDim.y >= 4 )  sArray[ i ] += sArray[ i + 2 ];
-        if(  blockDim.x * blockDim.y >= 2 )  sArray[ i ] += sArray[ i + 1 ];
+        if(  blockDim.x >= 64 ) sArray[ i ] += sArray[ i + 32 ];
+        if(  blockDim.x >= 32 )  sArray[ i ] += sArray[ i + 16 ];
+        if(  blockDim.x >= 16 )  sArray[ i ] += sArray[ i + 8 ];
+        if(  blockDim.x >= 8 )  sArray[ i ] += sArray[ i + 4 ];
+        if(  blockDim.x >= 4 )  sArray[ i ] += sArray[ i + 2 ];
+        if(  blockDim.x >= 2 )  sArray[ i ] += sArray[ i + 1 ];
     }
     
     if( i == 0 )
@@ -378,14 +426,15 @@ template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                       Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-                                      int *BlockIterDevice, int oddEvenBlock )
+                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice )
 {
     int thri = threadIdx.x; int thrj = threadIdx.y;
     int blIdx = blockIdx.x; int blIdy = blockIdx.y;
     int i = thri + blockDim.x*blIdx;
     int j = blockDim.y*blIdy + thrj;
     int currentIndex = thrj * blockDim.x + thri;
-    
+    if( BlockIterDevice[ blIdy * gridDim.x + blIdx] )
+    {
     //__shared__ volatile bool changed[ blockDim.x*blockDim.y ];
     __shared__ volatile bool changed[16*16];
     changed[ currentIndex ] = false;
@@ -424,13 +473,13 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
 
         if( numOfBlocky -1 == blIdy )
             ykolik = dimY - (blIdy)*blockDim.y+1;
-        BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0;
+        //BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0;
     }
     __syncthreads();
     
-    if( (blIdy%2  + blIdx) % 2 == oddEvenBlock )
-    {
-    
+        if(thri == 0 && thrj == 0 )
+            BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0;
+
         if( thri == 0 )
         {        
             if( dimX > (blIdx+1) * blockDim.x  && thrj+1 < ykolik )
@@ -528,14 +577,16 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
                 if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
                 if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
             }
-            if( changed[ 0 ] && thri == 0 && thrj == 0 )
+            if( changed[ 0 ] && thri == 0 && thrj == 0 ){
                 BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1;
+            }
             __syncthreads();
         }
-        
+
         if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) )
             aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ];
-
     }
+    /*if( thri == 0 && thrj == 0 )
+        printf( "Block ID = %d, value = %d \n", (blIdy * numOfBlockx + blIdx), BlockIterDevice[ blIdy * numOfBlockx + blIdx ] );*/
 }
 #endif
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index b024979cc..8c85745cd 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -272,8 +272,8 @@ solve( const MeshPointer& mesh,
                                                               interfaceMapPtr.template getData< Device >(),
                                                               auxPtr.template modifyData< Device>(),
                                                               BlockIterDevice );
-            CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
-            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+            //CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
+            //CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
             
             cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
                                    
-- 
GitLab


From 08ec37be1ec80e299cea295a057a9515e9f04896 Mon Sep 17 00:00:00 2001
From: Fencl <fenclmat@fjfi.cvut.cz>
Date: Mon, 22 Oct 2018 21:13:54 +0200
Subject: [PATCH 12/20] FIM method is now faster than chess method but some
 random error occurs.

---
 .../tnlDirectEikonalMethodsBase.h             |  17 +-
 .../tnlDirectEikonalMethodsBase_impl.h        | 193 ++++
 .../hamilton-jacobi/tnlFastSweepingMethod.h   |   3 +-
 .../tnlFastSweepingMethod2D_impl.h            | 871 ++++++++----------
 .../tnlFastSweepingMethod3D_impl.h            |  33 +-
 5 files changed, 629 insertions(+), 488 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index c92368deb..08ed947ed 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -61,8 +61,9 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
       typedef Index IndexType;
       typedef Functions::MeshFunction< MeshType > MeshFunctionType;
       typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType;
+      typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
       using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
-      using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;      
+      using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;
 
       void initInterface( const MeshFunctionPointer& input,
                           MeshFunctionPointer& output,
@@ -76,6 +77,11 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
       __cuda_callable__ bool updateCell( volatile Real sArray[18][18],
                                          int thri, int thrj, const Real hx, const Real hy,
                                          const Real velocity = 1.0 );
+      void updateBlocks( InterfaceMapType interfaceMap,
+                         MeshFunctionType aux,
+                         ArrayContainer BlockIterHost, int numThreadsPerBlock );
+      
+      void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY  );
 };
 
 template< typename Real,
@@ -132,14 +138,15 @@ template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                       Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );
+                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne = 1 );
 
 template < typename Index >
 __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
                                    TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks );
 
-/*template < typename Real, typename Device, typename Index >
-__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a );*/
+template < typename Index >
+__global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+                               /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY );
 
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
@@ -155,7 +162,7 @@ template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
                                       Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
-                                      int *BlockIterDevice );
+                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );
 #endif
 
 #include "tnlDirectEikonalMethodsBase_impl.h"
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index cfea6aca0..1f9fc5eeb 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -89,6 +89,199 @@ initInterface( const MeshFunctionPointer& _input,
     }
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+void 
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+updateBlocks( InterfaceMapType interfaceMap,
+              MeshFunctionType aux,
+              ArrayContainer BlockIterHost, int numThreadsPerBlock )
+{
+  for( int i = 0; i < BlockIterHost.getSize(); i++ )
+  {
+    if( BlockIterHost[ i ] )
+    {
+      MeshType mesh = interfaceMap.template getMesh< Devices::Host >();
+    
+      int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
+      int numOfBlockx = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
+      int numOfBlocky = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
+      int xkolik = numThreadsPerBlock + 1;
+      int ykolik = numThreadsPerBlock + 1;
+      
+      int blIdx = i%numOfBlockx;
+      int blIdy = i/numOfBlocky;
+      
+      if( numOfBlockx - 1 == blIdx )
+        xkolik = dimX - (blIdx)*numThreadsPerBlock+1;
+      
+      if( numOfBlocky -1 == blIdy )
+        ykolik = dimY - (blIdy)*numThreadsPerBlock+1;
+    
+        
+      /*bool changed[numThreadsPerBlock*numThreadsPerBlock];
+      changed[ 0 ] = 1;*/
+      Real hx = mesh.getSpaceSteps().x();
+      Real hy = mesh.getSpaceSteps().y();
+      
+      Real changed1[ 16*16 ];
+      /*Real changed2[ 16*16 ];
+      Real changed3[ 16*16 ];
+      Real changed4[ 16*16 ];*/
+      Real sArray[18][18];
+      
+      for( int thri = 0; thri < numThreadsPerBlock + 2; thri++ )
+        for( int thrj = 0; thrj < numThreadsPerBlock + 2; thrj++ )
+          sArray[thrj][thri] = std::numeric_limits< Real >::max();
+    
+      BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0;
+    
+      for( int thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ )
+      {        
+        if( dimX > (blIdx+1) * numThreadsPerBlock  && thrj+1 < ykolik )
+          sArray[thrj+1][xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ];
+        else
+         sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
+      
+    
+        if( blIdx != 0 && thrj+1 < ykolik )
+          sArray[thrj+1][0] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ];
+        else
+          sArray[thrj+1][0] = std::numeric_limits< Real >::max();
+    
+        if( dimY > (blIdy+1) * numThreadsPerBlock  && thrj+1 < xkolik )
+          sArray[ykolik][thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ];
+        else
+          sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();
+      
+        if( blIdy != 0 && thrj+1 < xkolik )
+          sArray[0][thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ];
+        else
+          sArray[0][thrj+1] = std::numeric_limits< Real >::max();
+      }
+    
+      for( int k = 0; k < numThreadsPerBlock; k++ )
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          sArray[k+1][l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ];
+
+      for( int k = 0; k < numThreadsPerBlock; k++ ) 
+        for( int l = 0; l < numThreadsPerBlock; l++ ){
+          changed1[ k*numThreadsPerBlock + l ] = 0;
+          /*changed2[ k*numThreadsPerBlock + l ] = 0;
+          changed3[ k*numThreadsPerBlock + l ] = 0;
+          changed4[ k*numThreadsPerBlock + l ] = 0;*/
+          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
+          {
+            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
+            {
+              changed1[ k*numThreadsPerBlock + l ] = this->updateCell( sArray, l+1, k+1, hx,hy);
+            }
+          }
+        }
+
+      for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
+        for( int l = 0; l < numThreadsPerBlock; l++ ) { 
+          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
+          {
+            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
+            {
+              /*changed2[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
+            }
+          }
+        }
+
+      for( int k = 0; k < numThreadsPerBlock; k++ ) 
+        for( int l = numThreadsPerBlock-1; l >-1; l-- ) { 
+          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
+          {
+            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
+            {
+              /*changed3[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
+            }
+          }
+        }
+
+      for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
+        for( int l = numThreadsPerBlock-1; l >-1; l-- ) { 
+          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
+          {
+            if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
+            {
+              /*changed4[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
+            }
+          }
+        }
+
+      for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
+        for( int l = numThreadsPerBlock-1; l >-1; l-- ){
+          changed1[ 0 ] = changed1[ 0 ] || changed1[ k*numThreadsPerBlock + l ];
+          /*changed2[ 0 ] = changed2[ 0 ] || changed2[ k*numThreadsPerBlock + l ];
+          changed3[ 0 ] = changed3[ 0 ] || changed3[ k*numThreadsPerBlock + l ];
+          changed4[ 0 ] = changed4[ 0 ] || changed4[ k*numThreadsPerBlock + l ];*/
+        }
+      
+      if( changed1[ 0 ] /*|| changed2[ 0 ] ||changed3[ 0 ] ||changed4[ 0 ]*/ )
+        BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 1;
+
+      for( int k = 0; k < numThreadsPerBlock; k++ ){ 
+        for( int l = 0; l < numThreadsPerBlock; l++ ) {       
+          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY &&
+              (!interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ]) )
+            aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] = sArray[ k + 1 ][ l + 1 ];
+          //std::cout<< sArray[k+1][l+1];
+        }
+        //std::cout<<std::endl;
+      }
+    }
+  }
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+void 
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY )
+{
+  int* BlockIterPom; 
+  BlockIterPom = new int [numBlockX * numBlockY];
+  
+  for(int i = 0; i < numBlockX * numBlockY; i++)
+  {
+    BlockIterPom[ i ] = 0;  
+    if( BlockIterHost[ i ] )
+    {
+      // i = k*numBlockY + m;
+      int m=0, k=0;
+      m = i%numBlockX;
+      k = i/numBlockX;
+      if( k > 0 )
+        BlockIterPom[i - numBlockX] = 1;
+      if( k < numBlockY - 1 )
+        BlockIterPom[i + numBlockX] = 1;
+      
+      if( m < numBlockX - 1 )
+        BlockIterPom[ i+1 ] = 1;
+      if( m > 0 )
+        BlockIterPom[ i-1 ] = 1;
+    }
+  }
+  for(int i = 0; i < numBlockX * numBlockY; i++ )
+      //if( !BlockIter[ i ] )
+        BlockIterHost[ i ] = BlockIterPom[ i ];
+      /*else
+        BlockIter[ i ] = 0;*/
+  /*for( int i = numBlockX-1; i > -1; i-- )
+  {
+      for( int j = 0; j< numBlockY; j++ )
+          std::cout << BlockIterHost[ i*numBlockY + j ];
+      std::cout << std::endl;
+  }
+  std::cout << std::endl;*/
+  delete[] BlockIterPom;
+}
+
 template< typename Real,
           typename Device,
           typename Index >
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
index fa8077427..60c690e06 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
@@ -88,7 +88,8 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >
       using typename BaseType::InterfaceMapType;
       using typename BaseType::MeshFunctionType;
       using typename BaseType::InterfaceMapPointer;
-      using typename BaseType::MeshFunctionPointer;      
+      using typename BaseType::MeshFunctionPointer;
+      using typename BaseType::ArrayContainer;
 
       FastSweepingMethod();
       
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index 817811c84..c6cc575d1 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -21,403 +21,348 @@
 #include <fstream>
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
 FastSweepingMethod()
 : maxIterations( 1 )
 {
-   
+  
 }
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 const Index&
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
 getMaxIterations() const
 {
-   
+  
 }
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 void
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
 setMaxIterations( const IndexType& maxIterations )
 {
-   
+  
 }
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 void
 FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >::
 solve( const MeshPointer& mesh,
-       const AnisotropyPointer& anisotropy,
-       MeshFunctionPointer& u )
-{
-   /*MeshFunctionType v;
-   v.setMesh(mesh);
-   double A[320][320];
-    for (int i = 0; i < 320; i++)
-        for (int j = 0; j < 320; j++)
-            A[i][j] = 0;
-    
-    std::ifstream file("/home/maty/Downloads/mapa2.txt");
-
-    for (int i = 0; i < 320; i++)
-        for (int j = 0; j < 320; j++)
-            file >> A[i][j];
-    file.close();
-    for (int i = 0; i < 320; i++)
-        for (int j = 0; j < 320; j++)
-            v[i*320 + j] = A[i][j];
-   v.save("mapa.tnl");*/
-   
-       
-   MeshFunctionPointer auxPtr;
-   InterfaceMapPointer interfaceMapPtr;
-   auxPtr->setMesh( mesh );
-   interfaceMapPtr->setMesh( mesh );
-   std::cout << "Initiating the interface cells ..." << std::endl;
-   BaseType::initInterface( u, auxPtr, interfaceMapPtr );
+        const AnisotropyPointer& anisotropy,
+        MeshFunctionPointer& u )
+{  
+  MeshFunctionPointer auxPtr;
+  InterfaceMapPointer interfaceMapPtr;
+  auxPtr->setMesh( mesh );
+  interfaceMapPtr->setMesh( mesh );
+  std::cout << "Initiating the interface cells ..." << std::endl;
+  BaseType::initInterface( u, auxPtr, interfaceMapPtr );
+  
+  auxPtr->save( "aux-ini.tnl" );
+  
+  typename MeshType::Cell cell( *mesh );
+  
+  IndexType iteration( 0 );
+  InterfaceMapType interfaceMap = *interfaceMapPtr;
+  MeshFunctionType aux = *auxPtr;
+  
+  
+  
+  
+  while( iteration < this->maxIterations )
+  {
+    if( std::is_same< DeviceType, Devices::Host >::value )
+    {
+      int numThreadsPerBlock = 16;
+      
+      int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
+      int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
+      
+          
+      ArrayContainer BlockIterHost;
+      BlockIterHost.setSize( numBlocksX * numBlocksY );
+      BlockIterHost.setValue( 1 );
+      /*for( int k = numBlocksX-1; k >-1; k-- ){
+        for( int l = 0; l < numBlocksY; l++ ){
+          std::cout<< BlockIterHost[ l*numBlocksX  + k ];
+        }
+        std::cout<<std::endl;
+      }
+      std::cout<<std::endl;*/
+      
+      while( BlockIterHost[ 0 ] )
+      {          
+        this->updateBlocks( interfaceMap, aux, BlockIterHost, numThreadsPerBlock);
         
-   auxPtr->save( "aux-ini.tnl" );
-
-   typename MeshType::Cell cell( *mesh );
-   
-   IndexType iteration( 0 );
-   InterfaceMapType interfaceMap = *interfaceMapPtr;
-   MeshFunctionType aux = *auxPtr;
-   while( iteration < this->maxIterations )
-   {
-      if( std::is_same< DeviceType, Devices::Host >::value )
-      {
-         for( cell.getCoordinates().y() = 0;
+        this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY );
+        
+  //Reduction      
+        for( int k = numBlocksX-1; k >-1; k-- ){
+          for( int l = 0; l < numBlocksY; l++ ){
+            //std::cout<< BlockIterHost[ l*numBlocksX  + k ];
+            BlockIterHost[ 0 ] = BlockIterHost[ 0 ] || BlockIterHost[ l*numBlocksX + k ];
+          }
+          //std::cout<<std::endl;
+        }
+        //std::cout<<std::endl;
+      }
+      /*for( cell.getCoordinates().y() = 0;
               cell.getCoordinates().y() < mesh->getDimensions().y();
               cell.getCoordinates().y()++ )
-         {
-            for( cell.getCoordinates().x() = 0;
-                 cell.getCoordinates().x() < mesh->getDimensions().x();
-                 cell.getCoordinates().x()++ )
-               {
-                  cell.refresh();
-                  if( ! interfaceMap( cell ) )
-                     this->updateCell( aux, cell );
-               }
-         }
-
-         //aux.save( "aux-1.tnl" );
-
-         for( cell.getCoordinates().y() = 0;
+      {
+        for( cell.getCoordinates().x() = 0;
+                cell.getCoordinates().x() < mesh->getDimensions().x();
+                cell.getCoordinates().x()++ )
+        {
+          cell.refresh();
+          if( ! interfaceMap( cell ) )
+            this->updateCell( aux, cell );
+        }
+      }
+      
+      //aux.save( "aux-1.tnl" );
+      
+      for( cell.getCoordinates().y() = 0;
               cell.getCoordinates().y() < mesh->getDimensions().y();
               cell.getCoordinates().y()++ )
-         {
-            for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                 cell.getCoordinates().x() >= 0 ;
-                 cell.getCoordinates().x()-- )		
-               {
-                  //std::cerr << "2 -> ";
-                  cell.refresh();
-                  if( ! interfaceMap( cell ) )            
-                     this->updateCell( aux, cell );
-               }
-         }
-
-         //aux.save( "aux-2.tnl" );
-
-         for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+      {
+        for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                cell.getCoordinates().x() >= 0 ;
+                cell.getCoordinates().x()-- )		
+        {
+          //std::cerr << "2 -> ";
+          cell.refresh();
+          if( ! interfaceMap( cell ) )            
+            this->updateCell( aux, cell );
+        }
+      }
+      
+      //aux.save( "aux-2.tnl" );
+      
+      for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
               cell.getCoordinates().y() >= 0 ;
               cell.getCoordinates().y()-- )
-            {
-            for( cell.getCoordinates().x() = 0;
-                 cell.getCoordinates().x() < mesh->getDimensions().x();
-                 cell.getCoordinates().x()++ )
-               {
-                  //std::cerr << "3 -> ";
-                  cell.refresh();
-                  if( ! interfaceMap( cell ) )            
-                     this->updateCell( aux, cell );
-               }
-            }
-
-         //aux.save( "aux-3.tnl" );
-
-         for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+      {
+        for( cell.getCoordinates().x() = 0;
+                cell.getCoordinates().x() < mesh->getDimensions().x();
+                cell.getCoordinates().x()++ )
+        {
+          //std::cerr << "3 -> ";
+          cell.refresh();
+          if( ! interfaceMap( cell ) )            
+            this->updateCell( aux, cell );
+        }
+      }
+      
+      //aux.save( "aux-3.tnl" );
+      
+      for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
               cell.getCoordinates().y() >= 0;
               cell.getCoordinates().y()-- )
-            {
-            for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                 cell.getCoordinates().x() >= 0 ;
-                 cell.getCoordinates().x()-- )		
-               {
-                  //std::cerr << "4 -> ";
-                  cell.refresh();
-                  if( ! interfaceMap( cell ) )            
-                     this->updateCell( aux, cell );
-               }
-            }
-
-         //aux.save( "aux-4.tnl" );
-
-         /*for( cell.getCoordinates().x() = 0;
-              cell.getCoordinates().x() < mesh->getDimensions().y();
-              cell.getCoordinates().x()++ )
-         {
-            for( cell.getCoordinates().y() = 0;
-                 cell.getCoordinates().y() < mesh->getDimensions().x();
-                 cell.getCoordinates().y()++ )
-               {
-                  cell.refresh();
-                  if( ! interfaceMap( cell ) )
-                     this->updateCell( aux, cell );
-               }
-         }     
-
-
-         aux.save( "aux-5.tnl" );
-
-         for( cell.getCoordinates().x() = 0;
-              cell.getCoordinates().x() < mesh->getDimensions().y();
-              cell.getCoordinates().x()++ )
-         {
-            for( cell.getCoordinates().y() = mesh->getDimensions().x() - 1;
-                 cell.getCoordinates().y() >= 0 ;
-                 cell.getCoordinates().y()-- )		
-               {
-                  //std::cerr << "2 -> ";
-                  cell.refresh();
-                  if( ! interfaceMap( cell ) )            
-                     this->updateCell( aux, cell );
-               }
-         }
-         aux.save( "aux-6.tnl" );
-
-         for( cell.getCoordinates().x() = mesh->getDimensions().y() - 1;
-              cell.getCoordinates().x() >= 0 ;
-              cell.getCoordinates().x()-- )
-            {
-            for( cell.getCoordinates().y() = 0;
-                 cell.getCoordinates().y() < mesh->getDimensions().x();
-                 cell.getCoordinates().y()++ )
-               {
-                  //std::cerr << "3 -> ";
-                  cell.refresh();
-                  if( ! interfaceMap( cell ) )            
-                     this->updateCell( aux, cell );
-               }
-            }
-         aux.save( "aux-7.tnl" );
-
-         for( cell.getCoordinates().x() = mesh->getDimensions().y() - 1;
-              cell.getCoordinates().x() >= 0;
-              cell.getCoordinates().x()-- )
-            {
-            for( cell.getCoordinates().y() = mesh->getDimensions().x() - 1;
-                 cell.getCoordinates().y() >= 0 ;
-                 cell.getCoordinates().y()-- )		
-               {
-                  //std::cerr << "4 -> ";
-                  cell.refresh();
-                  if( ! interfaceMap( cell ) )            
-                     this->updateCell( aux, cell );
-               }
-            }*/
-      }
-      if( std::is_same< DeviceType, Devices::Cuda >::value )
       {
-         // TODO: CUDA code
+        for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                cell.getCoordinates().x() >= 0 ;
+                cell.getCoordinates().x()-- )		
+        {
+          //std::cerr << "4 -> ";
+          cell.refresh();
+          if( ! interfaceMap( cell ) )            
+            this->updateCell( aux, cell );
+        }
+      }*/
+    }
+    if( std::is_same< DeviceType, Devices::Cuda >::value )
+    {
+      // TODO: CUDA code
 #ifdef HAVE_CUDA
-          const int cudaBlockSize( 16 );
-          int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
-          int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
-          dim3 blockSize( cudaBlockSize, cudaBlockSize );
-          dim3 gridSize( numBlocksX, numBlocksY );
-          
-          tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
-          
-          TNL::Containers::Array< int, Devices::Host, IndexType > BlockIter;
-          BlockIter.setSize( numBlocksX * numBlocksY );
-          BlockIter.setValue( 0 );
-          /*int* BlockIter = (int*)malloc( ( numBlocksX * numBlocksY ) * sizeof( int ) );
-          for( int i = 0; i < numBlocksX*numBlocksY +1; i++)
-              BlockIter[i] = 1;*/
-          
-          int BlockIterD = 1;
-          
-          TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
-          BlockIterDevice.setSize( numBlocksX * numBlocksY );
-          BlockIterDevice.setValue( 1 );
-          /*int *BlockIterDevice;
-          cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );
-          cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);*/
-          
-          int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0);
-          
-          TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
-          dBlock.setSize( nBlocks );
-          dBlock.setValue( 0 );
-          /*int *dBlock;
-          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/
-          
-          while( BlockIterD )
-          {
-           /*for( int i = 0; i < numBlocksX * numBlocksY; i++ )
-                BlockIter[ i ] = false;*/
-                       
-            CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
-                                                             interfaceMapPtr.template getData< Device >(),
-                                                             auxPtr.template modifyData< Device>(),
-                                                             BlockIterDevice );
-            cudaDeviceSynchronize();
-            TNL_CHECK_CUDA_DEVICE;
-            
-            BlockIter = BlockIterDevice;
-            //cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyDeviceToHost);
-            GetNeighbours( BlockIter, numBlocksX, numBlocksY );
-            
-            BlockIterDevice = BlockIter;
-            //cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);
-            cudaDeviceSynchronize();
-            TNL_CHECK_CUDA_DEVICE;
-            
-            
-            CudaParallelReduc<<<  nBlocks, 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
-            cudaDeviceSynchronize();
-            TNL_CHECK_CUDA_DEVICE;
-            
-            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
-            cudaDeviceSynchronize();
-            TNL_CHECK_CUDA_DEVICE;
-            
-            cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
-                                   
-            /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
-                BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
-            
-          }
-          /*cudaFree( BlockIterDevice );
-          cudaFree( dBlock );
-          delete BlockIter;*/
-          cudaDeviceSynchronize();
-          
-          TNL_CHECK_CUDA_DEVICE;
-              
-          aux = *auxPtr;
-          interfaceMap = *interfaceMapPtr;
-#endif
+      TNL_CHECK_CUDA_DEVICE;
+      const int cudaBlockSize( 16 );
+      int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
+      int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
+      dim3 blockSize( cudaBlockSize, cudaBlockSize );
+      dim3 gridSize( numBlocksX, numBlocksY );
+      
+      tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
+      
+      int BlockIterD = 1;
+      
+      TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
+      BlockIterDevice.setSize( numBlocksX * numBlocksY );
+      BlockIterDevice.setValue( 1 );
+      TNL_CHECK_CUDA_DEVICE;
+      int ne = 0;
+      CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
+                                                       interfaceMapPtr.template getData< Device >(),
+                                                       auxPtr.template modifyData< Device>(),
+                                                       BlockIterDevice, ne);
+      TNL_CHECK_CUDA_DEVICE;
+      
+      /*TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom;
+      BlockIterPom.setSize( numBlocksX * numBlocksY  );
+      BlockIterPom.setValue( 0 );*/
+      /*TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1;
+      BlockIterPom1.setSize( numBlocksX * numBlocksY  );
+      BlockIterPom1.setValue( 0 );*/
+      /*int *BlockIterDevice;
+       cudaMalloc((void**) &BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/
+      int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
+      //std::cout << "nBlocksNeigh = " << nBlocksNeigh << std::endl;
+      //free( BlockIter );
+      /*int *BlockIterPom;
+       cudaMalloc((void**) &BlockIterPom, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/
+      
+      int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0);
+      TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
+      dBlock.setSize( nBlocks  );
+      TNL_CHECK_CUDA_DEVICE;
+      /*int *dBlock;
+       cudaMalloc((void**) &dBlock, nBlocks * sizeof( int ) );*/
+      //int pocIter = 0;
+      while( BlockIterD )
+      {
+        /*BlockIterPom1 = BlockIterDevice;
+        for( int j = numBlocksY-1; j>-1; j-- ){
+          for( int i = 0; i < numBlocksX; i++ )
+            std::cout << BlockIterPom1[ j * numBlocksX + i ];
+          std::cout << std::endl;
+        }
+        std::cout << std::endl;*/
+        
+        CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
+                                                         interfaceMapPtr.template getData< Device >(),
+                                                         auxPtr.template modifyData< Device>(),
+                                                         BlockIterDevice, 1);
+        cudaDeviceSynchronize();
+        TNL_CHECK_CUDA_DEVICE;
+        
+        /*int poc = 0;
+        for( int i = 0; i < numBlocksX * numBlocksY; i++ )
+          if( BlockIterPom1[ i ] )
+            poc = poc+1;
+        std::cout << "pocet bloku, ktere se pocitali = " << poc << std::endl;*/
+        
+        GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, /*BlockIterPom,*/ numBlocksX, numBlocksY );
+        cudaDeviceSynchronize();
+        TNL_CHECK_CUDA_DEVICE;
+        
+        CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
+        TNL_CHECK_CUDA_DEVICE;
+        
+        CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+        TNL_CHECK_CUDA_DEVICE;
+        
+        BlockIterD = dBlock.getElement( 0 );
+        //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
+        cudaDeviceSynchronize();
+        TNL_CHECK_CUDA_DEVICE;
+        
+        /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
+         BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
+        //pocIter ++;
       }
-      iteration++;
-   }
-   aux.save("aux-final.tnl");
+      cudaDeviceSynchronize();
+      TNL_CHECK_CUDA_DEVICE;
+      
+      //std::cout<< pocIter << std::endl;
+      
+      aux = *auxPtr;
+      interfaceMap = *interfaceMapPtr;
+#endif
+    }
+    iteration++;
+  }
+  aux.save("aux-final.tnl");
 }
+
+#ifdef HAVE_CUDA
 template < typename Index >
-void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY )
+__global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+                               /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY )
 {
-    TNL::Containers::Array< int, Devices::Host, Index > BlockIterPom;
-    BlockIterPom.setSize( numBlockX * numBlockY );
-    BlockIterPom.setValue( 0 );
-  /*int* BlockIterPom; 
-  BlockIterPom = new int[numBlockX * numBlockY];*/
-  /*for(int i = 0; i < numBlockX * numBlockY; i++)
-    BlockIterPom[ i ] = 0;*/
-  for(int i = 0; i < numBlockX * numBlockY; i++)
+  int i = blockIdx.x * 1024 + threadIdx.x;
+  
+  if( i < numBlockX * numBlockY )
   {
-      
-      if( BlockIter[ i ] )
-      {
-          // i = k*numBlockY + m;
-          int m=0, k=0;
-          m = i%numBlockY;
-          k = i/numBlockY;
-          if( k > 0 && numBlockY > 1 )
-            BlockIterPom[i - numBlockX] = 1;
-          if( k < numBlockY-1 && numBlockY > 1 )
-            BlockIterPom[i + numBlockX] = 1;
+    int pom = 0;//BlockIterPom[ i ] = 0;
+    int m=0, k=0;
+    m = i%numBlockX;
+    k = i/numBlockX;
+    if( m > 0 )
+      if( BlockIterDevice[ i - 1 ] )
+        pom = 1;//BlockIterPom[ i ] = 1;
+    if( m < numBlockX -1 && pom == 0 )
+      if( BlockIterDevice[ i + 1 ] )
+        pom = 1;//BlockIterPom[ i ] = 1;
+    if( k > 0 && pom == 0 )
+      if( BlockIterDevice[ i - numBlockX ] )
+        pom = 1;// BlockIterPom[ i ] = 1;
+    if( k < numBlockY -1 && pom == 0 )
+      if( BlockIterDevice[ i + numBlockX ] )
+        pom = 1;//BlockIterPom[ i ] = 1;
+    
           
-          if( m >= 0 && m < numBlockX - 1 && numBlockX > 1 )
-              BlockIterPom[ i+1 ] = 1;
-          if( m <= numBlockX -1 && m > 0 && numBlockX > 1 )
-              BlockIterPom[ i-1 ] = 1;
-      }
-  }
-  for(int i = 0; i < numBlockX * numBlockY; i++ ){
-///      if( !BlockIter[ i ] )
-        BlockIter[ i ] = BlockIterPom[ i ];
-///      else
-///        BlockIter[ i ] = 0;
-  }
-  /*for( int i = numBlockX-1; i > -1; i-- )
-  {
-      for( int j = 0; j< numBlockY; j++ )
-          std::cout << BlockIter[ i*numBlockY + j ];
-      std::cout << std::endl;
+      
+    BlockIterDevice[ i ] = pom;//BlockIterPom[ i ];
   }
-  std::cout << std::endl;*/
-  //delete[] BlockIterPom;
 }
 
-#ifdef HAVE_CUDA
 template < typename Index >
 __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
                                    TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks )
 {
-    int i = threadIdx.x;
-    int blId = blockIdx.x;
-    /*if ( i == 0 && blId == 0 ){
-            printf( "nBlocks = %d \n", nBlocks );
-        for( int j = nBlocks-1; j > -1 ; j--){
-            printf( "cislo = %d \n", BlockIterDevice[ j ] );
-        }
-    }*/
-    __shared__ volatile int sArray[ 512 ];
-    sArray[ i ] = 0;
-    if( blId * 512 + i < nBlocks )
-        sArray[ i ] = BlockIterDevice[ blId * 512 + i ];
-    __syncthreads();
-    
-    if (blockDim.x == 1024) {
-        if (i < 512)
-            sArray[ i ] += sArray[ i + 512 ];
-    }
-    __syncthreads();
-    if (blockDim.x >= 512) {
-        if (i < 256) {
-            sArray[ i ] += sArray[ i + 256 ];
-        }
-    }
-    __syncthreads();
-    if (blockDim.x >= 256) {
-        if (i < 128) {
-            sArray[ i ] += sArray[ i + 128 ];
-        }
+  int i = threadIdx.x;
+  int blId = blockIdx.x;
+  __shared__ volatile int sArray[ 512 ];
+  sArray[ i ] = 0;
+  if(blId * 512 + i < nBlocks )
+    sArray[ i ] = BlockIterDevice[ blId * 512 + i ];
+  __syncthreads();
+  if (blockDim.x == 1024) {
+    if (i < 512)
+      sArray[ i ] += sArray[ i + 512 ];
+  }
+  __syncthreads();
+  if (blockDim.x  >= 512) {
+    if (i < 256) {
+      sArray[ i ] += sArray[ i + 256 ];
     }
-    __syncthreads();
-    if (blockDim.x >= 128) {
-        if (i < 64) {
-            sArray[ i ] += sArray[ i + 64 ];
-        }
+  }
+  if (blockDim.x >= 256) {
+    if (i < 128) {
+      sArray[ i ] += sArray[ i + 128 ];
     }
-    __syncthreads();
-    if (i < 32 )
-    {
-        if(  blockDim.x >= 64 ) sArray[ i ] += sArray[ i + 32 ];
-        if(  blockDim.x >= 32 )  sArray[ i ] += sArray[ i + 16 ];
-        if(  blockDim.x >= 16 )  sArray[ i ] += sArray[ i + 8 ];
-        if(  blockDim.x >= 8 )  sArray[ i ] += sArray[ i + 4 ];
-        if(  blockDim.x >= 4 )  sArray[ i ] += sArray[ i + 2 ];
-        if(  blockDim.x >= 2 )  sArray[ i ] += sArray[ i + 1 ];
+  }
+  __syncthreads();
+  if (blockDim.x >= 128) {
+    if (i < 64) {
+      sArray[ i ] += sArray[ i + 64 ];
     }
-    
-    if( i == 0 )
-        dBlock[ blId ] = sArray[ 0 ];
+  }
+  __syncthreads();
+  if (i < 32 )
+  {
+    if(  blockDim.x >= 64 ) sArray[ i ] += sArray[ i + 32 ];
+    if(  blockDim.x >= 32 )  sArray[ i ] += sArray[ i + 16 ];
+    if(  blockDim.x >= 16 )  sArray[ i ] += sArray[ i + 8 ];
+    if(  blockDim.x >= 8 )  sArray[ i ] += sArray[ i + 4 ];
+    if(  blockDim.x >= 4 )  sArray[ i ] += sArray[ i + 2 ];
+    if(  blockDim.x >= 2 )  sArray[ i ] += sArray[ i + 1 ];
+  }
+  
+  if( i == 0 )
+    dBlock[ blId ] = sArray[ 0 ];
 }
 
 
@@ -426,10 +371,40 @@ template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                       Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice )
+                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne )
 {
-    int thri = threadIdx.x; int thrj = threadIdx.y;
-    int blIdx = blockIdx.x; int blIdy = blockIdx.y;
+  int thri = threadIdx.x; int thrj = threadIdx.y;
+  int blIdx = blockIdx.x; int blIdy = blockIdx.y;
+  int grIdx = gridDim.x;
+  
+  if( BlockIterDevice[ blIdy * grIdx + blIdx] )
+  {
+  
+    const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
+    
+    int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
+    __shared__ volatile int numOfBlockx;
+    __shared__ volatile int numOfBlocky;
+    __shared__ int xkolik;
+    __shared__ int ykolik;
+    __shared__ volatile int NE;
+    if( thri == 0 && thrj == 0 )
+    {
+      xkolik = blockDim.x + 1;
+      ykolik = blockDim.y + 1;
+      numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0);
+      numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0);
+      
+      if( numOfBlockx - 1 == blIdx )
+        xkolik = dimX - (blIdx)*blockDim.x+1;
+      
+      if( numOfBlocky -1 == blIdy )
+        ykolik = dimY - (blIdy)*blockDim.y+1;
+        BlockIterDevice[ blIdy * grIdx + blIdx ] = 0;
+        NE = ne;
+    }
+    __syncthreads();
+   
     int i = thri + blockDim.x*blIdx;
     int j = blockDim.y*blIdy + thrj;
     int currentIndex = thrj * blockDim.x + thri;
@@ -438,17 +413,15 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     //__shared__ volatile bool changed[ blockDim.x*blockDim.y ];
     __shared__ volatile bool changed[16*16];
     changed[ currentIndex ] = false;
-    
     if( thrj == 0 && thri == 0 )
-        changed[ 0 ] = true;
+      changed[ 0 ] = true;
     
-    const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
     __shared__ Real hx;
     __shared__ Real hy;
     if( thrj == 1 && thri == 1 )
     {
-        hx = mesh.getSpaceSteps().x();
-        hy = mesh.getSpaceSteps().y();
+      hx = mesh.getSpaceSteps().x();
+      hy = mesh.getSpaceSteps().y();
     }
     
     //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ];
@@ -456,137 +429,89 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     sArray[thrj][thri] = std::numeric_limits< Real >::max();
     
     //filling sArray edges
-    int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
-    __shared__ volatile int numOfBlockx;
-    __shared__ volatile int numOfBlocky;
-    __shared__ int xkolik;
-    __shared__ int ykolik;
-    if( thri == 0 && thrj == 0 )
+    if( thri == 0 )
+    {        
+      if( dimX > (blIdx+1) * blockDim.x  && thrj+1 < ykolik && NE == 1 )
+        sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ];
+      else
+        sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
+    }
+    
+    if( thri == 1 )
     {
-        xkolik = blockDim.x + 1;
-        ykolik = blockDim.y + 1;
-        numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0);
-        numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0);
+      if( blIdx != 0 && thrj+1 < ykolik && NE == 1 )
+        sArray[thrj+1][0] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX ];
+      else
+        sArray[thrj+1][0] = std::numeric_limits< Real >::max();
+    }
     
-        if( numOfBlockx - 1 == blIdx )
-            xkolik = dimX - (blIdx)*blockDim.x+1;
-
-        if( numOfBlocky -1 == blIdy )
-            ykolik = dimY - (blIdy)*blockDim.y+1;
-        //BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0;
+    if( thri == 2 )
+    {
+      if( dimY > (blIdy+1) * blockDim.y  && thri+1 < xkolik && NE == 1 )
+        sArray[ykolik][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thrj+1 ];
+      else
+        sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();
     }
-    __syncthreads();
     
-        if(thri == 0 && thrj == 0 )
-            BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0;
-
-        if( thri == 0 )
-        {        
-            if( dimX > (blIdx+1) * blockDim.x  && thrj+1 < ykolik )
-                sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ];
-            else
-                sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
-        }
-
-        if( thri == 1 )
         {
-            if( blIdx != 0 && thrj+1 < ykolik )
-                sArray[thrj+1][0] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX ];
-            else
-                sArray[thrj+1][0] = std::numeric_limits< Real >::max();
+          changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy);
         }
-
-        if( thri == 2 )
+      }
+      __syncthreads();
+      
+      //pyramid reduction
+      if( blockDim.x*blockDim.y == 1024 )
+      {
+        if( currentIndex < 512 )
         {
-            if( dimY > (blIdy+1) * blockDim.y  && thri+1 < xkolik )
-                sArray[ykolik][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thrj+1 ];
-            else
-               sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
         }
-
-        if( thri == 3 )
+      }
+      __syncthreads();
+      if( blockDim.x*blockDim.y >= 512 )
+      {
+        if( currentIndex < 256 )
         {
-            if( blIdy != 0 && thrj+1 < xkolik )
-                sArray[0][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + thrj+1 ];
-            else
-                sArray[0][thrj+1] = std::numeric_limits< Real >::max();
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
         }
-
-
-        if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
-        {    
-            sArray[thrj+1][thri+1] = aux[ j*mesh.getDimensions().x() + i ];
-        }
-        __syncthreads();  
-
-        while( changed[ 0 ] )
+      }
+      __syncthreads();
+      if( blockDim.x*blockDim.y >= 256 )
+      {
+        if( currentIndex < 128 )
         {
-            __syncthreads();
-
-            changed[ currentIndex] = false;
-
-        //calculation of update cell
-            if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
-            {
-                if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] )
-                {
-                    changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy);
-                }
-            }
-            __syncthreads();
-
-        //pyramid reduction
-            if( blockDim.x*blockDim.y == 1024 )
-            {
-                if( currentIndex < 512 )
-                {
-                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
-                }
-            }
-            __syncthreads();
-            if( blockDim.x*blockDim.y >= 512 )
-            {
-                if( currentIndex < 256 )
-                {
-                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
-                }
-            }
-            __syncthreads();
-            if( blockDim.x*blockDim.y >= 256 )
-            {
-                if( currentIndex < 128 )
-                {
-                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
-                }
-            }
-            __syncthreads();
-            if( blockDim.x*blockDim.y >= 128 )
-            {
-                if( currentIndex < 64 )
-                {
-                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
-                }
-            }
-            __syncthreads();
-            if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
-            {
-                if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
-                if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
-                if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
-                if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
-                if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
-                if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
-            }
-            if( changed[ 0 ] && thri == 0 && thrj == 0 ){
-                BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1;
-            }
-            __syncthreads();
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
         }
-
-        if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) )
-            aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ];
+      }
+      __syncthreads();
+      if( blockDim.x*blockDim.y >= 128 )
+      {
+        if( currentIndex < 64 )
+        {
+    if( thri == 3 )
+    {
+      if( blIdy != 0 && thrj+1 < xkolik && NE == 1 )
+        sArray[0][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + thrj+1 ];
+      else
+        sArray[0][thrj+1] = std::numeric_limits< Real >::max();
+    }
+    
+    
+    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+    {    
+      sArray[thrj+1][thri+1] = aux[ j*mesh.getDimensions().x() + i ];
     }
-    /*if( thri == 0 && thrj == 0 )
-        printf( "Block ID = %d, value = %d \n", (blIdy * numOfBlockx + blIdx), BlockIterDevice[ blIdy * numOfBlockx + blIdx ] );*/
+    __syncthreads();  
+    
+    while( changed[ 0 ] )
+    {
+      __syncthreads();
+      
+      changed[ currentIndex] = false;
+      
+      //calculation of update cell
+      if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+      {
+        if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] )
 }
 #endif
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index 8c85745cd..4daf9fc92 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -258,13 +258,21 @@ solve( const MeshPointer& mesh,
                  
           tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
           
-          int *BlockIterDevice;
+          
           int BlockIterD = 1;
           
-          cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );
+          TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
+          BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ );
+          BlockIterDevice.setValue( 1 );
+          /*int *BlockIterDevice;
+          cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/
           int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0);
-          int *dBlock;
-          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );
+          
+          TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
+          dBlock.setSize( nBlocks );
+          dBlock.setValue( 0 );
+          /*int *dBlock;
+          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/
           
           while( BlockIterD )
           {
@@ -272,17 +280,24 @@ solve( const MeshPointer& mesh,
                                                               interfaceMapPtr.template getData< Device >(),
                                                               auxPtr.template modifyData< Device>(),
                                                               BlockIterDevice );
-            //CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
-            //CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
+            
+            CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
             
+            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
             cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
                                    
             /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
                 BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
             
           }
-          cudaFree( BlockIterDevice );
-          cudaFree( dBlock );
+          //cudaFree( BlockIterDevice );
+          //cudaFree( dBlock );
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
           aux = *auxPtr;
@@ -302,7 +317,7 @@ template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
                                       Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
-                                      int *BlockIterDevice )
+                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice )
 {
     int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
     int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z;
-- 
GitLab


From d40a55e3f1e9795f5ca3ef669fe980b8effeb1e3 Mon Sep 17 00:00:00 2001
From: Fencl <fenclmat@fjfi.cvut.cz>
Date: Tue, 30 Oct 2018 18:38:41 +0100
Subject: [PATCH 13/20] FIM method implemented for GPU and FIM-FSM implemented
 for CPU (parallel).

---
 .../tnlDirectEikonalMethodsBase.h             |   18 +-
 .../tnlDirectEikonalMethodsBase_impl.h        | 2045 +++++++++--------
 .../tnlFastSweepingMethod2D_impl.h            |  620 +++--
 3 files changed, 1440 insertions(+), 1243 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index 08ed947ed..cbb1a1ff6 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -74,12 +74,16 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
                                          const MeshEntity& cell,
                                          const RealType velocity = 1.0 );
       
-      __cuda_callable__ bool updateCell( volatile Real sArray[18][18],
+      template< int sizeSArray >
+      __cuda_callable__ bool updateCell( volatile Real *sArray,
                                          int thri, int thrj, const Real hx, const Real hy,
                                          const Real velocity = 1.0 );
+      
+      template< int sizeSArray >
       void updateBlocks( InterfaceMapType interfaceMap,
                          MeshFunctionType aux,
-                         ArrayContainer BlockIterHost, int numThreadsPerBlock );
+                         MeshFunctionType helpFunc,
+                         ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ );
       
       void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY  );
 };
@@ -119,9 +123,6 @@ T1 meet2DCondition( T1 a, T1 b, const T2 ha, const T2 hb, const T1 value, double
 template < typename T1 >
 __cuda_callable__ void sortMinims( T1 pom[] );
 
-template < typename Index >
-void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY );
-
 #ifdef HAVE_CUDA
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, 
@@ -134,11 +135,12 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
                                       Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& aux,
                                       bool *BlockIterDevice );
 
-template < typename Real, typename Device, typename Index >
+template < int sizeSArray, typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
-                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne = 1 );
+                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
+                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
+                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock =0);
 
 template < typename Index >
 __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index 1f9fc5eeb..95971c9b8 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -1,4 +1,4 @@
- /* 
+/* 
  * File:   tnlDirectEikonalMethodsBase_impl.h
  * Author: oberhuber
  *
@@ -13,233 +13,259 @@
 #include "tnlFastSweepingMethod.h"
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >::
 initInterface( const MeshFunctionPointer& _input,
-               MeshFunctionPointer& _output,
-               InterfaceMapPointer& _interfaceMap  )
+        MeshFunctionPointer& _output,
+        InterfaceMapPointer& _interfaceMap  )
 {
-    if( std::is_same< Device, Devices::Cuda >::value )
-    {
+  if( std::is_same< Device, Devices::Cuda >::value )
+  {
 #ifdef HAVE_CUDA
-        const MeshType& mesh = _input->getMesh();
-        
-        const int cudaBlockSize( 16 );
-        int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
-        dim3 blockSize( cudaBlockSize );
-        dim3 gridSize( numBlocksX );
-        Devices::Cuda::synchronizeDevice();
-        CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
-                                                   _output.template modifyData< Device >(),
-                                                   _interfaceMap.template modifyData< Device >() );
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
+    const MeshType& mesh = _input->getMesh();
+    
+    const int cudaBlockSize( 16 );
+    int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
+    dim3 blockSize( cudaBlockSize );
+    dim3 gridSize( numBlocksX );
+    Devices::Cuda::synchronizeDevice();
+    CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
+            _output.template modifyData< Device >(),
+            _interfaceMap.template modifyData< Device >() );
+    cudaDeviceSynchronize();
+    TNL_CHECK_CUDA_DEVICE;
 #endif
-    }
-    if( std::is_same< Device, Devices::Host >::value )
-    {
-        const MeshType& mesh = _input->getMesh();
-        typedef typename MeshType::Cell Cell;
-        const MeshFunctionType& input = _input.getData();
-        MeshFunctionType& output = _output.modifyData();
-        InterfaceMapType& interfaceMap = _interfaceMap.modifyData();
-        Cell cell( mesh );
-        for( cell.getCoordinates().x() = 0;
+  }
+  if( std::is_same< Device, Devices::Host >::value )
+  {
+    const MeshType& mesh = _input->getMesh();
+    typedef typename MeshType::Cell Cell;
+    const MeshFunctionType& input = _input.getData();
+    MeshFunctionType& output = _output.modifyData();
+    InterfaceMapType& interfaceMap = _interfaceMap.modifyData();
+    Cell cell( mesh );
+    for( cell.getCoordinates().x() = 0;
             cell.getCoordinates().x() < mesh.getDimensions().x();
             cell.getCoordinates().x() ++ )
-           {
-               cell.refresh();
-               output[ cell.getIndex() ] =
-               input( cell ) >= 0 ? std::numeric_limits< RealType >::max() :
-                                  -std::numeric_limits< RealType >::max();
-               interfaceMap[ cell.getIndex() ] = false;
-           }
-        
-        
-        const RealType& h = mesh.getSpaceSteps().x();
-        for( cell.getCoordinates().x() = 0;
-             cell.getCoordinates().x() < mesh.getDimensions().x() - 1;
-             cell.getCoordinates().x() ++ )
+    {
+      cell.refresh();
+      output[ cell.getIndex() ] =
+              input( cell ) >= 0 ? std::numeric_limits< RealType >::max() :
+                -std::numeric_limits< RealType >::max();
+      interfaceMap[ cell.getIndex() ] = false;
+    }
+    
+    
+    const RealType& h = mesh.getSpaceSteps().x();
+    for( cell.getCoordinates().x() = 0;
+            cell.getCoordinates().x() < mesh.getDimensions().x() - 1;
+            cell.getCoordinates().x() ++ )
+    {
+      cell.refresh();
+      const RealType& c = input( cell );      
+      if( ! cell.isBoundaryEntity()  )
+      {
+        const auto& neighbors = cell.getNeighborEntities();
+        Real pom = 0;
+        //const IndexType& c = cell.getIndex();
+        const IndexType e = neighbors.template getEntityIndex<  1 >();
+        if( c * input[ e ] <= 0 )
         {
-           cell.refresh();
-           const RealType& c = input( cell );      
-           if( ! cell.isBoundaryEntity()  )
-           {
-              const auto& neighbors = cell.getNeighborEntities();
-              Real pom = 0;
-              //const IndexType& c = cell.getIndex();
-              const IndexType e = neighbors.template getEntityIndex<  1 >();
-              if( c * input[ e ] <= 0 )
-              {
-                pom = TNL::sign( c )*( h * c )/( c - input[ e ]);
-                if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
-                    output[ cell.getIndex() ] = pom;
-
-                pom = pom - TNL::sign( c )*h; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
-                if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
-                    output[ e ] = pom; 
-
-                interfaceMap[ cell.getIndex() ] = true;
-                interfaceMap[ e ] = true;
-              }
-           }
+          pom = TNL::sign( c )*( h * c )/( c - input[ e ]);
+          if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
+            output[ cell.getIndex() ] = pom;
+          
+          pom = pom - TNL::sign( c )*h; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
+          if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
+            output[ e ] = pom; 
+          
+          interfaceMap[ cell.getIndex() ] = true;
+          interfaceMap[ e ] = true;
         }
+      }
     }
+  }
 }
 
 template< typename Real,
-          typename Device,
-          typename Index >
-void 
+        typename Device,
+        typename Index >
+template< int sizeSArray >
+void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
 updateBlocks( InterfaceMapType interfaceMap,
-              MeshFunctionType aux,
-              ArrayContainer BlockIterHost, int numThreadsPerBlock )
+        MeshFunctionType aux,
+        MeshFunctionType helpFunc,
+        ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
 {
+#pragma omp parallel for schedule( dynamic )
   for( int i = 0; i < BlockIterHost.getSize(); i++ )
   {
     if( BlockIterHost[ i ] )
     {
       MeshType mesh = interfaceMap.template getMesh< Devices::Host >();
-    
+      
       int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
-      int numOfBlockx = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
-      int numOfBlocky = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
+      //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl;
+      int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
+      int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
+      //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl;
       int xkolik = numThreadsPerBlock + 1;
       int ykolik = numThreadsPerBlock + 1;
       
       int blIdx = i%numOfBlockx;
-      int blIdy = i/numOfBlocky;
+      int blIdy = i/numOfBlockx;
+      //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl;
       
       if( numOfBlockx - 1 == blIdx )
         xkolik = dimX - (blIdx)*numThreadsPerBlock+1;
       
       if( numOfBlocky -1 == blIdy )
         ykolik = dimY - (blIdy)*numThreadsPerBlock+1;
-    
-        
+      //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl;
+      
+      
       /*bool changed[numThreadsPerBlock*numThreadsPerBlock];
-      changed[ 0 ] = 1;*/
+       changed[ 0 ] = 1;*/
       Real hx = mesh.getSpaceSteps().x();
       Real hy = mesh.getSpaceSteps().y();
       
-      Real changed1[ 16*16 ];
-      /*Real changed2[ 16*16 ];
-      Real changed3[ 16*16 ];
-      Real changed4[ 16*16 ];*/
-      Real sArray[18][18];
+      bool changed = false;
+      
+      
+      RealType *sArray;
+      sArray = new Real[ sizeSArray * sizeSArray ];
+      if( sArray == nullptr )
+        std::cout << "Error while allocating memory for sArray." << std::endl;
+      
+      for( int thri = 0; thri < sizeSArray; thri++ ){
+        for( int thrj = 0; thrj < sizeSArray; thrj++ )
+          sArray/*[i]*/[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max();
+      }
       
-      for( int thri = 0; thri < numThreadsPerBlock + 2; thri++ )
-        for( int thrj = 0; thrj < numThreadsPerBlock + 2; thrj++ )
-          sArray[thrj][thri] = std::numeric_limits< Real >::max();
-    
       BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0;
-    
+      
       for( int thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ )
       {        
         if( dimX > (blIdx+1) * numThreadsPerBlock  && thrj+1 < ykolik )
-          sArray[thrj+1][xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ];
-        else
-         sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
-      
-    
+          sArray/*[i]*/[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ];
+        
+        
         if( blIdx != 0 && thrj+1 < ykolik )
-          sArray[thrj+1][0] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ];
-        else
-          sArray[thrj+1][0] = std::numeric_limits< Real >::max();
-    
+          sArray/*[i]*/[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ];
+        
         if( dimY > (blIdy+1) * numThreadsPerBlock  && thrj+1 < xkolik )
-          sArray[ykolik][thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ];
-        else
-          sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();
-      
+          sArray/*[i]*/[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ];
+        
         if( blIdy != 0 && thrj+1 < xkolik )
-          sArray[0][thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ];
-        else
-          sArray[0][thrj+1] = std::numeric_limits< Real >::max();
+          sArray/*[i]*/[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ];
       }
-    
-      for( int k = 0; k < numThreadsPerBlock; k++ )
-        for( int l = 0; l < numThreadsPerBlock; l++ ) 
-          sArray[k+1][l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ];
-
-      for( int k = 0; k < numThreadsPerBlock; k++ ) 
+      
+      for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ )
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
+            sArray/*[i]*/[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ];
+      }
+      bool pom = false;
+      for( int k = 0; k < numThreadsPerBlock; k++ ){ 
         for( int l = 0; l < numThreadsPerBlock; l++ ){
-          changed1[ k*numThreadsPerBlock + l ] = 0;
-          /*changed2[ k*numThreadsPerBlock + l ] = 0;
-          changed3[ k*numThreadsPerBlock + l ] = 0;
-          changed4[ k*numThreadsPerBlock + l ] = 0;*/
-          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
-          {
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ){
+            //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl;
             if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
             {
-              changed1[ k*numThreadsPerBlock + l ] = this->updateCell( sArray, l+1, k+1, hx,hy);
+              pom = this->updateCell<sizeSArray>( sArray/*[i]*/, l+1, k+1, hx,hy);
+              changed = changed || pom;
             }
           }
         }
-
-      for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
-        for( int l = 0; l < numThreadsPerBlock; l++ ) { 
-          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
+      }
+      /*aux.save( "aux-1pruch.tnl" );
+      for( int k = 0; k < sizeSArray; k++ ){ 
+        for( int l = 0; l < sizeSArray; l++ ) {
+          std::cout << sArray[ k * sizeSArray + l] << " ";
+        }
+        std::cout << std::endl;
+      }*/
+           
+      for( int k = 0; k < numThreadsPerBlock; k++ ) 
+        for( int l = numThreadsPerBlock-1; l >-1; l-- ) { 
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
           {
             if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
             {
-              /*changed2[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
+              this->updateCell<sizeSArray>( sArray/*[i]*/, l+1, k+1, hx,hy);
             }
           }
         }
-
-      for( int k = 0; k < numThreadsPerBlock; k++ ) 
-        for( int l = numThreadsPerBlock-1; l >-1; l-- ) { 
-          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
+      /*aux.save( "aux-2pruch.tnl" );
+      for( int k = 0; k < sizeSArray; k++ ){ 
+        for( int l = 0; l < sizeSArray; l++ ) {
+          std::cout << sArray[ k * sizeSArray + l] << " ";
+        }
+        std::cout << std::endl;
+      }*/
+      
+      for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
+        for( int l = 0; l < numThreadsPerBlock; l++ ) {
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
           {
             if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
             {
-              /*changed3[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
+              this->updateCell<sizeSArray>( sArray/*[i]*/, l+1, k+1, hx,hy);
             }
           }
         }
-
-      for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
+      /*aux.save( "aux-3pruch.tnl" );
+      for( int k = 0; k < sizeSArray; k++ ){ 
+        for( int l = 0; l < sizeSArray; l++ ) {
+          std::cout << sArray[ k * sizeSArray + l] << " ";
+        }
+        std::cout << std::endl;
+      }*/
+      
+      for( int k = numThreadsPerBlock-1; k > -1; k-- ){
         for( int l = numThreadsPerBlock-1; l >-1; l-- ) { 
-          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY )
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
           {
             if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
             {
-              /*changed4[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
+              this->updateCell<sizeSArray>( sArray/*[i]*/, l+1, k+1, hx,hy);
             }
           }
         }
-
-      for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
-        for( int l = numThreadsPerBlock-1; l >-1; l-- ){
-          changed1[ 0 ] = changed1[ 0 ] || changed1[ k*numThreadsPerBlock + l ];
-          /*changed2[ 0 ] = changed2[ 0 ] || changed2[ k*numThreadsPerBlock + l ];
-          changed3[ 0 ] = changed3[ 0 ] || changed3[ k*numThreadsPerBlock + l ];
-          changed4[ 0 ] = changed4[ 0 ] || changed4[ k*numThreadsPerBlock + l ];*/
+      }
+      /*aux.save( "aux-4pruch.tnl" );
+      for( int k = 0; k < sizeSArray; k++ ){ 
+        for( int l = 0; l < sizeSArray; l++ ) {
+          std::cout << sArray[ k * sizeSArray + l] << " ";
         }
+        std::cout << std::endl;
+      }*/
+      
       
-      if( changed1[ 0 ] /*|| changed2[ 0 ] ||changed3[ 0 ] ||changed4[ 0 ]*/ )
+      if( changed ){
         BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 1;
-
+      }
+      
+      
       for( int k = 0; k < numThreadsPerBlock; k++ ){ 
-        for( int l = 0; l < numThreadsPerBlock; l++ ) {       
-          if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l < dimX*dimY &&
-              (!interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ]) )
-            aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] = sArray[ k + 1 ][ l + 1 ];
+        for( int l = 0; l < numThreadsPerBlock; l++ ) {
+          if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )      
+            helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] = sArray/*[i]*/[ (k + 1)* sizeSArray + l + 1 ];
           //std::cout<< sArray[k+1][l+1];
         }
         //std::cout<<std::endl;
       }
+      //delete []sArray;
     }
   }
 }
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
 void 
 tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
 getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY )
@@ -249,643 +275,643 @@ getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY )
   
   for(int i = 0; i < numBlockX * numBlockY; i++)
   {
-    BlockIterPom[ i ] = 0;  
-    if( BlockIterHost[ i ] )
-    {
-      // i = k*numBlockY + m;
-      int m=0, k=0;
-      m = i%numBlockX;
-      k = i/numBlockX;
-      if( k > 0 )
-        BlockIterPom[i - numBlockX] = 1;
-      if( k < numBlockY - 1 )
-        BlockIterPom[i + numBlockX] = 1;
-      
-      if( m < numBlockX - 1 )
-        BlockIterPom[ i+1 ] = 1;
-      if( m > 0 )
-        BlockIterPom[ i-1 ] = 1;
+    BlockIterPom[ i ] = 0;//BlockIterPom[ i ] = 0;
+    int m=0, k=0;
+    m = i%numBlockX;
+    k = i/numBlockX;
+    if( m > 0 && BlockIterHost[ i - 1 ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){
+      BlockIterPom[ i ] = 1;
     }
+    //BlockIterPom[ i ];
   }
-  for(int i = 0; i < numBlockX * numBlockY; i++ )
-      //if( !BlockIter[ i ] )
-        BlockIterHost[ i ] = BlockIterPom[ i ];
-      /*else
-        BlockIter[ i ] = 0;*/
-  /*for( int i = numBlockX-1; i > -1; i-- )
+  
+  for(int i = 0; i < numBlockX * numBlockY; i++)
   {
-      for( int j = 0; j< numBlockY; j++ )
-          std::cout << BlockIterHost[ i*numBlockY + j ];
-      std::cout << std::endl;
+    if( !BlockIterHost[ i ] )
+      BlockIterHost[ i ] = BlockIterPom[ i ];
   }
-  std::cout << std::endl;*/
+  /*else
+   BlockIter[ i ] = 0;*/
+  /*for( int i = numBlockX-1; i > -1; i-- )
+   {
+   for( int j = 0; j< numBlockY; j++ )
+   std::cout << BlockIterHost[ i*numBlockY + j ];
+   std::cout << std::endl;
+   }
+   std::cout << std::endl;*/
   delete[] BlockIterPom;
 }
 
 template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename MeshEntity >
+        typename Device,
+        typename Index >
+template< typename MeshEntity >
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >::
 updateCell( MeshFunctionType& u,
-            const MeshEntity& cell, 
-            const RealType v )
+        const MeshEntity& cell, 
+        const RealType v )
 {
-    const auto& neighborEntities = cell.template getNeighborEntities< 1 >();
-    const MeshType& mesh = cell.getMesh();
-    const RealType& h = mesh.getSpaceSteps().x();
-    const RealType value = u( cell );
-    RealType a, tmp = std::numeric_limits< RealType >::max();
-
-    if( cell.getCoordinates().x() == 0 )
-       a = u[ neighborEntities.template getEntityIndex< 1 >() ];
-    else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
-       a = u[ neighborEntities.template getEntityIndex< -1 >() ];
-    else
-    {
-       a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1 >() ],
-                           u[ neighborEntities.template getEntityIndex<  1 >() ] );
-    }
-
-    if( fabs( a ) == std::numeric_limits< RealType >::max() )
-      return;
-   
-    tmp = a + TNL::sign( value ) * h/v;
-    
-    u[ cell.getIndex() ] = argAbsMin( value, tmp );
+  const auto& neighborEntities = cell.template getNeighborEntities< 1 >();
+  const MeshType& mesh = cell.getMesh();
+  const RealType& h = mesh.getSpaceSteps().x();
+  const RealType value = u( cell );
+  RealType a, tmp = std::numeric_limits< RealType >::max();
+  
+  if( cell.getCoordinates().x() == 0 )
+    a = u[ neighborEntities.template getEntityIndex< 1 >() ];
+  else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
+    a = u[ neighborEntities.template getEntityIndex< -1 >() ];
+  else
+  {
+    a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1 >() ],
+            u[ neighborEntities.template getEntityIndex<  1 >() ] );
+  }
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() )
+    return;
+  
+  tmp = a + TNL::sign( value ) * h/v;
+  
+  u[ cell.getIndex() ] = argAbsMin( value, tmp );
 }
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
 initInterface( const MeshFunctionPointer& _input,
-               MeshFunctionPointer& _output,
-               InterfaceMapPointer& _interfaceMap )
+        MeshFunctionPointer& _output,
+        InterfaceMapPointer& _interfaceMap )
 {
-            
-    if( std::is_same< Device, Devices::Cuda >::value )
-    {
+  
+  if( std::is_same< Device, Devices::Cuda >::value )
+  {
 #ifdef HAVE_CUDA
-        const MeshType& mesh = _input->getMesh();
-        
-        const int cudaBlockSize( 16 );
-        int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
-        int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
-        dim3 blockSize( cudaBlockSize, cudaBlockSize );
-        dim3 gridSize( numBlocksX, numBlocksY );
-        Devices::Cuda::synchronizeDevice();
-        CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
-                                                   _output.template modifyData< Device >(),
-                                                   _interfaceMap.template modifyData< Device >() );
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
+    const MeshType& mesh = _input->getMesh();
+    
+    const int cudaBlockSize( 16 );
+    int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
+    int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
+    dim3 blockSize( cudaBlockSize, cudaBlockSize );
+    dim3 gridSize( numBlocksX, numBlocksY );
+    Devices::Cuda::synchronizeDevice();
+    CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
+            _output.template modifyData< Device >(),
+            _interfaceMap.template modifyData< Device >() );
+    cudaDeviceSynchronize();
+    TNL_CHECK_CUDA_DEVICE;
 #endif
-    }
-    if( std::is_same< Device, Devices::Host >::value )
-    {
-        MeshFunctionType input = _input.getData();
-        
-        /*double A[320][320];
-        std::ifstream fileInit("/home/maty/Downloads/initData.txt");
-
-        for (int i = 0; i < 320; i++)
-            for (int j = 0; j < 320; j++)
-                fileInit >> A[i][j];
-        fileInit.close();
-        for (int i = 0; i < 320; i++)
-            for (int j = 0; j < 320; j++)
-                input[i*320 + j] = A[i][j];*/
-        
-        
-         MeshFunctionType& output = _output.modifyData();
-         InterfaceMapType& interfaceMap = _interfaceMap.modifyData();
-        const MeshType& mesh = input.getMesh();
-        typedef typename MeshType::Cell Cell;
-        Cell cell( mesh );
-        for( cell.getCoordinates().y() = 0;
-             cell.getCoordinates().y() < mesh.getDimensions().y();
-             cell.getCoordinates().y() ++ )
-            for( cell.getCoordinates().x() = 0;
-                 cell.getCoordinates().x() < mesh.getDimensions().x();
-                 cell.getCoordinates().x() ++ )
-                {
-                    cell.refresh();
-                    output[ cell.getIndex() ] =
-                    input( cell ) >= 0 ? std::numeric_limits< RealType >::max() :
-                                       - std::numeric_limits< RealType >::max();
-                    interfaceMap[ cell.getIndex() ] = false;
-                }
-
-       const RealType& hx = mesh.getSpaceSteps().x();
-       const RealType& hy = mesh.getSpaceSteps().y();     
-       for( cell.getCoordinates().y() = 0;
+  }
+  if( std::is_same< Device, Devices::Host >::value )
+  {
+    MeshFunctionType input = _input.getData();
+    
+    /*double A[320][320];
+     std::ifstream fileInit("/home/maty/Downloads/initData.txt");
+     
+     for (int i = 0; i < 320; i++)
+     for (int j = 0; j < 320; j++)
+     fileInit >> A[j];
+     fileInit.close();
+     for (int i = 0; i < 320; i++)
+     for (int j = 0; j < 320; j++)
+     input[i*320 + j] = A[j];*/
+    
+    
+    MeshFunctionType& output = _output.modifyData();
+    InterfaceMapType& interfaceMap = _interfaceMap.modifyData();
+    const MeshType& mesh = input.getMesh();
+    typedef typename MeshType::Cell Cell;
+    Cell cell( mesh );
+    for( cell.getCoordinates().y() = 0;
             cell.getCoordinates().y() < mesh.getDimensions().y();
             cell.getCoordinates().y() ++ )
-          for( cell.getCoordinates().x() = 0;
-               cell.getCoordinates().x() < mesh.getDimensions().x();
-               cell.getCoordinates().x() ++ )
+      for( cell.getCoordinates().x() = 0;
+              cell.getCoordinates().x() < mesh.getDimensions().x();
+              cell.getCoordinates().x() ++ )
+      {
+        cell.refresh();
+        output[ cell.getIndex() ] =
+                input( cell ) >= 0 ? std::numeric_limits< RealType >::max() :
+                  - std::numeric_limits< RealType >::max();
+        interfaceMap[ cell.getIndex() ] = false;
+      }
+    
+    const RealType& hx = mesh.getSpaceSteps().x();
+    const RealType& hy = mesh.getSpaceSteps().y();     
+    for( cell.getCoordinates().y() = 0;
+            cell.getCoordinates().y() < mesh.getDimensions().y();
+            cell.getCoordinates().y() ++ )
+      for( cell.getCoordinates().x() = 0;
+              cell.getCoordinates().x() < mesh.getDimensions().x();
+              cell.getCoordinates().x() ++ )
+      {
+        cell.refresh();
+        const RealType& c = input( cell );
+        if( ! cell.isBoundaryEntity()  )
+        {
+          auto neighbors = cell.getNeighborEntities();
+          Real pom = 0;
+          const IndexType e = neighbors.template getEntityIndex<  1,  0 >();
+          const IndexType n = neighbors.template getEntityIndex<  0,  1 >();
+          //Try init with exact data:
+          /*if( c * input[ n ] <= 0 )
+           {
+           output[ cell.getIndex() ] = c;
+           output[ n ] = input[ n ];
+           interfaceMap[ cell.getIndex() ] = true;
+           interfaceMap[ n ] = true;
+           }   
+           if( c * input[ e ] <= 0 )
+           {   
+           output[ cell.getIndex() ] = c;
+           output[ e ] = input[ e ];
+           interfaceMap[ cell.getIndex() ] = true;
+           interfaceMap[ e ] = true;
+           }*/
+          if( c * input[ n ] <= 0 )
           {
-             cell.refresh();
-             const RealType& c = input( cell );
-             if( ! cell.isBoundaryEntity()  )
+            /*if( c >= 0 )
+             {*/
+            pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
+            if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
+              output[ cell.getIndex() ] = pom;
+            pom = pom - TNL::sign( c )*hy;
+            if( TNL::abs( output[ n ] ) > TNL::abs( pom ) )
+              output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy;
+            /*}else
              {
-                auto neighbors = cell.getNeighborEntities();
-                Real pom = 0;
-                const IndexType e = neighbors.template getEntityIndex<  1,  0 >();
-                const IndexType n = neighbors.template getEntityIndex<  0,  1 >();
-                //Try init with exact data:
-                /*if( c * input[ n ] <= 0 )
-                {
-                    output[ cell.getIndex() ] = c;
-                    output[ n ] = input[ n ];
-                    interfaceMap[ cell.getIndex() ] = true;
-                    interfaceMap[ n ] = true;
-                }   
-                if( c * input[ e ] <= 0 )
-                {   
-                    output[ cell.getIndex() ] = c;
-                    output[ e ] = input[ e ];
-                    interfaceMap[ cell.getIndex() ] = true;
-                    interfaceMap[ e ] = true;
-                }*/
-                if( c * input[ n ] <= 0 )
-                {
-                    /*if( c >= 0 )
-                    {*/
-                        pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
-                        if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
-                            output[ cell.getIndex() ] = pom;
-                        pom = pom - TNL::sign( c )*hy;
-                        if( TNL::abs( output[ n ] ) > TNL::abs( pom ) )
-                            output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy;
-                    /*}else
-                    {
-                        pom = - ( hy * c )/( c - input[ n ]);
-                        if( output[ cell.getIndex() ] < pom )
-                            output[ cell.getIndex() ] = pom;
-                        if( output[ n ] > hy + pom )
-                            output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]);
-                    }*/
-                    interfaceMap[ cell.getIndex() ] = true;
-                    interfaceMap[ n ] = true;
-                }
-                if( c * input[ e ] <= 0 )
-                {
-                    /*if( c >= 0 )
-                    {*/
-                        pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
-                        if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
-                            output[ cell.getIndex() ] = pom;
-
-                        pom = pom - TNL::sign( c )*hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
-                        if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
-                            output[ e ] = pom; 
-                    /*}else
-                    {
-                        pom = - (hx * c)/( c - input[ e ]);
-                        if( output[ cell.getIndex() ] < pom )
-                            output[ cell.getIndex() ] = pom;
-
-                        pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
-                        if( output[ e ] > pom )
-                            output[ e ] = pom;
-                    }*/
-                    interfaceMap[ cell.getIndex() ] = true;
-                    interfaceMap[ e ] = true;
-                }
-             }
+             pom = - ( hy * c )/( c - input[ n ]);
+             if( output[ cell.getIndex() ] < pom )
+             output[ cell.getIndex() ] = pom;
+             if( output[ n ] > hy + pom )
+             output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]);
+             }*/
+            interfaceMap[ cell.getIndex() ] = true;
+            interfaceMap[ n ] = true;
           }
+          if( c * input[ e ] <= 0 )
+          {
+            /*if( c >= 0 )
+             {*/
+            pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
+            if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
+              output[ cell.getIndex() ] = pom;
+            
+            pom = pom - TNL::sign( c )*hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
+            if( TNL::abs( output[ e ] ) > TNL::abs( pom ) )
+              output[ e ] = pom; 
+            /*}else
+             {
+             pom = - (hx * c)/( c - input[ e ]);
+             if( output[ cell.getIndex() ] < pom )
+             output[ cell.getIndex() ] = pom;
+             
+             pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
+             if( output[ e ] > pom )
+             output[ e ] = pom;
+             }*/
+            interfaceMap[ cell.getIndex() ] = true;
+            interfaceMap[ e ] = true;
+          }
+        }
       }
+  }
 }
 
 template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename MeshEntity >
+        typename Device,
+        typename Index >
+template< typename MeshEntity >
 __cuda_callable__
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
 updateCell( MeshFunctionType& u,
-            const MeshEntity& cell,   
-            const RealType v)
+        const MeshEntity& cell,   
+        const RealType v)
 {
-   const auto& neighborEntities = cell.template getNeighborEntities< 2 >();
-   const MeshType& mesh = cell.getMesh();
-   const RealType& hx = mesh.getSpaceSteps().x();
-   const RealType& hy = mesh.getSpaceSteps().y();
-   const RealType value = u( cell );
-   RealType a, b, tmp = std::numeric_limits< RealType >::max();
-   
-   if( cell.getCoordinates().x() == 0 )
-      a = u[ neighborEntities.template getEntityIndex< 1,  0 >() ];
-   else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
-      a = u[ neighborEntities.template getEntityIndex< -1,  0 >() ];
-   else
-   {
-      a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1,  0 >() ],
-                          u[ neighborEntities.template getEntityIndex<  1,  0 >() ] );
-   }
-
-   if( cell.getCoordinates().y() == 0 )
-      b = u[ neighborEntities.template getEntityIndex< 0,  1 >()];
-   else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 )
-      b = u[ neighborEntities.template getEntityIndex< 0,  -1 >() ];
-   else
-   {
-      b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0,  -1 >() ],
-                          u[ neighborEntities.template getEntityIndex< 0,   1 >() ] );
-   }
-   if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-       fabs( b ) == std::numeric_limits< RealType >::max() )
-      return;
-   /*if( fabs( a ) == TypeInfo< Real >::getMaxValue() ||
-       fabs( b ) == TypeInfo< Real >::getMaxValue() ||
-       fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) )
+  const auto& neighborEntities = cell.template getNeighborEntities< 2 >();
+  const MeshType& mesh = cell.getMesh();
+  const RealType& hx = mesh.getSpaceSteps().x();
+  const RealType& hy = mesh.getSpaceSteps().y();
+  const RealType value = u( cell );
+  RealType a, b, tmp = std::numeric_limits< RealType >::max();
+  
+  if( cell.getCoordinates().x() == 0 )
+    a = u[ neighborEntities.template getEntityIndex< 1,  0 >() ];
+  else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
+    a = u[ neighborEntities.template getEntityIndex< -1,  0 >() ];
+  else
+  {
+    a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1,  0 >() ],
+            u[ neighborEntities.template getEntityIndex<  1,  0 >() ] );
+  }
+  
+  if( cell.getCoordinates().y() == 0 )
+    b = u[ neighborEntities.template getEntityIndex< 0,  1 >()];
+  else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 )
+    b = u[ neighborEntities.template getEntityIndex< 0,  -1 >() ];
+  else
+  {
+    b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0,  -1 >() ],
+            u[ neighborEntities.template getEntityIndex< 0,   1 >() ] );
+  }
+  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+          fabs( b ) == std::numeric_limits< RealType >::max() )
+    return;
+  /*if( fabs( a ) == TypeInfo< Real >::getMaxValue() ||
+   fabs( b ) == TypeInfo< Real >::getMaxValue() ||
+   fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) )
    {
-      tmp = 
-        fabs( a ) >= fabs( b ) ? b + TNL::sign( value ) * hy :
-                                 a + TNL::sign( value ) * hx;
+   tmp = 
+   fabs( a ) >= fabs( b ) ? b + TNL::sign( value ) * hy :
+   a + TNL::sign( value ) * hx;
    }*/
-   /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() &&
-       fabs( b ) != TypeInfo< Real >::getMaxValue() &&
-       fabs( a - b ) < TNL::sqrt( (hx * hx + hy * hy)/v ) )
+  /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() &&
+   fabs( b ) != TypeInfo< Real >::getMaxValue() &&
+   fabs( a - b ) < TNL::sqrt( (hx * hx + hy * hy)/v ) )
    {
-       tmp = ( hx * hx * b + hy * hy * a + 
-            sign( value ) * hx * hy * TNL::sqrt( ( hx * hx + hy * hy )/v - 
-            ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy );
-       u[ cell.getIndex() ] =  tmp;
+   tmp = ( hx * hx * b + hy * hy * a + 
+   sign( value ) * hx * hy * TNL::sqrt( ( hx * hx + hy * hy )/v - 
+   ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy );
+   u[ cell.getIndex() ] =  tmp;
    }
    else
    {
-       tmp = 
-          fabs( a ) > fabs( b ) ? b + TNL::sign( value ) * hy/v :
-                                   a + TNL::sign( value ) * hx/v;
-       u[ cell.getIndex() ] = argAbsMin( value, tmp );
-       //tmp = TypeInfo< RealType >::getMaxValue();
+   tmp = 
+   fabs( a ) > fabs( b ) ? b + TNL::sign( value ) * hy/v :
+   a + TNL::sign( value ) * hx/v;
+   u[ cell.getIndex() ] = argAbsMin( value, tmp );
+   //tmp = TypeInfo< RealType >::getMaxValue();
    }*/
-    RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
-    sortMinims( pom );
-    tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
-    
-                                
-    if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
-        u[ cell.getIndex() ] = argAbsMin( value, tmp );
-    else
-    {
-        tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+  RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
+  sortMinims( pom );
+  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
+  
+  
+  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
+    u[ cell.getIndex() ] = argAbsMin( value, tmp );
+  else
+  {
+    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
             TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
             ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-        u[ cell.getIndex() ] = argAbsMin( value, tmp );
-    }
+    u[ cell.getIndex() ] = argAbsMin( value, tmp );
+  }
 }
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
 initInterface( const MeshFunctionPointer& _input,
-               MeshFunctionPointer& _output,
-               InterfaceMapPointer& _interfaceMap  )
+        MeshFunctionPointer& _output,
+        InterfaceMapPointer& _interfaceMap  )
 {
-    if( std::is_same< Device, Devices::Cuda >::value )
-    {
+  if( std::is_same< Device, Devices::Cuda >::value )
+  {
 #ifdef HAVE_CUDA
-        const MeshType& mesh = _input->getMesh();
-        
-        const int cudaBlockSize( 8 );
-        int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
-        int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
-        int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize );
-        if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
-            std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
-        dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
-        dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
-        Devices::Cuda::synchronizeDevice();
-        CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(),
-                                                     _output.template modifyData< Device >(),
-                                                     _interfaceMap.template modifyData< Device >() );
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
+    const MeshType& mesh = _input->getMesh();
+    
+    const int cudaBlockSize( 8 );
+    int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
+    int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
+    int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize );
+    if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
+      std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
+    dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
+    dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
+    Devices::Cuda::synchronizeDevice();
+    CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(),
+            _output.template modifyData< Device >(),
+            _interfaceMap.template modifyData< Device >() );
+    cudaDeviceSynchronize();
+    TNL_CHECK_CUDA_DEVICE;
 #endif
-    }
-    if( std::is_same< Device, Devices::Host >::value )
-    {
-        const MeshFunctionType& input =  _input.getData();
-        MeshFunctionType& output =  _output.modifyData();
-        InterfaceMapType& interfaceMap =  _interfaceMap.modifyData();
-        const MeshType& mesh = input.getMesh();
-        typedef typename MeshType::Cell Cell;
-        Cell cell( mesh );
-        for( cell.getCoordinates().z() = 0;
-             cell.getCoordinates().z() < mesh.getDimensions().z();
-             cell.getCoordinates().z() ++ )
-             for( cell.getCoordinates().y() = 0;
-                  cell.getCoordinates().y() < mesh.getDimensions().y();
-                  cell.getCoordinates().y() ++ )
-                 for( cell.getCoordinates().x() = 0;
-                      cell.getCoordinates().x() < mesh.getDimensions().x();
-                      cell.getCoordinates().x() ++ )
-                 {
-                     cell.refresh();
-                     output[ cell.getIndex() ] =
-                     input( cell ) > 0 ? std::numeric_limits< RealType >::max() :
-                                        - std::numeric_limits< RealType >::max();
-                     interfaceMap[ cell.getIndex() ] = false;
-                 }
-
-        const RealType& hx = mesh.getSpaceSteps().x();
-        const RealType& hy = mesh.getSpaceSteps().y();
-        const RealType& hz = mesh.getSpaceSteps().z();
-        for( cell.getCoordinates().z() = 0;
-             cell.getCoordinates().z() < mesh.getDimensions().z();
-             cell.getCoordinates().z() ++ )   
-           for( cell.getCoordinates().y() = 0;
-                cell.getCoordinates().y() < mesh.getDimensions().y();
-                cell.getCoordinates().y() ++ )
-              for( cell.getCoordinates().x() = 0;
-                   cell.getCoordinates().x() < mesh.getDimensions().x();
-                   cell.getCoordinates().x() ++ )
+  }
+  if( std::is_same< Device, Devices::Host >::value )
+  {
+    const MeshFunctionType& input =  _input.getData();
+    MeshFunctionType& output =  _output.modifyData();
+    InterfaceMapType& interfaceMap =  _interfaceMap.modifyData();
+    const MeshType& mesh = input.getMesh();
+    typedef typename MeshType::Cell Cell;
+    Cell cell( mesh );
+    for( cell.getCoordinates().z() = 0;
+            cell.getCoordinates().z() < mesh.getDimensions().z();
+            cell.getCoordinates().z() ++ )
+      for( cell.getCoordinates().y() = 0;
+              cell.getCoordinates().y() < mesh.getDimensions().y();
+              cell.getCoordinates().y() ++ )
+        for( cell.getCoordinates().x() = 0;
+                cell.getCoordinates().x() < mesh.getDimensions().x();
+                cell.getCoordinates().x() ++ )
+        {
+          cell.refresh();
+          output[ cell.getIndex() ] =
+                  input( cell ) > 0 ? std::numeric_limits< RealType >::max() :
+                    - std::numeric_limits< RealType >::max();
+          interfaceMap[ cell.getIndex() ] = false;
+        }
+    
+    const RealType& hx = mesh.getSpaceSteps().x();
+    const RealType& hy = mesh.getSpaceSteps().y();
+    const RealType& hz = mesh.getSpaceSteps().z();
+    for( cell.getCoordinates().z() = 0;
+            cell.getCoordinates().z() < mesh.getDimensions().z();
+            cell.getCoordinates().z() ++ )   
+      for( cell.getCoordinates().y() = 0;
+              cell.getCoordinates().y() < mesh.getDimensions().y();
+              cell.getCoordinates().y() ++ )
+        for( cell.getCoordinates().x() = 0;
+                cell.getCoordinates().x() < mesh.getDimensions().x();
+                cell.getCoordinates().x() ++ )
+        {
+          cell.refresh();
+          const RealType& c = input( cell );
+          if( ! cell.isBoundaryEntity() )
+          {
+            auto neighbors = cell.getNeighborEntities();
+            Real pom = 0;
+            const IndexType e = neighbors.template getEntityIndex<  1,  0,  0 >();
+            const IndexType n = neighbors.template getEntityIndex<  0,  1,  0 >();
+            const IndexType t = neighbors.template getEntityIndex<  0,  0,  1 >();
+            //Try exact initiation
+            /*const IndexType w = neighbors.template getEntityIndex< -1,  0,  0 >();
+             const IndexType s = neighbors.template getEntityIndex<  0, -1,  0 >();
+             const IndexType b = neighbors.template getEntityIndex<  0,  0, -1 >();
+             if( c * input[ e ] <= 0 )
+             {
+             output[ cell.getIndex() ] = c;
+             output[ e ] = input[ e ];
+             interfaceMap[ e ] = true;   
+             interfaceMap[ cell.getIndex() ] = true;
+             }
+             else if( c * input[ n ] <= 0 )
+             {
+             output[ cell.getIndex() ] = c;
+             output[ n ] = input[ n ];
+             interfaceMap[ n ] = true;   
+             interfaceMap[ cell.getIndex() ] = true;
+             }
+             else if( c * input[ t ] <= 0 )
+             {
+             output[ cell.getIndex() ] = c;
+             output[ t ] = input[ t ];
+             interfaceMap[ t ] = true;   
+             interfaceMap[ cell.getIndex() ] = true;
+             }*/
+            if( c * input[ n ] <= 0 )
+            {
+              if( c >= 0 )
               {
-                 cell.refresh();
-                 const RealType& c = input( cell );
-                 if( ! cell.isBoundaryEntity() )
-                 {
-                    auto neighbors = cell.getNeighborEntities();
-                    Real pom = 0;
-                    const IndexType e = neighbors.template getEntityIndex<  1,  0,  0 >();
-                    const IndexType n = neighbors.template getEntityIndex<  0,  1,  0 >();
-                    const IndexType t = neighbors.template getEntityIndex<  0,  0,  1 >();
-                    //Try exact initiation
-                    /*const IndexType w = neighbors.template getEntityIndex< -1,  0,  0 >();
-                    const IndexType s = neighbors.template getEntityIndex<  0, -1,  0 >();
-                    const IndexType b = neighbors.template getEntityIndex<  0,  0, -1 >();
-                    if( c * input[ e ] <= 0 )
-                    {
-                       output[ cell.getIndex() ] = c;
-                       output[ e ] = input[ e ];
-                       interfaceMap[ e ] = true;   
-                       interfaceMap[ cell.getIndex() ] = true;
-                    }
-                    else if( c * input[ n ] <= 0 )
-                    {
-                       output[ cell.getIndex() ] = c;
-                       output[ n ] = input[ n ];
-                       interfaceMap[ n ] = true;   
-                       interfaceMap[ cell.getIndex() ] = true;
-                    }
-                    else if( c * input[ t ] <= 0 )
-                    {
-                       output[ cell.getIndex() ] = c;
-                       output[ t ] = input[ t ];
-                       interfaceMap[ t ] = true;   
-                       interfaceMap[ cell.getIndex() ] = true;
-                    }*/
-                    if( c * input[ n ] <= 0 )
-                    {
-                        if( c >= 0 )
-                        {
-                        pom = ( hy * c )/( c - input[ n ]);
-                        if( output[ cell.getIndex() ] > pom ) 
-                            output[ cell.getIndex() ] = pom;
-
-                        if ( output[ n ] < pom - hy)
-                             output[ n ] = pom - hy; // ( hy * c )/( c - input[ n ]) - hy;
-
-                        }else
-                        {
-                          pom = - ( hy * c )/( c - input[ n ]);
-                          if( output[ cell.getIndex() ] < pom )
-                              output[ cell.getIndex() ] = pom;
-                          if( output[ n ] > hy + pom )
-                              output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]);
-
-                        }
-                    interfaceMap[ cell.getIndex() ] = true;
-                    interfaceMap[ n ] = true;
-                    }
-                    if( c * input[ e ] <= 0 )
-                    {
-                        if( c >= 0 )
-                        {
-                            pom = ( hx * c )/( c - input[ e ]);
-                            if( output[ cell.getIndex() ] > pom )
-                                output[ cell.getIndex() ] = pom;
-
-                            pom = pom - hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
-                            if( output[ e ] < pom )
-                                output[ e ] = pom;      
-
-                        }else
-                        {
-                            pom = - (hx * c)/( c - input[ e ]);
-                            if( output[ cell.getIndex() ] < pom )
-                                output[ cell.getIndex() ] = pom;
-
-                            pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
-                            if( output[ e ] > pom )
-                                output[ e ] = pom;
-                        }
-                    interfaceMap[ cell.getIndex() ] = true;
-                    interfaceMap[ e ] = true;
-                    }
-                    if( c * input[ t ] <= 0 )
-                    {
-                        if( c >= 0 )
-                        {
-                            pom = ( hz * c )/( c - input[ t ]);
-                            if( output[ cell.getIndex() ] > pom )
-                                output[ cell.getIndex() ] = pom;
-
-                            pom = pom - hz; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
-                            if( output[ t ] < pom )
-                                output[ t ] = pom; 
-
-                        }else
-                        {
-                            pom = - (hz * c)/( c - input[ t ]);
-                            if( output[ cell.getIndex() ] < pom )
-                                output[ cell.getIndex() ] = pom;
-
-                            pom = pom + hz; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
-                            if( output[ t ] > pom )
-                                output[ t ] = pom;
-
-                        }
-                    interfaceMap[ cell.getIndex() ] = true;
-                    interfaceMap[ t ] = true;
-                    }    
-                 }
-                 /*output[ cell.getIndex() ] =
-                    c > 0 ? TypeInfo< RealType >::getMaxValue() :
-                           -TypeInfo< RealType >::getMaxValue();
-                 interfaceMap[ cell.getIndex() ] = false;*/ //is on line 245
+                pom = ( hy * c )/( c - input[ n ]);
+                if( output[ cell.getIndex() ] > pom ) 
+                  output[ cell.getIndex() ] = pom;
+                
+                if ( output[ n ] < pom - hy)
+                  output[ n ] = pom - hy; // ( hy * c )/( c - input[ n ]) - hy;
+                
+              }else
+              {
+                pom = - ( hy * c )/( c - input[ n ]);
+                if( output[ cell.getIndex() ] < pom )
+                  output[ cell.getIndex() ] = pom;
+                if( output[ n ] > hy + pom )
+                  output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]);
+                
               }
-    }
+              interfaceMap[ cell.getIndex() ] = true;
+              interfaceMap[ n ] = true;
+            }
+            if( c * input[ e ] <= 0 )
+            {
+              if( c >= 0 )
+              {
+                pom = ( hx * c )/( c - input[ e ]);
+                if( output[ cell.getIndex() ] > pom )
+                  output[ cell.getIndex() ] = pom;
+                
+                pom = pom - hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
+                if( output[ e ] < pom )
+                  output[ e ] = pom;      
+                
+              }else
+              {
+                pom = - (hx * c)/( c - input[ e ]);
+                if( output[ cell.getIndex() ] < pom )
+                  output[ cell.getIndex() ] = pom;
+                
+                pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
+                if( output[ e ] > pom )
+                  output[ e ] = pom;
+              }
+              interfaceMap[ cell.getIndex() ] = true;
+              interfaceMap[ e ] = true;
+            }
+            if( c * input[ t ] <= 0 )
+            {
+              if( c >= 0 )
+              {
+                pom = ( hz * c )/( c - input[ t ]);
+                if( output[ cell.getIndex() ] > pom )
+                  output[ cell.getIndex() ] = pom;
+                
+                pom = pom - hz; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
+                if( output[ t ] < pom )
+                  output[ t ] = pom; 
+                
+              }else
+              {
+                pom = - (hz * c)/( c - input[ t ]);
+                if( output[ cell.getIndex() ] < pom )
+                  output[ cell.getIndex() ] = pom;
+                
+                pom = pom + hz; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
+                if( output[ t ] > pom )
+                  output[ t ] = pom;
+                
+              }
+              interfaceMap[ cell.getIndex() ] = true;
+              interfaceMap[ t ] = true;
+            }    
+          }
+          /*output[ cell.getIndex() ] =
+           c > 0 ? TypeInfo< RealType >::getMaxValue() :
+           -TypeInfo< RealType >::getMaxValue();
+           interfaceMap[ cell.getIndex() ] = false;*/ //is on line 245
+        }
+  }
 }
 
 template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename MeshEntity >
+        typename Device,
+        typename Index >
+template< typename MeshEntity >
 __cuda_callable__
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
 updateCell( MeshFunctionType& u,
-            const MeshEntity& cell, 
-            const RealType v )
+        const MeshEntity& cell, 
+        const RealType v )
 {
-   const auto& neighborEntities = cell.template getNeighborEntities< 3 >();
-   const MeshType& mesh = cell.getMesh();
+  const auto& neighborEntities = cell.template getNeighborEntities< 3 >();
+  const MeshType& mesh = cell.getMesh();
   
-   const RealType& hx = mesh.getSpaceSteps().x();
-   const RealType& hy = mesh.getSpaceSteps().y();
-   const RealType& hz = mesh.getSpaceSteps().z();
-   const RealType value = u( cell );
-   //std::cout << value << std::endl;
-   RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
-   
-   
-   if( cell.getCoordinates().x() == 0 )
-      a = u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ];
-   else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
-      a = u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ];
-   else
+  const RealType& hx = mesh.getSpaceSteps().x();
+  const RealType& hy = mesh.getSpaceSteps().y();
+  const RealType& hz = mesh.getSpaceSteps().z();
+  const RealType value = u( cell );
+  //std::cout << value << std::endl;
+  RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
+  
+  
+  if( cell.getCoordinates().x() == 0 )
+    a = u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ];
+  else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 )
+    a = u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ];
+  else
+  {
+    a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ],
+            u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ] );
+  }
+  if( cell.getCoordinates().y() == 0 )
+    b = u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ];
+  else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 )
+    b = u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ];
+  else
+  {
+    b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ],
+            u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ] );
+  }if( cell.getCoordinates().z() == 0 )
+    c = u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ];
+  else if( cell.getCoordinates().z() == mesh.getDimensions().z() - 1 )
+    c = u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ];
+  else
+  {
+    c = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ],
+            u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ] );
+  }
+  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+          fabs( b ) == std::numeric_limits< RealType >::max() &&
+          fabs( c ) == std::numeric_limits< RealType >::max() )
+    return;
+  
+  
+  /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() &&
+   fabs( b ) != TypeInfo< Real >::getMaxValue() &&
+   fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) )
    {
-      a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ],
-                        u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ] );
+   tmp = ( hx * hx * a + hy * hy * b + 
+   sign( value ) * hx * hy * sqrt( ( hx * hx + hy * hy )/v - 
+   ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy );
    }
-   if( cell.getCoordinates().y() == 0 )
-      b = u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ];
-   else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 )
-      b = u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ];
-   else
-   {
-      b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ],
-                        u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ] );
-   }if( cell.getCoordinates().z() == 0 )
-      c = u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ];
-   else if( cell.getCoordinates().z() == mesh.getDimensions().z() - 1 )
-      c = u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ];
-   else
+   if( fabs( a ) != TypeInfo< Real >::getMaxValue() &&
+   fabs( c ) != TypeInfo< Real >::getMaxValue() &&
+   fabs( a - c ) >= TNL::sqrt( (hx * hx + hz * hz)/v ) )
    {
-      c = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ],
-                         u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ] );
+   tmp = ( hx * hx * a + hz * hz * c + 
+   sign( value ) * hx * hz * sqrt( ( hx * hx + hz * hz )/v - 
+   ( a - c ) * ( a - c ) ) )/( hx * hx + hz * hz );
    }
-   if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-       fabs( b ) == std::numeric_limits< RealType >::max() &&
-       fabs( c ) == std::numeric_limits< RealType >::max() )
-      return;
-   
-   
-       /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() &&
-           fabs( b ) != TypeInfo< Real >::getMaxValue() &&
-           fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) )
-       {
-           tmp = ( hx * hx * a + hy * hy * b + 
-                sign( value ) * hx * hy * sqrt( ( hx * hx + hy * hy )/v - 
-                ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy );
-       }
-       if( fabs( a ) != TypeInfo< Real >::getMaxValue() &&
-           fabs( c ) != TypeInfo< Real >::getMaxValue() &&
-           fabs( a - c ) >= TNL::sqrt( (hx * hx + hz * hz)/v ) )
-       {
-           tmp = ( hx * hx * a + hz * hz * c + 
-                sign( value ) * hx * hz * sqrt( ( hx * hx + hz * hz )/v - 
-                ( a - c ) * ( a - c ) ) )/( hx * hx + hz * hz );
-       }
-       if( fabs( b ) != TypeInfo< Real >::getMaxValue() &&
-           fabs( c ) != TypeInfo< Real >::getMaxValue() &&
-           fabs( b - c ) >= TNL::sqrt( (hy * hy + hz * hz)/v ) )
-       {
-           tmp = ( hy * hy * b + hz * hz * c + 
-                sign( value ) * hy * hz * sqrt( ( hy * hy + hz * hz )/v - 
-                ( b - c ) * ( b - c ) ) )/( hy * hy + hz * hz );
-       }*/
-    RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
-    sortMinims( pom );   
-    tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
-    if( fabs( tmp ) < fabs( pom[ 1 ] ) )
+   if( fabs( b ) != TypeInfo< Real >::getMaxValue() &&
+   fabs( c ) != TypeInfo< Real >::getMaxValue() &&
+   fabs( b - c ) >= TNL::sqrt( (hy * hy + hz * hz)/v ) )
+   {
+   tmp = ( hy * hy * b + hz * hz * c + 
+   sign( value ) * hy * hz * sqrt( ( hy * hy + hz * hz )/v - 
+   ( b - c ) * ( b - c ) ) )/( hy * hy + hz * hz );
+   }*/
+  RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
+  sortMinims( pom );   
+  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
+  if( fabs( tmp ) < fabs( pom[ 1 ] ) )
+  {
+    u[ cell.getIndex() ] = argAbsMin( value, tmp ); 
+  }
+  else
+  {
+    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
+            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
+    if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
     {
-        u[ cell.getIndex() ] = argAbsMin( value, tmp ); 
+      u[ cell.getIndex() ] = argAbsMin( value, tmp );
     }
     else
     {
-        tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
-            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
-            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-        if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
-        {
-            u[ cell.getIndex() ] = argAbsMin( value, tmp );
-        }
-        else
-        {
-            tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
-                TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
-                hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
-                hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
-            u[ cell.getIndex() ] = argAbsMin( value, tmp );
-        }
+      tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
+              TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
+              hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
+              hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
+      u[ cell.getIndex() ] = argAbsMin( value, tmp );
     }
+  }
 }
 
 template < typename T1, typename T2 >
 T1 meet2DCondition( T1 a, T1 b, const T2 ha, const T2 hb, const T1 value, double v)
 {
-   T1 tmp;
-   if( fabs( a ) != std::numeric_limits< T1 >::max &&
-       fabs( b ) != std::numeric_limits< T1 >::max &&
-       fabs( a - b ) < ha/v )//TNL::sqrt( (ha * ha + hb * hb)/2 )/v )
-   {
-      tmp = ( ha * ha * b + hb * hb * a + 
+  T1 tmp;
+  if( fabs( a ) != std::numeric_limits< T1 >::max &&
+          fabs( b ) != std::numeric_limits< T1 >::max &&
+          fabs( a - b ) < ha/v )//TNL::sqrt( (ha * ha + hb * hb)/2 )/v )
+  {
+    tmp = ( ha * ha * b + hb * hb * a + 
             TNL::sign( value ) * ha * hb * TNL::sqrt( ( ha * ha + hb * hb )/( v * v ) - 
             ( a - b ) * ( a - b ) ) )/( ha * ha + hb * hb );
-   }
-   else
-   {
-       tmp = std::numeric_limits< T1 >::max;
-   }
-   
-   return tmp;
+  }
+  else
+  {
+    tmp = std::numeric_limits< T1 >::max;
+  }
+  
+  return tmp;
 }
 
 template < typename T1 >
 __cuda_callable__ void sortMinims( T1 pom[] )
 {
-    T1 tmp[6] = {0.0,0.0,0.0,0.0,0.0,0.0}; 
-    if( fabs(pom[0]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[2])){
-        tmp[0] = pom[0]; tmp[1] = pom[1]; tmp[2] = pom[2];
-        tmp[3] = pom[3]; tmp[4] = pom[4]; tmp[5] = pom[5];
-        
-    }
-    else if( fabs(pom[0]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[1]) ){
-        tmp[0] = pom[0]; tmp[1] = pom[2]; tmp[2] = pom[1];
-        tmp[3] = pom[3]; tmp[4] = pom[5]; tmp[5] = pom[4];
-    }
-    else if( fabs(pom[1]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[2]) ){
-        tmp[0] = pom[1]; tmp[1] = pom[0]; tmp[2] = pom[2];
-        tmp[3] = pom[4]; tmp[4] = pom[3]; tmp[5] = pom[5];
-    }
-    else if( fabs(pom[1]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[0]) ){
-        tmp[0] = pom[1]; tmp[1] = pom[2]; tmp[2] = pom[0];
-        tmp[3] = pom[4]; tmp[4] = pom[5]; tmp[5] = pom[3];
-    }
-    else if( fabs(pom[2]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[1]) ){
-        tmp[0] = pom[2]; tmp[1] = pom[0]; tmp[2] = pom[1];
-        tmp[3] = pom[5]; tmp[4] = pom[3]; tmp[5] = pom[4];
-    }
-    else if( fabs(pom[2]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[0]) ){
-        tmp[0] = pom[2]; tmp[1] = pom[1]; tmp[2] = pom[0];
-        tmp[3] = pom[5]; tmp[4] = pom[4]; tmp[5] = pom[3];
-    }
+  T1 tmp[6] = {0.0,0.0,0.0,0.0,0.0,0.0}; 
+  if( fabs(pom[0]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[2])){
+    tmp[0] = pom[0]; tmp[1] = pom[1]; tmp[2] = pom[2];
+    tmp[3] = pom[3]; tmp[4] = pom[4]; tmp[5] = pom[5];
     
-    for( int i = 0; i < 6; i++ )
-    {
-        pom[ i ] = tmp[ i ];
-    }   
+  }
+  else if( fabs(pom[0]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[1]) ){
+    tmp[0] = pom[0]; tmp[1] = pom[2]; tmp[2] = pom[1];
+    tmp[3] = pom[3]; tmp[4] = pom[5]; tmp[5] = pom[4];
+  }
+  else if( fabs(pom[1]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[2]) ){
+    tmp[0] = pom[1]; tmp[1] = pom[0]; tmp[2] = pom[2];
+    tmp[3] = pom[4]; tmp[4] = pom[3]; tmp[5] = pom[5];
+  }
+  else if( fabs(pom[1]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[0]) ){
+    tmp[0] = pom[1]; tmp[1] = pom[2]; tmp[2] = pom[0];
+    tmp[3] = pom[4]; tmp[4] = pom[5]; tmp[5] = pom[3];
+  }
+  else if( fabs(pom[2]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[1]) ){
+    tmp[0] = pom[2]; tmp[1] = pom[0]; tmp[2] = pom[1];
+    tmp[3] = pom[5]; tmp[4] = pom[3]; tmp[5] = pom[4];
+  }
+  else if( fabs(pom[2]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[0]) ){
+    tmp[0] = pom[2]; tmp[1] = pom[1]; tmp[2] = pom[0];
+    tmp[3] = pom[5]; tmp[4] = pom[4]; tmp[5] = pom[3];
+  }
+  
+  for( int i = 0; i < 6; i++ )
+  {
+    pom[ i ] = tmp[ i ];
+  }   
 }
 
 
@@ -893,372 +919,373 @@ __cuda_callable__ void sortMinims( T1 pom[] )
 #ifdef HAVE_CUDA
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, 
-                                Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output,
-                                Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap  )
+        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap  )
 {
-    int i = threadIdx.x + blockDim.x*blockIdx.x;
-    const Meshes::Grid< 1, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
+  int i = threadIdx.x + blockDim.x*blockIdx.x;
+  const Meshes::Grid< 1, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
+  
+  if( i < mesh.getDimensions().x()  )
+  {
+    typedef typename Meshes::Grid< 1, Real, Device, Index >::Cell Cell;
+    Cell cell( mesh );
+    cell.getCoordinates().x() = i;
+    cell.refresh();
+    const Index cind = cell.getIndex();
+    
     
-    if( i < mesh.getDimensions().x()  )
+    output[ cind ] =
+            input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
+              - std::numeric_limits< Real >::max();
+    interfaceMap[ cind ] = false; 
+    
+    const Real& h = mesh.getSpaceSteps().x();
+    cell.refresh();
+    const Real& c = input( cell );
+    if( ! cell.isBoundaryEntity()  )
     {
-        typedef typename Meshes::Grid< 1, Real, Device, Index >::Cell Cell;
-        Cell cell( mesh );
-        cell.getCoordinates().x() = i;
-        cell.refresh();
-        const Index cind = cell.getIndex();
-
-
-        output[ cind ] =
-               input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
-                                    - std::numeric_limits< Real >::max();
-        interfaceMap[ cind ] = false; 
-
-        const Real& h = mesh.getSpaceSteps().x();
-        cell.refresh();
-        const Real& c = input( cell );
-        if( ! cell.isBoundaryEntity()  )
-        {
-           auto neighbors = cell.getNeighborEntities();
-           Real pom = 0;
-           const Index e = neighbors.template getEntityIndex< 1 >();
-           const Index w = neighbors.template getEntityIndex< -1 >();
-           if( c * input[ e ] <= 0 )
-           {
-               pom = TNL::sign( c )*( h * c )/( c - input[ e ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
-                   output[ cind ] = pom;                       
-
-               interfaceMap[ cind ] = true;
-           }
-           if( c * input[ w ] <= 0 )
-           {
-               pom = TNL::sign( c )*( h * c )/( c - input[ w ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-                   output[ cind ] = pom;
-
-               interfaceMap[ cind ] = true;
-           }
-        }
+      auto neighbors = cell.getNeighborEntities();
+      Real pom = 0;
+      const Index e = neighbors.template getEntityIndex< 1 >();
+      const Index w = neighbors.template getEntityIndex< -1 >();
+      if( c * input[ e ] <= 0 )
+      {
+        pom = TNL::sign( c )*( h * c )/( c - input[ e ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
+          output[ cind ] = pom;                       
+        
+        interfaceMap[ cind ] = true;
+      }
+      if( c * input[ w ] <= 0 )
+      {
+        pom = TNL::sign( c )*( h * c )/( c - input[ w ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+          output[ cind ] = pom;
+        
+        interfaceMap[ cind ] = true;
+      }
     }
-           
+  }
+  
 }
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
-                                Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
-                                Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap ) 
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap ) 
 {
-    int i = threadIdx.x + blockDim.x*blockIdx.x;
-    int j = blockDim.y*blockIdx.y + threadIdx.y;
-    const Meshes::Grid< 2, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
+  int i = threadIdx.x + blockDim.x*blockIdx.x;
+  int j = blockDim.y*blockIdx.y + threadIdx.y;
+  const Meshes::Grid< 2, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
+  
+  if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+  {
+    typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell;
+    Cell cell( mesh );
+    cell.getCoordinates().x() = i; cell.getCoordinates().y() = j;
+    cell.refresh();
+    const Index cind = cell.getIndex();
+    
     
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+    output[ cind ] =
+            input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
+              - std::numeric_limits< Real >::max();
+    interfaceMap[ cind ] = false; 
+    
+    const Real& hx = mesh.getSpaceSteps().x();
+    const Real& hy = mesh.getSpaceSteps().y();
+    cell.refresh();
+    const Real& c = input( cell );
+    if( ! cell.isBoundaryEntity()  )
     {
-        typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell;
-        Cell cell( mesh );
-        cell.getCoordinates().x() = i; cell.getCoordinates().y() = j;
-        cell.refresh();
-        const Index cind = cell.getIndex();
-
-
-        output[ cind ] =
-               input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
-                                    - std::numeric_limits< Real >::max();
-        interfaceMap[ cind ] = false; 
-
-        const Real& hx = mesh.getSpaceSteps().x();
-        const Real& hy = mesh.getSpaceSteps().y();
-        cell.refresh();
-        const Real& c = input( cell );
-        if( ! cell.isBoundaryEntity()  )
-        {
-           auto neighbors = cell.getNeighborEntities();
-           Real pom = 0;
-           const Index e = neighbors.template getEntityIndex<  1,  0 >();
-           const Index w = neighbors.template getEntityIndex<  -1,  0 >();
-           const Index n = neighbors.template getEntityIndex<  0,  1 >();
-           const Index s = neighbors.template getEntityIndex<  0,  -1 >();
-           
-           if( c * input[ n ] <= 0 )
-           {
-               pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-                   output[ cind ] = pom;
-
-               interfaceMap[ cell.getIndex() ] = true;
-           }
-           if( c * input[ e ] <= 0 )
-           {
-               pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
-                   output[ cind ] = pom;                       
-
-               interfaceMap[ cind ] = true;
-           }
-           if( c * input[ w ] <= 0 )
-           {
-               pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-                   output[ cind ] = pom;
-
-               interfaceMap[ cind ] = true;
-           }
-           if( c * input[ s ] <= 0 )
-           {
-               pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-                   output[ cind ] = pom;
-
-               interfaceMap[ cind ] = true;
-           }
-        }
+      auto neighbors = cell.getNeighborEntities();
+      Real pom = 0;
+      const Index e = neighbors.template getEntityIndex<  1,  0 >();
+      const Index w = neighbors.template getEntityIndex<  -1,  0 >();
+      const Index n = neighbors.template getEntityIndex<  0,  1 >();
+      const Index s = neighbors.template getEntityIndex<  0,  -1 >();
+      
+      if( c * input[ n ] <= 0 )
+      {
+        pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+          output[ cind ] = pom;
+        
+        interfaceMap[ cell.getIndex() ] = true;
+      }
+      if( c * input[ e ] <= 0 )
+      {
+        pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
+          output[ cind ] = pom;                       
+        
+        interfaceMap[ cind ] = true;
+      }
+      if( c * input[ w ] <= 0 )
+      {
+        pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+          output[ cind ] = pom;
+        
+        interfaceMap[ cind ] = true;
+      }
+      if( c * input[ s ] <= 0 )
+      {
+        pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+          output[ cind ] = pom;
+        
+        interfaceMap[ cind ] = true;
+      }
     }
+  }
 }
 
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, 
-                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
-                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap )
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap )
 {
-    int i = threadIdx.x + blockDim.x*blockIdx.x;
-    int j = blockDim.y*blockIdx.y + threadIdx.y;
-    int k = blockDim.z*blockIdx.z + threadIdx.z;
-    const Meshes::Grid< 3, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
+  int i = threadIdx.x + blockDim.x*blockIdx.x;
+  int j = blockDim.y*blockIdx.y + threadIdx.y;
+  int k = blockDim.z*blockIdx.z + threadIdx.z;
+  const Meshes::Grid< 3, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
+  
+  if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
+  {
+    typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell;
+    Cell cell( mesh );
+    cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k;
+    cell.refresh();
+    const Index cind = cell.getIndex();
+    
+    
+    output[ cind ] =
+            input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
+              - std::numeric_limits< Real >::max();
+    interfaceMap[ cind ] = false; 
+    cell.refresh();
     
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
+    const Real& hx = mesh.getSpaceSteps().x();
+    const Real& hy = mesh.getSpaceSteps().y();
+    const Real& hz = mesh.getSpaceSteps().z();
+    const Real& c = input( cell );
+    if( ! cell.isBoundaryEntity()  )
     {
-        typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell;
-        Cell cell( mesh );
-        cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k;
-        cell.refresh();
-        const Index cind = cell.getIndex();
-
-
-        output[ cind ] =
-               input( cell ) >= 0 ? std::numeric_limits< Real >::max() :
-                                    - std::numeric_limits< Real >::max();
-        interfaceMap[ cind ] = false; 
-        cell.refresh();
-
-        const Real& hx = mesh.getSpaceSteps().x();
-        const Real& hy = mesh.getSpaceSteps().y();
-        const Real& hz = mesh.getSpaceSteps().z();
-        const Real& c = input( cell );
-        if( ! cell.isBoundaryEntity()  )
-        {
-           auto neighbors = cell.getNeighborEntities();
-           Real pom = 0;
-           const Index e = neighbors.template getEntityIndex<  1, 0, 0 >();
-           const Index w = neighbors.template getEntityIndex<  -1, 0, 0 >();
-           const Index n = neighbors.template getEntityIndex<  0, 1, 0 >();
-           const Index s = neighbors.template getEntityIndex<  0, -1, 0 >();
-           const Index t = neighbors.template getEntityIndex<  0, 0, 1 >();
-           const Index b = neighbors.template getEntityIndex<  0, 0, -1 >();
-           
-           if( c * input[ n ] <= 0 )
-           {
-               pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-                   output[ cind ] = pom;
-
-               interfaceMap[ cind ] = true;
-           }
-           if( c * input[ e ] <= 0 )
-           {
-               pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
-                   output[ cind ] = pom;                       
-
-               interfaceMap[ cind ] = true;
-           }
-           if( c * input[ w ] <= 0 )
-           {
-               pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-                   output[ cind ] = pom;
-
-               interfaceMap[ cind ] = true;
-           }
-           if( c * input[ s ] <= 0 )
-           {
-               pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-                   output[ cind ] = pom;
-
-               interfaceMap[ cind ] = true;
-           }
-           if( c * input[ b ] <= 0 )
-           {
-               pom = TNL::sign( c )*( hz * c )/( c - input[ b ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-                   output[ cind ] = pom;
-
-               interfaceMap[ cind ] = true;
-           }
-           if( c * input[ t ] <= 0 )
-           {
-               pom = TNL::sign( c )*( hz * c )/( c - input[ t ]);
-               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
-                   output[ cind ] = pom;
-
-               interfaceMap[ cind ] = true;
-           }
-        }
+      auto neighbors = cell.getNeighborEntities();
+      Real pom = 0;
+      const Index e = neighbors.template getEntityIndex<  1, 0, 0 >();
+      const Index w = neighbors.template getEntityIndex<  -1, 0, 0 >();
+      const Index n = neighbors.template getEntityIndex<  0, 1, 0 >();
+      const Index s = neighbors.template getEntityIndex<  0, -1, 0 >();
+      const Index t = neighbors.template getEntityIndex<  0, 0, 1 >();
+      const Index b = neighbors.template getEntityIndex<  0, 0, -1 >();
+      
+      if( c * input[ n ] <= 0 )
+      {
+        pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+          output[ cind ] = pom;
+        
+        interfaceMap[ cind ] = true;
+      }
+      if( c * input[ e ] <= 0 )
+      {
+        pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
+          output[ cind ] = pom;                       
+        
+        interfaceMap[ cind ] = true;
+      }
+      if( c * input[ w ] <= 0 )
+      {
+        pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+          output[ cind ] = pom;
+        
+        interfaceMap[ cind ] = true;
+      }
+      if( c * input[ s ] <= 0 )
+      {
+        pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+          output[ cind ] = pom;
+        
+        interfaceMap[ cind ] = true;
+      }
+      if( c * input[ b ] <= 0 )
+      {
+        pom = TNL::sign( c )*( hz * c )/( c - input[ b ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+          output[ cind ] = pom;
+        
+        interfaceMap[ cind ] = true;
+      }
+      if( c * input[ t ] <= 0 )
+      {
+        pom = TNL::sign( c )*( hz * c )/( c - input[ t ]);
+        if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+          output[ cind ] = pom;
+        
+        interfaceMap[ cind ] = true;
+      }
     }
+  }
 }
 
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
+template< int sizeSArray >
 __cuda_callable__
 bool
 tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
-updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy,
-            const Real v )
+updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real hy,
+        const Real v )
 {
-   const RealType value = sArray[ thrj ][ thri ];
-   RealType a, b, tmp = std::numeric_limits< RealType >::max();
-      
-   b = TNL::argAbsMin( sArray[ thrj+1 ][ thri ],
-                        sArray[ thrj-1 ][ thri ] );
-    
-   a = TNL::argAbsMin( sArray[ thrj ][ thri+1 ],
-                        sArray[ thrj ][ thri-1 ] );
-
-    if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-        fabs( b ) == std::numeric_limits< RealType >::max() )
-       return false;
-   
-    RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
-    sortMinims( pom );
-    tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
-    
-                                
-    if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
-    {
-        sArray[ thrj ][ thri ] = argAbsMin( value, tmp );
-        tmp = value - sArray[ thrj ][ thri ];
-        if ( fabs( tmp ) >  0.001*hx )
-            return true;
-        else
-            return false;
-    }
+  const RealType value = sArray[ thrj * sizeSArray + thri ];
+  RealType a, b, tmp = std::numeric_limits< RealType >::max();
+  
+  b = TNL::argAbsMin( sArray[ (thrj+1) * sizeSArray + thri ],
+          sArray[ (thrj-1) * sizeSArray + thri ] );
+  
+  a = TNL::argAbsMin( sArray[ thrj * sizeSArray + thri+1 ],
+          sArray[ thrj * sizeSArray + thri-1 ] );
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+          fabs( b ) == std::numeric_limits< RealType >::max() )
+    return false;
+  
+  RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
+  sortMinims( pom );
+  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
+  
+  
+  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
+  {
+    sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp );
+    tmp = value - sArray[ thrj * sizeSArray + thri ];
+    if ( fabs( tmp ) >  0.001*hx )
+      return true;
     else
-    {
-        tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+      return false;
+  }
+  else
+  {
+    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
             TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
             ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-        sArray[ thrj ][ thri ] = argAbsMin( value, tmp );
-        tmp = value - sArray[ thrj ][ thri ];
-        if ( fabs( tmp ) > 0.001*hx )
-            return true;
-        else
-            return false;
-    }
-    
-    return false;
+    sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp );
+    tmp = value - sArray[ thrj * sizeSArray + thri ];
+    if ( fabs( tmp ) > 0.001*hx )
+      return true;
+    else
+      return false;
+  }
+  
+  return false;
 }
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
 __cuda_callable__
 bool
 tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >::
 updateCell( volatile Real sArray[18], int thri, const Real h, const Real v )
 {
-   const RealType value = sArray[ thri ];
-   RealType a, tmp = std::numeric_limits< RealType >::max();
-      
-   a = TNL::argAbsMin( sArray[ thri+1 ],
-                       sArray[ thri-1 ] );
-
-    if( fabs( a ) == std::numeric_limits< RealType >::max() )
-       return false;
-   
-    tmp = a + TNL::sign( value ) * h/v;
-    
-                                
-    sArray[ thri ] = argAbsMin( value, tmp );
-    
-    tmp = value - sArray[ thri ];
-    if ( fabs( tmp ) >  0.001*h )
-        return true;
-    else
-        return false;
+  const RealType value = sArray[ thri ];
+  RealType a, tmp = std::numeric_limits< RealType >::max();
+  
+  a = TNL::argAbsMin( sArray[ thri+1 ],
+          sArray[ thri-1 ] );
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() )
+    return false;
+  
+  tmp = a + TNL::sign( value ) * h/v;
+  
+  
+  sArray[ thri ] = argAbsMin( value, tmp );
+  
+  tmp = value - sArray[ thri ];
+  if ( fabs( tmp ) >  0.001*h )
+    return true;
+  else
+    return false;
 }
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
 __cuda_callable__ 
 bool 
 tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
 updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk,
         const Real hx, const Real hy, const Real hz, const Real v )
 {
-   const RealType value = sArray[thrk][thrj][thri];
-   //std::cout << value << std::endl;
-   RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
-   
-   c = TNL::argAbsMin( sArray[ thrk+1 ][ thrj ][ thri ],
-                        sArray[ thrk-1 ][ thrj ][ thri ] );
-    
-   b = TNL::argAbsMin( sArray[ thrk ][ thrj+1 ][ thri ],
-                        sArray[ thrk ][ thrj-1 ][ thri ] );
-   
-   a = TNL::argAbsMin( sArray[ thrk ][ thrj ][ thri+1 ],
-                        sArray[ thrk ][ thrj ][ thri-1 ] );
-   
-   
-   if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-       fabs( b ) == std::numeric_limits< RealType >::max() &&
-       fabs( c ) == std::numeric_limits< RealType >::max() )
+  const RealType value = sArray[thrk][thrj][thri];
+  //std::cout << value << std::endl;
+  RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
+  
+  c = TNL::argAbsMin( sArray[ thrk+1 ][ thrj ][ thri ],
+          sArray[ thrk-1 ][ thrj ][ thri ] );
+  
+  b = TNL::argAbsMin( sArray[ thrk ][ thrj+1 ][ thri ],
+          sArray[ thrk ][ thrj-1 ][ thri ] );
+  
+  a = TNL::argAbsMin( sArray[ thrk ][ thrj ][ thri+1 ],
+          sArray[ thrk ][ thrj ][ thri-1 ] );
+  
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+          fabs( b ) == std::numeric_limits< RealType >::max() &&
+          fabs( c ) == std::numeric_limits< RealType >::max() )
+    return false;
+  
+  RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
+  
+  sortMinims( pom );
+  
+  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
+  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
+  {
+    sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
+    tmp = value - sArray[ thrk ][ thrj ][ thri ];
+    if ( fabs( tmp ) >  0.001*hx )
+      return true;
+    else
       return false;
-   
-    RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
-    
-    sortMinims( pom );
-    
-    tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
-    if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
+  }
+  else
+  {
+    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
+            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
+    if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
     {
-        sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
-        tmp = value - sArray[ thrk ][ thrj ][ thri ];
-        if ( fabs( tmp ) >  0.001*hx )
-            return true;
-        else
-            return false;
+      sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
+      tmp = value - sArray[ thrk ][ thrj ][ thri ];
+      if ( fabs( tmp ) > 0.001*hx )
+        return true;
+      else
+        return false;
     }
     else
     {
-        tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
-            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
-            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-        if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
-        {
-            sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
-            tmp = value - sArray[ thrk ][ thrj ][ thri ];
-            if ( fabs( tmp ) > 0.001*hx )
-                return true;
-            else
-                return false;
-        }
-        else
-        {
-            tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
-                TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
-                hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
-                hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
-            sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
-            tmp = value - sArray[ thrk ][ thrj ][ thri ];
-            if ( fabs( tmp ) > 0.001*hx )
-                return true;
-            else
-                return false;
-        }
+      tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
+              TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
+              hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
+              hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
+      sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
+      tmp = value - sArray[ thrk ][ thrj ][ thri ];
+      if ( fabs( tmp ) > 0.001*hx )
+        return true;
+      else
+        return false;
     }
-    
-    return false;
+  }
+  
+  return false;
 }
 #endif
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index c6cc575d1..0efa38aa1 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -15,6 +15,7 @@
 
 #include "tnlFastSweepingMethod.h"
 #include <TNL/Devices/Cuda.h>
+#include <string.h>
 
 
 #include <iostream>
@@ -80,115 +81,171 @@ solve( const MeshPointer& mesh,
   
   
-  
   while( iteration < this->maxIterations )
   {
     if( std::is_same< DeviceType, Devices::Host >::value )
     {
-      int numThreadsPerBlock = 16;
+      int numThreadsPerBlock = 1024;
+      
       
       int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
       int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
+      //std::cout << "numBlocksX = " << numBlocksX << std::endl;
+      
+      /*Real **sArray = new Real*[numBlocksX*numBlocksY];
+       for( int i = 0; i < numBlocksX * numBlocksY; i++ )
+       sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];*/
       
-          
       ArrayContainer BlockIterHost;
       BlockIterHost.setSize( numBlocksX * numBlocksY );
       BlockIterHost.setValue( 1 );
+      int IsCalculationDone = 1;
+      
+      MeshFunctionPointer helpFunc( mesh );
+      MeshFunctionPointer helpFunc1( mesh );
+      helpFunc1 = auxPtr;
+      auxPtr = helpFunc;
+      helpFunc = helpFunc1;
+      //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl;
       /*for( int k = numBlocksX-1; k >-1; k-- ){
-        for( int l = 0; l < numBlocksY; l++ ){
-          std::cout<< BlockIterHost[ l*numBlocksX  + k ];
+       for( int l = 0; l < numBlocksY; l++ ){
+       std::cout<< BlockIterHost[ l*numBlocksX  + k ];
+       }
+       std::cout<<std::endl;
+       }
+       std::cout<<std::endl;*/
+      unsigned int numWhile = 0;
+      while( IsCalculationDone  )
+      {      
+        IsCalculationDone = 0;
+        helpFunc1 = auxPtr;
+        auxPtr = helpFunc;
+        helpFunc = helpFunc1;
+        this->template updateBlocks< 1026 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+        
+        for( int i = 0; i < BlockIterHost.getSize(); i++ ){
+          if( IsCalculationDone == 0 ){
+            IsCalculationDone = IsCalculationDone || BlockIterHost[ i ];
+            //break;
+          }
         }
-        std::cout<<std::endl;
-      }
-      std::cout<<std::endl;*/
-      
-      while( BlockIterHost[ 0 ] )
-      {          
-        this->updateBlocks( interfaceMap, aux, BlockIterHost, numThreadsPerBlock);
+        numWhile++;
+        
+        for( int j = numBlocksY-1; j>-1; j-- ){
+          for( int i = 0; i < numBlocksX; i++ )
+            std::cout << BlockIterHost[ j * numBlocksX + i ];
+          std::cout << std::endl;
+        }
+        std::cout << std::endl;
         
         this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY );
         
-  //Reduction      
-        for( int k = numBlocksX-1; k >-1; k-- ){
-          for( int l = 0; l < numBlocksY; l++ ){
-            //std::cout<< BlockIterHost[ l*numBlocksX  + k ];
-            BlockIterHost[ 0 ] = BlockIterHost[ 0 ] || BlockIterHost[ l*numBlocksX + k ];
-          }
-          //std::cout<<std::endl;
-        }
+        /*for( int j = numBlocksY-1; j>-1; j-- ){
+         for( int i = 0; i < numBlocksX; i++ )
+         std::cout << "BlockIterHost = "<< j*numBlocksX + i<< " ," << BlockIterHost[ j * numBlocksX + i ];
+         std::cout << std::endl;
+         }
+         std::cout << std::endl;*/
+        //Reduction      
+        
         //std::cout<<std::endl;
+        string s( "aux-"+ std::to_string(numWhile) + ".tnl");
+        aux.save( s );
       }
-      /*for( cell.getCoordinates().y() = 0;
-              cell.getCoordinates().y() < mesh->getDimensions().y();
-              cell.getCoordinates().y()++ )
-      {
-        for( cell.getCoordinates().x() = 0;
-                cell.getCoordinates().x() < mesh->getDimensions().x();
-                cell.getCoordinates().x()++ )
-        {
-          cell.refresh();
-          if( ! interfaceMap( cell ) )
-            this->updateCell( aux, cell );
-        }
-      }
-      
-      //aux.save( "aux-1.tnl" );
-      
-      for( cell.getCoordinates().y() = 0;
-              cell.getCoordinates().y() < mesh->getDimensions().y();
-              cell.getCoordinates().y()++ )
-      {
-        for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                cell.getCoordinates().x() >= 0 ;
-                cell.getCoordinates().x()-- )		
-        {
-          //std::cerr << "2 -> ";
-          cell.refresh();
-          if( ! interfaceMap( cell ) )            
-            this->updateCell( aux, cell );
-        }
+      if( numWhile == 1 ){
+        auxPtr = helpFunc;
       }
+      /*for( int i = 0; i < numBlocksX * numBlocksY; i++ )
+       delete []sArray[i];*/
       
-      //aux.save( "aux-2.tnl" );
-      
-      for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-              cell.getCoordinates().y() >= 0 ;
-              cell.getCoordinates().y()-- )
-      {
-        for( cell.getCoordinates().x() = 0;
-                cell.getCoordinates().x() < mesh->getDimensions().x();
-                cell.getCoordinates().x()++ )
-        {
-          //std::cerr << "3 -> ";
-          cell.refresh();
-          if( ! interfaceMap( cell ) )            
-            this->updateCell( aux, cell );
-        }
-      }
       
-      //aux.save( "aux-3.tnl" );
+      /*for( cell.getCoordinates().y() = 0;
+       cell.getCoordinates().y() < mesh->getDimensions().y();
+       cell.getCoordinates().y()++ )
+       {
+       for( cell.getCoordinates().x() = 0;
+       cell.getCoordinates().x() < mesh->getDimensions().x();
+       cell.getCoordinates().x()++ )
+       {
+       cell.refresh();
+       if( ! interfaceMap( cell ) )
+       this->updateCell( aux, cell );
+       }
+       }
+       
+       //aux.save( "aux-1.tnl" );
+       
+       for( cell.getCoordinates().y() = 0;
+       cell.getCoordinates().y() < mesh->getDimensions().y();
+       cell.getCoordinates().y()++ )
+       {
+       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+       cell.getCoordinates().x() >= 0 ;
+       cell.getCoordinates().x()-- )		
+       {
+       //std::cerr << "2 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       
+       //aux.save( "aux-2.tnl" );
+       
+       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+       cell.getCoordinates().y() >= 0 ;
+       cell.getCoordinates().y()-- )
+       {
+       for( cell.getCoordinates().x() = 0;
+       cell.getCoordinates().x() < mesh->getDimensions().x();
+       cell.getCoordinates().x()++ )
+       {
+       //std::cerr << "3 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       
+       //aux.save( "aux-3.tnl" );
+       
+       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+       cell.getCoordinates().y() >= 0;
+       cell.getCoordinates().y()-- )
+       {
+       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+       cell.getCoordinates().x() >= 0 ;
+       cell.getCoordinates().x()-- )		
+       {
+       //std::cerr << "4 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       
+       for( int j = 0;
+       j < mesh->getDimensions().y();
+       j++ )
+       {
+       for( int i = 0;
+       i < mesh->getDimensions().x();
+       i++ )
+       {
+       std::cout << aux[ i * mesh->getDimensions().y() + j ] << " ";
+       }
+       std::cout << std::endl;
+       }*/
       
-      for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-              cell.getCoordinates().y() >= 0;
-              cell.getCoordinates().y()-- )
-      {
-        for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                cell.getCoordinates().x() >= 0 ;
-                cell.getCoordinates().x()-- )		
-        {
-          //std::cerr << "4 -> ";
-          cell.refresh();
-          if( ! interfaceMap( cell ) )            
-            this->updateCell( aux, cell );
-        }
-      }*/
     }
     if( std::is_same< DeviceType, Devices::Cuda >::value )
     {
       // TODO: CUDA code
 #ifdef HAVE_CUDA
       TNL_CHECK_CUDA_DEVICE;
+      // Maximum cudaBlockSite is 32. Because of maximum num. of threads in kernel.
       const int cudaBlockSize( 16 );
+      
       int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
       int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
       dim3 blockSize( cudaBlockSize, cudaBlockSize );
@@ -202,19 +259,14 @@ solve( const MeshPointer& mesh,
       BlockIterDevice.setSize( numBlocksX * numBlocksY );
       BlockIterDevice.setValue( 1 );
       TNL_CHECK_CUDA_DEVICE;
-      int ne = 0;
-      CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
-                                                       interfaceMapPtr.template getData< Device >(),
-                                                       auxPtr.template modifyData< Device>(),
-                                                       BlockIterDevice, ne);
-      TNL_CHECK_CUDA_DEVICE;
+      
       
       /*TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom;
-      BlockIterPom.setSize( numBlocksX * numBlocksY  );
-      BlockIterPom.setValue( 0 );*/
+       BlockIterPom.setSize( numBlocksX * numBlocksY  );
+       BlockIterPom.setValue( 0 );*/
       /*TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1;
-      BlockIterPom1.setSize( numBlocksX * numBlocksY  );
-      BlockIterPom1.setValue( 0 );*/
+       BlockIterPom1.setSize( numBlocksX * numBlocksY  );
+       BlockIterPom1.setValue( 0 );*/
       /*int *BlockIterDevice;
        cudaMalloc((void**) &BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/
       int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
@@ -223,59 +275,125 @@ solve( const MeshPointer& mesh,
       /*int *BlockIterPom;
        cudaMalloc((void**) &BlockIterPom, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/
       
-      int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0);
+      int nBlocks = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
+      
       TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
-      dBlock.setSize( nBlocks  );
+      dBlock.setSize( nBlocks );
       TNL_CHECK_CUDA_DEVICE;
       /*int *dBlock;
        cudaMalloc((void**) &dBlock, nBlocks * sizeof( int ) );*/
-      //int pocIter = 0;
+      
+      
+      MeshFunctionPointer helpFunc1;
+      helpFunc1->setMesh(mesh);
+      
+      MeshFunctionPointer helpFunc( mesh );
+      
+      helpFunc1 = auxPtr;
+      auxPtr = helpFunc;
+      helpFunc = helpFunc1;
+      
+      int numIter = 0;
+      
+      //int oddEvenBlock = 0;
       while( BlockIterD )
       {
-        /*BlockIterPom1 = BlockIterDevice;
-        for( int j = numBlocksY-1; j>-1; j-- ){
-          for( int i = 0; i < numBlocksX; i++ )
-            std::cout << BlockIterPom1[ j * numBlocksX + i ];
-          std::cout << std::endl;
-        }
-        std::cout << std::endl;*/
+        /** HERE IS CHESS METHOD **/
+        
+        /*auxPtr = helpFunc;
         
-        CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
-                                                         interfaceMapPtr.template getData< Device >(),
-                                                         auxPtr.template modifyData< Device>(),
-                                                         BlockIterDevice, 1);
+        CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
+                interfaceMapPtr.template getData< Device >(),
+                auxPtr.template getData< Device>(),
+                helpFunc.template modifyData< Device>(),
+                BlockIterDevice,
+                oddEvenBlock );
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
+        auxPtr = helpFunc;
         
-        /*int poc = 0;
-        for( int i = 0; i < numBlocksX * numBlocksY; i++ )
-          if( BlockIterPom1[ i ] )
-            poc = poc+1;
-        std::cout << "pocet bloku, ktere se pocitali = " << poc << std::endl;*/
+        oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
         
-        GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, /*BlockIterPom,*/ numBlocksX, numBlocksY );
+        CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
+                interfaceMapPtr.template getData< Device >(),
+                auxPtr.template getData< Device>(),
+                helpFunc.template modifyData< Device>(),
+                BlockIterDevice,
+                oddEvenBlock );
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
+        auxPtr = helpFunc;
         
-        CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
-        TNL_CHECK_CUDA_DEVICE;
+        oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
         
-        CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+        CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
+        cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
-        
-        BlockIterD = dBlock.getElement( 0 );
-        //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
+        CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
         
+        BlockIterD = dBlock.getElement( 0 );*/
+        
+        /**------------------------------------------------------------------------------------------------*/
+        
+        
+        /** HERE IS FIM **/
+        
+         helpFunc1 = auxPtr;
+         auxPtr = helpFunc;
+         helpFunc = helpFunc1;
+         
+         //int pocBloku = 0;
+         Devices::Cuda::synchronizeDevice();
+         CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
+         interfaceMapPtr.template getData< Device >(),
+         auxPtr.template modifyData< Device>(),
+         helpFunc.template modifyData< Device>(),
+         BlockIterDevice );
+         cudaDeviceSynchronize();
+         TNL_CHECK_CUDA_DEVICE;
+         
+         //std::cout << "Pocet aktivnich bloku = " << pocBloku << std::endl;
+         
+         GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, numBlocksX, numBlocksY );
+         cudaDeviceSynchronize();
+         TNL_CHECK_CUDA_DEVICE;
+         
+         //std::cout<< "Probehlo" << std::endl;
+         
+         //TNL::swap( auxPtr, helpFunc );
+         
+         
+         CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
+         TNL_CHECK_CUDA_DEVICE;
+         
+         CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+         TNL_CHECK_CUDA_DEVICE;
+         
+         
+         BlockIterD = dBlock.getElement( 0 );
+         //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
+         cudaDeviceSynchronize();
+         TNL_CHECK_CUDA_DEVICE;
+         
+        
+        /**-----------------------------------------------------------------------------------------------------------*/
         /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
          BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
-        //pocIter ++;
+        numIter ++;
       }
+      if( numIter == 1 ){
+        helpFunc1 = auxPtr;
+        auxPtr = helpFunc;
+        helpFunc = helpFunc1;
+      }
+      /*cudaFree( BlockIterDevice );
+       cudaFree( dBlock );
+       delete BlockIter;*/
       cudaDeviceSynchronize();
-      TNL_CHECK_CUDA_DEVICE;
       
-      //std::cout<< pocIter << std::endl;
+      TNL_CHECK_CUDA_DEVICE;
       
       aux = *auxPtr;
       interfaceMap = *interfaceMapPtr;
@@ -286,10 +404,13 @@ solve( const MeshPointer& mesh,
   aux.save("aux-final.tnl");
 }
 
+
 #ifdef HAVE_CUDA
+
+
 template < typename Index >
 __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
-                               /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY )
+        /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY )
 {
   int i = blockIdx.x * 1024 + threadIdx.x;
   
@@ -299,53 +420,68 @@ __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index
     int m=0, k=0;
     m = i%numBlockX;
     k = i/numBlockX;
-    if( m > 0 )
-      if( BlockIterDevice[ i - 1 ] )
-        pom = 1;//BlockIterPom[ i ] = 1;
-    if( m < numBlockX -1 && pom == 0 )
-      if( BlockIterDevice[ i + 1 ] )
-        pom = 1;//BlockIterPom[ i ] = 1;
-    if( k > 0 && pom == 0 )
-      if( BlockIterDevice[ i - numBlockX ] )
-        pom = 1;// BlockIterPom[ i ] = 1;
-    if( k < numBlockY -1 && pom == 0 )
-      if( BlockIterDevice[ i + numBlockX ] )
-        pom = 1;//BlockIterPom[ i ] = 1;
+    if( m > 0 && BlockIterDevice[ i - 1 ] ){
+      pom = 1;//BlockIterPom[ i ] = 1;
+    }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){
+      pom = 1;//BlockIterPom[ i ] = 1;
+    }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){
+      pom = 1;// BlockIterPom[ i ] = 1;
+    }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){
+      pom = 1;//BlockIterPom[ i ] = 1;
+    }
     
-          
-      
     BlockIterDevice[ i ] = pom;//BlockIterPom[ i ];
   }
 }
 
 template < typename Index >
 __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
-                                   TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks )
+        TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks )
 {
   int i = threadIdx.x;
   int blId = blockIdx.x;
-  __shared__ volatile int sArray[ 512 ];
+  int blockSize = blockDim.x;
+  /*if ( i == 0 && blId == 0 ){
+   printf( "nBlocks = %d \n", nBlocks );
+   for( int j = nBlocks-1; j > -1 ; j--){
+   printf( "cislo = %d \n", BlockIterDevice[ j ] );
+   }
+   }*/
+  __shared__ int sArray[ 1024 ];
   sArray[ i ] = 0;
-  if(blId * 512 + i < nBlocks )
-    sArray[ i ] = BlockIterDevice[ blId * 512 + i ];
+  if( blId * 1024 + i < nBlocks )
+    sArray[ i ] = BlockIterDevice[ blId * 1024 + i ];
   __syncthreads();
-  if (blockDim.x == 1024) {
+  /*extern __shared__ volatile int sArray[];
+   unsigned int i = threadIdx.x;
+   unsigned int gid = blockIdx.x * blockSize * 2 + threadIdx.x;
+   unsigned int gridSize = blockSize * 2 * gridDim.x;
+   sArray[ i ] = 0;
+   while( gid < nBlocks )
+   {
+   sArray[ i ] += BlockIterDevice[ gid ] + BlockIterDevice[ gid + blockSize ];
+   gid += gridSize;
+   }
+   __syncthreads();*/
+  
+  if ( blockSize == 1024) {
     if (i < 512)
       sArray[ i ] += sArray[ i + 512 ];
   }
   __syncthreads();
-  if (blockDim.x  >= 512) {
+  if (blockSize >= 512) {
     if (i < 256) {
       sArray[ i ] += sArray[ i + 256 ];
     }
   }
-  if (blockDim.x >= 256) {
+  __syncthreads();
+  if (blockSize >= 256) {
     if (i < 128) {
       sArray[ i ] += sArray[ i + 128 ];
     }
   }
   __syncthreads();
-  if (blockDim.x >= 128) {
+  if (blockSize >= 128) {
     if (i < 64) {
       sArray[ i ] += sArray[ i + 64 ];
     }
@@ -353,12 +489,12 @@ __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, I
   __syncthreads();
   if (i < 32 )
   {
-    if(  blockDim.x >= 64 ) sArray[ i ] += sArray[ i + 32 ];
-    if(  blockDim.x >= 32 )  sArray[ i ] += sArray[ i + 16 ];
-    if(  blockDim.x >= 16 )  sArray[ i ] += sArray[ i + 8 ];
-    if(  blockDim.x >= 8 )  sArray[ i ] += sArray[ i + 4 ];
-    if(  blockDim.x >= 4 )  sArray[ i ] += sArray[ i + 2 ];
-    if(  blockDim.x >= 2 )  sArray[ i ] += sArray[ i + 1 ];
+    if(  blockSize >= 64 ) sArray[ i ] += sArray[ i + 32 ];
+    if(  blockSize >= 32 )  sArray[ i ] += sArray[ i + 16 ];
+    if(  blockSize >= 16 )  sArray[ i ] += sArray[ i + 8 ];
+    if(  blockSize >= 8 )  sArray[ i ] += sArray[ i + 4 ];
+    if(  blockSize >= 4 )  sArray[ i ] += sArray[ i + 2 ];
+    if(  blockSize >= 2 )  sArray[ i ] += sArray[ i + 1 ];
   }
   
   if( i == 0 )
@@ -367,94 +503,120 @@ __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, I
 
 
-template < typename Real, typename Device, typename Index >
+template < int sizeSArray, typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
-                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
-                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne )
+        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
+        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock )
 {
   int thri = threadIdx.x; int thrj = threadIdx.y;
-  int blIdx = blockIdx.x; int blIdy = blockIdx.y;
-  int grIdx = gridDim.x;
-  
-  if( BlockIterDevice[ blIdy * grIdx + blIdx] )
+  int i = threadIdx.x + blockDim.x*blockIdx.x;
+  int j = blockDim.y*blockIdx.y + threadIdx.y;
+  /** FOR CHESS METHOD */
+  if( (blockIdx.y%2  + blockIdx.x) % 2 == oddEvenBlock )
   {
-  
-    const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
+    /**-----------------------------------------*/
+    
     
-    int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
-    __shared__ volatile int numOfBlockx;
-    __shared__ volatile int numOfBlocky;
-    __shared__ int xkolik;
-    __shared__ int ykolik;
-    __shared__ volatile int NE;
-    if( thri == 0 && thrj == 0 )
+    /** FOR FIM METHOD */
+    /*if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x] )
+     {*/ 
+    /**-----------------------------------------*/
+    const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
+    __shared__ volatile int dimX;
+    __shared__ volatile int dimY;
+    __shared__ volatile Real hx;
+    __shared__ volatile Real hy;
+    if( thri==0 && thrj == 0)
     {
-      xkolik = blockDim.x + 1;
-      ykolik = blockDim.y + 1;
-      numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0);
-      numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0);
-      
-      if( numOfBlockx - 1 == blIdx )
-        xkolik = dimX - (blIdx)*blockDim.x+1;
-      
-      if( numOfBlocky -1 == blIdy )
-        ykolik = dimY - (blIdy)*blockDim.y+1;
-        BlockIterDevice[ blIdy * grIdx + blIdx ] = 0;
-        NE = ne;
+      dimX = mesh.getDimensions().x();
+      dimY = mesh.getDimensions().y();
+      hx = mesh.getSpaceSteps().x();
+      hy = mesh.getSpaceSteps().y();
+      BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] = 0;
     }
     __syncthreads();
-   
-    int i = thri + blockDim.x*blIdx;
-    int j = blockDim.y*blIdy + thrj;
+    int numOfBlockx;
+    int numOfBlocky;
+    int xkolik;
+    int ykolik;
+    
+    xkolik = blockDim.x + 1;
+    ykolik = blockDim.y + 1;
+    numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0);
+    numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0);
+    
+    if( numOfBlockx - 1 == blockIdx.x )
+      xkolik = dimX - (blockIdx.x)*blockDim.x+1;
+    
+    if( numOfBlocky -1 == blockIdx.y )
+      ykolik = dimY - (blockIdx.y)*blockDim.y+1;
+    __syncthreads();
+    
     int currentIndex = thrj * blockDim.x + thri;
-    if( BlockIterDevice[ blIdy * gridDim.x + blIdx] )
-    {
     //__shared__ volatile bool changed[ blockDim.x*blockDim.y ];
-    __shared__ volatile bool changed[16*16];
+    __shared__ volatile bool changed[ (sizeSArray-2)*(sizeSArray-2)];
     changed[ currentIndex ] = false;
     if( thrj == 0 && thri == 0 )
       changed[ 0 ] = true;
     
-    __shared__ Real hx;
-    __shared__ Real hy;
-    if( thrj == 1 && thri == 1 )
-    {
-      hx = mesh.getSpaceSteps().x();
-      hy = mesh.getSpaceSteps().y();
-    }
     
     //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ];
-    __shared__ volatile Real sArray[18][18];
-    sArray[thrj][thri] = std::numeric_limits< Real >::max();
+    __shared__ volatile Real sArray[ sizeSArray * sizeSArray ];
+    sArray[ thrj * sizeSArray + thri ] = std::numeric_limits< Real >::max();
     
     //filling sArray edges
     if( thri == 0 )
-    {        
-      if( dimX > (blIdx+1) * blockDim.x  && thrj+1 < ykolik && NE == 1 )
-        sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ];
+    {      
+      if( dimX > (blockIdx.x+1) * blockDim.x  && thrj+1 < ykolik )
+        sArray[(thrj+1)*sizeSArray + xkolik] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX + xkolik ];
       else
-        sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
+        sArray[(thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max();
     }
     
     if( thri == 1 )
     {
-      if( blIdx != 0 && thrj+1 < ykolik && NE == 1 )
-        sArray[thrj+1][0] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX ];
+      if( blockIdx.x != 0 && thrj+1 < ykolik )
+        sArray[(thrj+1)*sizeSArray + 0] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX ];
       else
-        sArray[thrj+1][0] = std::numeric_limits< Real >::max();
+        sArray[(thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max();
     }
     
     if( thri == 2 )
     {
-      if( dimY > (blIdy+1) * blockDim.y  && thri+1 < xkolik && NE == 1 )
-        sArray[ykolik][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thrj+1 ];
+      if( dimY > (blockIdx.y+1) * blockDim.y  && thrj+1 < xkolik )
+        sArray[ ykolik*sizeSArray + thrj+1 ] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + ykolik*dimX + thrj+1 ];
+      else
+        sArray[ykolik*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
+      
+    }
+    
+    if( thri == 3 )
+    {
+      if( blockIdx.y != 0 && thrj+1 < xkolik )
+        sArray[0*sizeSArray + thrj+1] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + thrj+1 ];
       else
-        sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();
+        sArray[0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
+    }
+    
+    if( i < dimX && j < dimY )
+    {    
+      sArray[(thrj+1)*sizeSArray + thri+1] = aux[ j*dimX + i ];
     }
     
+    while( changed[ 0 ] )
+    {
+      __syncthreads();
+      
+      changed[ currentIndex] = false;
+      
+      //calculation of update cell
+      if( i < dimX && j < dimY )
+      {
+        if( ! interfaceMap[ j * dimX + i ] )
         {
-          changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy);
+          changed[ currentIndex ] = ptr.updateCell<sizeSArray>( sArray, thri+1, thrj+1, hx,hy);
         }
       }
       __syncthreads();
@@ -488,30 +650,36 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       {
         if( currentIndex < 64 )
         {
-    if( thri == 3 )
-    {
-      if( blIdy != 0 && thrj+1 < xkolik && NE == 1 )
-        sArray[0][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + thrj+1 ];
-      else
-        sArray[0][thrj+1] = std::numeric_limits< Real >::max();
-    }
-    
-    
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
-    {    
-      sArray[thrj+1][thri+1] = aux[ j*mesh.getDimensions().x() + i ];
-    }
-    __syncthreads();  
-    
-    while( changed[ 0 ] )
-    {
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
+        }
+      }
       __syncthreads();
-      
-      changed[ currentIndex] = false;
-      
-      //calculation of update cell
-      if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+      if( currentIndex < 32 ) 
       {
-        if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] )
+        if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
+        if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
+        if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
+        if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
+        if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
+        if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
+      }
+      if( thri == 0 && thrj == 0 && changed[ 0 ] ){
+        BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] = 1;
+      }
+      /*if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 )
+       {
+       for( int k = 15; k>-1; k-- ){
+       for( int l = 0; l < 16; l++ )
+       printf( "%f\t", sArray[k * 16 + l]);
+       printf( "\n");
+       }
+       printf( "\n");
+       }*/
+      __syncthreads();
+    }
+    if( i < dimX && j < dimY )
+      helpFunc[ j * dimX + i ] = sArray[ ( thrj + 1 ) * sizeSArray + thri + 1 ];
+    
+  } 
 }
 #endif
-- 
GitLab


From a6cfb604446996a5f1296c45b8a5dc28b194ebf1 Mon Sep 17 00:00:00 2001
From: Fencl <fenclmat@fjfi.cvut.cz>
Date: Wed, 31 Oct 2018 06:44:59 +0100
Subject: [PATCH 14/20] Repair of last commit (error for - wihtout cuda): FIM
 method implemented for 2D GPU and FIM-FSM implemented for 2D CPU (parallel).

---
 .../tnlDirectEikonalMethodsBase_impl.h        | 119 +++++++++---------
 1 file changed, 60 insertions(+), 59 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index 95971c9b8..500d1bf03 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -11,6 +11,7 @@
 
 #include <iostream>
 #include "tnlFastSweepingMethod.h"
+#include "tnlDirectEikonalMethodsBase.h"
 
 template< typename Real,
         typename Device,
@@ -135,7 +136,7 @@ updateBlocks( InterfaceMapType interfaceMap,
       bool changed = false;
       
       
-      RealType *sArray;
+      Real *sArray;
       sArray = new Real[ sizeSArray * sizeSArray ];
       if( sArray == nullptr )
         std::cout << "Error while allocating memory for sArray." << std::endl;
@@ -175,7 +176,7 @@ updateBlocks( InterfaceMapType interfaceMap,
             //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl;
             if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
             {
-              pom = this->updateCell<sizeSArray>( sArray/*[i]*/, l+1, k+1, hx,hy);
+              pom = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy);
               changed = changed || pom;
             }
           }
@@ -195,7 +196,7 @@ updateBlocks( InterfaceMapType interfaceMap,
           {
             if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
             {
-              this->updateCell<sizeSArray>( sArray/*[i]*/, l+1, k+1, hx,hy);
+              this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy);
             }
           }
         }
@@ -213,7 +214,7 @@ updateBlocks( InterfaceMapType interfaceMap,
           {
             if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
             {
-              this->updateCell<sizeSArray>( sArray/*[i]*/, l+1, k+1, hx,hy);
+              this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy);
             }
           }
         }
@@ -231,7 +232,7 @@ updateBlocks( InterfaceMapType interfaceMap,
           {
             if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
             {
-              this->updateCell<sizeSArray>( sArray/*[i]*/, l+1, k+1, hx,hy);
+              this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx, hy, 1.0);
             }
           }
         }
@@ -258,7 +259,7 @@ updateBlocks( InterfaceMapType interfaceMap,
         }
         //std::cout<<std::endl;
       }
-      //delete []sArray;
+      delete []sArray;
     }
   }
 }
@@ -914,7 +915,58 @@ __cuda_callable__ void sortMinims( T1 pom[] )
   }   
 }
 
-
+template< typename Real,
+        typename Device,
+        typename Index >
+template< int sizeSArray >
+__cuda_callable__
+bool
+tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
+updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real hy,
+        const Real v )
+{
+  const RealType value = sArray[ thrj * sizeSArray + thri ];
+  RealType a, b, tmp = std::numeric_limits< RealType >::max();
+  
+  b = TNL::argAbsMin( sArray[ (thrj+1) * sizeSArray + thri ],
+          sArray[ (thrj-1) * sizeSArray + thri ] );
+  
+  a = TNL::argAbsMin( sArray[ thrj * sizeSArray + thri+1 ],
+          sArray[ thrj * sizeSArray + thri-1 ] );
+  
+  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
+          fabs( b ) == std::numeric_limits< RealType >::max() )
+    return false;
+  
+  RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
+  sortMinims( pom );
+  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
+  
+  
+  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
+  {
+    sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp );
+    tmp = value - sArray[ thrj * sizeSArray + thri ];
+    if ( fabs( tmp ) >  0.001*hx )
+      return true;
+    else
+      return false;
+  }
+  else
+  {
+    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
+            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
+    sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp );
+    tmp = value - sArray[ thrj * sizeSArray + thri ];
+    if ( fabs( tmp ) > 0.001*hx )
+      return true;
+    else
+      return false;
+  }
+  
+  return false;
+}
 
 #ifdef HAVE_CUDA
 template < typename Real, typename Device, typename Index >
@@ -1133,58 +1185,7 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3
 }
 
 
-template< typename Real,
-        typename Device,
-        typename Index >
-template< int sizeSArray >
-__cuda_callable__
-bool
-tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
-updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real hy,
-        const Real v )
-{
-  const RealType value = sArray[ thrj * sizeSArray + thri ];
-  RealType a, b, tmp = std::numeric_limits< RealType >::max();
-  
-  b = TNL::argAbsMin( sArray[ (thrj+1) * sizeSArray + thri ],
-          sArray[ (thrj-1) * sizeSArray + thri ] );
-  
-  a = TNL::argAbsMin( sArray[ thrj * sizeSArray + thri+1 ],
-          sArray[ thrj * sizeSArray + thri-1 ] );
-  
-  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-          fabs( b ) == std::numeric_limits< RealType >::max() )
-    return false;
-  
-  RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 };
-  sortMinims( pom );
-  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;
-  
-  
-  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
-  {
-    sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp );
-    tmp = value - sArray[ thrj * sizeSArray + thri ];
-    if ( fabs( tmp ) >  0.001*hx )
-      return true;
-    else
-      return false;
-  }
-  else
-  {
-    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
-            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
-            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-    sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp );
-    tmp = value - sArray[ thrj * sizeSArray + thri ];
-    if ( fabs( tmp ) > 0.001*hx )
-      return true;
-    else
-      return false;
-  }
-  
-  return false;
-}
+
 
 template< typename Real,
         typename Device,
-- 
GitLab


From 39b4889c9c0205f550e334f646d1c244530d788d Mon Sep 17 00:00:00 2001
From: Fencl <fenclmat@fjfi.cvut.cz>
Date: Thu, 1 Nov 2018 16:26:36 +0100
Subject: [PATCH 15/20] Last repair of FIM for GPU.

---
 .../tnlDirectEikonalMethodsBase.h             |   2 +-
 .../tnlDirectEikonalMethodsBase_impl.h        |  72 ++++----
 .../tnlFastSweepingMethod2D_impl.h            | 165 ++++++++++--------
 3 files changed, 125 insertions(+), 114 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index cbb1a1ff6..ccbae8abe 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -148,7 +148,7 @@ __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, I
 
 template < typename Index >
 __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
-                               /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY );
+                               TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY );
 
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index 500d1bf03..5083544e2 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -134,6 +134,7 @@ updateBlocks( InterfaceMapType interfaceMap,
       Real hy = mesh.getSpaceSteps().y();
       
       bool changed = false;
+      BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0;
       
       
       Real *sArray;
@@ -143,53 +144,52 @@ updateBlocks( InterfaceMapType interfaceMap,
       
       for( int thri = 0; thri < sizeSArray; thri++ ){
         for( int thrj = 0; thrj < sizeSArray; thrj++ )
-          sArray/*[i]*/[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max();
+          sArray[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max();
       }
       
-      BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0;
       
       for( int thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ )
       {        
         if( dimX > (blIdx+1) * numThreadsPerBlock  && thrj+1 < ykolik )
-          sArray/*[i]*/[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ];
+          sArray[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ];
         
         
         if( blIdx != 0 && thrj+1 < ykolik )
-          sArray/*[i]*/[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ];
+          sArray[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ];
         
         if( dimY > (blIdy+1) * numThreadsPerBlock  && thrj+1 < xkolik )
-          sArray/*[i]*/[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ];
+          sArray[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ];
         
         if( blIdy != 0 && thrj+1 < xkolik )
-          sArray/*[i]*/[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ];
+          sArray[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ];
       }
       
       for( int k = 0; k < numThreadsPerBlock; k++ ){
         for( int l = 0; l < numThreadsPerBlock; l++ )
           if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
-            sArray/*[i]*/[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ];
+            sArray[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ];
       }
-      bool pom = false;
+      
       for( int k = 0; k < numThreadsPerBlock; k++ ){ 
         for( int l = 0; l < numThreadsPerBlock; l++ ){
           if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ){
             //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl;
             if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] )
             {
-              pom = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy);
-              changed = changed || pom;
+              changed = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy) || changed;
+              
             }
           }
         }
       }
       /*aux.save( "aux-1pruch.tnl" );
-      for( int k = 0; k < sizeSArray; k++ ){ 
-        for( int l = 0; l < sizeSArray; l++ ) {
-          std::cout << sArray[ k * sizeSArray + l] << " ";
-        }
-        std::cout << std::endl;
-      }*/
-           
+       for( int k = 0; k < sizeSArray; k++ ){ 
+       for( int l = 0; l < sizeSArray; l++ ) {
+       std::cout << sArray[ k * sizeSArray + l] << " ";
+       }
+       std::cout << std::endl;
+       }*/
+      
       for( int k = 0; k < numThreadsPerBlock; k++ ) 
         for( int l = numThreadsPerBlock-1; l >-1; l-- ) { 
           if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
@@ -201,12 +201,12 @@ updateBlocks( InterfaceMapType interfaceMap,
           }
         }
       /*aux.save( "aux-2pruch.tnl" );
-      for( int k = 0; k < sizeSArray; k++ ){ 
-        for( int l = 0; l < sizeSArray; l++ ) {
-          std::cout << sArray[ k * sizeSArray + l] << " ";
-        }
-        std::cout << std::endl;
-      }*/
+       for( int k = 0; k < sizeSArray; k++ ){ 
+       for( int l = 0; l < sizeSArray; l++ ) {
+       std::cout << sArray[ k * sizeSArray + l] << " ";
+       }
+       std::cout << std::endl;
+       }*/
       
       for( int k = numThreadsPerBlock-1; k > -1; k-- ) 
         for( int l = 0; l < numThreadsPerBlock; l++ ) {
@@ -219,12 +219,12 @@ updateBlocks( InterfaceMapType interfaceMap,
           }
         }
       /*aux.save( "aux-3pruch.tnl" );
-      for( int k = 0; k < sizeSArray; k++ ){ 
-        for( int l = 0; l < sizeSArray; l++ ) {
-          std::cout << sArray[ k * sizeSArray + l] << " ";
-        }
-        std::cout << std::endl;
-      }*/
+       for( int k = 0; k < sizeSArray; k++ ){ 
+       for( int l = 0; l < sizeSArray; l++ ) {
+       std::cout << sArray[ k * sizeSArray + l] << " ";
+       }
+       std::cout << std::endl;
+       }*/
       
       for( int k = numThreadsPerBlock-1; k > -1; k-- ){
         for( int l = numThreadsPerBlock-1; l >-1; l-- ) { 
@@ -238,12 +238,12 @@ updateBlocks( InterfaceMapType interfaceMap,
         }
       }
       /*aux.save( "aux-4pruch.tnl" );
-      for( int k = 0; k < sizeSArray; k++ ){ 
-        for( int l = 0; l < sizeSArray; l++ ) {
-          std::cout << sArray[ k * sizeSArray + l] << " ";
-        }
-        std::cout << std::endl;
-      }*/
+       for( int k = 0; k < sizeSArray; k++ ){ 
+       for( int l = 0; l < sizeSArray; l++ ) {
+       std::cout << sArray[ k * sizeSArray + l] << " ";
+       }
+       std::cout << std::endl;
+       }*/
       
       
       if( changed ){
@@ -254,7 +254,7 @@ updateBlocks( InterfaceMapType interfaceMap,
       for( int k = 0; k < numThreadsPerBlock; k++ ){ 
         for( int l = 0; l < numThreadsPerBlock; l++ ) {
           if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )      
-            helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] = sArray/*[i]*/[ (k + 1)* sizeSArray + l + 1 ];
+            helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx  + k*dimX + l ] = sArray[ (k + 1)* sizeSArray + l + 1 ];
           //std::cout<< sArray[k+1][l+1];
         }
         //std::cout<<std::endl;
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index 0efa38aa1..546cfe9aa 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -123,6 +123,7 @@ solve( const MeshPointer& mesh,
         helpFunc = helpFunc1;
         this->template updateBlocks< 1026 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
         
+        //Reduction      
         for( int i = 0; i < BlockIterHost.getSize(); i++ ){
           if( IsCalculationDone == 0 ){
             IsCalculationDone = IsCalculationDone || BlockIterHost[ i ];
@@ -130,6 +131,7 @@ solve( const MeshPointer& mesh,
           }
         }
         numWhile++;
+        std::cout <<"numWhile = "<< numWhile <<std::endl;
         
         for( int j = numBlocksY-1; j>-1; j-- ){
           for( int i = 0; i < numBlocksX; i++ )
@@ -146,7 +148,6 @@ solve( const MeshPointer& mesh,
          std::cout << std::endl;
          }
          std::cout << std::endl;*/
-        //Reduction      
         
         //std::cout<<std::endl;
         string s( "aux-"+ std::to_string(numWhile) + ".tnl");
@@ -171,7 +172,7 @@ solve( const MeshPointer& mesh,
        if( ! interfaceMap( cell ) )
        this->updateCell( aux, cell );
        }
-       }
+       } 
        
        //aux.save( "aux-1.tnl" );
        
@@ -261,12 +262,12 @@ solve( const MeshPointer& mesh,
       TNL_CHECK_CUDA_DEVICE;
       
       
-      /*TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom;
-       BlockIterPom.setSize( numBlocksX * numBlocksY  );
-       BlockIterPom.setValue( 0 );*/
+      TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom;
+      BlockIterPom.setSize( numBlocksX * numBlocksY  );
+      BlockIterPom.setValue( 0 );
       /*TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1;
-       BlockIterPom1.setSize( numBlocksX * numBlocksY  );
-       BlockIterPom1.setValue( 0 );*/
+      BlockIterPom1.setSize( numBlocksX * numBlocksY  );
+      BlockIterPom1.setValue( 0 );*/
       /*int *BlockIterDevice;
        cudaMalloc((void**) &BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/
       int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
@@ -284,9 +285,7 @@ solve( const MeshPointer& mesh,
        cudaMalloc((void**) &dBlock, nBlocks * sizeof( int ) );*/
       
       
-      MeshFunctionPointer helpFunc1;
-      helpFunc1->setMesh(mesh);
-      
+      MeshFunctionPointer helpFunc1( mesh );      
       MeshFunctionPointer helpFunc( mesh );
       
       helpFunc1 = auxPtr;
@@ -301,83 +300,94 @@ solve( const MeshPointer& mesh,
         /** HERE IS CHESS METHOD **/
         
         /*auxPtr = helpFunc;
+         
+         CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
+         interfaceMapPtr.template getData< Device >(),
+         auxPtr.template getData< Device>(),
+         helpFunc.template modifyData< Device>(),
+         BlockIterDevice,
+         oddEvenBlock );
+         cudaDeviceSynchronize();
+         TNL_CHECK_CUDA_DEVICE;
+         auxPtr = helpFunc;
+         
+         oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
+         
+         CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
+         interfaceMapPtr.template getData< Device >(),
+         auxPtr.template getData< Device>(),
+         helpFunc.template modifyData< Device>(),
+         BlockIterDevice,
+         oddEvenBlock );
+         cudaDeviceSynchronize();
+         TNL_CHECK_CUDA_DEVICE;
+         auxPtr = helpFunc;
+         
+         oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
+         
+         CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
+         cudaDeviceSynchronize();
+         TNL_CHECK_CUDA_DEVICE;
+         CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+         cudaDeviceSynchronize();
+         TNL_CHECK_CUDA_DEVICE;
+         
+         BlockIterD = dBlock.getElement( 0 );*/
+        
+        /**------------------------------------------------------------------------------------------------*/
+        
+        
+        /** HERE IS FIM **/
         
+        helpFunc1 = auxPtr;
+        auxPtr = helpFunc;
+        helpFunc = helpFunc1;
+        TNL_CHECK_CUDA_DEVICE;
+        
+        //int pocBloku = 0;
+        Devices::Cuda::synchronizeDevice();
         CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
                 interfaceMapPtr.template getData< Device >(),
-                auxPtr.template getData< Device>(),
+                auxPtr.template modifyData< Device>(),
                 helpFunc.template modifyData< Device>(),
-                BlockIterDevice,
-                oddEvenBlock );
+                BlockIterDevice );
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
-        auxPtr = helpFunc;
         
-        oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
+        //std::cout << "Pocet aktivnich bloku = " << pocBloku << std::endl;
+        //BlockIterPom1 = BlockIterDevice;
+        ///for( int i =0; i< numBlocksX; i++ ){
+        //  for( int j = 0; j < numBlocksY; j++ )
+        //  {
+        //    std::cout << BlockIterPom1[j*numBlocksX + i];
+        //  }
+        //  std::cout << std::endl;
+        //}
+        //std::cout << std::endl;
         
-        CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
-                interfaceMapPtr.template getData< Device >(),
-                auxPtr.template getData< Device>(),
-                helpFunc.template modifyData< Device>(),
-                BlockIterDevice,
-                oddEvenBlock );
+        GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, BlockIterPom, numBlocksX, numBlocksY );
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
-        auxPtr = helpFunc;
+        BlockIterDevice = BlockIterPom;
+        
+        //std::cout<< "Probehlo" << std::endl;
+        
+        //TNL::swap( auxPtr, helpFunc );
         
-        oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
         
         CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
-        cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
+        
         CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
-        cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
         
-        BlockIterD = dBlock.getElement( 0 );*/
         
-        /**------------------------------------------------------------------------------------------------*/
+        BlockIterD = dBlock.getElement( 0 );
+        //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
+        cudaDeviceSynchronize();
+        TNL_CHECK_CUDA_DEVICE;
         
         
-        /** HERE IS FIM **/
-        
-         helpFunc1 = auxPtr;
-         auxPtr = helpFunc;
-         helpFunc = helpFunc1;
-         
-         //int pocBloku = 0;
-         Devices::Cuda::synchronizeDevice();
-         CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
-         interfaceMapPtr.template getData< Device >(),
-         auxPtr.template modifyData< Device>(),
-         helpFunc.template modifyData< Device>(),
-         BlockIterDevice );
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
-         
-         //std::cout << "Pocet aktivnich bloku = " << pocBloku << std::endl;
-         
-         GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, numBlocksX, numBlocksY );
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
-         
-         //std::cout<< "Probehlo" << std::endl;
-         
-         //TNL::swap( auxPtr, helpFunc );
-         
-         
-         CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
-         TNL_CHECK_CUDA_DEVICE;
-         
-         CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
-         TNL_CHECK_CUDA_DEVICE;
-         
-         
-         BlockIterD = dBlock.getElement( 0 );
-         //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
-         
-        
         /**-----------------------------------------------------------------------------------------------------------*/
         /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
          BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
@@ -392,7 +402,6 @@ solve( const MeshPointer& mesh,
        cudaFree( dBlock );
        delete BlockIter;*/
       cudaDeviceSynchronize();
-      
       TNL_CHECK_CUDA_DEVICE;
       
       aux = *auxPtr;
@@ -410,7 +419,7 @@ solve( const MeshPointer& mesh,
 
 template < typename Index >
 __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
-        /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY )
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY )
 {
   int i = blockIdx.x * 1024 + threadIdx.x;
   
@@ -430,7 +439,7 @@ __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index
       pom = 1;//BlockIterPom[ i ] = 1;
     }
     
-    BlockIterDevice[ i ] = pom;//BlockIterPom[ i ];
+    BlockIterPom[ i ] = pom;//BlockIterPom[ i ];
   }
 }
 
@@ -514,14 +523,16 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
   int i = threadIdx.x + blockDim.x*blockIdx.x;
   int j = blockDim.y*blockIdx.y + threadIdx.y;
   /** FOR CHESS METHOD */
-  if( (blockIdx.y%2  + blockIdx.x) % 2 == oddEvenBlock )
-  {
-    /**-----------------------------------------*/
-    
+  //if( (blockIdx.y%2  + blockIdx.x) % 2 == oddEvenBlock )
+  //{
+  /**------------------------------------------*/
+  
+  
+  /** FOR FIM METHOD */
     
-    /** FOR FIM METHOD */
-    /*if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x] )
-     {*/ 
+  if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] )
+  { 
+    __syncthreads();
     /**-----------------------------------------*/
     const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
     __shared__ volatile int dimX;
-- 
GitLab


From e3c970e1396fadc6b7fb1e97f0cf734037a19fd2 Mon Sep 17 00:00:00 2001
From: Fencl <fenclmat@fjfi.cvut.cz>
Date: Mon, 5 Nov 2018 14:43:21 +0100
Subject: [PATCH 16/20] FIM implemented in 3D

---
 .../tnlDirectEikonalMethodsBase.h             |  10 +-
 .../tnlFastSweepingMethod2D_impl.h            |  16 +-
 .../tnlFastSweepingMethod3D_impl.h            | 881 +++++++++---------
 3 files changed, 478 insertions(+), 429 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index ccbae8abe..7d990c1bb 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -160,11 +160,17 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3
                                   Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
                                   Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap );
 
-template < typename Real, typename Device, typename Index >
+template < int sizeSArray, typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
-                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
+                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
+                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
                                       TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );
+
+template < typename Index >
+__global__ void GetNeighbours3D( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,
+        int numBlockX, int numBlockY, int numBlockZ );
 #endif
 
 #include "tnlDirectEikonalMethodsBase_impl.h"
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index 546cfe9aa..b823fec03 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -85,7 +85,7 @@ solve( const MeshPointer& mesh,
   {
     if( std::is_same< DeviceType, Devices::Host >::value )
     {
-      int numThreadsPerBlock = 1024;
+      int numThreadsPerBlock = 16;
       
       
       int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
@@ -115,13 +115,13 @@ solve( const MeshPointer& mesh,
        }
        std::cout<<std::endl;*/
       unsigned int numWhile = 0;
-      while( IsCalculationDone  )
+      while( IsCalculationDone && numWhile < 1 )
       {      
         IsCalculationDone = 0;
         helpFunc1 = auxPtr;
         auxPtr = helpFunc;
         helpFunc = helpFunc1;
-        this->template updateBlocks< 1026 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+        this->template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
         
         //Reduction      
         for( int i = 0; i < BlockIterHost.getSize(); i++ ){
@@ -394,9 +394,7 @@ solve( const MeshPointer& mesh,
         numIter ++;
       }
       if( numIter == 1 ){
-        helpFunc1 = auxPtr;
         auxPtr = helpFunc;
-        helpFunc = helpFunc1;
       }
       /*cudaFree( BlockIterDevice );
        cudaFree( dBlock );
@@ -535,10 +533,10 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     __syncthreads();
     /**-----------------------------------------*/
     const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
-    __shared__ volatile int dimX;
-    __shared__ volatile int dimY;
-    __shared__ volatile Real hx;
-    __shared__ volatile Real hy;
+    __shared__ int dimX;
+    __shared__ int dimY;
+    __shared__ Real hx;
+    __shared__ Real hy;
     if( thri==0 && thrj == 0)
     {
       dimX = mesh.getDimensions().x();
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index 4daf9fc92..65aba5bf5 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -16,511 +16,556 @@
 #include "tnlFastSweepingMethod.h"
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
 FastSweepingMethod()
 : maxIterations( 1 )
 {
-   
+  
 }
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 const Index&
 FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
 getMaxIterations() const
 {
-   
+  
 }
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 void
 FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
 setMaxIterations( const IndexType& maxIterations )
 {
-   
+  
 }
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 void
 FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
 solve( const MeshPointer& mesh,
-       const AnisotropyPointer& anisotropy,
-       MeshFunctionPointer& u )
+        const AnisotropyPointer& anisotropy,
+        MeshFunctionPointer& u )
 {
-   MeshFunctionPointer auxPtr;
-   InterfaceMapPointer interfaceMapPtr;
-   auxPtr->setMesh( mesh );
-   interfaceMapPtr->setMesh( mesh );
-   std::cout << "Initiating the interface cells ..." << std::endl;
-   BaseType::initInterface( u, auxPtr, interfaceMapPtr );
+  MeshFunctionPointer auxPtr;
+  InterfaceMapPointer interfaceMapPtr;
+  auxPtr->setMesh( mesh );
+  interfaceMapPtr->setMesh( mesh );
+  std::cout << "Initiating the interface cells ..." << std::endl;
+  BaseType::initInterface( u, auxPtr, interfaceMapPtr );
 #ifdef HAVE_CUDA
-   cudaDeviceSynchronize();
+  cudaDeviceSynchronize();
 #endif
-   auxPtr->save( "aux-ini.tnl" );   
-   
-   typename MeshType::Cell cell( *mesh );
-   
-   IndexType iteration( 0 );
-   MeshFunctionType aux = *auxPtr;
-   InterfaceMapType interfaceMap = * interfaceMapPtr;
-    while( iteration < this->maxIterations )
+  auxPtr->save( "aux-ini.tnl" );   
+  
+  typename MeshType::Cell cell( *mesh );
+  
+  IndexType iteration( 0 );
+  MeshFunctionType aux = *auxPtr;
+  InterfaceMapType interfaceMap = * interfaceMapPtr;
+  while( iteration < this->maxIterations )
+  {
+    if( std::is_same< DeviceType, Devices::Host >::value )
     {
-        if( std::is_same< DeviceType, Devices::Host >::value )
+      for( cell.getCoordinates().z() = 0;
+              cell.getCoordinates().z() < mesh->getDimensions().z();
+              cell.getCoordinates().z()++ )
+      {
+        for( cell.getCoordinates().y() = 0;
+                cell.getCoordinates().y() < mesh->getDimensions().y();
+                cell.getCoordinates().y()++ )
         {
-           for( cell.getCoordinates().z() = 0;
-                cell.getCoordinates().z() < mesh->getDimensions().z();
-                cell.getCoordinates().z()++ )
-           {
-              for( cell.getCoordinates().y() = 0;
-                   cell.getCoordinates().y() < mesh->getDimensions().y();
-                   cell.getCoordinates().y()++ )
-              {
-                 for( cell.getCoordinates().x() = 0;
-                      cell.getCoordinates().x() < mesh->getDimensions().x();
-                      cell.getCoordinates().x()++ )
-                 {
-                    cell.refresh();
-                    if( ! interfaceMap( cell ) )
-                       this->updateCell( aux, cell );
-                 }
-              }
-           }
-           //aux.save( "aux-1.tnl" );
-
-           for( cell.getCoordinates().z() = 0;
-                cell.getCoordinates().z() < mesh->getDimensions().z();
-                cell.getCoordinates().z()++ )
-           {
-              for( cell.getCoordinates().y() = 0;
-                   cell.getCoordinates().y() < mesh->getDimensions().y();
-                   cell.getCoordinates().y()++ )
-              {
-                 for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                      cell.getCoordinates().x() >= 0 ;
-                      cell.getCoordinates().x()-- )		
-                 {
-                    //std::cerr << "2 -> ";
-                    cell.refresh();
-                    if( ! interfaceMap( cell ) )            
-                       this->updateCell( aux, cell );
-                 }
-              }
-           }
-           //aux.save( "aux-2.tnl" );
-           for( cell.getCoordinates().z() = 0;
-                cell.getCoordinates().z() < mesh->getDimensions().z();
-                cell.getCoordinates().z()++ )
-           {
-              for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-                   cell.getCoordinates().y() >= 0 ;
-                   cell.getCoordinates().y()-- )
-              {
-                 for( cell.getCoordinates().x() = 0;
-                      cell.getCoordinates().x() < mesh->getDimensions().x();
-                      cell.getCoordinates().x()++ )
-                 {
-                    //std::cerr << "3 -> ";
-                    cell.refresh();
-                    if( ! interfaceMap( cell ) )            
-                       this->updateCell( aux, cell );
-                 }
-              }
-           }
-           //aux.save( "aux-3.tnl" );
-
-           for( cell.getCoordinates().z() = 0;
-                cell.getCoordinates().z() < mesh->getDimensions().z();
-                cell.getCoordinates().z()++ )
-           {
-              for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-                   cell.getCoordinates().y() >= 0;
-                   cell.getCoordinates().y()-- )
-              {
-                 for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                      cell.getCoordinates().x() >= 0 ;
-                      cell.getCoordinates().x()-- )		
-                 {
-                    //std::cerr << "4 -> ";
-                    cell.refresh();
-                    if( ! interfaceMap( cell ) )            
-                       this->updateCell( aux, cell );
-                 }
-              }
-           }     
-           //aux.save( "aux-4.tnl" );
-
-           for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-                cell.getCoordinates().z() >= 0;
-                cell.getCoordinates().z()-- )
-           {
-              for( cell.getCoordinates().y() = 0;
-                   cell.getCoordinates().y() < mesh->getDimensions().y();
-                   cell.getCoordinates().y()++ )
-              {
-                 for( cell.getCoordinates().x() = 0;
-                      cell.getCoordinates().x() < mesh->getDimensions().x();
-                      cell.getCoordinates().x()++ )
-                 {
-                    //std::cerr << "5 -> ";
-                    cell.refresh();
-                    if( ! interfaceMap( cell ) )
-                       this->updateCell( aux, cell );
-                 }
-              }
-           }
-           //aux.save( "aux-5.tnl" );
-
-           for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-                cell.getCoordinates().z() >= 0;
-                cell.getCoordinates().z()-- )
-           {
-              for( cell.getCoordinates().y() = 0;
-                   cell.getCoordinates().y() < mesh->getDimensions().y();
-                   cell.getCoordinates().y()++ )
-              {
-                 for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                      cell.getCoordinates().x() >= 0 ;
-                      cell.getCoordinates().x()-- )		
-                 {
-                    //std::cerr << "6 -> ";
-                    cell.refresh();
-                    if( ! interfaceMap( cell ) )            
-                       this->updateCell( aux, cell );
-                 }
-              }
-           }
-           //aux.save( "aux-6.tnl" );
-
-           for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-                cell.getCoordinates().z() >= 0;
-                cell.getCoordinates().z()-- )
-           {
-              for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-                   cell.getCoordinates().y() >= 0 ;
-                   cell.getCoordinates().y()-- )
-              {
-                 for( cell.getCoordinates().x() = 0;
-                      cell.getCoordinates().x() < mesh->getDimensions().x();
-                      cell.getCoordinates().x()++ )
-                 {
-                    //std::cerr << "7 -> ";
-                    cell.refresh();
-                    if( ! interfaceMap( cell ) )            
-                       this->updateCell( aux, cell );
-                 }
-              }
-           }
-           //aux.save( "aux-7.tnl" );
-
-           for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-                cell.getCoordinates().z() >= 0;
-                cell.getCoordinates().z()-- )
-           {
-              for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-                   cell.getCoordinates().y() >= 0;
-                   cell.getCoordinates().y()-- )
-              {
-                 for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                      cell.getCoordinates().x() >= 0 ;
-                      cell.getCoordinates().x()-- )		
-                 {
-                    //std::cerr << "8 -> ";
-                    cell.refresh();
-                    if( ! interfaceMap( cell ) )            
-                       this->updateCell( aux, cell );
-                 }
-              }
-           }
+          for( cell.getCoordinates().x() = 0;
+                  cell.getCoordinates().x() < mesh->getDimensions().x();
+                  cell.getCoordinates().x()++ )
+          {
+            cell.refresh();
+            if( ! interfaceMap( cell ) )
+              this->updateCell( aux, cell );
+          }
+        }
       }
-      if( std::is_same< DeviceType, Devices::Cuda >::value )
+      //aux.save( "aux-1.tnl" );
+      
+      for( cell.getCoordinates().z() = 0;
+              cell.getCoordinates().z() < mesh->getDimensions().z();
+              cell.getCoordinates().z()++ )
       {
-         // TODO: CUDA code
-#ifdef HAVE_CUDA
-          const int cudaBlockSize( 8 );
-          int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
-          int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
-          int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z(), cudaBlockSize ); 
-          if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
-              std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
-          dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
-          dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
-                 
-          tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
-          
-          
-          int BlockIterD = 1;
-          
-          TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
-          BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ );
-          BlockIterDevice.setValue( 1 );
-          /*int *BlockIterDevice;
-          cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/
-          int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0);
-          
-          TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
-          dBlock.setSize( nBlocks );
-          dBlock.setValue( 0 );
-          /*int *dBlock;
-          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/
-          
-          while( BlockIterD )
+        for( cell.getCoordinates().y() = 0;
+                cell.getCoordinates().y() < mesh->getDimensions().y();
+                cell.getCoordinates().y()++ )
+        {
+          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                  cell.getCoordinates().x() >= 0 ;
+                  cell.getCoordinates().x()-- )		
           {
-             CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
-                                                              interfaceMapPtr.template getData< Device >(),
-                                                              auxPtr.template modifyData< Device>(),
-                                                              BlockIterDevice );
-            cudaDeviceSynchronize();
-            TNL_CHECK_CUDA_DEVICE;
-            
-            CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
-            cudaDeviceSynchronize();
-            TNL_CHECK_CUDA_DEVICE;
-            
-            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
-            cudaDeviceSynchronize();
-            TNL_CHECK_CUDA_DEVICE;
-            cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
-                                   
-            /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
-                BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
-            
+            //std::cerr << "2 -> ";
+            cell.refresh();
+            if( ! interfaceMap( cell ) )            
+              this->updateCell( aux, cell );
           }
-          //cudaFree( BlockIterDevice );
-          //cudaFree( dBlock );
-          cudaDeviceSynchronize();
-          TNL_CHECK_CUDA_DEVICE;
-          aux = *auxPtr;
-          interfaceMap = *interfaceMapPtr;
-#endif
+        }
       }
-        
-      //aux.save( "aux-8.tnl" );
-      iteration++;
+      //aux.save( "aux-2.tnl" );
+      for( cell.getCoordinates().z() = 0;
+              cell.getCoordinates().z() < mesh->getDimensions().z();
+              cell.getCoordinates().z()++ )
+      {
+        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+                cell.getCoordinates().y() >= 0 ;
+                cell.getCoordinates().y()-- )
+        {
+          for( cell.getCoordinates().x() = 0;
+                  cell.getCoordinates().x() < mesh->getDimensions().x();
+                  cell.getCoordinates().x()++ )
+          {
+            //std::cerr << "3 -> ";
+            cell.refresh();
+            if( ! interfaceMap( cell ) )            
+              this->updateCell( aux, cell );
+          }
+        }
+      }
+      //aux.save( "aux-3.tnl" );
+      
+      for( cell.getCoordinates().z() = 0;
+              cell.getCoordinates().z() < mesh->getDimensions().z();
+              cell.getCoordinates().z()++ )
+      {
+        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+                cell.getCoordinates().y() >= 0;
+                cell.getCoordinates().y()-- )
+        {
+          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                  cell.getCoordinates().x() >= 0 ;
+                  cell.getCoordinates().x()-- )		
+          {
+            //std::cerr << "4 -> ";
+            cell.refresh();
+            if( ! interfaceMap( cell ) )            
+              this->updateCell( aux, cell );
+          }
+        }
+      }     
+      //aux.save( "aux-4.tnl" );
+      
+      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+              cell.getCoordinates().z() >= 0;
+              cell.getCoordinates().z()-- )
+      {
+        for( cell.getCoordinates().y() = 0;
+                cell.getCoordinates().y() < mesh->getDimensions().y();
+                cell.getCoordinates().y()++ )
+        {
+          for( cell.getCoordinates().x() = 0;
+                  cell.getCoordinates().x() < mesh->getDimensions().x();
+                  cell.getCoordinates().x()++ )
+          {
+            //std::cerr << "5 -> ";
+            cell.refresh();
+            if( ! interfaceMap( cell ) )
+              this->updateCell( aux, cell );
+          }
+        }
+      }
+      //aux.save( "aux-5.tnl" );
+      
+      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+              cell.getCoordinates().z() >= 0;
+              cell.getCoordinates().z()-- )
+      {
+        for( cell.getCoordinates().y() = 0;
+                cell.getCoordinates().y() < mesh->getDimensions().y();
+                cell.getCoordinates().y()++ )
+        {
+          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                  cell.getCoordinates().x() >= 0 ;
+                  cell.getCoordinates().x()-- )		
+          {
+            //std::cerr << "6 -> ";
+            cell.refresh();
+            if( ! interfaceMap( cell ) )            
+              this->updateCell( aux, cell );
+          }
+        }
+      }
+      //aux.save( "aux-6.tnl" );
+      
+      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+              cell.getCoordinates().z() >= 0;
+              cell.getCoordinates().z()-- )
+      {
+        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+                cell.getCoordinates().y() >= 0 ;
+                cell.getCoordinates().y()-- )
+        {
+          for( cell.getCoordinates().x() = 0;
+                  cell.getCoordinates().x() < mesh->getDimensions().x();
+                  cell.getCoordinates().x()++ )
+          {
+            //std::cerr << "7 -> ";
+            cell.refresh();
+            if( ! interfaceMap( cell ) )            
+              this->updateCell( aux, cell );
+          }
+        }
+      }
+      //aux.save( "aux-7.tnl" );
+      
+      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+              cell.getCoordinates().z() >= 0;
+              cell.getCoordinates().z()-- )
+      {
+        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+                cell.getCoordinates().y() >= 0;
+                cell.getCoordinates().y()-- )
+        {
+          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                  cell.getCoordinates().x() >= 0 ;
+                  cell.getCoordinates().x()-- )		
+          {
+            //std::cerr << "8 -> ";
+            cell.refresh();
+            if( ! interfaceMap( cell ) )            
+              this->updateCell( aux, cell );
+          }
+        }
+      }
+    }
+    if( std::is_same< DeviceType, Devices::Cuda >::value )
+    {
+      // TODO: CUDA code
+#ifdef HAVE_CUDA
+      const int cudaBlockSize( 8 );
+      int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
+      int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
+      int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z(), cudaBlockSize ); 
+      if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
+        std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
+      dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
+      dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
+      
+      tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
+      
+      
+      int BlockIterD = 1;
+      
+      TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
+      BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ );
+      BlockIterDevice.setValue( 1 );
+      TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom;
+      BlockIterPom.setSize( numBlocksX * numBlocksY * numBlocksZ );
+      BlockIterPom.setValue( 0 );
+      /*int *BlockIterDevice;
+       cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/
+      int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0);
       
-   }
-   aux.save("aux-final.tnl");
+      TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
+      dBlock.setSize( nBlocks );
+      dBlock.setValue( 0 );
+      
+      int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0);
+      /*int *dBlock;
+       cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/
+      MeshFunctionPointer helpFunc1( mesh );      
+      MeshFunctionPointer helpFunc( mesh );
+      
+      helpFunc1 = auxPtr;
+      auxPtr = helpFunc;
+      helpFunc = helpFunc1;
+      int numIter = 0;
+      
+      while( BlockIterD )
+      {
+        helpFunc1 = auxPtr;
+        auxPtr = helpFunc;
+        helpFunc = helpFunc1;
+        TNL_CHECK_CUDA_DEVICE;
+        
+        CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr,
+                interfaceMapPtr.template getData< Device >(),
+                auxPtr.template getData< Device>(),
+                helpFunc.template modifyData< Device>(),
+                BlockIterDevice );
+        cudaDeviceSynchronize();
+        TNL_CHECK_CUDA_DEVICE;
+        
+        GetNeighbours3D<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, BlockIterPom, numBlocksX, numBlocksY, numBlocksZ );
+        cudaDeviceSynchronize();
+        TNL_CHECK_CUDA_DEVICE;
+        BlockIterDevice = BlockIterPom;
+        
+        CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
+        cudaDeviceSynchronize();
+        TNL_CHECK_CUDA_DEVICE;
+        
+        CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+        cudaDeviceSynchronize();
+        TNL_CHECK_CUDA_DEVICE;
+        cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
+        numIter++;
+        /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
+         BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
+        
+      }
+      if( numIter == 1 ){
+        auxPtr = helpFunc;
+      }
+      //cudaFree( BlockIterDevice );
+      //cudaFree( dBlock );
+      cudaDeviceSynchronize();
+      TNL_CHECK_CUDA_DEVICE;
+      aux = *auxPtr;
+      interfaceMap = *interfaceMapPtr;
+#endif
+    }
+    
+    //aux.save( "aux-8.tnl" );
+    iteration++;
+    
+  }
+  aux.save("aux-final.tnl");
 }
 
 #ifdef HAVE_CUDA
-template < typename Real, typename Device, typename Index >
+template < typename Index >
+__global__ void GetNeighbours3D( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,
+        int numBlockX, int numBlockY, int numBlockZ )
+{
+  int i = blockIdx.x * 1024 + threadIdx.x;
+  
+  if( i < numBlockX * numBlockY * numBlockZ )
+  {
+    int pom = 0;//BlockIterPom[ i ] = 0;
+    int m=0, l=0, k=0;
+    l = i/( numBlockX * numBlockY );
+    k = (i-l*numBlockX * numBlockY )/(numBlockX );
+    m = (i-l*numBlockX * numBlockY )%( numBlockX );
+    if( m > 0 && BlockIterDevice[ i - 1 ] ){
+      pom = 1;//BlockIterPom[ i ] = 1;
+    }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){
+      pom = 1;//BlockIterPom[ i ] = 1;
+    }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){
+      pom = 1;// BlockIterPom[ i ] = 1;
+    }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){
+      pom = 1;//BlockIterPom[ i ] = 1;
+    }else if( l > 0 && BlockIterDevice[ i - numBlockX*numBlockY ] ){
+      pom = 1;
+    }else if( l < numBlockZ-1 && BlockIterDevice[ i + numBlockX*numBlockY ] ){
+      pom = 1;
+    }
+    
+    BlockIterPom[ i ] = pom;//BlockIterPom[ i ];
+  }
+}
+
+template < int sizeSArray, typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
-                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
-                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
-                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice )
+        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
+        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice )
 {
-    int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
-    int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z;
-    int i = threadIdx.x + blockDim.x*blockIdx.x;
-    int j = blockDim.y*blockIdx.y + threadIdx.y;
-    int k = blockDim.z*blockIdx.z + threadIdx.z;
-    int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri;
+  int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
+  int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z;
+  int i = threadIdx.x + blockDim.x*blockIdx.x;
+  int j = blockDim.y*blockIdx.y + threadIdx.y;
+  int k = blockDim.z*blockIdx.z + threadIdx.z;
+  int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri;
+  
+  if( BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] )
+  {
+    __syncthreads();
     
-    __shared__ volatile bool changed[8*8*8];
-    changed[ currentIndex ] = false;
+    __shared__ volatile bool changed[ (sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2)];
     
+    changed[ currentIndex ] = false;
     if( thrj == 0 && thri == 0 && thrk == 0 )
-        changed[ 0 ] = true;
+      changed[ 0 ] = true;
     
     const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
-    __shared__ Real hx;
-    __shared__ Real hy;
-    __shared__ Real hz;
+    __shared__ Real hx; __shared__ int dimX;
+    __shared__ Real hy; __shared__ int dimY;
+    __shared__ Real hz; __shared__ int dimZ;
+    
     if( thrj == 1 && thri == 1 && thrk == 1 )
     {
-        hx = mesh.getSpaceSteps().x();
-        hy = mesh.getSpaceSteps().y();
-        hz = mesh.getSpaceSteps().z();
+      hx = mesh.getSpaceSteps().x();
+      hy = mesh.getSpaceSteps().y();
+      hz = mesh.getSpaceSteps().z();
+      dimX = mesh.getDimensions().x();
+      dimY = mesh.getDimensions().y();
+      dimZ = mesh.getDimensions().z();
+      BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 0;
     }
-    __shared__ volatile Real sArray[10][10][10];
-    sArray[thrk][thrj][thri] = std::numeric_limits< Real >::max();
-    if(thri == 0 )
-    {
-        sArray[8][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
-        sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
-        sArray[thrk+1][thrj+1][8] = std::numeric_limits< Real >::max();
-        sArray[thrk+1][thrj+1][9] = std::numeric_limits< Real >::max();
-        sArray[thrj+1][8][thrk+1] = std::numeric_limits< Real >::max();
-        sArray[thrj+1][9][thrk+1] = std::numeric_limits< Real >::max();
-    }
-            
+    __shared__ volatile Real sArray[sizeSArray][sizeSArray][sizeSArray];
+    sArray[thrk+1][thrj+1][thri+1] = std::numeric_limits< Real >::max();
+    
     //filling sArray edges
-    int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
-    int dimZ = mesh.getDimensions().z();
-    __shared__ volatile int numOfBlockx;
-    __shared__ volatile int numOfBlocky;
-    __shared__ volatile int numOfBlockz;
-    __shared__ int xkolik;
-    __shared__ int ykolik;
-    __shared__ int zkolik;
-    if( thri == 0 && thrj == 0 && thrk == 0 )
-    {
-        xkolik = blockDim.x + 1;
-        ykolik = blockDim.y + 1;
-        zkolik = blockDim.z + 1;
-        numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0);
-        numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0);
-        numOfBlockz = dimZ/blockDim.z + ((dimZ%blockDim.z != 0) ? 1:0);
-        
-        if( numOfBlockx - 1 == blIdx )
-            xkolik = dimX - (blIdx)*blockDim.x+1;
-
-        if( numOfBlocky -1 == blIdy )
-            ykolik = dimY - (blIdy)*blockDim.y+1;
-        if( numOfBlockz-1 == blIdz )
-            zkolik = dimZ - (blIdz)*blockDim.z+1;
-        
-        BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 0;
-    }
+    int numOfBlockx;
+    int numOfBlocky;
+    int numOfBlockz;
+    int xkolik;
+    int ykolik;
+    int zkolik;
+    xkolik = blockDim.x + 1;
+    ykolik = blockDim.y + 1;
+    zkolik = blockDim.z + 1;
+    numOfBlockx = gridDim.x;
+    numOfBlocky = gridDim.y;
+    numOfBlockz = gridDim.z;
+    
+    if( numOfBlockx - 1 == blIdx )
+      xkolik = dimX - (blIdx)*blockDim.x+1;
+    if( numOfBlocky -1 == blIdy )
+      ykolik = dimY - (blIdy)*blockDim.y+1;
+    if( numOfBlockz-1 == blIdz )
+      zkolik = dimZ - (blIdz)*blockDim.z+1;
     __syncthreads();
     
     if( thri == 0 )
     {        
-        if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik )
-            sArray[thrk+1][thrj+1][0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ];
-        else
-            sArray[thrk+1][thrj+1][0] = std::numeric_limits< Real >::max();
+      if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik )
+        sArray[thrk+1][thrj+1][0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ];
+      else
+        sArray[thrk+1][thrj+1][0] = std::numeric_limits< Real >::max();
     }
     
     if( thri == 1 )
     {
-        if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik )
-            sArray[thrk+1][thrj+1][9] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ];
-        else
-            sArray[thrk+1][thrj+1][9] = std::numeric_limits< Real >::max();
+      if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik )
+        sArray[thrk+1][thrj+1][9] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ];
+      else
+        sArray[thrk+1][thrj+1][9] = std::numeric_limits< Real >::max();
     }
     if( thri == 2 )
     {        
-        if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik )
-            sArray[thrk+1][0][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ];
-        else
-            sArray[thrk+1][0][thrj+1] = std::numeric_limits< Real >::max();
+      if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik )
+        sArray[thrk+1][0][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ];
+      else
+        sArray[thrk+1][0][thrj+1] = std::numeric_limits< Real >::max();
     }
     
     if( thri == 3 )
     {
-        if( dimY > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik )
-            sArray[thrk+1][9][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ];
-        else
-            sArray[thrk+1][9][thrj+1] = std::numeric_limits< Real >::max();
+      if( dimY > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik )
+        sArray[thrk+1][9][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ];
+      else
+        sArray[thrk+1][9][thrj+1] = std::numeric_limits< Real >::max();
     }
     if( thri == 4 )
     {        
-        if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik )
-            sArray[0][thrj+1][thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ];
-        else
-            sArray[0][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
+      if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik )
+        sArray[0][thrj+1][thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ];
+      else
+        sArray[0][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
     }
     
     if( thri == 5 )
     {
-        if( dimZ > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik )
-            sArray[9][thrj+1][thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ];
-        else
-            sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
+      if( dimZ > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik )
+        sArray[9][thrj+1][thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ];
+      else
+        sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
     }
     
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
+    if( i < dimX && j < dimY && k < dimZ )
     {
-        sArray[thrk+1][thrj+1][thri+1] = aux[ k*dimX*dimY + j*dimX + i ];
+      sArray[thrk+1][thrj+1][thri+1] = aux[ k*dimX*dimY + j*dimX + i ];
     }
-    __shared__ volatile int loopcounter;
-    loopcounter = 0;
     __syncthreads(); 
     while( changed[ 0 ] )
     {
-        __syncthreads();
-        
-        changed[ currentIndex ] = false;
-        
-    //calculation of update cell
-        if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ )
-        {
-            if( ! interfaceMap[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] )
-            {
-                changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz);
-            }
-        }
-        __syncthreads();
-        
-    //pyramid reduction
-        if( blockDim.x*blockDim.y*blockDim.z == 1024 )
-        {
-            if( currentIndex < 512 )
-            {
-                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
-            }
-        }
-        __syncthreads();
-        if( blockDim.x*blockDim.y*blockDim.z >= 512 )
+      __syncthreads();
+      
+      changed[ currentIndex ] = false;
+      
+      //calculation of update cell
+      if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ )
+      {
+        if( ! interfaceMap[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] )
         {
-            if( currentIndex < 256 )
-            {
-                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
-            }
+          changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz);
         }
-        __syncthreads();
-        if( blockDim.x*blockDim.y*blockDim.z >= 256 )
+      }
+      __syncthreads();
+      
+      //pyramid reduction
+      if( blockDim.x*blockDim.y*blockDim.z == 1024 )
+      {
+        if( currentIndex < 512 )
         {
-            if( currentIndex < 128 )
-            {
-                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
-            }
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
         }
-        __syncthreads();
-        if( blockDim.x*blockDim.y*blockDim.z >= 128 )
+      }
+      __syncthreads();
+      if( blockDim.x*blockDim.y*blockDim.z >= 512 )
+      {
+        if( currentIndex < 256 )
         {
-            if( currentIndex < 64 )
-            {
-                changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
-            }
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
         }
-        __syncthreads();
-        if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
+      }
+      __syncthreads();
+      if( blockDim.x*blockDim.y*blockDim.z >= 256 )
+      {
+        if( currentIndex < 128 )
         {
-            if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
-            if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
-            if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
-            if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
-            if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
-            if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
         }
-        __syncthreads();
-        
-        /*if(thri == 0 && thrj ==0 && thrk ==0 && blIdx == 0 && blIdy == 0 && blIdz == 0)
-        {
-            for(int m = 0; m < 8; m++){
-                for(int n = 0; n<8; n++){
-                    for(int b=0; b<8; b++)
-                        printf(" %i ", changed[m*64 + n*8 + b]);
-                    printf("\n");
-                }
-                printf("\n \n");
-            }
-        }*/
-        if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 )
+      }
+      __syncthreads();
+      if( blockDim.x*blockDim.y*blockDim.z >= 128 )
+      {
+        if( currentIndex < 64 )
         {
-            //loopcounter++;
-            BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 1;
+          changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
         }
-        __syncthreads();
-        /*if(thri == 0 && thrj==0 && thrk==0)
-            printf("%i \n",loopcounter);
-        if(loopcounter == 500)
-            break;*/
+      }
+      __syncthreads();
+      if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
+      {
+        if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
+        if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
+        if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
+        if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
+        if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
+        if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
+      }
+      __syncthreads();
+      
+      /*if(thri == 0 && thrj ==0 && thrk ==0 && blIdx == 0 && blIdy == 0 && blIdz == 0)
+       {
+       for(int m = 0; m < 8; m++){
+       for(int n = 0; n<8; n++){
+       for(int b=0; b<8; b++)
+       printf(" %i ", changed[m*64 + n*8 + b]);
+       printf("\n");
+       }
+       printf("\n \n");
+       }
+       }*/
+      if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 )
+      {
+        BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 1;
+      }
+      __syncthreads();
     }
-  
-    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ && (!interfaceMap[ k*dimX*dimY+j * mesh.getDimensions().x() + i ]) )
-        aux[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ];
-}   
+    
+    if( i < dimX && j < dimY && k < dimZ )
+      helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ];
+  } 
+}  
 #endif
-- 
GitLab


From 4cefa039f6ffa3b5e30c4249cb3e1f60ba860c3b Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 15 Nov 2018 13:24:51 +0100
Subject: [PATCH 17/20] Enabled computations with single precision.

---
 .../Hamilton-Jacobi/Solvers/hamilton-jacobi/MainBuildConfig.h   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/MainBuildConfig.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/MainBuildConfig.h
index f8f9187fa..a2a1d7372 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/MainBuildConfig.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/MainBuildConfig.h
@@ -23,7 +23,7 @@ namespace Solvers {
 /****
  * Turn off support for float and long double.
  */
-template<> struct ConfigTagReal< HamiltonJacobiBuildConfig, float > { enum { enabled = false }; };
+template<> struct ConfigTagReal< HamiltonJacobiBuildConfig, float > { enum { enabled = true }; };
 template<> struct ConfigTagReal< HamiltonJacobiBuildConfig, long double > { enum { enabled = false }; };
 
 /****
-- 
GitLab


From 852534a88eab1e78b21d3fd41665dc7ca556c878 Mon Sep 17 00:00:00 2001
From: Fencl <fenclmat@fjfi.cvut.cz>
Date: Fri, 16 Nov 2018 12:03:40 +0100
Subject: [PATCH 18/20] 3D FSM+FIM implemented 2D FSM+FIM method pickes size of
 rectangular block depending on number of blocks

---
 .../tnlDirectEikonalMethodsBase.h             | 214 ++++----
 .../tnlDirectEikonalMethodsBase_impl.h        | 519 +++++++++++++++---
 .../hamilton-jacobi/tnlFastSweepingMethod.h   | 222 ++++----
 .../tnlFastSweepingMethod2D_impl.h            |  74 ++-
 .../tnlFastSweepingMethod3D_impl.h            | 455 +++++++++------
 5 files changed, 1004 insertions(+), 480 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index 7d990c1bb..f712ce2cc 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -19,102 +19,112 @@ class tnlDirectEikonalMethodsBase
 };
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
 class tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >
 {
-   public:
-      
-      typedef Meshes::Grid< 1, Real, Device, Index > MeshType;
-      typedef Real RealType;
-      typedef Device DevcieType;
-      typedef Index IndexType;
-      typedef Functions::MeshFunction< MeshType > MeshFunctionType;
-      typedef Functions::MeshFunction< MeshType, 1, bool > InterfaceMapType;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
-      using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;
-      
-      void initInterface( const MeshFunctionPointer& input,
-                          MeshFunctionPointer& output,
-                          InterfaceMapPointer& interfaceMap );
-      
-      template< typename MeshEntity >
-      __cuda_callable__ void updateCell( MeshFunctionType& u,
-                                         const MeshEntity& cell,
-                                         const RealType velocity = 1.0  );
-      
-      __cuda_callable__ bool updateCell( volatile Real sArray[18],
-                                         int thri, const Real h,
-                                         const Real velocity = 1.0 );
+  public:
+    
+    typedef Meshes::Grid< 1, Real, Device, Index > MeshType;
+    typedef Real RealType;
+    typedef Device DevcieType;
+    typedef Index IndexType;
+    typedef Functions::MeshFunction< MeshType > MeshFunctionType;
+    typedef Functions::MeshFunction< MeshType, 1, bool > InterfaceMapType;
+    using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
+    using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;
+    
+    void initInterface( const MeshFunctionPointer& input,
+            MeshFunctionPointer& output,
+            InterfaceMapPointer& interfaceMap );
+    
+    template< typename MeshEntity >
+    __cuda_callable__ void updateCell( MeshFunctionType& u,
+            const MeshEntity& cell,
+            const RealType velocity = 1.0  );
+    
+    __cuda_callable__ bool updateCell( volatile Real sArray[18],
+            int thri, const Real h,
+            const Real velocity = 1.0 );
 };
 
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
 class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
 {
-   public:
-      typedef Meshes::Grid< 2, Real, Device, Index > MeshType;
-      typedef Real RealType;
-      typedef Device DevcieType;
-      typedef Index IndexType;
-      typedef Functions::MeshFunction< MeshType > MeshFunctionType;
-      typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType;
-      typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
-      using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;
-
-      void initInterface( const MeshFunctionPointer& input,
-                          MeshFunctionPointer& output,
-                          InterfaceMapPointer& interfaceMap );
-      
-      template< typename MeshEntity >
-      __cuda_callable__ void updateCell( MeshFunctionType& u,
-                                         const MeshEntity& cell,
-                                         const RealType velocity = 1.0 );
-      
-      template< int sizeSArray >
-      __cuda_callable__ bool updateCell( volatile Real *sArray,
-                                         int thri, int thrj, const Real hx, const Real hy,
-                                         const Real velocity = 1.0 );
-      
-      template< int sizeSArray >
-      void updateBlocks( InterfaceMapType interfaceMap,
-                         MeshFunctionType aux,
-                         MeshFunctionType helpFunc,
-                         ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ );
-      
-      void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY  );
+  public:
+    typedef Meshes::Grid< 2, Real, Device, Index > MeshType;
+    typedef Real RealType;
+    typedef Device DevcieType;
+    typedef Index IndexType;
+    typedef Functions::MeshFunction< MeshType > MeshFunctionType;
+    typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType;
+    typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
+    using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
+    using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;
+    
+    void initInterface( const MeshFunctionPointer& input,
+            MeshFunctionPointer& output,
+            InterfaceMapPointer& interfaceMap );
+    
+    template< typename MeshEntity >
+    __cuda_callable__ void updateCell( MeshFunctionType& u,
+            const MeshEntity& cell,
+            const RealType velocity = 1.0 );
+    
+    template< int sizeSArray >
+    __cuda_callable__ bool updateCell( volatile Real *sArray,
+            int thri, int thrj, const Real hx, const Real hy,
+            const Real velocity = 1.0 );
+    
+    template< int sizeSArray >
+    void updateBlocks( InterfaceMapType interfaceMap,
+            MeshFunctionType aux,
+            MeshFunctionType helpFunc,
+            ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ );
+    
+    void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY  );
 };
 
 template< typename Real,
-          typename Device,
-          typename Index >
+        typename Device,
+        typename Index >
 class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
 {
-   public:
-      typedef Meshes::Grid< 3, Real, Device, Index > MeshType;
-      typedef Real RealType;
-      typedef Device DevcieType;
-      typedef Index IndexType;
-      typedef Functions::MeshFunction< MeshType > MeshFunctionType;
-      typedef Functions::MeshFunction< MeshType, 3, bool > InterfaceMapType;
-      using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
-      using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;      
-
-      void initInterface( const MeshFunctionPointer& input,
-                          MeshFunctionPointer& output,
-                          InterfaceMapPointer& interfaceMap );
-      
-      template< typename MeshEntity >
-      __cuda_callable__ void updateCell( MeshFunctionType& u,
-                                         const MeshEntity& cell,
-                                         const RealType velocity = 1.0);
-      
-      __cuda_callable__ bool updateCell( volatile Real sArray[10][10][10],
-                                         int thri, int thrj, int thrk, const Real hx, const Real hy, const Real hz,
-                                         const Real velocity = 1.0 );
+  public:
+    typedef Meshes::Grid< 3, Real, Device, Index > MeshType;
+    typedef Real RealType;
+    typedef Device DevcieType;
+    typedef Index IndexType;
+    typedef Functions::MeshFunction< MeshType > MeshFunctionType;
+    typedef Functions::MeshFunction< MeshType, 3, bool > InterfaceMapType;
+    typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
+    using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
+    using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;      
+    
+    void initInterface( const MeshFunctionPointer& input,
+            MeshFunctionPointer& output,
+            InterfaceMapPointer& interfaceMap );
+    
+    template< typename MeshEntity >
+    __cuda_callable__ void updateCell( MeshFunctionType& u,
+            const MeshEntity& cell,
+            const RealType velocity = 1.0);
+    
+    template< int sizeSArray >
+    void updateBlocks( const InterfaceMapType interfaceMap,
+            const MeshFunctionType aux,
+            MeshFunctionType& helpFunc,
+            ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ );
+    
+    void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ );
+    
+    template< int sizeSArray >
+    __cuda_callable__ bool updateCell3D( volatile Real *sArray,
+            int thri, int thrj, int thrk, const Real hx, const Real hy, const Real hz,
+            const Real velocity = 1.0 );
 };
 
 template < typename T1, typename T2 >
@@ -126,46 +136,46 @@ __cuda_callable__ void sortMinims( T1 pom[] );
 #ifdef HAVE_CUDA
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, 
-                                Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output,
-                                Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap  );
+        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap  );
 
 template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > ptr,
-                                      const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap,
-                                      Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& aux,
-                                      bool *BlockIterDevice );
+        const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap,
+        Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& aux,
+        bool *BlockIterDevice );
 
 template < int sizeSArray, typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
-                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
-                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-                                      Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
-                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock =0);
+        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
+        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock =0);
 
 template < typename Index >
 __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
-                                   TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks );
+        TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks );
 
 template < typename Index >
 __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
-                               TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY );
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY );
 
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, 
-                                Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
-                                Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap );
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap );
 
 template < typename Real, typename Device, typename Index >
 __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, 
-                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
-                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap );
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap );
 
 template < int sizeSArray, typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
-                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
-                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
-                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
-                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );
+        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
+        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
+        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
+        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );
 
 template < typename Index >
 __global__ void GetNeighbours3D( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index 5083544e2..8f7937541 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -148,6 +148,7 @@ updateBlocks( InterfaceMapType interfaceMap,
       }
       
       
+      //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
       for( int thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ )
       {        
         if( dimX > (blIdx+1) * numThreadsPerBlock  && thrj+1 < ykolik )
@@ -263,6 +264,370 @@ updateBlocks( InterfaceMapType interfaceMap,
     }
   }
 }
+template< typename Real,
+        typename Device,
+        typename Index >
+template< int sizeSArray >
+void
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+updateBlocks( const InterfaceMapType interfaceMap,
+        const MeshFunctionType aux,
+        MeshFunctionType& helpFunc,
+        ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
+{  
+//#pragma omp parallel for schedule( dynamic )
+  for( int i = 0; i < BlockIterHost.getSize(); i++ )
+  {
+    if( BlockIterHost[ i ] )
+    {
+      MeshType mesh = interfaceMap.template getMesh< Devices::Host >();
+      
+      int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
+      int dimZ = mesh.getDimensions().z();
+      //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl;
+      int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
+      int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
+      int numOfBlockz = dimZ/numThreadsPerBlock + ((dimZ%numThreadsPerBlock != 0) ? 1:0);
+      //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl;
+      int xkolik = numThreadsPerBlock + 1;
+      int ykolik = numThreadsPerBlock + 1;
+      int zkolik = numThreadsPerBlock + 1;
+      
+      
+      int blIdz = i/( numOfBlockx * numOfBlocky );
+      int blIdy = (i-blIdz*numOfBlockx * numOfBlocky )/(numOfBlockx );
+      int blIdx = (i-blIdz*numOfBlockx * numOfBlocky )%( numOfBlockx );
+      //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl;
+      
+      if( numOfBlockx - 1 == blIdx )
+        xkolik = dimX - (blIdx)*numThreadsPerBlock+1;
+      if( numOfBlocky -1 == blIdy )
+        ykolik = dimY - (blIdy)*numThreadsPerBlock+1;
+      if( numOfBlockz-1 == blIdz )
+        zkolik = dimZ - (blIdz)*numThreadsPerBlock+1;
+      //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl;
+      
+      
+      /*bool changed[numThreadsPerBlock*numThreadsPerBlock];
+       changed[ 0 ] = 1;*/
+      Real hx = mesh.getSpaceSteps().x();
+      Real hy = mesh.getSpaceSteps().y();
+      Real hz = mesh.getSpaceSteps().z();
+      
+      bool changed = false;
+      BlockIterHost[ i ] = 0;
+      
+      
+      Real *sArray;
+      sArray = new Real[ sizeSArray * sizeSArray * sizeSArray ];
+      if( sArray == nullptr )
+        std::cout << "Error while allocating memory for sArray." << std::endl;
+      
+      for( int k = 0; k < sizeSArray; k++ )
+        for( int l = 0; l < sizeSArray; l++ )
+          for( int m = 0; m < sizeSArray; m++ ){
+            sArray[ m * sizeSArray * sizeSArray + k * sizeSArray + l ] = std::numeric_limits< Real >::max();
+          }
+      
+      
+      for( int thrk = 0; thrk < numThreadsPerBlock; thrk++ )
+        for( int thrj = 0; thrj < numThreadsPerBlock; thrj++ )
+        {
+          if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik )
+            sArray[(thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX -1 + thrk*dimX*dimY ];
+          
+          if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < zkolik )
+            sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy *numThreadsPerBlock*dimX+ blIdx*numThreadsPerBlock + numThreadsPerBlock + thrj * dimX + thrk*dimX*dimY ];
+          
+          if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik )
+            sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX + thrj + thrk*dimX*dimY ];
+          
+          if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik && thrk+1 < zkolik )
+            sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + (blIdy+1) * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj + thrk*dimX*dimY ];
+          
+          if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik )
+            sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = 
+                    aux[ blIdz*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock - dimX * dimY + thrj * dimX + thrk ];
+          
+          if( dimZ > (blIdz+1) * numThreadsPerBlock && thrj+1 < ykolik && thrk+1 < xkolik )
+            sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = 
+                    aux[ (blIdz+1)*numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock*dimX + blIdx*numThreadsPerBlock + thrj * dimX + thrk ];
+        }
+      
+      for( int m = 0; m < numThreadsPerBlock; m++ ){
+        for( int k = 0; k < numThreadsPerBlock; k++ ){
+          for( int l = 0; l < numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+              sArray[(m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1] = 
+                      aux[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ];
+          }
+        }
+      }
+      /*string s;
+      int numWhile = 0;
+      for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      for( int m = 0; m < numThreadsPerBlock; m++ ){
+        for( int k = 0; k < numThreadsPerBlock; k++ ){ 
+          for( int l = 0; l < numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){
+              //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl;
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                //printf("In with point m  = %d, k = %d, l = %d\n", m, k, l);
+                changed = this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz) || changed;
+                
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      for( int m = numThreadsPerBlock-1; m >-1; m-- ){
+        for( int k = 0; k < numThreadsPerBlock; k++ ){
+          for( int l = 0; l <numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      for( int m = 0; m < numThreadsPerBlock; m++ ){
+        for( int k = 0; k < numThreadsPerBlock; k++ ){
+          for( int l = numThreadsPerBlock-1; l >-1; l-- ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );
+      */
+      for( int m = numThreadsPerBlock-1; m >-1; m-- ){
+        for( int k = 0; k < numThreadsPerBlock; k++ ){
+          for( int l = numThreadsPerBlock-1; l >-1; l-- ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell3D< sizeSArray >(  sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      for( int m = 0; m < numThreadsPerBlock; m++ ){
+        for( int k = numThreadsPerBlock-1; k > -1; k-- ){
+          for( int l = 0; l <numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      for( int m = numThreadsPerBlock-1; m >-1; m-- ){
+        for( int k = numThreadsPerBlock-1; k > -1; k-- ){
+          for( int l = 0; l <numThreadsPerBlock; l++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      for( int m = 0; m < numThreadsPerBlock; m++ ){
+        for( int k = numThreadsPerBlock-1; k > -1; k-- ){
+          for( int l = numThreadsPerBlock-1; l >-1; l-- ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      
+      for( int m = numThreadsPerBlock-1; m >-1; m-- ){
+        for( int k = numThreadsPerBlock-1; k > -1; k-- ){
+          for( int l = numThreadsPerBlock-1; l >-1; l-- ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )
+            {
+              if( ! interfaceMap[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l  ] )
+              {
+                this->template updateCell3D< sizeSArray >( sArray, l+1, k+1, m+1, hx,hy,hz);
+              }
+            }
+          }
+        }
+      }
+      /*for( int k = 0; k < numThreadsPerBlock; k++ ){
+        for( int l = 0; l < numThreadsPerBlock; l++ ) 
+          for( int m = 0; m < numThreadsPerBlock; m++ )
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ )     
+              helpFunc[ m*dimX*dimY + k*dimX + l ] = sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+      } 
+      numWhile++;
+      s = "helpFunc-"+ std::to_string(numWhile) + ".tnl";
+      helpFunc.save( s );*/
+      
+      if( changed ){
+        BlockIterHost[ i ] = 1;
+      }
+      
+      
+      for( int k = 0; k < numThreadsPerBlock; k++ ){ 
+        for( int l = 0; l < numThreadsPerBlock; l++ ) {
+          for( int m = 0; m < numThreadsPerBlock; m++ ){
+            if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX && blIdz * numThreadsPerBlock + m < dimZ ){      
+              helpFunc[ blIdz * numThreadsPerBlock * dimX * dimY + blIdy * numThreadsPerBlock * dimX + blIdx*numThreadsPerBlock + m*dimX*dimY + k*dimX + l ] = 
+                      sArray[ (m+1) * sizeSArray * sizeSArray + (k+1) *sizeSArray + l+1 ];
+              //std::cout << helpFunc[ m*dimX*dimY + k*dimX + l ] << " ";
+            }
+          }
+          //std::cout << std::endl;
+        }
+        //std::cout << std::endl;
+      }
+      //helpFunc.save( "helpF.tnl");
+      delete []sArray;
+    }
+  }
+}
+template< typename Real,
+        typename Device,
+        typename Index >
+void 
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ )
+{
+  int* BlockIterPom; 
+  BlockIterPom = new int [ numBlockX * numBlockY * numBlockZ ];
+  
+  for( int i = 0; i< BlockIterHost.getSize(); i++)
+  {
+    BlockIterPom[ i ] = 0;
+    
+    int m=0, l=0, k=0;
+    l = i/( numBlockX * numBlockY );
+    k = (i-l*numBlockX * numBlockY )/(numBlockX );
+    m = (i-l*numBlockX * numBlockY )%( numBlockX );
+    
+    if( m > 0 && BlockIterHost[ i - 1 ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( l > 0 && BlockIterHost[ i - numBlockX*numBlockY ] ){
+      BlockIterPom[ i ] = 1;
+    }else if( l < numBlockZ-1 && BlockIterHost[ i + numBlockX*numBlockY ] ){
+      BlockIterPom[ i ] = 1;
+    }
+  }
+  for( int i = 0; i< BlockIterHost.getSize(); i++)
+  { 
+    BlockIterHost[ i ] = BlockIterPom[ i ];
+  }
+}
+
 
 template< typename Real,
         typename Device,
@@ -619,8 +984,8 @@ initInterface( const MeshFunctionPointer& _input,
         {
           cell.refresh();
           output[ cell.getIndex() ] =
-                  input( cell ) > 0 ? std::numeric_limits< RealType >::max() :
-                    - std::numeric_limits< RealType >::max();
+                  input( cell ) > 0 ? 10://std::numeric_limits< RealType >::max() :
+                    -10;//- std::numeric_limits< RealType >::max();
           interfaceMap[ cell.getIndex() ] = false;
         }
     
@@ -967,6 +1332,82 @@ updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real
   
   return false;
 }
+template< typename Real,
+        typename Device,
+        typename Index >
+template< int sizeSArray >
+__cuda_callable__ 
+bool 
+tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
+updateCell3D( volatile Real *sArray, int thri, int thrj, int thrk,
+        const Real hx, const Real hy, const Real hz, const Real v )
+{
+  const RealType value = sArray[thrk *sizeSArray * sizeSArray + thrj * sizeSArray + thri];
+  
+  RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
+  
+  c = TNL::argAbsMin( sArray[ (thrk+1)* sizeSArray*sizeSArray + thrj * sizeSArray + thri ],
+          sArray[ (thrk-1) * sizeSArray *sizeSArray + thrj* sizeSArray + thri ] );
+  
+  b = TNL::argAbsMin( sArray[ thrk* sizeSArray*sizeSArray + (thrj+1) * sizeSArray + thri ],
+          sArray[ thrk* sizeSArray * sizeSArray + (thrj-1)* sizeSArray +thri ] );
+  
+  a = TNL::argAbsMin( sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri+1 ],
+          sArray[ thrk* sizeSArray * sizeSArray + thrj* sizeSArray +thri-1 ] );
+  
+  /*if( thrk == 8 )
+    printf("Calculating a = %f, b = %f, c = %f\n" , a, b, c );*/
+  
+  if( fabs( a ) == 10&& //std::numeric_limits< RealType >::max() && 
+          fabs( b ) == 10&&//std::numeric_limits< RealType >::max() &&
+          fabs( c ) == 10)//std::numeric_limits< RealType >::max() )
+    return false;
+  
+  RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
+  
+  sortMinims( pom );
+  
+  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
+  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
+  {
+    sArray[ thrk* sizeSArray* sizeSArray + thrj* sizeSArray + thri ] = argAbsMin( value, tmp );
+    tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
+    if ( fabs( tmp ) >  0.001*hx )
+      return true;
+    else
+      return false;
+  }
+  else
+  {
+    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
+            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
+            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
+    if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
+    {
+      sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ] = argAbsMin( value, tmp );
+      tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
+      if ( fabs( tmp ) > 0.001*hx )
+        return true;
+      else
+        return false;
+    }
+    else
+    {
+      tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
+              TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
+              hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
+              hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
+      sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ] = argAbsMin( value, tmp );
+      tmp = value - sArray[ thrk* sizeSArray* sizeSArray  + thrj* sizeSArray + thri ];
+      if ( fabs( tmp ) > 0.001*hx )
+        return true;
+      else
+        return false;
+    }
+  }
+  
+  return false;
+}
 
 #ifdef HAVE_CUDA
 template < typename Real, typename Device, typename Index >
@@ -1215,78 +1656,4 @@ updateCell( volatile Real sArray[18], int thri, const Real h, const Real v )
   else
     return false;
 }
-
-template< typename Real,
-        typename Device,
-        typename Index >
-__cuda_callable__ 
-bool 
-tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
-updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk,
-        const Real hx, const Real hy, const Real hz, const Real v )
-{
-  const RealType value = sArray[thrk][thrj][thri];
-  //std::cout << value << std::endl;
-  RealType a, b, c, tmp = std::numeric_limits< RealType >::max();
-  
-  c = TNL::argAbsMin( sArray[ thrk+1 ][ thrj ][ thri ],
-          sArray[ thrk-1 ][ thrj ][ thri ] );
-  
-  b = TNL::argAbsMin( sArray[ thrk ][ thrj+1 ][ thri ],
-          sArray[ thrk ][ thrj-1 ][ thri ] );
-  
-  a = TNL::argAbsMin( sArray[ thrk ][ thrj ][ thri+1 ],
-          sArray[ thrk ][ thrj ][ thri-1 ] );
-  
-  
-  if( fabs( a ) == std::numeric_limits< RealType >::max() && 
-          fabs( b ) == std::numeric_limits< RealType >::max() &&
-          fabs( c ) == std::numeric_limits< RealType >::max() )
-    return false;
-  
-  RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz};
-  
-  sortMinims( pom );
-  
-  tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ];
-  if( fabs( tmp ) < fabs( pom[ 1 ] ) ) 
-  {
-    sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
-    tmp = value - sArray[ thrk ][ thrj ][ thri ];
-    if ( fabs( tmp ) >  0.001*hx )
-      return true;
-    else
-      return false;
-  }
-  else
-  {
-    tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + 
-            TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] +  pom[ 4 ] *  pom[ 4 ] )/( v * v ) - 
-            ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
-    if( fabs( tmp ) < fabs( pom[ 2 ]) ) 
-    {
-      sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
-      tmp = value - sArray[ thrk ][ thrj ][ thri ];
-      if ( fabs( tmp ) > 0.001*hx )
-        return true;
-      else
-        return false;
-    }
-    else
-    {
-      tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c +
-              TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - 
-              hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) -
-              hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx );
-      sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp );
-      tmp = value - sArray[ thrk ][ thrj ][ thri ];
-      if ( fabs( tmp ) > 0.001*hx )
-        return true;
-      else
-        return false;
-    }
-  }
-  
-  return false;
-}
 #endif
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
index 60c690e06..57b1886e8 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
@@ -1,9 +1,9 @@
 /***************************************************************************
-                          FastSweepingMethod.h  -  description
-                             -------------------
-    begin                : Jul 14, 2016
-    copyright            : (C) 2017 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
+ FastSweepingMethod.h  -  description
+ -------------------
+ begin                : Jul 14, 2016
+ copyright            : (C) 2017 by Tomas Oberhuber
+ email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
 /* See Copyright Notice in tnl/Copyright */
@@ -17,132 +17,134 @@
 
 
 template< typename Mesh,
-          typename Anisotropy = Functions::Analytic::Constant< Mesh::getMeshDimension(), typename Mesh::RealType > >
+        typename Anisotropy = Functions::Analytic::Constant< Mesh::getMeshDimension(), typename Mesh::RealType > >
 class FastSweepingMethod
 {   
 };
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 class FastSweepingMethod< Meshes::Grid< 1, Real, Device, Index >, Anisotropy >
-   : public tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >
+: public tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >
 {
-   //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
-   
-   public:
-      
-      typedef Meshes::Grid< 1, Real, Device, Index > MeshType;
-      typedef Real RealType;
-      typedef Device DeviceType;
-      typedef Index IndexType;
-      typedef Anisotropy AnisotropyType;
-      typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > BaseType;
-      using MeshPointer = Pointers::SharedPointer<  MeshType >;
-      using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
-      
-      
-      using typename BaseType::InterfaceMapType;
-      using typename BaseType::MeshFunctionType;
-      using typename BaseType::InterfaceMapPointer;
-      using typename BaseType::MeshFunctionPointer;
-      
-      
-      FastSweepingMethod();
-      
-      const IndexType& getMaxIterations() const;
-      
-      void setMaxIterations( const IndexType& maxIterations );
-      
-      void solve( const MeshPointer& mesh,
-                  const AnisotropyPointer& anisotropy,
-                  MeshFunctionPointer& u );
-      
-      
-   protected:
+  //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
+  
+  public:
+    
+    typedef Meshes::Grid< 1, Real, Device, Index > MeshType;
+    typedef Real RealType;
+    typedef Device DeviceType;
+    typedef Index IndexType;
+    typedef Anisotropy AnisotropyType;
+    typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > > BaseType;
+    using MeshPointer = Pointers::SharedPointer<  MeshType >;
+    using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
+    
+    
+    using typename BaseType::InterfaceMapType;
+    using typename BaseType::MeshFunctionType;
+    using typename BaseType::InterfaceMapPointer;
+    using typename BaseType::MeshFunctionPointer;
+    
+    
+    FastSweepingMethod();
+    
+    const IndexType& getMaxIterations() const;
+    
+    void setMaxIterations( const IndexType& maxIterations );
+    
+    void solve( const MeshPointer& mesh,
+            const AnisotropyPointer& anisotropy,
+            MeshFunctionPointer& u );
+    
+    
+    protected:
       
       const IndexType maxIterations;
 };
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >
-   : public tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
+: public tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
 {
-   //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
-   
-   public:
-      
-      typedef Meshes::Grid< 2, Real, Device, Index > MeshType;
-      typedef Real RealType;
-      typedef Device DeviceType;
-      typedef Index IndexType;
-      typedef Anisotropy AnisotropyType;
-      typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > BaseType;
-      using MeshPointer = Pointers::SharedPointer<  MeshType >;
-      using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
-
-      using typename BaseType::InterfaceMapType;
-      using typename BaseType::MeshFunctionType;
-      using typename BaseType::InterfaceMapPointer;
-      using typename BaseType::MeshFunctionPointer;
-      using typename BaseType::ArrayContainer;
-
-      FastSweepingMethod();
-      
-      const IndexType& getMaxIterations() const;
-      
-      void setMaxIterations( const IndexType& maxIterations );
-      
-      void solve( const MeshPointer& mesh,
-                  const AnisotropyPointer& anisotropy,
-                  MeshFunctionPointer& u );
-      
-   protected:
+  //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
+  
+  public:
+    
+    typedef Meshes::Grid< 2, Real, Device, Index > MeshType;
+    typedef Real RealType;
+    typedef Device DeviceType;
+    typedef Index IndexType;
+    typedef Anisotropy AnisotropyType;
+    typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > BaseType;
+    using MeshPointer = Pointers::SharedPointer<  MeshType >;
+    using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
+    
+    using typename BaseType::InterfaceMapType;
+    using typename BaseType::MeshFunctionType;
+    using typename BaseType::InterfaceMapPointer;
+    using typename BaseType::MeshFunctionPointer;
+    using typename BaseType::ArrayContainer;
+    
+    FastSweepingMethod();
+    
+    const IndexType& getMaxIterations() const;
+    
+    void setMaxIterations( const IndexType& maxIterations );
+    
+    void solve( const MeshPointer& mesh,
+            const AnisotropyPointer& anisotropy,
+            MeshFunctionPointer& u );
+    
+    protected:
       
       const IndexType maxIterations;
 };
 
 template< typename Real,
-          typename Device,
-          typename Index,
-          typename Anisotropy >
+        typename Device,
+        typename Index,
+        typename Anisotropy >
 class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >
-   : public tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
+: public tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
 {
-   //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
-   
-   public:
-      
-      typedef Meshes::Grid< 3, Real, Device, Index > MeshType;
-      typedef Real RealType;
-      typedef Device DeviceType;
-      typedef Index IndexType;
-      typedef Anisotropy AnisotropyType;
-      typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > BaseType;
-      using MeshPointer = Pointers::SharedPointer<  MeshType >;
-      using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
-      
-      using typename BaseType::InterfaceMapType;
-      using typename BaseType::MeshFunctionType;
-      using typename BaseType::InterfaceMapPointer;
-      using typename BaseType::MeshFunctionPointer;      
-      
-      FastSweepingMethod();
-      
-      const IndexType& getMaxIterations() const;
-      
-      void setMaxIterations( const IndexType& maxIterations );
-      
-      void solve( const MeshPointer& mesh,
-                  const AnisotropyPointer& anisotropy,
-                  MeshFunctionPointer& u );
-      
-      
-   protected:
+  //static_assert(  std::is_same< Device, TNL::Devices::Host >::value, "The fast sweeping method works only on CPU." );
+  
+  public:
+    
+    typedef Meshes::Grid< 3, Real, Device, Index > MeshType;
+    typedef Real RealType;
+    typedef Device DeviceType;
+    typedef Index IndexType;
+    typedef Anisotropy AnisotropyType;
+    typedef tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > BaseType;
+    using MeshPointer = Pointers::SharedPointer<  MeshType >;
+    using AnisotropyPointer = Pointers::SharedPointer< AnisotropyType, DeviceType >;
+    
+    using typename BaseType::InterfaceMapType;
+    using typename BaseType::MeshFunctionType;
+    using typename BaseType::InterfaceMapPointer;
+    using typename BaseType::MeshFunctionPointer;   
+    using typename BaseType::ArrayContainer;
+    
+    
+    FastSweepingMethod();
+    
+    const IndexType& getMaxIterations() const;
+    
+    void setMaxIterations( const IndexType& maxIterations );
+    
+    void solve( const MeshPointer& mesh,
+            const AnisotropyPointer& anisotropy,
+            MeshFunctionPointer& u );
+    
+    
+    protected:
       
       const IndexType maxIterations;
 };
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index b823fec03..07be36571 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -15,9 +15,12 @@
 
 #include "tnlFastSweepingMethod.h"
 #include <TNL/Devices/Cuda.h>
-#include <string.h>
+#include <TNL/Communicators/MpiDefs.h>
+
 
 
+
+#include <string.h>
 #include <iostream>
 #include <fstream>
 
@@ -80,16 +83,48 @@ solve( const MeshPointer& mesh,
   MeshFunctionType aux = *auxPtr;
   
   
+//#ifdef HAVE_MPI
+  bool a = Communicators::MpiCommunicator::IsInitialized();
+  if( a )
+    printf("Je Init\n");
+  else
+    printf("Neni Init\n");
+//#endif
   
   while( iteration < this->maxIterations )
   {
     if( std::is_same< DeviceType, Devices::Host >::value )
     {
-      int numThreadsPerBlock = 16;
+      int numThreadsPerBlock = -1;
+      
+      numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0));
+      //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
+      if( numThreadsPerBlock <= 16 )
+        numThreadsPerBlock = 16;
+      else if(numThreadsPerBlock <= 32 )
+        numThreadsPerBlock = 32;
+      else if(numThreadsPerBlock <= 64 )
+        numThreadsPerBlock = 64;
+      else if(numThreadsPerBlock <= 128 )
+        numThreadsPerBlock = 128;
+      else if(numThreadsPerBlock <= 256 )
+        numThreadsPerBlock = 256;
+      else if(numThreadsPerBlock <= 512 )
+        numThreadsPerBlock = 512;
+      else
+        numThreadsPerBlock = 1024;
+      //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock);
+      
+      if( numThreadsPerBlock == -1 ){
+        printf("Fail in setting numThreadsPerBlock.\n");
+        break;
+      }
+      
       
       
       int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
       int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
+      
       //std::cout << "numBlocksX = " << numBlocksX << std::endl;
       
       /*Real **sArray = new Real*[numBlocksX*numBlocksY];
@@ -115,13 +150,29 @@ solve( const MeshPointer& mesh,
        }
        std::cout<<std::endl;*/
       unsigned int numWhile = 0;
-      while( IsCalculationDone && numWhile < 1 )
+      while( IsCalculationDone )
       {      
         IsCalculationDone = 0;
         helpFunc1 = auxPtr;
         auxPtr = helpFunc;
         helpFunc = helpFunc1;
-        this->template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+        switch ( numThreadsPerBlock ){
+          case 16:
+            this->template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+          case 32:
+            this->template updateBlocks< 34 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+          case 64:
+            this->template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+          case 128:
+            this->template updateBlocks< 130 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+          case 256:
+            this->template updateBlocks< 258 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+          case 512:
+            this->template updateBlocks< 514 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+          default:
+            this->template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+        }
+        
         
         //Reduction      
         for( int i = 0; i < BlockIterHost.getSize(); i++ ){
@@ -131,14 +182,14 @@ solve( const MeshPointer& mesh,
           }
         }
         numWhile++;
-        std::cout <<"numWhile = "<< numWhile <<std::endl;
+        /*std::cout <<"numWhile = "<< numWhile <<std::endl;
         
         for( int j = numBlocksY-1; j>-1; j-- ){
           for( int i = 0; i < numBlocksX; i++ )
             std::cout << BlockIterHost[ j * numBlocksX + i ];
           std::cout << std::endl;
         }
-        std::cout << std::endl;
+        std::cout << std::endl;*/
         
         this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY );
         
@@ -150,8 +201,8 @@ solve( const MeshPointer& mesh,
          std::cout << std::endl;*/
         
         //std::cout<<std::endl;
-        string s( "aux-"+ std::to_string(numWhile) + ".tnl");
-        aux.save( s );
+        //string s( "aux-"+ std::to_string(numWhile) + ".tnl");
+        //aux.save( s );
       }
       if( numWhile == 1 ){
         auxPtr = helpFunc;
@@ -266,8 +317,8 @@ solve( const MeshPointer& mesh,
       BlockIterPom.setSize( numBlocksX * numBlocksY  );
       BlockIterPom.setValue( 0 );
       /*TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1;
-      BlockIterPom1.setSize( numBlocksX * numBlocksY  );
-      BlockIterPom1.setValue( 0 );*/
+       BlockIterPom1.setSize( numBlocksX * numBlocksY  );
+       BlockIterPom1.setValue( 0 );*/
       /*int *BlockIterDevice;
        cudaMalloc((void**) &BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/
       int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0);
@@ -408,6 +459,7 @@ solve( const MeshPointer& mesh,
     }
     iteration++;
   }
+  //#endif
   aux.save("aux-final.tnl");
 }
 
@@ -527,7 +579,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
   
   
   /** FOR FIM METHOD */
-    
+  
   if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] )
   { 
     __syncthreads();
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index 65aba5bf5..5af33cf29 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -64,9 +64,6 @@ solve( const MeshPointer& mesh,
   interfaceMapPtr->setMesh( mesh );
   std::cout << "Initiating the interface cells ..." << std::endl;
   BaseType::initInterface( u, auxPtr, interfaceMapPtr );
-#ifdef HAVE_CUDA
-  cudaDeviceSynchronize();
-#endif
   auxPtr->save( "aux-ini.tnl" );   
   
   typename MeshType::Cell cell( *mesh );
@@ -78,170 +75,259 @@ solve( const MeshPointer& mesh,
   {
     if( std::is_same< DeviceType, Devices::Host >::value )
     {
-      for( cell.getCoordinates().z() = 0;
-              cell.getCoordinates().z() < mesh->getDimensions().z();
-              cell.getCoordinates().z()++ )
-      {
-        for( cell.getCoordinates().y() = 0;
-                cell.getCoordinates().y() < mesh->getDimensions().y();
-                cell.getCoordinates().y()++ )
-        {
-          for( cell.getCoordinates().x() = 0;
-                  cell.getCoordinates().x() < mesh->getDimensions().x();
-                  cell.getCoordinates().x()++ )
-          {
-            cell.refresh();
-            if( ! interfaceMap( cell ) )
-              this->updateCell( aux, cell );
-          }
-        }
-      }
-      //aux.save( "aux-1.tnl" );
+      int numThreadsPerBlock = 64;
       
-      for( cell.getCoordinates().z() = 0;
-              cell.getCoordinates().z() < mesh->getDimensions().z();
-              cell.getCoordinates().z()++ )
-      {
-        for( cell.getCoordinates().y() = 0;
-                cell.getCoordinates().y() < mesh->getDimensions().y();
-                cell.getCoordinates().y()++ )
-        {
-          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                  cell.getCoordinates().x() >= 0 ;
-                  cell.getCoordinates().x()-- )		
-          {
-            //std::cerr << "2 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )            
-              this->updateCell( aux, cell );
-          }
-        }
-      }
-      //aux.save( "aux-2.tnl" );
-      for( cell.getCoordinates().z() = 0;
-              cell.getCoordinates().z() < mesh->getDimensions().z();
-              cell.getCoordinates().z()++ )
-      {
-        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-                cell.getCoordinates().y() >= 0 ;
-                cell.getCoordinates().y()-- )
-        {
-          for( cell.getCoordinates().x() = 0;
-                  cell.getCoordinates().x() < mesh->getDimensions().x();
-                  cell.getCoordinates().x()++ )
-          {
-            //std::cerr << "3 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )            
-              this->updateCell( aux, cell );
-          }
-        }
-      }
-      //aux.save( "aux-3.tnl" );
       
-      for( cell.getCoordinates().z() = 0;
-              cell.getCoordinates().z() < mesh->getDimensions().z();
-              cell.getCoordinates().z()++ )
-      {
-        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-                cell.getCoordinates().y() >= 0;
-                cell.getCoordinates().y()-- )
-        {
-          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                  cell.getCoordinates().x() >= 0 ;
-                  cell.getCoordinates().x()-- )		
-          {
-            //std::cerr << "4 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )            
-              this->updateCell( aux, cell );
-          }
-        }
-      }     
-      //aux.save( "aux-4.tnl" );
+      int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
+      int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0);
+      int numBlocksZ = mesh->getDimensions().z() / numThreadsPerBlock + (mesh->getDimensions().z() % numThreadsPerBlock != 0 ? 1:0);
+      //std::cout << "numBlocksX = " << numBlocksX << std::endl;
       
-      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-              cell.getCoordinates().z() >= 0;
-              cell.getCoordinates().z()-- )
-      {
-        for( cell.getCoordinates().y() = 0;
-                cell.getCoordinates().y() < mesh->getDimensions().y();
-                cell.getCoordinates().y()++ )
-        {
-          for( cell.getCoordinates().x() = 0;
-                  cell.getCoordinates().x() < mesh->getDimensions().x();
-                  cell.getCoordinates().x()++ )
-          {
-            //std::cerr << "5 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )
-              this->updateCell( aux, cell );
-          }
-        }
-      }
-      //aux.save( "aux-5.tnl" );
+      /*Real **sArray = new Real*[numBlocksX*numBlocksY];
+       for( int i = 0; i < numBlocksX * numBlocksY; i++ )
+       sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];*/
       
-      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-              cell.getCoordinates().z() >= 0;
-              cell.getCoordinates().z()-- )
-      {
-        for( cell.getCoordinates().y() = 0;
-                cell.getCoordinates().y() < mesh->getDimensions().y();
-                cell.getCoordinates().y()++ )
-        {
-          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                  cell.getCoordinates().x() >= 0 ;
-                  cell.getCoordinates().x()-- )		
-          {
-            //std::cerr << "6 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )            
-              this->updateCell( aux, cell );
-          }
-        }
-      }
-      //aux.save( "aux-6.tnl" );
+      ArrayContainer BlockIterHost;
+      BlockIterHost.setSize( numBlocksX * numBlocksY * numBlocksZ );
+      BlockIterHost.setValue( 1 );
+      int IsCalculationDone = 1;
       
-      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-              cell.getCoordinates().z() >= 0;
-              cell.getCoordinates().z()-- )
-      {
-        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-                cell.getCoordinates().y() >= 0 ;
-                cell.getCoordinates().y()-- )
-        {
-          for( cell.getCoordinates().x() = 0;
-                  cell.getCoordinates().x() < mesh->getDimensions().x();
-                  cell.getCoordinates().x()++ )
-          {
-            //std::cerr << "7 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )            
-              this->updateCell( aux, cell );
+      MeshFunctionPointer helpFunc( mesh );
+      MeshFunctionPointer helpFunc1( mesh );
+      helpFunc1 = auxPtr;
+      auxPtr = helpFunc;
+      helpFunc = helpFunc1;
+      //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl;
+      /*for( int k = numBlocksX-1; k >-1; k-- ){
+       for( int l = 0; l < numBlocksY; l++ ){
+       std::cout<< BlockIterHost[ l*numBlocksX  + k ];
+       }
+       std::cout<<std::endl;
+       }
+       std::cout<<std::endl;*/
+      unsigned int numWhile = 0;
+      while( IsCalculationDone  )
+      {      
+        IsCalculationDone = 0;
+        helpFunc1 = auxPtr;
+        auxPtr = helpFunc;
+        helpFunc = helpFunc1;
+        this->template updateBlocks< 66 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ );
+        
+        //Reduction      
+        for( int i = 0; i < BlockIterHost.getSize(); i++ ){
+          if( IsCalculationDone == 0 ){
+            IsCalculationDone = IsCalculationDone || BlockIterHost[ i ];
+            //break;
           }
         }
-      }
-      //aux.save( "aux-7.tnl" );
-      
-      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-              cell.getCoordinates().z() >= 0;
-              cell.getCoordinates().z()-- )
-      {
-        for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-                cell.getCoordinates().y() >= 0;
-                cell.getCoordinates().y()-- )
-        {
-          for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                  cell.getCoordinates().x() >= 0 ;
-                  cell.getCoordinates().x()-- )		
-          {
-            //std::cerr << "8 -> ";
-            cell.refresh();
-            if( ! interfaceMap( cell ) )            
-              this->updateCell( aux, cell );
+        numWhile++;
+        std::cout <<"numWhile = "<< numWhile <<std::endl;
+        /*for( int k = 0; k < numBlocksZ; k++ ){
+          for( int j = numBlocksY-1; j>-1; j-- ){
+            for( int i = 0; i < numBlocksX; i++ ){
+              //std::cout << (*auxPtr)[ k * numBlocksX * numBlocksY + j * numBlocksX + i ] << " ";
+              std::cout << BlockIterHost[ k * numBlocksX * numBlocksY + j * numBlocksX + i ];
+            }
+            std::cout << std::endl;
           }
+          std::cout << std::endl;
         }
+        std::cout << std::endl;*/
+        
+        this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY, numBlocksZ );
+        
+        /*for( int k = 0; k < numBlocksZ; k++ ){
+          for( int j = numBlocksY-1; j>-1; j-- ){
+            for( int i = 0; i < numBlocksX; i++ ){
+              //std::cout << (*auxPtr)[ k * numBlocksX * numBlocksY + j * numBlocksX + i ] << " ";
+              std::cout << BlockIterHost[ k * numBlocksX * numBlocksY + j * numBlocksX + i ];
+            }
+            std::cout << std::endl;
+          }
+          std::cout << std::endl;
+        }*/
+        
+        /*for( int j = numBlocksY-1; j>-1; j-- ){
+         for( int i = 0; i < numBlocksX; i++ )
+         std::cout << "BlockIterHost = "<< j*numBlocksX + i<< " ," << BlockIterHost[ j * numBlocksX + i ];
+         std::cout << std::endl;
+         }
+         std::cout << std::endl;*/
+        
+        //std::cout<<std::endl;
+        //string s( "aux-"+ std::to_string(numWhile) + ".tnl");
+        //aux.save( s );
+      }
+      if( numWhile == 1 ){
+        auxPtr = helpFunc;
       }
+      aux = *auxPtr;
+      
+      /*for( cell.getCoordinates().z() = 0;
+       cell.getCoordinates().z() < mesh->getDimensions().z();
+       cell.getCoordinates().z()++ )
+       {
+       for( cell.getCoordinates().y() = 0;
+       cell.getCoordinates().y() < mesh->getDimensions().y();
+       cell.getCoordinates().y()++ )
+       {
+       for( cell.getCoordinates().x() = 0;
+       cell.getCoordinates().x() < mesh->getDimensions().x();
+       cell.getCoordinates().x()++ )
+       {
+       cell.refresh();
+       if( ! interfaceMap( cell ) )
+       this->updateCell( aux, cell );
+       }
+       }
+       }
+       //aux.save( "aux-1.tnl" );
+       
+       for( cell.getCoordinates().z() = 0;
+       cell.getCoordinates().z() < mesh->getDimensions().z();
+       cell.getCoordinates().z()++ )
+       {
+       for( cell.getCoordinates().y() = 0;
+       cell.getCoordinates().y() < mesh->getDimensions().y();
+       cell.getCoordinates().y()++ )
+       {
+       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+       cell.getCoordinates().x() >= 0 ;
+       cell.getCoordinates().x()-- )		
+       {
+       //std::cerr << "2 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       }
+       //aux.save( "aux-2.tnl" );
+       for( cell.getCoordinates().z() = 0;
+       cell.getCoordinates().z() < mesh->getDimensions().z();
+       cell.getCoordinates().z()++ )
+       {
+       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+       cell.getCoordinates().y() >= 0 ;
+       cell.getCoordinates().y()-- )
+       {
+       for( cell.getCoordinates().x() = 0;
+       cell.getCoordinates().x() < mesh->getDimensions().x();
+       cell.getCoordinates().x()++ )
+       {
+       //std::cerr << "3 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       }
+       //aux.save( "aux-3.tnl" );
+       
+       for( cell.getCoordinates().z() = 0;
+       cell.getCoordinates().z() < mesh->getDimensions().z();
+       cell.getCoordinates().z()++ )
+       {
+       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+       cell.getCoordinates().y() >= 0;
+       cell.getCoordinates().y()-- )
+       {
+       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+       cell.getCoordinates().x() >= 0 ;
+       cell.getCoordinates().x()-- )		
+       {
+       //std::cerr << "4 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       }     
+       //aux.save( "aux-4.tnl" );
+       
+       for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+       cell.getCoordinates().z() >= 0;
+       cell.getCoordinates().z()-- )
+       {
+       for( cell.getCoordinates().y() = 0;
+       cell.getCoordinates().y() < mesh->getDimensions().y();
+       cell.getCoordinates().y()++ )
+       {
+       for( cell.getCoordinates().x() = 0;
+       cell.getCoordinates().x() < mesh->getDimensions().x();
+       cell.getCoordinates().x()++ )
+       {
+       //std::cerr << "5 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )
+       this->updateCell( aux, cell );
+       }
+       }
+       }
+       //aux.save( "aux-5.tnl" );
+       
+       for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+       cell.getCoordinates().z() >= 0;
+       cell.getCoordinates().z()-- )
+       {
+       for( cell.getCoordinates().y() = 0;
+       cell.getCoordinates().y() < mesh->getDimensions().y();
+       cell.getCoordinates().y()++ )
+       {
+       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+       cell.getCoordinates().x() >= 0 ;
+       cell.getCoordinates().x()-- )		
+       {
+       //std::cerr << "6 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       }
+       //aux.save( "aux-6.tnl" );
+       
+       for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+       cell.getCoordinates().z() >= 0;
+       cell.getCoordinates().z()-- )
+       {
+       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+       cell.getCoordinates().y() >= 0 ;
+       cell.getCoordinates().y()-- )
+       {
+       for( cell.getCoordinates().x() = 0;
+       cell.getCoordinates().x() < mesh->getDimensions().x();
+       cell.getCoordinates().x()++ )
+       {
+       //std::cerr << "7 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       }
+       //aux.save( "aux-7.tnl" );
+       
+       for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+       cell.getCoordinates().z() >= 0;
+       cell.getCoordinates().z()-- )
+       {
+       for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+       cell.getCoordinates().y() >= 0;
+       cell.getCoordinates().y()-- )
+       {
+       for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+       cell.getCoordinates().x() >= 0 ;
+       cell.getCoordinates().x()-- )		
+       {
+       //std::cerr << "8 -> ";
+       cell.refresh();
+       if( ! interfaceMap( cell ) )            
+       this->updateCell( aux, cell );
+       }
+       }
+       }*/
     }
     if( std::is_same< DeviceType, Devices::Cuda >::value )
     {
@@ -389,7 +475,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
   {
     __syncthreads();
     
-    __shared__ volatile bool changed[ (sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2)];
+    __shared__ volatile bool changed[ 8*8*8/*(sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2)*/];
     
     changed[ currentIndex ] = false;
     if( thrj == 0 && thri == 0 && thrk == 0 )
@@ -402,6 +488,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     
     if( thrj == 1 && thri == 1 && thrk == 1 )
     {
+      //printf( "We are in the calculation. Block = %d.\n",blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x  );
       hx = mesh.getSpaceSteps().x();
       hy = mesh.getSpaceSteps().y();
       hz = mesh.getSpaceSteps().z();
@@ -410,8 +497,8 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       dimZ = mesh.getDimensions().z();
       BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 0;
     }
-    __shared__ volatile Real sArray[sizeSArray][sizeSArray][sizeSArray];
-    sArray[thrk+1][thrj+1][thri+1] = std::numeric_limits< Real >::max();
+    __shared__ volatile Real sArray[ 10*10*10/*sizeSArray * sizeSArray * sizeSArray*/ ];
+    sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = std::numeric_limits< Real >::max();
     
     //filling sArray edges
     int numOfBlockx;
@@ -426,6 +513,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     numOfBlockx = gridDim.x;
     numOfBlocky = gridDim.y;
     numOfBlockz = gridDim.z;
+    __syncthreads();
     
     if( numOfBlockx - 1 == blIdx )
       xkolik = dimX - (blIdx)*blockDim.x+1;
@@ -438,54 +526,55 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     if( thri == 0 )
     {        
       if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik )
-        sArray[thrk+1][thrj+1][0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ];
+        sArray[(thrk+1 )* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ];
       else
-        sArray[thrk+1][thrj+1][0] = std::numeric_limits< Real >::max();
+        sArray[(thrk+1)* sizeSArray * sizeSArray + (thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max();
     }
     
     if( thri == 1 )
     {
       if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik )
-        sArray[thrk+1][thrj+1][9] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ];
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + xkolik ] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ];
       else
-        sArray[thrk+1][thrj+1][9] = std::numeric_limits< Real >::max();
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max();
     }
     if( thri == 2 )
     {        
       if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik )
-        sArray[thrk+1][0][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ];
+        sArray[ (thrk+1) * sizeSArray * sizeSArray +0*sizeSArray + thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ];
       else
-        sArray[thrk+1][0][thrj+1] = std::numeric_limits< Real >::max();
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + 0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
     }
     
     if( thri == 3 )
     {
       if( dimY > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik )
-        sArray[thrk+1][9][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ];
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ];
       else
-        sArray[thrk+1][9][thrj+1] = std::numeric_limits< Real >::max();
+        sArray[ (thrk+1) * sizeSArray * sizeSArray + ykolik*sizeSArray + thrj+1] = std::numeric_limits< Real >::max();
     }
     if( thri == 4 )
     {        
       if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik )
-        sArray[0][thrj+1][thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ];
+        sArray[ 0 * sizeSArray * sizeSArray +(thrj+1 )* sizeSArray + thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ];
       else
-        sArray[0][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
+        sArray[0 * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thrk+1] = std::numeric_limits< Real >::max();
     }
     
     if( thri == 5 )
     {
       if( dimZ > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik )
-        sArray[9][thrj+1][thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ];
+        sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ];
       else
-        sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
+        sArray[zkolik * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thrk+1] = std::numeric_limits< Real >::max();
     }
     
     if( i < dimX && j < dimY && k < dimZ )
     {
-      sArray[thrk+1][thrj+1][thri+1] = aux[ k*dimX*dimY + j*dimX + i ];
+      sArray[(thrk+1) * sizeSArray * sizeSArray + (thrj+1) *sizeSArray + thri+1] = aux[ k*dimX*dimY + j*dimX + i ];
     }
     __syncthreads(); 
+    
     while( changed[ 0 ] )
     {
       __syncthreads();
@@ -493,11 +582,11 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       changed[ currentIndex ] = false;
       
       //calculation of update cell
-      if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ )
+      if( i < dimX && j < dimY && k < dimZ )
       {
-        if( ! interfaceMap[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] )
+        if( ! interfaceMap[ k*dimX*dimY + j * dimX + i ] )
         {
-          changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz);
+          changed[ currentIndex ] = ptr.updateCell3D< sizeSArray >( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz);
         }
       }
       __syncthreads();
@@ -535,7 +624,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
         }
       }
       __syncthreads();
-      if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
+      if( currentIndex < 32 )
       {
         if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
         if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
@@ -548,7 +637,8 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       
       /*if(thri == 0 && thrj ==0 && thrk ==0 && blIdx == 0 && blIdy == 0 && blIdz == 0)
        {
-       for(int m = 0; m < 8; m++){
+       //for(int m = 0; m < 8; m++){
+       int m = 4;
        for(int n = 0; n<8; n++){
        for(int b=0; b<8; b++)
        printf(" %i ", changed[m*64 + n*8 + b]);
@@ -556,16 +646,19 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
        }
        printf("\n \n");
        }
-       }*/
+       //}*/
+      
       if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 )
       {
-        BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 1;
+        //printf( "Setting block calculation. Block = %d.\n",blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x  );
+        BlockIterDevice[ blIdz * gridDim.x * gridDim.y + blIdy * gridDim.x + blIdx ] = 1;
       }
       __syncthreads();
     }
     
     if( i < dimX && j < dimY && k < dimZ )
-      helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ];
+      helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[ (thrk+1) * sizeSArray * sizeSArray + (thrj+1) * sizeSArray + thri+1 ];
+    
   } 
 }  
 #endif
-- 
GitLab


From 1163bf19405e8f2622a96540190d55dd284037a6 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 29 Nov 2018 13:51:34 +0100
Subject: [PATCH 19/20] Moving old code to Legacy.

---
 .../fast-sweeping-map/CMakeLists.txt                |   0
 .../Legacy/fast-sweeping-map}/MainBuildConfig.h     |   0
 .../fast-sweeping-map/fastSweepingMapConfig.h       |   0
 .../Legacy/fast-sweeping-map}/main.cpp              |   0
 .../Legacy/fast-sweeping-map}/main.cu               |   0
 .../Solvers => Legacy}/fast-sweeping-map/main.h     |   0
 .../fast-sweeping-map/tnlFastSweepingMap.h          |   0
 .../tnlFastSweepingMap2D_CUDA_v4_impl.h             |   0
 .../fast-sweeping-map/tnlFastSweepingMap2D_impl.h   |   0
 .../fast-sweeping-map/tnlFastSweepingMap_CUDA.h     |   0
 .../Solvers => Legacy}/fast-sweeping/CMakeLists.txt |   0
 .../fast-sweeping}/MainBuildConfig.h                |   0
 .../fast-sweeping/fastSweepingConfig.h              |   0
 .../fast-sweeping}/main.cpp                         |   0
 .../fast-sweeping}/main.cu                          |   0
 .../Solvers => Legacy}/fast-sweeping/main.h         |   0
 .../fast-sweeping/tnlFastSweeping.h                 |   0
 .../fast-sweeping/tnlFastSweeping2D_CUDA_impl.h     |   0
 .../fast-sweeping/tnlFastSweeping2D_CUDA_v2_impl.h  |   0
 .../fast-sweeping/tnlFastSweeping2D_CUDA_v3_impl.h  |   0
 .../fast-sweeping/tnlFastSweeping2D_CUDA_v4_impl.h  |   0
 .../fast-sweeping/tnlFastSweeping2D_CUDA_v5_impl.h  |   0
 .../fast-sweeping/tnlFastSweeping2D_impl.h          |   0
 .../fast-sweeping/tnlFastSweeping2D_openMP_impl.h   |   0
 .../fast-sweeping/tnlFastSweeping3D_CUDA_impl.h     |   0
 .../fast-sweeping/tnlFastSweeping3D_impl.h          |   0
 .../fast-sweeping/tnlFastSweepingSolver.h           |   0
 .../fast-sweeping/tnlFastSweeping_CUDA.h            |   0
 .../hamilton-jacobi-parallel-map/CMakeLists.txt     |   0
 .../hamilton-jacobi-parallel-map}/MainBuildConfig.h |   0
 .../hamilton-jacobi-parallel-map/gnuplot.txt        |   0
 .../hamilton-jacobi-parallel-map/main.cpp           |   0
 .../hamilton-jacobi-parallel-map/main.cu            |   0
 .../hamilton-jacobi-parallel-map/main.h             |   0
 .../hamilton-jacobi-parallel-map/mapa_png.png       | Bin
 .../hamilton-jacobi-parallel-map/no-Makefile        |   0
 .../parallelMapConfig.h                             |   0
 .../hamilton-jacobi-parallel-map/run                |   0
 .../hamilton-jacobi-parallel-map/tnl-err2eoc-2.py   |   0
 .../tnlParallelMapSolver.h                          |   0
 .../tnlParallelMapSolver2D_impl.h                   |   0
 .../hamilton-jacobi-parallel/CMakeLists.txt         |   0
 .../hamilton-jacobi-parallel}/MainBuildConfig.h     |   0
 .../hamilton-jacobi-parallel/main.cpp               |   0
 .../hamilton-jacobi-parallel/main.cu                |   0
 .../hamilton-jacobi-parallel/main.h                 |   0
 .../hamilton-jacobi-parallel/no-Makefile            |   0
 .../parallelEikonalConfig.h                         |   0
 .../Solvers => Legacy}/hamilton-jacobi-parallel/run |   0
 .../hamilton-jacobi-parallel/tnl-err2eoc-2.py       |   0
 .../tnlParallelEikonalSolver.h                      |   0
 .../tnlParallelEikonalSolver2D_impl.h               |   0
 .../tnlParallelEikonalSolver3D_impl.h               |   0
 .../Legacy}/narrow-band/CMakeLists.txt              |   0
 .../narrow-band}/MainBuildConfig.h                  |   0
 .../fast-sweeping => Legacy/narrow-band}/main.cpp   |   0
 .../fast-sweeping => Legacy/narrow-band}/main.cu    |   0
 src/{Examples => TNL/Legacy}/narrow-band/main.h     |   0
 .../Legacy}/narrow-band/narrowBandConfig.h          |   0
 .../Legacy}/narrow-band/tnlNarrowBand.h             |   0
 .../narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h      |   0
 .../narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h      |   0
 .../Legacy}/narrow-band/tnlNarrowBand2D_impl.h      |   0
 .../Legacy}/narrow-band/tnlNarrowBand3D_CUDA_impl.h |   0
 .../Legacy}/narrow-band/tnlNarrowBand3D_impl.h      |   0
 .../Legacy}/narrow-band/tnlNarrowBand_CUDA.h        |   0
 66 files changed, 0 insertions(+), 0 deletions(-)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping-map/CMakeLists.txt (100%)
 rename src/{Examples/narrow-band => TNL/Legacy/fast-sweeping-map}/MainBuildConfig.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping-map/fastSweepingMapConfig.h (100%)
 rename src/{Examples/narrow-band => TNL/Legacy/fast-sweeping-map}/main.cpp (100%)
 rename src/{Examples/narrow-band => TNL/Legacy/fast-sweeping-map}/main.cu (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping-map/main.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping-map/tnlFastSweepingMap.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping-map/tnlFastSweepingMap2D_CUDA_v4_impl.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping-map/tnlFastSweepingMap2D_impl.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping-map/tnlFastSweepingMap_CUDA.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping/CMakeLists.txt (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map => Legacy/fast-sweeping}/MainBuildConfig.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping/fastSweepingConfig.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map => Legacy/fast-sweeping}/main.cpp (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map => Legacy/fast-sweeping}/main.cu (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping/main.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping/tnlFastSweeping.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping/tnlFastSweeping2D_CUDA_impl.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping/tnlFastSweeping2D_CUDA_v2_impl.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping/tnlFastSweeping2D_CUDA_v3_impl.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping/tnlFastSweeping2D_CUDA_v4_impl.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping/tnlFastSweeping2D_CUDA_v5_impl.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping/tnlFastSweeping2D_impl.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping/tnlFastSweeping2D_openMP_impl.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping/tnlFastSweeping3D_CUDA_impl.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping/tnlFastSweeping3D_impl.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping/tnlFastSweepingSolver.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/fast-sweeping/tnlFastSweeping_CUDA.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel-map/CMakeLists.txt (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers/fast-sweeping => Legacy/hamilton-jacobi-parallel-map}/MainBuildConfig.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel-map/gnuplot.txt (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel-map/main.cpp (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel-map/main.cu (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel-map/main.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel-map/mapa_png.png (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel-map/no-Makefile (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel-map/parallelMapConfig.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel-map/run (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel-map/tnl-err2eoc-2.py (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel-map/tnlParallelMapSolver.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel-map/tnlParallelMapSolver2D_impl.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel/CMakeLists.txt (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map => Legacy/hamilton-jacobi-parallel}/MainBuildConfig.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel/main.cpp (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel/main.cu (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel/main.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel/no-Makefile (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel/parallelEikonalConfig.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel/run (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel/tnl-err2eoc-2.py (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel/tnlParallelEikonalSolver.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel/tnlParallelEikonalSolver2D_impl.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers => Legacy}/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h (100%)
 rename src/{Examples => TNL/Legacy}/narrow-band/CMakeLists.txt (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel => Legacy/narrow-band}/MainBuildConfig.h (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers/fast-sweeping => Legacy/narrow-band}/main.cpp (100%)
 rename src/TNL/{Experimental/Hamilton-Jacobi/Solvers/fast-sweeping => Legacy/narrow-band}/main.cu (100%)
 rename src/{Examples => TNL/Legacy}/narrow-band/main.h (100%)
 rename src/{Examples => TNL/Legacy}/narrow-band/narrowBandConfig.h (100%)
 rename src/{Examples => TNL/Legacy}/narrow-band/tnlNarrowBand.h (100%)
 rename src/{Examples => TNL/Legacy}/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h (100%)
 rename src/{Examples => TNL/Legacy}/narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h (100%)
 rename src/{Examples => TNL/Legacy}/narrow-band/tnlNarrowBand2D_impl.h (100%)
 rename src/{Examples => TNL/Legacy}/narrow-band/tnlNarrowBand3D_CUDA_impl.h (100%)
 rename src/{Examples => TNL/Legacy}/narrow-band/tnlNarrowBand3D_impl.h (100%)
 rename src/{Examples => TNL/Legacy}/narrow-band/tnlNarrowBand_CUDA.h (100%)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map/CMakeLists.txt b/src/TNL/Legacy/fast-sweeping-map/CMakeLists.txt
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map/CMakeLists.txt
rename to src/TNL/Legacy/fast-sweeping-map/CMakeLists.txt
diff --git a/src/Examples/narrow-band/MainBuildConfig.h b/src/TNL/Legacy/fast-sweeping-map/MainBuildConfig.h
similarity index 100%
rename from src/Examples/narrow-band/MainBuildConfig.h
rename to src/TNL/Legacy/fast-sweeping-map/MainBuildConfig.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map/fastSweepingMapConfig.h b/src/TNL/Legacy/fast-sweeping-map/fastSweepingMapConfig.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map/fastSweepingMapConfig.h
rename to src/TNL/Legacy/fast-sweeping-map/fastSweepingMapConfig.h
diff --git a/src/Examples/narrow-band/main.cpp b/src/TNL/Legacy/fast-sweeping-map/main.cpp
similarity index 100%
rename from src/Examples/narrow-band/main.cpp
rename to src/TNL/Legacy/fast-sweeping-map/main.cpp
diff --git a/src/Examples/narrow-band/main.cu b/src/TNL/Legacy/fast-sweeping-map/main.cu
similarity index 100%
rename from src/Examples/narrow-band/main.cu
rename to src/TNL/Legacy/fast-sweeping-map/main.cu
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map/main.h b/src/TNL/Legacy/fast-sweeping-map/main.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map/main.h
rename to src/TNL/Legacy/fast-sweeping-map/main.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map/tnlFastSweepingMap.h b/src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map/tnlFastSweepingMap.h
rename to src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map/tnlFastSweepingMap2D_CUDA_v4_impl.h b/src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap2D_CUDA_v4_impl.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map/tnlFastSweepingMap2D_CUDA_v4_impl.h
rename to src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap2D_CUDA_v4_impl.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map/tnlFastSweepingMap2D_impl.h b/src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap2D_impl.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map/tnlFastSweepingMap2D_impl.h
rename to src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap2D_impl.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map/tnlFastSweepingMap_CUDA.h b/src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap_CUDA.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map/tnlFastSweepingMap_CUDA.h
rename to src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap_CUDA.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/CMakeLists.txt b/src/TNL/Legacy/fast-sweeping/CMakeLists.txt
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/CMakeLists.txt
rename to src/TNL/Legacy/fast-sweeping/CMakeLists.txt
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map/MainBuildConfig.h b/src/TNL/Legacy/fast-sweeping/MainBuildConfig.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map/MainBuildConfig.h
rename to src/TNL/Legacy/fast-sweeping/MainBuildConfig.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/fastSweepingConfig.h b/src/TNL/Legacy/fast-sweeping/fastSweepingConfig.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/fastSweepingConfig.h
rename to src/TNL/Legacy/fast-sweeping/fastSweepingConfig.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map/main.cpp b/src/TNL/Legacy/fast-sweeping/main.cpp
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map/main.cpp
rename to src/TNL/Legacy/fast-sweeping/main.cpp
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map/main.cu b/src/TNL/Legacy/fast-sweeping/main.cu
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping-map/main.cu
rename to src/TNL/Legacy/fast-sweeping/main.cu
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/main.h b/src/TNL/Legacy/fast-sweeping/main.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/main.h
rename to src/TNL/Legacy/fast-sweeping/main.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping.h
rename to src/TNL/Legacy/fast-sweeping/tnlFastSweeping.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping2D_CUDA_impl.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_impl.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping2D_CUDA_impl.h
rename to src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_impl.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping2D_CUDA_v2_impl.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v2_impl.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping2D_CUDA_v2_impl.h
rename to src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v2_impl.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping2D_CUDA_v3_impl.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v3_impl.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping2D_CUDA_v3_impl.h
rename to src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v3_impl.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping2D_CUDA_v4_impl.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v4_impl.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping2D_CUDA_v4_impl.h
rename to src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v4_impl.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping2D_CUDA_v5_impl.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v5_impl.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping2D_CUDA_v5_impl.h
rename to src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v5_impl.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping2D_impl.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_impl.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping2D_impl.h
rename to src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_impl.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping2D_openMP_impl.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_openMP_impl.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping2D_openMP_impl.h
rename to src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_openMP_impl.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping3D_CUDA_impl.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping3D_CUDA_impl.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping3D_CUDA_impl.h
rename to src/TNL/Legacy/fast-sweeping/tnlFastSweeping3D_CUDA_impl.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping3D_impl.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping3D_impl.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping3D_impl.h
rename to src/TNL/Legacy/fast-sweeping/tnlFastSweeping3D_impl.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweepingSolver.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweepingSolver.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweepingSolver.h
rename to src/TNL/Legacy/fast-sweeping/tnlFastSweepingSolver.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping_CUDA.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping_CUDA.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/tnlFastSweeping_CUDA.h
rename to src/TNL/Legacy/fast-sweeping/tnlFastSweeping_CUDA.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/CMakeLists.txt b/src/TNL/Legacy/hamilton-jacobi-parallel-map/CMakeLists.txt
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/CMakeLists.txt
rename to src/TNL/Legacy/hamilton-jacobi-parallel-map/CMakeLists.txt
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/MainBuildConfig.h b/src/TNL/Legacy/hamilton-jacobi-parallel-map/MainBuildConfig.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/MainBuildConfig.h
rename to src/TNL/Legacy/hamilton-jacobi-parallel-map/MainBuildConfig.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/gnuplot.txt b/src/TNL/Legacy/hamilton-jacobi-parallel-map/gnuplot.txt
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/gnuplot.txt
rename to src/TNL/Legacy/hamilton-jacobi-parallel-map/gnuplot.txt
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/main.cpp b/src/TNL/Legacy/hamilton-jacobi-parallel-map/main.cpp
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/main.cpp
rename to src/TNL/Legacy/hamilton-jacobi-parallel-map/main.cpp
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/main.cu b/src/TNL/Legacy/hamilton-jacobi-parallel-map/main.cu
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/main.cu
rename to src/TNL/Legacy/hamilton-jacobi-parallel-map/main.cu
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/main.h b/src/TNL/Legacy/hamilton-jacobi-parallel-map/main.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/main.h
rename to src/TNL/Legacy/hamilton-jacobi-parallel-map/main.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/mapa_png.png b/src/TNL/Legacy/hamilton-jacobi-parallel-map/mapa_png.png
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/mapa_png.png
rename to src/TNL/Legacy/hamilton-jacobi-parallel-map/mapa_png.png
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/no-Makefile b/src/TNL/Legacy/hamilton-jacobi-parallel-map/no-Makefile
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/no-Makefile
rename to src/TNL/Legacy/hamilton-jacobi-parallel-map/no-Makefile
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/parallelMapConfig.h b/src/TNL/Legacy/hamilton-jacobi-parallel-map/parallelMapConfig.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/parallelMapConfig.h
rename to src/TNL/Legacy/hamilton-jacobi-parallel-map/parallelMapConfig.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/run b/src/TNL/Legacy/hamilton-jacobi-parallel-map/run
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/run
rename to src/TNL/Legacy/hamilton-jacobi-parallel-map/run
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/tnl-err2eoc-2.py b/src/TNL/Legacy/hamilton-jacobi-parallel-map/tnl-err2eoc-2.py
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/tnl-err2eoc-2.py
rename to src/TNL/Legacy/hamilton-jacobi-parallel-map/tnl-err2eoc-2.py
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/tnlParallelMapSolver.h b/src/TNL/Legacy/hamilton-jacobi-parallel-map/tnlParallelMapSolver.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/tnlParallelMapSolver.h
rename to src/TNL/Legacy/hamilton-jacobi-parallel-map/tnlParallelMapSolver.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/tnlParallelMapSolver2D_impl.h b/src/TNL/Legacy/hamilton-jacobi-parallel-map/tnlParallelMapSolver2D_impl.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/tnlParallelMapSolver2D_impl.h
rename to src/TNL/Legacy/hamilton-jacobi-parallel-map/tnlParallelMapSolver2D_impl.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/CMakeLists.txt b/src/TNL/Legacy/hamilton-jacobi-parallel/CMakeLists.txt
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/CMakeLists.txt
rename to src/TNL/Legacy/hamilton-jacobi-parallel/CMakeLists.txt
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/MainBuildConfig.h b/src/TNL/Legacy/hamilton-jacobi-parallel/MainBuildConfig.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel-map/MainBuildConfig.h
rename to src/TNL/Legacy/hamilton-jacobi-parallel/MainBuildConfig.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/main.cpp b/src/TNL/Legacy/hamilton-jacobi-parallel/main.cpp
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/main.cpp
rename to src/TNL/Legacy/hamilton-jacobi-parallel/main.cpp
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/main.cu b/src/TNL/Legacy/hamilton-jacobi-parallel/main.cu
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/main.cu
rename to src/TNL/Legacy/hamilton-jacobi-parallel/main.cu
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/main.h b/src/TNL/Legacy/hamilton-jacobi-parallel/main.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/main.h
rename to src/TNL/Legacy/hamilton-jacobi-parallel/main.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/no-Makefile b/src/TNL/Legacy/hamilton-jacobi-parallel/no-Makefile
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/no-Makefile
rename to src/TNL/Legacy/hamilton-jacobi-parallel/no-Makefile
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/parallelEikonalConfig.h b/src/TNL/Legacy/hamilton-jacobi-parallel/parallelEikonalConfig.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/parallelEikonalConfig.h
rename to src/TNL/Legacy/hamilton-jacobi-parallel/parallelEikonalConfig.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/run b/src/TNL/Legacy/hamilton-jacobi-parallel/run
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/run
rename to src/TNL/Legacy/hamilton-jacobi-parallel/run
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/tnl-err2eoc-2.py b/src/TNL/Legacy/hamilton-jacobi-parallel/tnl-err2eoc-2.py
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/tnl-err2eoc-2.py
rename to src/TNL/Legacy/hamilton-jacobi-parallel/tnl-err2eoc-2.py
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/tnlParallelEikonalSolver.h b/src/TNL/Legacy/hamilton-jacobi-parallel/tnlParallelEikonalSolver.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/tnlParallelEikonalSolver.h
rename to src/TNL/Legacy/hamilton-jacobi-parallel/tnlParallelEikonalSolver.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/tnlParallelEikonalSolver2D_impl.h b/src/TNL/Legacy/hamilton-jacobi-parallel/tnlParallelEikonalSolver2D_impl.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/tnlParallelEikonalSolver2D_impl.h
rename to src/TNL/Legacy/hamilton-jacobi-parallel/tnlParallelEikonalSolver2D_impl.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h b/src/TNL/Legacy/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h
rename to src/TNL/Legacy/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h
diff --git a/src/Examples/narrow-band/CMakeLists.txt b/src/TNL/Legacy/narrow-band/CMakeLists.txt
similarity index 100%
rename from src/Examples/narrow-band/CMakeLists.txt
rename to src/TNL/Legacy/narrow-band/CMakeLists.txt
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/MainBuildConfig.h b/src/TNL/Legacy/narrow-band/MainBuildConfig.h
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi-parallel/MainBuildConfig.h
rename to src/TNL/Legacy/narrow-band/MainBuildConfig.h
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/main.cpp b/src/TNL/Legacy/narrow-band/main.cpp
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/main.cpp
rename to src/TNL/Legacy/narrow-band/main.cpp
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/main.cu b/src/TNL/Legacy/narrow-band/main.cu
similarity index 100%
rename from src/TNL/Experimental/Hamilton-Jacobi/Solvers/fast-sweeping/main.cu
rename to src/TNL/Legacy/narrow-band/main.cu
diff --git a/src/Examples/narrow-band/main.h b/src/TNL/Legacy/narrow-band/main.h
similarity index 100%
rename from src/Examples/narrow-band/main.h
rename to src/TNL/Legacy/narrow-band/main.h
diff --git a/src/Examples/narrow-band/narrowBandConfig.h b/src/TNL/Legacy/narrow-band/narrowBandConfig.h
similarity index 100%
rename from src/Examples/narrow-band/narrowBandConfig.h
rename to src/TNL/Legacy/narrow-band/narrowBandConfig.h
diff --git a/src/Examples/narrow-band/tnlNarrowBand.h b/src/TNL/Legacy/narrow-band/tnlNarrowBand.h
similarity index 100%
rename from src/Examples/narrow-band/tnlNarrowBand.h
rename to src/TNL/Legacy/narrow-band/tnlNarrowBand.h
diff --git a/src/Examples/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h b/src/TNL/Legacy/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h
similarity index 100%
rename from src/Examples/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h
rename to src/TNL/Legacy/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h
diff --git a/src/Examples/narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h b/src/TNL/Legacy/narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h
similarity index 100%
rename from src/Examples/narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h
rename to src/TNL/Legacy/narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h
diff --git a/src/Examples/narrow-band/tnlNarrowBand2D_impl.h b/src/TNL/Legacy/narrow-band/tnlNarrowBand2D_impl.h
similarity index 100%
rename from src/Examples/narrow-band/tnlNarrowBand2D_impl.h
rename to src/TNL/Legacy/narrow-band/tnlNarrowBand2D_impl.h
diff --git a/src/Examples/narrow-band/tnlNarrowBand3D_CUDA_impl.h b/src/TNL/Legacy/narrow-band/tnlNarrowBand3D_CUDA_impl.h
similarity index 100%
rename from src/Examples/narrow-band/tnlNarrowBand3D_CUDA_impl.h
rename to src/TNL/Legacy/narrow-band/tnlNarrowBand3D_CUDA_impl.h
diff --git a/src/Examples/narrow-band/tnlNarrowBand3D_impl.h b/src/TNL/Legacy/narrow-band/tnlNarrowBand3D_impl.h
similarity index 100%
rename from src/Examples/narrow-band/tnlNarrowBand3D_impl.h
rename to src/TNL/Legacy/narrow-band/tnlNarrowBand3D_impl.h
diff --git a/src/Examples/narrow-band/tnlNarrowBand_CUDA.h b/src/TNL/Legacy/narrow-band/tnlNarrowBand_CUDA.h
similarity index 100%
rename from src/Examples/narrow-band/tnlNarrowBand_CUDA.h
rename to src/TNL/Legacy/narrow-band/tnlNarrowBand_CUDA.h
-- 
GitLab


From e7217db4a436509220b84019d4ee6b7d6c20a650 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 29 Nov 2018 14:02:03 +0100
Subject: [PATCH 20/20] Deleting legacy code.

---
 .../Legacy/fast-sweeping-map/CMakeLists.txt   |   22 -
 .../fast-sweeping-map/MainBuildConfig.h       |   64 -
 .../fast-sweeping-map/fastSweepingMapConfig.h |   39 -
 src/TNL/Legacy/fast-sweeping-map/main.cpp     |   17 -
 src/TNL/Legacy/fast-sweeping-map/main.cu      |   17 -
 src/TNL/Legacy/fast-sweeping-map/main.h       |   88 -
 .../fast-sweeping-map/tnlFastSweepingMap.h    |  188 --
 .../tnlFastSweepingMap2D_CUDA_v4_impl.h       | 1051 ---------
 .../tnlFastSweepingMap2D_impl.h               |  823 -------
 .../tnlFastSweepingMap_CUDA.h                 |  196 --
 src/TNL/Legacy/fast-sweeping/CMakeLists.txt   |   22 -
 .../Legacy/fast-sweeping/MainBuildConfig.h    |   64 -
 .../Legacy/fast-sweeping/fastSweepingConfig.h |   38 -
 src/TNL/Legacy/fast-sweeping/main.cpp         |   17 -
 src/TNL/Legacy/fast-sweeping/main.cu          |   17 -
 src/TNL/Legacy/fast-sweeping/main.h           |   88 -
 .../Legacy/fast-sweeping/tnlFastSweeping.h    |  186 --
 .../tnlFastSweeping2D_CUDA_impl.h             |  522 -----
 .../tnlFastSweeping2D_CUDA_v2_impl.h          |  588 -----
 .../tnlFastSweeping2D_CUDA_v3_impl.h          |  920 --------
 .../tnlFastSweeping2D_CUDA_v4_impl.h          | 1003 ---------
 .../tnlFastSweeping2D_CUDA_v5_impl.h          |  697 ------
 .../fast-sweeping/tnlFastSweeping2D_impl.h    |  927 --------
 .../tnlFastSweeping2D_openMP_impl.h           |  399 ----
 .../tnlFastSweeping3D_CUDA_impl.h             |  961 --------
 .../fast-sweeping/tnlFastSweeping3D_impl.h    |  307 ---
 .../fast-sweeping/tnlFastSweepingSolver.h     |   36 -
 .../fast-sweeping/tnlFastSweeping_CUDA.h      |  194 --
 .../CMakeLists.txt                            |   23 -
 .../MainBuildConfig.h                         |   64 -
 .../hamilton-jacobi-parallel-map/gnuplot.txt  |   32 -
 .../hamilton-jacobi-parallel-map/main.cpp     |   17 -
 .../hamilton-jacobi-parallel-map/main.cu      |   17 -
 .../hamilton-jacobi-parallel-map/main.h       |   98 -
 .../hamilton-jacobi-parallel-map/mapa_png.png |  Bin 24841 -> 0 bytes
 .../hamilton-jacobi-parallel-map/no-Makefile  |   41 -
 .../parallelMapConfig.h                       |   47 -
 .../Legacy/hamilton-jacobi-parallel-map/run   |   43 -
 .../tnl-err2eoc-2.py                          |  141 --
 .../tnlParallelMapSolver.h                    |  217 --
 .../tnlParallelMapSolver2D_impl.h             | 1315 -----------
 .../hamilton-jacobi-parallel/CMakeLists.txt   |   23 -
 .../MainBuildConfig.h                         |   64 -
 .../Legacy/hamilton-jacobi-parallel/main.cpp  |   17 -
 .../Legacy/hamilton-jacobi-parallel/main.cu   |   17 -
 .../Legacy/hamilton-jacobi-parallel/main.h    |  142 --
 .../hamilton-jacobi-parallel/no-Makefile      |   41 -
 .../parallelEikonalConfig.h                   |   46 -
 src/TNL/Legacy/hamilton-jacobi-parallel/run   |   64 -
 .../hamilton-jacobi-parallel/tnl-err2eoc-2.py |  141 --
 .../tnlParallelEikonalSolver.h                |  366 ----
 .../tnlParallelEikonalSolver2D_impl.h         | 1928 -----------------
 .../tnlParallelEikonalSolver3D_impl.h         | 1706 ---------------
 src/TNL/Legacy/narrow-band/CMakeLists.txt     |   22 -
 src/TNL/Legacy/narrow-band/MainBuildConfig.h  |   64 -
 src/TNL/Legacy/narrow-band/main.cpp           |   17 -
 src/TNL/Legacy/narrow-band/main.cu            |   17 -
 src/TNL/Legacy/narrow-band/main.h             |   88 -
 src/TNL/Legacy/narrow-band/narrowBandConfig.h |   40 -
 src/TNL/Legacy/narrow-band/tnlNarrowBand.h    |  186 --
 .../tnlNarrowBand2D_CUDA_v4_impl.h            | 1317 -----------
 .../tnlNarrowBand2D_CUDA_v5_impl.h            | 1313 -----------
 .../Legacy/narrow-band/tnlNarrowBand2D_impl.h |  927 --------
 .../narrow-band/tnlNarrowBand3D_CUDA_impl.h   |  961 --------
 .../Legacy/narrow-band/tnlNarrowBand3D_impl.h |  307 ---
 .../Legacy/narrow-band/tnlNarrowBand_CUDA.h   |  203 --
 66 files changed, 21563 deletions(-)
 delete mode 100644 src/TNL/Legacy/fast-sweeping-map/CMakeLists.txt
 delete mode 100644 src/TNL/Legacy/fast-sweeping-map/MainBuildConfig.h
 delete mode 100644 src/TNL/Legacy/fast-sweeping-map/fastSweepingMapConfig.h
 delete mode 100644 src/TNL/Legacy/fast-sweeping-map/main.cpp
 delete mode 100644 src/TNL/Legacy/fast-sweeping-map/main.cu
 delete mode 100644 src/TNL/Legacy/fast-sweeping-map/main.h
 delete mode 100644 src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap.h
 delete mode 100644 src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap2D_CUDA_v4_impl.h
 delete mode 100644 src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap2D_impl.h
 delete mode 100644 src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap_CUDA.h
 delete mode 100644 src/TNL/Legacy/fast-sweeping/CMakeLists.txt
 delete mode 100644 src/TNL/Legacy/fast-sweeping/MainBuildConfig.h
 delete mode 100644 src/TNL/Legacy/fast-sweeping/fastSweepingConfig.h
 delete mode 100644 src/TNL/Legacy/fast-sweeping/main.cpp
 delete mode 100644 src/TNL/Legacy/fast-sweeping/main.cu
 delete mode 100644 src/TNL/Legacy/fast-sweeping/main.h
 delete mode 100644 src/TNL/Legacy/fast-sweeping/tnlFastSweeping.h
 delete mode 100644 src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_impl.h
 delete mode 100644 src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v2_impl.h
 delete mode 100644 src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v3_impl.h
 delete mode 100644 src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v4_impl.h
 delete mode 100644 src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v5_impl.h
 delete mode 100644 src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_impl.h
 delete mode 100644 src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_openMP_impl.h
 delete mode 100644 src/TNL/Legacy/fast-sweeping/tnlFastSweeping3D_CUDA_impl.h
 delete mode 100644 src/TNL/Legacy/fast-sweeping/tnlFastSweeping3D_impl.h
 delete mode 100644 src/TNL/Legacy/fast-sweeping/tnlFastSweepingSolver.h
 delete mode 100644 src/TNL/Legacy/fast-sweeping/tnlFastSweeping_CUDA.h
 delete mode 100644 src/TNL/Legacy/hamilton-jacobi-parallel-map/CMakeLists.txt
 delete mode 100644 src/TNL/Legacy/hamilton-jacobi-parallel-map/MainBuildConfig.h
 delete mode 100644 src/TNL/Legacy/hamilton-jacobi-parallel-map/gnuplot.txt
 delete mode 100644 src/TNL/Legacy/hamilton-jacobi-parallel-map/main.cpp
 delete mode 100644 src/TNL/Legacy/hamilton-jacobi-parallel-map/main.cu
 delete mode 100644 src/TNL/Legacy/hamilton-jacobi-parallel-map/main.h
 delete mode 100644 src/TNL/Legacy/hamilton-jacobi-parallel-map/mapa_png.png
 delete mode 100644 src/TNL/Legacy/hamilton-jacobi-parallel-map/no-Makefile
 delete mode 100644 src/TNL/Legacy/hamilton-jacobi-parallel-map/parallelMapConfig.h
 delete mode 100755 src/TNL/Legacy/hamilton-jacobi-parallel-map/run
 delete mode 100755 src/TNL/Legacy/hamilton-jacobi-parallel-map/tnl-err2eoc-2.py
 delete mode 100644 src/TNL/Legacy/hamilton-jacobi-parallel-map/tnlParallelMapSolver.h
 delete mode 100644 src/TNL/Legacy/hamilton-jacobi-parallel-map/tnlParallelMapSolver2D_impl.h
 delete mode 100644 src/TNL/Legacy/hamilton-jacobi-parallel/CMakeLists.txt
 delete mode 100644 src/TNL/Legacy/hamilton-jacobi-parallel/MainBuildConfig.h
 delete mode 100644 src/TNL/Legacy/hamilton-jacobi-parallel/main.cpp
 delete mode 100644 src/TNL/Legacy/hamilton-jacobi-parallel/main.cu
 delete mode 100644 src/TNL/Legacy/hamilton-jacobi-parallel/main.h
 delete mode 100644 src/TNL/Legacy/hamilton-jacobi-parallel/no-Makefile
 delete mode 100644 src/TNL/Legacy/hamilton-jacobi-parallel/parallelEikonalConfig.h
 delete mode 100755 src/TNL/Legacy/hamilton-jacobi-parallel/run
 delete mode 100755 src/TNL/Legacy/hamilton-jacobi-parallel/tnl-err2eoc-2.py
 delete mode 100644 src/TNL/Legacy/hamilton-jacobi-parallel/tnlParallelEikonalSolver.h
 delete mode 100644 src/TNL/Legacy/hamilton-jacobi-parallel/tnlParallelEikonalSolver2D_impl.h
 delete mode 100644 src/TNL/Legacy/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h
 delete mode 100644 src/TNL/Legacy/narrow-band/CMakeLists.txt
 delete mode 100644 src/TNL/Legacy/narrow-band/MainBuildConfig.h
 delete mode 100644 src/TNL/Legacy/narrow-band/main.cpp
 delete mode 100644 src/TNL/Legacy/narrow-band/main.cu
 delete mode 100644 src/TNL/Legacy/narrow-band/main.h
 delete mode 100644 src/TNL/Legacy/narrow-band/narrowBandConfig.h
 delete mode 100644 src/TNL/Legacy/narrow-band/tnlNarrowBand.h
 delete mode 100644 src/TNL/Legacy/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h
 delete mode 100644 src/TNL/Legacy/narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h
 delete mode 100644 src/TNL/Legacy/narrow-band/tnlNarrowBand2D_impl.h
 delete mode 100644 src/TNL/Legacy/narrow-band/tnlNarrowBand3D_CUDA_impl.h
 delete mode 100644 src/TNL/Legacy/narrow-band/tnlNarrowBand3D_impl.h
 delete mode 100644 src/TNL/Legacy/narrow-band/tnlNarrowBand_CUDA.h

diff --git a/src/TNL/Legacy/fast-sweeping-map/CMakeLists.txt b/src/TNL/Legacy/fast-sweeping-map/CMakeLists.txt
deleted file mode 100644
index 3f9db0da0..000000000
--- a/src/TNL/Legacy/fast-sweeping-map/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-set( tnl_fast_sweeping_map_SOURCES
-#     MainBuildConfig.h
-#     tnlFastSweepingMap2D_impl.h
-#     tnlFastSweepingMap.h
-#     fastSweepingMapConfig.h 
-     main.cpp)
-
-
-IF(  BUILD_CUDA ) 
-	CUDA_ADD_EXECUTABLE(fast-sweeping-map main.cu)
-ELSE(  BUILD_CUDA )                
-	ADD_EXECUTABLE(fast-sweeping-map main.cpp)
-ENDIF( BUILD_CUDA )
-target_link_libraries (fast-sweeping-map tnl )
-
-
-INSTALL( TARGETS fast-sweeping-map
-         RUNTIME DESTINATION bin
-         PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )
-        
-#INSTALL( FILES ${tnl_fast_sweeping_map_SOURCES}
-#         DESTINATION ${TNL_TARGET_DATA_DIRECTORY}/examples/fast-sweeping-map )
diff --git a/src/TNL/Legacy/fast-sweeping-map/MainBuildConfig.h b/src/TNL/Legacy/fast-sweeping-map/MainBuildConfig.h
deleted file mode 100644
index ed3d686eb..000000000
--- a/src/TNL/Legacy/fast-sweeping-map/MainBuildConfig.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/***************************************************************************
-                          MainBuildConfig.h  -  description
-                             -------------------
-    begin                : Jul 7, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef MAINBUILDCONFIG_H_
-#define MAINBUILDCONFIG_H_
-
-#include <solvers/tnlBuildConfigTags.h>
-
-class MainBuildConfig
-{
-   public:
-
-      static void print() {std::cerr << "MainBuildConfig" <<std::endl; }
-};
-
-/****
- * Turn off support for float and long double.
- */
-template<> struct tnlConfigTagReal< MainBuildConfig, float > { enum { enabled = false }; };
-template<> struct tnlConfigTagReal< MainBuildConfig, long double > { enum { enabled = false }; };
-
-/****
- * Turn off support for short int and long int indexing.
- */
-template<> struct tnlConfigTagIndex< MainBuildConfig, short int >{ enum { enabled = false }; };
-template<> struct tnlConfigTagIndex< MainBuildConfig, long int >{ enum { enabled = false }; };
-
-/****
- * Use of tnlGrid is enabled for allowed dimensions and Real, Device and Index types.
- */
-template< int Dimensions, typename Real, typename Device, typename Index >
-   struct tnlConfigTagMesh< MainBuildConfig, tnlGrid< Dimensions, Real, Device, Index > >
-      { enum { enabled = tnlConfigTagDimensions< MainBuildConfig, Dimensions >::enabled  &&
-                         tnlConfigTagReal< MainBuildConfig, Real >::enabled &&
-                         tnlConfigTagDevice< MainBuildConfig, Device >::enabled &&
-                         tnlConfigTagIndex< MainBuildConfig, Index >::enabled }; };
-
-/****
- * Please, chose your preferred time discretisation  here.
- */
-template<> struct tnlConfigTagTimeDiscretisation< MainBuildConfig, tnlExplicitTimeDiscretisationTag >{ enum { enabled = true }; };
-template<> struct tnlConfigTagTimeDiscretisation< MainBuildConfig, tnlSemiImplicitTimeDiscretisationTag >{ enum { enabled = false}; };
-template<> struct tnlConfigTagTimeDiscretisation< MainBuildConfig, tnlImplicitTimeDiscretisationTag >{ enum { enabled = false }; };
-
-/****
- * Only the Runge-Kutta-Merson solver is enabled by default.
- */
-template<> struct tnlConfigTagExplicitSolver< MainBuildConfig, tnlExplicitEulerSolverTag >{ enum { enabled = false }; };
-
-#endif /* MAINBUILDCONFIG_H_ */
diff --git a/src/TNL/Legacy/fast-sweeping-map/fastSweepingMapConfig.h b/src/TNL/Legacy/fast-sweeping-map/fastSweepingMapConfig.h
deleted file mode 100644
index 9251deca8..000000000
--- a/src/TNL/Legacy/fast-sweeping-map/fastSweepingMapConfig.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/***************************************************************************
-                          fastSweepingConfig.h  -  description
-                             -------------------
-    begin                : Oct 15, 2015
-    copyright            : (C) 2015 by Tomas Sobotik
-    email                :
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef FASTSWEEPINGCONFIG_H_
-#define FASTSWEEPINGCONFIG_H_
-
-#include <config/tnlConfigDescription.h>
-
-template< typename ConfigTag >
-class fastSweepingMapConfig
-{
-   public:
-      static void configSetup( tnlConfigDescription& config )
-      {
-         config.addDelimiter( "Parallel Eikonal solver settings:" );
-         config.addEntry        < String > ( "problem-name", "This defines particular problem.", "fast-sweeping" );
-         config.addRequiredEntry        < String > ( "initial-condition", "Initial condition for solver");
-         config.addRequiredEntry        < int > ( "dim", "Dimension of problem.");
-         config.addEntry       < String > ( "mesh", "Name of mesh.", "mesh.tnl" );
-         config.addEntry       < String > ( "exact-input", "Are the function values near the curve equal to the SDF? (yes/no)", "no" );
-         config.addRequiredEntry        < String > ( "map", "Gradient map for solver");
-      }
-};
-
-#endif /* FASTSWEEPINGCONFIG_H_ */
diff --git a/src/TNL/Legacy/fast-sweeping-map/main.cpp b/src/TNL/Legacy/fast-sweeping-map/main.cpp
deleted file mode 100644
index 8849008ff..000000000
--- a/src/TNL/Legacy/fast-sweeping-map/main.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/***************************************************************************
-                          main.cpp  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#include "main.h"
diff --git a/src/TNL/Legacy/fast-sweeping-map/main.cu b/src/TNL/Legacy/fast-sweeping-map/main.cu
deleted file mode 100644
index 8849008ff..000000000
--- a/src/TNL/Legacy/fast-sweeping-map/main.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-/***************************************************************************
-                          main.cpp  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#include "main.h"
diff --git a/src/TNL/Legacy/fast-sweeping-map/main.h b/src/TNL/Legacy/fast-sweeping-map/main.h
deleted file mode 100644
index 6f23851c2..000000000
--- a/src/TNL/Legacy/fast-sweeping-map/main.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/***************************************************************************
-                          main.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-
-#include "MainBuildConfig.h"
-	//for HOST versions:
-#include "tnlFastSweepingMap.h"
-	//for DEVICE versions:
-//#include "tnlFastSweepingMap_CUDA.h"
-#include "fastSweepingMapConfig.h"
-#include <solvers/tnlBuildConfigTags.h>
-
-#include <mesh/tnlGrid.h>
-#include <core/tnlDevice.h>
-#include <time.h>
-#include <ctime>
-
-typedef MainBuildConfig BuildConfig;
-
-int main( int argc, char* argv[] )
-{
-	time_t start;
-	time_t stop;
-	time(&start);
-	std::clock_t start2= std::clock();
-   Config::ParameterContainer parameters;
-   tnlConfigDescription configDescription;
-   fastSweepingMapConfig< BuildConfig >::configSetup( configDescription );
-
-   if( ! parseCommandLine( argc, argv, configDescription, parameters ) )
-      return false;
-
-   const int& dim = parameters.getParameter< int >( "dim" );
-
-   if(dim == 2)
-   {
-		tnlFastSweepingMap<tnlGrid<2,double,TNL::Devices::Host, int>, double, int> solver;
-		if(!solver.init(parameters))
-	   {
-			cerr << "Solver failed to initialize." <<std::endl;
-			return EXIT_FAILURE;
-	   }
-		TNL_CHECK_CUDA_DEVICE;
-	  std::cout << "-------------------------------------------------------------" <<std::endl;
-	  std::cout << "Starting solver..." <<std::endl;
-	   solver.run();
-   }
-//   else if(dim == 3)
-//   {
-//		tnlFastSweepingMap<tnlGrid<3,double,TNL::Devices::Host, int>, double, int> solver;
-//		if(!solver.init(parameters))
-//	   {
-//			cerr << "Solver failed to initialize." <<std::endl;
-//			return EXIT_FAILURE;
-//	   }
-//		TNL_CHECK_CUDA_DEVICE;
-//	  std::cout << "-------------------------------------------------------------" <<std::endl;
-//	  std::cout << "Starting solver..." <<std::endl;
-//	   solver.run();
-//   }
-   else
-   {
-	  std::cerr << "Unsupported number of dimensions: " << dim << "!" <<std::endl;
-	   return EXIT_FAILURE;
-   }
-
-
-   time(&stop);
-  std::cout << "Solver stopped..." <<std::endl;
-  std::cout <<std::endl;
-  std::cout << "Running time was: " << difftime(stop,start) << " .... " << (std::clock() - start2) / (double)(CLOCKS_PER_SEC) <<std::endl;
-   return EXIT_SUCCESS;
-}
-
-
diff --git a/src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap.h b/src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap.h
deleted file mode 100644
index c568329ba..000000000
--- a/src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/***************************************************************************
-                          tnlFastSweepingMap.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLFASTSWEEPING_H_
-#define TNLFASTSWEEPING_H_
-
-#include <TNL/Config/ParameterContainer.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Containers/StaticVector.h>
-#include <functions/tnlMeshFunction.h>
-#include <TNL/Devices/Host.h>
-#include <mesh/tnlGrid.h>
-#include <mesh/grids/tnlGridEntity.h>
-#include <limits.h>
-#include <core/tnlDevice.h>
-#include <ctime>
-#ifdef HAVE_OPENMP
-#include <omp.h>
-#endif
-
-
-
-
-template< typename Mesh,
-		  typename Real,
-		  typename Index >
-class tnlFastSweepingMap
-{};
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-class tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >
-{
-
-public:
-	typedef Real RealType;
-	typedef Device DeviceType;
-	typedef Index IndexType;
-	typedef tnlGrid< 2, Real, Device, Index > MeshType;
-	typedef TNL::Containers::Vector< RealType, DeviceType, IndexType> DofVectorType;
-	typedef typename MeshType::CoordinatesType CoordinatesType;
-
-
-	tnlFastSweepingMap();
-
-	static String getType();
-	bool init( const Config::ParameterContainer& parameters );
-
-	bool initGrid();
-	bool run();
-
-	//for single core version use this implementation:
-	void updateValue(const Index i, const Index j);
-	//for parallel version use this one instead:
-//	void updateValue(const Index i, const Index j, DofVectorType* grid);
-
-
-	void setupSquare1000(Index i, Index j);
-	void setupSquare1100(Index i, Index j);
-	void setupSquare1010(Index i, Index j);
-	void setupSquare1001(Index i, Index j);
-	void setupSquare1110(Index i, Index j);
-	void setupSquare1101(Index i, Index j);
-	void setupSquare1011(Index i, Index j);
-	void setupSquare1111(Index i, Index j);
-	void setupSquare0000(Index i, Index j);
-	void setupSquare0100(Index i, Index j);
-	void setupSquare0010(Index i, Index j);
-	void setupSquare0001(Index i, Index j);
-	void setupSquare0110(Index i, Index j);
-	void setupSquare0101(Index i, Index j);
-	void setupSquare0011(Index i, Index j);
-	void setupSquare0111(Index i, Index j);
-
-	Real fabsMin(const Real x, const Real y);
-
-
-protected:
-
-	MeshType Mesh;
-
-	bool exactInput;
-
-	int something_changed;
-
-	tnlMeshFunction<MeshType> dofVector, dofVector2;
-	DofVectorType data,map;
-
-	RealType h;
-
-	tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage > Entity;
-
-
-#ifdef HAVE_OPENMP
-//	omp_lock_t* gridLock;
-#endif
-
-
-};
-
-
-
-
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-class tnlFastSweepingMap< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index >
-{
-
-public:
-	typedef Real RealType;
-	typedef Device DeviceType;
-	typedef Index IndexType;
-	typedef tnlGrid< 3, Real, Device, Index > MeshType;
-	typedef TNL::Containers::Vector< RealType, DeviceType, IndexType> DofVectorType;
-	typedef typename MeshType::CoordinatesType CoordinatesType;
-
-	tnlFastSweepingMap();
-
-	static String getType();
-	bool init( const Config::ParameterContainer& parameters );
-
-	bool initGrid();
-	bool run();
-
-	//for single core version use this implementation:
-	void updateValue(const Index i, const Index j, const Index k);
-	//for parallel version use this one instead:
-//	void updateValue(const Index i, const Index j, DofVectorType* grid);
-
-	Real fabsMin(const Real x, const Real y);
-
-
-protected:
-
-	MeshType Mesh;
-
-	bool exactInput;
-
-
-	tnlMeshFunction<MeshType> dofVector, dofVector2;
-	DofVectorType data;
-
-	RealType h;
-
-	tnlGridEntity< MeshType, 3, tnlGridEntityNoStencilStorage > Entity;
-
-#ifdef HAVE_OPENMP
-//	omp_lock_t* gridLock;
-#endif
-
-
-};
-
-
-	//for single core version use this implementation:
-#include "tnlFastSweepingMap2D_impl.h"
-	//for parallel version use this one instead:
-// #include "tnlFastSweepingMap2D_openMP_impl.h"
-
-//											#include "tnlFastSweepingMap3D_impl.h"
-
-#endif /* TNLFASTSWEEPING_H_ */
diff --git a/src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap2D_CUDA_v4_impl.h b/src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap2D_CUDA_v4_impl.h
deleted file mode 100644
index d02b8d6c5..000000000
--- a/src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap2D_CUDA_v4_impl.h
+++ /dev/null
@@ -1,1051 +0,0 @@
-/***************************************************************************
-                          tnlFastSweepingMap2D_CUDA_v4_impl.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLFASTSWEEPING2D_IMPL_H_
-#define TNLFASTSWEEPING2D_IMPL_H_
-
-#include "tnlFastSweepingMap.h"
-
-#define MAP_SOLVER_MAX_VALUE 3
-
-__device__
-double fabsMin( double x, double y)
-{
-	double fx = abs(x);
-
-	if(Min(fx,abs(y)) == fx)
-		return x;
-	else
-		return y;
-}
-
-__device__
-double atomicFabsMin(double* address, double val)
-{
-	unsigned long long int* address_as_ull =
-						  (unsigned long long int*)address;
-	unsigned long long int old = *address_as_ull, assumed;
-	do {
-		assumed = old;
-			old = atomicCAS(address_as_ull, assumed,__double_as_longlong( fabsMin(__longlong_as_double(assumed),val) ));
-	} while (assumed != old);
-	return __longlong_as_double(old);
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-String tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
-{
-	   return String( "tnlFastSweepingMap< " ) +
-	          MeshType::getType() + ", " +
-	          ::getType< Real >() + ", " +
-	          ::getType< Index >() + " >";
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: tnlFastSweepingMap()
-:dofVector(Mesh)
-{
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: init( const Config::ParameterContainer& parameters )
-{
-	const String& meshFile = parameters.getParameter< String >( "mesh" );
-
-	if( ! Mesh.load( meshFile ) )
-	{
-		  std::cerr << "I am not able to load the mesh from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-
-	const String& initialCondition = parameters.getParameter <String>("initial-condition");
-	if( ! dofVector.load( initialCondition ) )
-	{
-		  std::cerr << "I am not able to load the initial condition from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-	const String& mapFile = parameters.getParameter <String>("map");
-	if(! this->map.load( mapFile ))
-		cout << "Failed to load map file : " << mapFile <<std::endl;
-
-	h = Mesh.template getSpaceStepsProducts< 1, 0 >();
-	//Entity.refresh();
-	counter = 0;
-
-	const String& exact_input = parameters.getParameter< String >( "exact-input" );
-
-	if(exact_input == "no")
-		exactInput=false;
-	else
-		exactInput=true;
-
-
-#ifdef HAVE_CUDA
-
-	cudaMalloc(&(cudaDofVector), this->dofVector.getData().getSize()*sizeof(double));
-	cudaMemcpy(cudaDofVector, this->dofVector.getData().getData(), this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyHostToDevice);
-
-	cudaMalloc(&(cudaDofVector2), this->dofVector.getData().getSize()*sizeof(double));
-	cudaMemcpy(cudaDofVector2, this->dofVector.getData().getData(), this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyHostToDevice);
-
-	cudaMalloc(&(map_cuda), this->map.getSize()*sizeof(double));
-	cudaMemcpy(map_cuda, this->map.getData(), this->map.getSize()*sizeof(double), cudaMemcpyHostToDevice);
-
-	cudaMalloc(&(changed), sizeof(int));
-	//counter == 0 --> setting changed to 0
-	cudaMemcpy(changed, &counter, sizeof(int), cudaMemcpyHostToDevice);
-
-
-	cudaMalloc(&(this->cudaSolver), sizeof(tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >));
-	cudaMemcpy(this->cudaSolver, this,sizeof(tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >), cudaMemcpyHostToDevice);
-
-#endif
-
-	int n = Mesh.getDimensions().x();
-	dim3 threadsPerBlock(16, 16);
-	dim3 numBlocks(n/16 + 1 ,n/16 +1);
-
-
-	initCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-
-	return true;
-}
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: run()
-{
-
-	int n = Mesh.getDimensions().x();
-	dim3 threadsPerBlock(1, 1024);
-	dim3 numBlocks(4,1);
-
-	int run = 1;
-	int zero = 0;
-	int cntr = 0;
-
-	while(run != 0)
-	{
-		cudaMemcpy(this->changed, &zero, sizeof(int), cudaMemcpyHostToDevice);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-
-		runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,0,0, this->changed);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-
-		cudaMemcpy(&run, this->changed,sizeof(int), cudaMemcpyDeviceToHost);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-		cntr++;
-		cout << "Finished set of sweeps #" << cntr << "           " << run <<std::endl;
-	}
-
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-
-	//data.setLike(dofVector.getData());
-	//cudaMemcpy(data.getData(), cudaDofVector2, this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-	cudaMemcpy(dofVector.getData().getData(), cudaDofVector2, this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-	cudaDeviceSynchronize();
-	cudaFree(cudaDofVector);
-	cudaFree(cudaDofVector2);
-	cudaFree(cudaSolver);
-	//data.save("u-00001.tnl");
-	dofVector.save("u-00001.tnl");
-	cudaDeviceSynchronize();
-	return true;
-}
-
-
-
-
-#ifdef HAVE_CUDA
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j, Index* something_changed)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-
-	if(map_cuda[Entity.getIndex()] != 0.0)
-	{
-		tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-		Real value = cudaDofVector2[Entity.getIndex()];
-		Real im = abs(1.0/map_cuda[Entity.getIndex()]);
-		Real a,b, tmp;
-
-		if( i == 0 )
-			a = cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()];
-		else if( i == Mesh.getDimensions().x() - 1 )
-			a = cudaDofVector2[neighborEntities.template getEntityIndex< -1,  0 >()];
-		else
-		{
-			a = fabsMin( cudaDofVector2[neighborEntities.template getEntityIndex< -1,  0 >()],
-					 cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()] );
-		}
-
-		if( j == 0 )
-			b = cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()];
-		else if( j == Mesh.getDimensions().y() - 1 )
-			b = cudaDofVector2[neighborEntities.template getEntityIndex< 0,  -1 >()];
-		else
-		{
-			b = fabsMin( cudaDofVector2[neighborEntities.template getEntityIndex< 0,  -1 >()],
-					 cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()] );
-		}
-
-
-		if(abs(a-b) >= im*h)
-			tmp = fabsMin(a,b) + sign(value)*im*h;
-		else
-			tmp = 0.5 * (a + b + sign(value)*sqrt(2.0 * im * h * im * h - (a - b) * (a - b) ) );
-
-	//	cudaDofVector2[Entity.getIndex()]  = fabsMin(value, tmp);
-		atomicFabsMin(&(cudaDofVector2[Entity.getIndex()]), tmp);
-
-		if(abs(value)-abs(tmp) > 0.0)
-			atomicMax(something_changed,1);
-	}
-	else
-	{
-		atomicFabsMin(&(cudaDofVector2[Entity.getIndex()]), MAP_SOLVER_MAX_VALUE);
-	}
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-bool tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid()
-{
-	int i = threadIdx.x + blockDim.x*blockIdx.x;
-	int j = blockDim.y*blockIdx.y + threadIdx.y;
-
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-
-	int gid = Entity.getIndex();
-
-	cudaDofVector2[gid] = INT_MAX*sign(cudaDofVector[gid]);
-
-	if(abs(cudaDofVector[gid]) < 1.01*h)
-	{
-		cudaDofVector2[gid] = cudaDofVector[gid];
-		if(map_cuda[gid] != 0.0)
-			cudaDofVector2[gid] /=map_cuda[gid];
-	}
-
-
-
-
-
-//	if(i+1 < Mesh.getDimensions().x() && j+1 < Mesh.getDimensions().y() )
-//	{
-//		if(cudaDofVector[Entity.getIndex()] > 0)
-//		{
-//			if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()] > 0)
-//			{
-//				if(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare1111(i,j);
-//					else
-//						setupSquare1110(i,j);
-//				}
-//				else
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare1101(i,j);
-//					else
-//						setupSquare1100(i,j);
-//				}
-//			}
-//			else
-//			{
-//				if(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare1011(i,j);
-//					else
-//						setupSquare1010(i,j);
-//				}
-//				else
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare1001(i,j);
-//					else
-//						setupSquare1000(i,j);
-//				}
-//			}
-//		}
-//		else
-//		{
-//			if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()] > 0)
-//			{
-//				if(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare0111(i,j);
-//					else
-//						setupSquare0110(i,j);
-//				}
-//				else
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare0101(i,j);
-//					else
-//						setupSquare0100(i,j);
-//				}
-//			}
-//			else
-//			{
-//				if(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare0011(i,j);
-//					else
-//						setupSquare0010(i,j);
-//				}
-//				else
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare0001(i,j);
-//					else
-//						setupSquare0000(i,j);
-//				}
-//			}
-//		}
-//
-//	}
-
-	return true;
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-Real tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
-{
-	Real fx = abs(x);
-	//Real fy = abs(y);
-
-	//Real tmpMin = Min(fx,abs(y));
-
-	if(Min(fx,abs(y)) == fx)
-		return x;
-	else
-		return y;
-
-
-}
-
-
-
-__global__ void runCUDA(tnlFastSweepingMap< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int i, int* changed)
-{
-
-	__shared__ int something_changed;
-	if(threadIdx.x+threadIdx.y == 0)
-		something_changed = 0;
-
-	int gx = 0;
-	int gy = threadIdx.y;
-	//if(solver->Mesh.getDimensions().x() <= gx || solver->Mesh.getDimensions().y() <= gy)
-	//	return;
-	int n = solver->Mesh.getDimensions().x();
-	int blockCount = n/blockDim.y +1;
-	//int gid = solver->Mesh.getDimensions().x() * gy + gx;
-	//int max = solver->Mesh.getDimensions().x()*solver->Mesh.getDimensions().x();
-
-	//int id1 = gx+gy;
-	//int id2 = (solver->Mesh.getDimensions().x() - gx - 1) + gy;
-
-	__syncthreads();
-	if(blockIdx.x==0)
-	{
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy < n)
-			{
-				solver->updateValue(gx,gy,&something_changed);
-				gx++;
-				if(gx==n)
-				{
-					gx=0;
-					gy+=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-	}
-	else if(blockIdx.x==1)
-	{
-		gx=n-1;
-		gy=threadIdx.y;
-
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy < n)
-			{
-				solver->updateValue(gx,gy,&something_changed);
-				gx--;
-				if(gx==-1)
-				{
-					gx=n-1;
-					gy+=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-	}
-	else if(blockIdx.x==2)
-	{
-		gx=0;
-		gy=n-threadIdx.y-1;
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy > -1)
-			{
-				solver->updateValue(gx,gy,&something_changed);
-				gx++;
-				if(gx==n)
-				{
-					gx=0;
-					gy-=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-	}
-	else if(blockIdx.x==3)
-	{
-		gx=n-1;
-		gy=n-threadIdx.y-1;
-
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy > -1)
-			{
-				solver->updateValue(gx,gy,&something_changed);
-				gx--;
-				if(gx==-1)
-				{
-					gx=n-1;
-					gy-=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-	}
-
-
-	if(threadIdx.x+threadIdx.y == 0)
-		atomicMax(changed, something_changed);
-
-
-
-
-}
-
-
-__global__ void initCUDA(tnlFastSweepingMap< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver)
-{
-
-
-	int gx = threadIdx.x + blockDim.x*blockIdx.x;
-	int gy = blockDim.y*blockIdx.y + threadIdx.y;
-
-
-	if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy)
-	{
-		solver->initGrid();
-	}
-
-
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1111( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	cudaDofVector2[Entity.getIndex()]=fabsMin(INT_MAX,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0000( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-INT_MAX,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1110( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1101( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[Entity.getIndex()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1011( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[Entity.getIndex()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0111( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0001( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0010( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[Entity.getIndex()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0100( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[Entity.getIndex()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1000( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1100( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-al;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1010( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1001( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	cudaDofVector2[Entity.getIndex()]=fabsMin(cudaDofVector[Entity.getIndex()],cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0011( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-al;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0101( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0110( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	cudaDofVector2[Entity.getIndex()]=fabsMin(cudaDofVector[Entity.getIndex()],cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-}
-#endif
-
-
-
-
-#endif /* TNLFASTSWEEPING_IMPL_H_ */
diff --git a/src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap2D_impl.h b/src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap2D_impl.h
deleted file mode 100644
index 4bd9e17c5..000000000
--- a/src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap2D_impl.h
+++ /dev/null
@@ -1,823 +0,0 @@
-/***************************************************************************
-                          tnlFastSweepingMap2D_impl.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLFASTSWEEPING2D_IMPL_H_
-#define TNLFASTSWEEPING2D_IMPL_H_
-
-
-#define MAP_SOLVER_MAX_VALUE 3
-
-
-#include "tnlFastSweepingMap.h"
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-String tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
-{
-	   return String( "tnlFastSweepingMap< " ) +
-	          MeshType::getType() + ", " +
-	          ::getType< Real >() + ", " +
-	          ::getType< Index >() + " >";
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: tnlFastSweepingMap()
-:Entity(Mesh),
- dofVector(Mesh),
- dofVector2(Mesh)
-{
-}
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: init( const Config::ParameterContainer& parameters )
-{
-	const String& meshFile = parameters.getParameter< String >( "mesh" );
-
-	if( ! Mesh.load( meshFile ) )
-	{
-		  std::cerr << "I am not able to load the mesh from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-
-	const String& initialCondition = parameters.getParameter <String>("initial-condition");
-	if( ! dofVector.load( initialCondition ) )
-	{
-		  std::cerr << "I am not able to load the initial condition from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-	dofVector2.load(initialCondition);
-
-	const String& mapFile = parameters.getParameter <String>("map");
-	if(! this->map.load( mapFile ))
-		cout << "Failed to load map file : " << mapFile <<std::endl;
-
-	h = Mesh.template getSpaceStepsProducts< 1, 0 >();
-	Entity.refresh();
-
-	const String& exact_input = parameters.getParameter< String >( "exact-input" );
-
-	if(exact_input == "no")
-		exactInput=false;
-	else
-		exactInput=true;
-
-	cout << "a" <<std::endl;
-
-	something_changed = 1;
-	return initGrid();
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid()
-{
-
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	for(int i=0; i< Mesh.getDimensions().x()*Mesh.getDimensions().x();i++)
-	{
-		dofVector2[i]=INT_MAX*sign(dofVector[i]);
-
-		if(abs(dofVector[i]) < 1.01*h)
-		{
-			dofVector2[i] = dofVector[i];
-			if(map[i] != 0.0)
-				dofVector2[i] /= map[i];
-		}
-	}
-
-//	for(int i = 0 ; i < Mesh.getDimensions().x()-1; i++)
-//	{
-//		for(int j = 0 ; j < Mesh.getDimensions().x()-1; j++)
-//			{
-//			this->Entity.setCoordinates(CoordinatesType(i,j));
-//			this->Entity.refresh();
-//			neighborEntities.refresh(Mesh,Entity.getIndex());
-//
-//				if(dofVector[this->Entity.getIndex()] > 0)
-//				{
-//					if(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()] > 0)
-//					{
-//						if(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-//						{
-//							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//								setupSquare1111(i,j);
-//							else
-//								setupSquare1110(i,j);
-//						}
-//						else
-//						{
-//							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//								setupSquare1101(i,j);
-//							else
-//								setupSquare1100(i,j);
-//						}
-//					}
-//					else
-//					{
-//						if(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-//						{
-//							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//								setupSquare1011(i,j);
-//							else
-//								setupSquare1010(i,j);
-//						}
-//						else
-//						{
-//							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//								setupSquare1001(i,j);
-//							else
-//								setupSquare1000(i,j);
-//						}
-//					}
-//				}
-//				else
-//				{
-//					if(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()] > 0)
-//					{
-//						if(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-//						{
-//							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//								setupSquare0111(i,j);
-//							else
-//								setupSquare0110(i,j);
-//						}
-//						else
-//						{
-//							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//								setupSquare0101(i,j);
-//							else
-//								setupSquare0100(i,j);
-//						}
-//					}
-//					else
-//					{
-//						if(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-//						{
-//							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//								setupSquare0011(i,j);
-//							else
-//								setupSquare0010(i,j);
-//						}
-//						else
-//						{
-//							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//								setupSquare0001(i,j);
-//							else
-//								setupSquare0000(i,j);
-//						}
-//					}
-//				}
-//
-//			}
-//	}
-	cout << "a" <<std::endl;
-
-	//data.setLike(dofVector2.getData());
-	//data=dofVector2.getData();
-	//cout << data.getType() <<std::endl;
-	dofVector2.save("u-00000.tnl");
-	//dofVector2.getData().save("u-00000.tnl");
-
-	return true;
-}
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: run()
-{
-	int cntr = 0;
-	while(something_changed != 0)
-	{
-		something_changed = 0;
-		for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-		{
-			for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-			{
-				updateValue(i,j);
-			}
-		}
-
-	/*---------------------------------------------------------------------------------------------------------------------------*/
-
-		for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-		{
-			for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-			{
-				updateValue(i,j);
-			}
-		}
-
-	/*---------------------------------------------------------------------------------------------------------------------------*/
-
-		for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-		{
-			for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-			{
-				updateValue(i,j);
-			}
-		}
-
-	/*---------------------------------------------------------------------------------------------------------------------------*/
-		for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-		{
-			for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-			{
-				updateValue(i,j);
-			}
-		}
-
-	/*---------------------------------------------------------------------------------------------------------------------------*/
-		cntr++;
-		cout << "Finished set of sweeps #" << cntr << "           " << something_changed <<std::endl;
-	}
-
-
-
-	dofVector2.save("u-00001.tnl");
-
-	return true;
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j)
-{
-
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	if(map[Entity.getIndex()] != 0.0)
-	{
-		tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-
-		Real value = dofVector2[Entity.getIndex()];
-		Real im = abs(1.0/map[Entity.getIndex()]);
-		Real a,b, tmp;
-
-		if( i == 0 )
-			a = dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()];
-		else if( i == Mesh.getDimensions().x() - 1 )
-			a = dofVector2[neighborEntities.template getEntityIndex< -1,  0 >()];
-		else
-		{
-			a = fabsMin( dofVector2[neighborEntities.template getEntityIndex< -1,  0 >()],
-					 dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()] );
-		}
-
-		if( j == 0 )
-			b = dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()];
-		else if( j == Mesh.getDimensions().y() - 1 )
-			b = dofVector2[neighborEntities.template getEntityIndex< 0,  -1 >()];
-		else
-		{
-			b = fabsMin( dofVector2[neighborEntities.template getEntityIndex< 0,  -1 >()],
-					 dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()] );
-		}
-
-
-		if(fabs(a-b) >= im*h)
-			tmp = fabsMin(a,b) + sign(value)*im*h;
-		else
-			tmp = 0.5 * (a + b + sign(value)*sqrt(2.0 * im * h * im * h - (a - b) * (a - b) ) );
-
-		if(abs(value)-abs(tmp) > 0.0)
-			something_changed = 1;
-
-		dofVector2[Entity.getIndex()] = fabsMin(value, tmp);
-
-	}
-	else
-	{
-		dofVector2[Entity.getIndex()] = MAP_SOLVER_MAX_VALUE;
-	}
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-Real tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
-{
-	Real fx = fabs(x);
-	Real fy = fabs(y);
-
-	Real tmpMin = Min(fx,fy);
-
-	if(tmpMin == fx)
-		return x;
-	else
-		return y;
-
-}
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1111( Index i, Index j)
-{
-//	this->Entity.setCoordinates(CoordinatesType(i,j));
-//	this->Entity.refresh();
-//	auto neighborEntities =  Entity.getNeighborEntities();
-//	dofVector2[Entity.getIndex()]=fabsMin(INT_MAX,dofVector2[Entity.getIndex()]);
-//	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(INT_MAX,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-//	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(INT_MAX,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-//	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(INT_MAX,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0000( Index i, Index j)
-{
-//	this->Entity.setCoordinates(CoordinatesType(i,j));
-//	this->Entity.refresh();
-//	auto neighborEntities =  Entity.getNeighborEntities();
-//	dofVector2[Entity.getIndex()]=fabsMin(-INT_MAX,dofVector2[(Entity.getIndex())]);
-//	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-INT_MAX,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-//	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-INT_MAX,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-//	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-INT_MAX,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1110( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1101( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(dofVector[Entity.getIndex()]-
-			 dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1011( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(dofVector[Entity.getIndex()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0111( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	be=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0001( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0010( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(dofVector[Entity.getIndex()]-
-			 dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0100( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(dofVector[Entity.getIndex()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1000( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	be=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1100( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-al;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1010( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1001( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	dofVector2[Entity.getIndex()]=fabsMin(dofVector[Entity.getIndex()],dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()],dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()],dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()],dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0011( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-al;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0101( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0110( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	dofVector2[Entity.getIndex()]=fabsMin(dofVector[Entity.getIndex()],dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()],dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()],dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()],dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-}
-
-
-
-
-#endif /* TNLFASTSWEEPING_IMPL_H_ */
diff --git a/src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap_CUDA.h b/src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap_CUDA.h
deleted file mode 100644
index a23057e78..000000000
--- a/src/TNL/Legacy/fast-sweeping-map/tnlFastSweepingMap_CUDA.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/***************************************************************************
-                          tnlFastSweepingMap_CUDA.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLFASTSWEEPING_H_
-#define TNLFASTSWEEPING_H_
-
-#include <TNL/Config/ParameterContainer.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Containers/StaticVector.h>
-#include <TNL/Devices/Host.h>
-#include <mesh/tnlGrid.h>
-#include <mesh/grids/tnlGridEntity.h>
-
-#include <functions/tnlMeshFunction.h>
-#include <limits.h>
-#include <core/tnlDevice.h>
-#include <ctime>
-
-
-
-
-
-template< typename Mesh,
-		  typename Real,
-		  typename Index >
-class tnlFastSweepingMap
-{};
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-class tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >
-{
-
-public:
-	typedef Real RealType;
-	typedef Device DeviceType;
-	typedef Index IndexType;
-	typedef tnlGrid< 2, Real, Device, Index > MeshType;
-	typedef TNL::Containers::Vector< RealType, DeviceType, IndexType> DofVectorType;
-	typedef typename MeshType::CoordinatesType CoordinatesType;
-
-	tnlFastSweepingMap();
-
-	__host__ static String getType();
-	__host__ bool init( const Config::ParameterContainer& parameters );
-	__host__ bool run();
-
-#ifdef HAVE_CUDA
-	__device__ bool initGrid();
-	__device__ void updateValue(const Index i, const Index j, Index* something_changed);
-	__device__ void updateValue(const Index i, const Index j, double** sharedMem, const int k3);
-	__device__ Real fabsMin(const Real x, const Real y);
-
-	tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >* cudaSolver;
-	double* cudaDofVector;
-	double* cudaDofVector2;
-	double* map_cuda;
-	int counter;
-	int* changed;
-	__device__ void setupSquare1000(Index i, Index j);
-	__device__ void setupSquare1100(Index i, Index j);
-	__device__ void setupSquare1010(Index i, Index j);
-	__device__ void setupSquare1001(Index i, Index j);
-	__device__ void setupSquare1110(Index i, Index j);
-	__device__ void setupSquare1101(Index i, Index j);
-	__device__ void setupSquare1011(Index i, Index j);
-	__device__ void setupSquare1111(Index i, Index j);
-	__device__ void setupSquare0000(Index i, Index j);
-	__device__ void setupSquare0100(Index i, Index j);
-	__device__ void setupSquare0010(Index i, Index j);
-	__device__ void setupSquare0001(Index i, Index j);
-	__device__ void setupSquare0110(Index i, Index j);
-	__device__ void setupSquare0101(Index i, Index j);
-	__device__ void setupSquare0011(Index i, Index j);
-	__device__ void setupSquare0111(Index i, Index j);
-#endif
-
-	MeshType Mesh;
-
-protected:
-
-
-
-	bool exactInput;
-
-	tnlMeshFunction<MeshType> dofVector;
-	DofVectorType data, map;
-
-
-	RealType h;
-
-
-};
-
-
-
-
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-class tnlFastSweepingMap< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index >
-{
-
-public:
-	typedef Real RealType;
-	typedef Device DeviceType;
-	typedef Index IndexType;
-	typedef tnlGrid< 3, Real, Device, Index > MeshType;
-	typedef TNL::Containers::Vector< RealType, DeviceType, IndexType> DofVectorType;
-	typedef typename MeshType::CoordinatesType CoordinatesType;
-
-
-
-	__host__ static String getType();
-	__host__ bool init( const Config::ParameterContainer& parameters );
-	__host__ bool run();
-
-#ifdef HAVE_CUDA
-	__device__ bool initGrid(int i, int j, int k);
-	__device__ void updateValue(const Index i, const Index j, const Index k);
-	__device__ void updateValue(const Index i, const Index j, const Index k, double** sharedMem, const int k3);
-	__device__ Real fabsMin(const Real x, const Real y);
-
-	tnlFastSweepingMap< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index >* cudaSolver;
-	double* cudaDofVector;
-	double* cudaDofVector2;
-	int counter;
-#endif
-
-	MeshType Mesh;
-
-protected:
-
-
-
-	bool exactInput;
-
-	tnlMeshFunction<MeshType> dofVector;
-	DofVectorType data;
-
-	RealType h;
-
-
-};
-
-
-
-
-
-
-
-#ifdef HAVE_CUDA
-//template<int sweep_t>
-__global__ void runCUDA(tnlFastSweepingMap< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int i, int* changed);
-//__global__ void runCUDA(tnlFastSweepingMap< tnlGrid< 3,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int i);
-
-__global__ void initCUDA(tnlFastSweepingMap< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver);
-//__global__ void initCUDA(tnlFastSweepingMap< tnlGrid< 3,double, TNL::Devices::Host, int >, double, int >* solver);
-#endif
-
-/*various implementtions.... choose one*/
-//#include "tnlFastSweepingMap2D_CUDA_impl.h"
-//#include "tnlFastSweepingMap2D_CUDA_v2_impl.h"
-//#include "tnlFastSweepingMap2D_CUDA_v3_impl.h"
-#include "tnlFastSweepingMap2D_CUDA_v4_impl.h"
-//#include "tnlFastSweepingMap2D_CUDA_v5_impl.h"
-
-
-//															#include "tnlFastSweepingMap3D_CUDA_impl.h"
-
-#endif /* TNLFASTSWEEPING_H_ */
diff --git a/src/TNL/Legacy/fast-sweeping/CMakeLists.txt b/src/TNL/Legacy/fast-sweeping/CMakeLists.txt
deleted file mode 100644
index 1a23d646a..000000000
--- a/src/TNL/Legacy/fast-sweeping/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-set( tnl_fast_sweeping_SOURCES
-#     MainBuildConfig.h
-#     tnlFastSweeping2D_impl.h
-#     tnlFastSweeping.h
-#     fastSweepingConfig.h 
-     main.cpp)
-
-
-IF(  BUILD_CUDA ) 
-	CUDA_ADD_EXECUTABLE(fast-sweeping main.cu)
-ELSE(  BUILD_CUDA )                
-	ADD_EXECUTABLE(fast-sweeping main.cpp)
-ENDIF( BUILD_CUDA )
-target_link_libraries (fast-sweeping tnl )
-
-
-INSTALL( TARGETS fast-sweeping
-         RUNTIME DESTINATION bin
-         PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )
-        
-#INSTALL( FILES ${tnl_fast_sweeping_SOURCES}
-#         DESTINATION ${TNL_TARGET_DATA_DIRECTORY}/examples/fast-sweeping )
diff --git a/src/TNL/Legacy/fast-sweeping/MainBuildConfig.h b/src/TNL/Legacy/fast-sweeping/MainBuildConfig.h
deleted file mode 100644
index ed3d686eb..000000000
--- a/src/TNL/Legacy/fast-sweeping/MainBuildConfig.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/***************************************************************************
-                          MainBuildConfig.h  -  description
-                             -------------------
-    begin                : Jul 7, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef MAINBUILDCONFIG_H_
-#define MAINBUILDCONFIG_H_
-
-#include <solvers/tnlBuildConfigTags.h>
-
-class MainBuildConfig
-{
-   public:
-
-      static void print() {std::cerr << "MainBuildConfig" <<std::endl; }
-};
-
-/****
- * Turn off support for float and long double.
- */
-template<> struct tnlConfigTagReal< MainBuildConfig, float > { enum { enabled = false }; };
-template<> struct tnlConfigTagReal< MainBuildConfig, long double > { enum { enabled = false }; };
-
-/****
- * Turn off support for short int and long int indexing.
- */
-template<> struct tnlConfigTagIndex< MainBuildConfig, short int >{ enum { enabled = false }; };
-template<> struct tnlConfigTagIndex< MainBuildConfig, long int >{ enum { enabled = false }; };
-
-/****
- * Use of tnlGrid is enabled for allowed dimensions and Real, Device and Index types.
- */
-template< int Dimensions, typename Real, typename Device, typename Index >
-   struct tnlConfigTagMesh< MainBuildConfig, tnlGrid< Dimensions, Real, Device, Index > >
-      { enum { enabled = tnlConfigTagDimensions< MainBuildConfig, Dimensions >::enabled  &&
-                         tnlConfigTagReal< MainBuildConfig, Real >::enabled &&
-                         tnlConfigTagDevice< MainBuildConfig, Device >::enabled &&
-                         tnlConfigTagIndex< MainBuildConfig, Index >::enabled }; };
-
-/****
- * Please, chose your preferred time discretisation  here.
- */
-template<> struct tnlConfigTagTimeDiscretisation< MainBuildConfig, tnlExplicitTimeDiscretisationTag >{ enum { enabled = true }; };
-template<> struct tnlConfigTagTimeDiscretisation< MainBuildConfig, tnlSemiImplicitTimeDiscretisationTag >{ enum { enabled = false}; };
-template<> struct tnlConfigTagTimeDiscretisation< MainBuildConfig, tnlImplicitTimeDiscretisationTag >{ enum { enabled = false }; };
-
-/****
- * Only the Runge-Kutta-Merson solver is enabled by default.
- */
-template<> struct tnlConfigTagExplicitSolver< MainBuildConfig, tnlExplicitEulerSolverTag >{ enum { enabled = false }; };
-
-#endif /* MAINBUILDCONFIG_H_ */
diff --git a/src/TNL/Legacy/fast-sweeping/fastSweepingConfig.h b/src/TNL/Legacy/fast-sweeping/fastSweepingConfig.h
deleted file mode 100644
index 3df2c1e88..000000000
--- a/src/TNL/Legacy/fast-sweeping/fastSweepingConfig.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/***************************************************************************
-                          fastSweepingConfig.h  -  description
-                             -------------------
-    begin                : Oct 15, 2015
-    copyright            : (C) 2015 by Tomas Sobotik
-    email                :
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef FASTSWEEPINGCONFIG_H_
-#define FASTSWEEPINGCONFIG_H_
-
-#include <config/tnlConfigDescription.h>
-
-template< typename ConfigTag >
-class fastSweepingConfig
-{
-   public:
-      static void configSetup( tnlConfigDescription& config )
-      {
-         config.addDelimiter( "Parallel Eikonal solver settings:" );
-         config.addEntry        < String > ( "problem-name", "This defines particular problem.", "fast-sweeping" );
-         config.addRequiredEntry        < String > ( "initial-condition", "Initial condition for solver");
-         config.addRequiredEntry        < int > ( "dim", "Dimension of problem.");
-         config.addEntry       < String > ( "mesh", "Name of mesh.", "mesh.tnl" );
-         config.addEntry       < String > ( "exact-input", "Are the function values near the curve equal to the SDF? (yes/no)", "no" );
-      }
-};
-
-#endif /* FASTSWEEPINGCONFIG_H_ */
diff --git a/src/TNL/Legacy/fast-sweeping/main.cpp b/src/TNL/Legacy/fast-sweeping/main.cpp
deleted file mode 100644
index 8849008ff..000000000
--- a/src/TNL/Legacy/fast-sweeping/main.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/***************************************************************************
-                          main.cpp  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#include "main.h"
diff --git a/src/TNL/Legacy/fast-sweeping/main.cu b/src/TNL/Legacy/fast-sweeping/main.cu
deleted file mode 100644
index 8849008ff..000000000
--- a/src/TNL/Legacy/fast-sweeping/main.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-/***************************************************************************
-                          main.cpp  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#include "main.h"
diff --git a/src/TNL/Legacy/fast-sweeping/main.h b/src/TNL/Legacy/fast-sweeping/main.h
deleted file mode 100644
index e5ac15fed..000000000
--- a/src/TNL/Legacy/fast-sweeping/main.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/***************************************************************************
-                          main.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-
-#include "MainBuildConfig.h"
-	//for HOST versions:
-#include "tnlFastSweeping.h"
-	//for DEVICE versions:
-//#include "tnlFastSweeping_CUDA.h"
-#include "fastSweepingConfig.h"
-#include <solvers/tnlBuildConfigTags.h>
-
-#include <mesh/tnlGrid.h>
-#include <core/tnlDevice.h>
-#include <time.h>
-#include <ctime>
-
-typedef MainBuildConfig BuildConfig;
-
-int main( int argc, char* argv[] )
-{
-	time_t start;
-	time_t stop;
-	time(&start);
-	std::clock_t start2= std::clock();
-   Config::ParameterContainer parameters;
-   tnlConfigDescription configDescription;
-   fastSweepingConfig< BuildConfig >::configSetup( configDescription );
-
-   if( ! parseCommandLine( argc, argv, configDescription, parameters ) )
-      return false;
-
-   const int& dim = parameters.getParameter< int >( "dim" );
-
-   if(dim == 2)
-   {
-		tnlFastSweeping<tnlGrid<2,double,TNL::Devices::Host, int>, double, int> solver;
-		if(!solver.init(parameters))
-	   {
-			cerr << "Solver failed to initialize." <<std::endl;
-			return EXIT_FAILURE;
-	   }
-		TNL_CHECK_CUDA_DEVICE;
-	  std::cout << "-------------------------------------------------------------" <<std::endl;
-	  std::cout << "Starting solver..." <<std::endl;
-	   solver.run();
-   }
-   else if(dim == 3)
-   {
-		tnlFastSweeping<tnlGrid<3,double,TNL::Devices::Host, int>, double, int> solver;
-		if(!solver.init(parameters))
-	   {
-			cerr << "Solver failed to initialize." <<std::endl;
-			return EXIT_FAILURE;
-	   }
-		TNL_CHECK_CUDA_DEVICE;
-	  std::cout << "-------------------------------------------------------------" <<std::endl;
-	  std::cout << "Starting solver..." <<std::endl;
-	   solver.run();
-   }
-   else
-   {
-	  std::cerr << "Unsupported number of dimensions: " << dim << "!" <<std::endl;
-	   return EXIT_FAILURE;
-   }
-
-
-   time(&stop);
-  std::cout << "Solver stopped..." <<std::endl;
-  std::cout <<std::endl;
-  std::cout << "Running time was: " << difftime(stop,start) << " .... " << (std::clock() - start2) / (double)(CLOCKS_PER_SEC) <<std::endl;
-   return EXIT_SUCCESS;
-}
-
-
diff --git a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping.h
deleted file mode 100644
index 96d26db7b..000000000
--- a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/***************************************************************************
-                          tnlFastSweeping.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLFASTSWEEPING_H_
-#define TNLFASTSWEEPING_H_
-
-#include <TNL/Config/ParameterContainer.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Containers/StaticVector.h>
-#include <functions/tnlMeshFunction.h>
-#include <TNL/Devices/Host.h>
-#include <mesh/tnlGrid.h>
-#include <mesh/grids/tnlGridEntity.h>
-#include <limits.h>
-#include <core/tnlDevice.h>
-#include <ctime>
-#ifdef HAVE_OPENMP
-#include <omp.h>
-#endif
-
-
-
-
-template< typename Mesh,
-		  typename Real,
-		  typename Index >
-class tnlFastSweeping
-{};
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-class tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >
-{
-
-public:
-	typedef Real RealType;
-	typedef Device DeviceType;
-	typedef Index IndexType;
-	typedef tnlGrid< 2, Real, Device, Index > MeshType;
-	typedef TNL::Containers::Vector< RealType, DeviceType, IndexType> DofVectorType;
-	typedef typename MeshType::CoordinatesType CoordinatesType;
-
-
-	tnlFastSweeping();
-
-	static String getType();
-	bool init( const Config::ParameterContainer& parameters );
-
-	bool initGrid();
-	bool run();
-
-	//for single core version use this implementation:
-	void updateValue(const Index i, const Index j);
-	//for parallel version use this one instead:
-//	void updateValue(const Index i, const Index j, DofVectorType* grid);
-
-
-	void setupSquare1000(Index i, Index j);
-	void setupSquare1100(Index i, Index j);
-	void setupSquare1010(Index i, Index j);
-	void setupSquare1001(Index i, Index j);
-	void setupSquare1110(Index i, Index j);
-	void setupSquare1101(Index i, Index j);
-	void setupSquare1011(Index i, Index j);
-	void setupSquare1111(Index i, Index j);
-	void setupSquare0000(Index i, Index j);
-	void setupSquare0100(Index i, Index j);
-	void setupSquare0010(Index i, Index j);
-	void setupSquare0001(Index i, Index j);
-	void setupSquare0110(Index i, Index j);
-	void setupSquare0101(Index i, Index j);
-	void setupSquare0011(Index i, Index j);
-	void setupSquare0111(Index i, Index j);
-
-	Real fabsMin(const Real x, const Real y);
-
-
-protected:
-
-	MeshType Mesh;
-
-	bool exactInput;
-
-	tnlMeshFunction<MeshType> dofVector, dofVector2;
-	DofVectorType data;
-
-	RealType h;
-
-	tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage > Entity;
-
-
-#ifdef HAVE_OPENMP
-//	omp_lock_t* gridLock;
-#endif
-
-
-};
-
-
-
-
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-class tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index >
-{
-
-public:
-	typedef Real RealType;
-	typedef Device DeviceType;
-	typedef Index IndexType;
-	typedef tnlGrid< 3, Real, Device, Index > MeshType;
-	typedef TNL::Containers::Vector< RealType, DeviceType, IndexType> DofVectorType;
-	typedef typename MeshType::CoordinatesType CoordinatesType;
-
-	tnlFastSweeping();
-
-	static String getType();
-	bool init( const Config::ParameterContainer& parameters );
-
-	bool initGrid();
-	bool run();
-
-	//for single core version use this implementation:
-	void updateValue(const Index i, const Index j, const Index k);
-	//for parallel version use this one instead:
-//	void updateValue(const Index i, const Index j, DofVectorType* grid);
-
-	Real fabsMin(const Real x, const Real y);
-
-
-protected:
-
-	MeshType Mesh;
-
-	bool exactInput;
-
-
-	tnlMeshFunction<MeshType> dofVector, dofVector2;
-	DofVectorType data;
-
-	RealType h;
-
-	tnlGridEntity< MeshType, 3, tnlGridEntityNoStencilStorage > Entity;
-
-#ifdef HAVE_OPENMP
-//	omp_lock_t* gridLock;
-#endif
-
-
-};
-
-
-	//for single core version use this implementation:
-#include "tnlFastSweeping2D_impl.h"
-	//for parallel version use this one instead:
-// #include "tnlFastSweeping2D_openMP_impl.h"
-
-#include "tnlFastSweeping3D_impl.h"
-
-#endif /* TNLFASTSWEEPING_H_ */
diff --git a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_impl.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_impl.h
deleted file mode 100644
index bc1da169c..000000000
--- a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_impl.h
+++ /dev/null
@@ -1,522 +0,0 @@
-/***************************************************************************
-                          tnlFastSweeping2D_CUDA_impl.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLFASTSWEEPING2D_IMPL_H_
-#define TNLFASTSWEEPING2D_IMPL_H_
-
-#include "tnlFastSweeping.h"
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-String tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
-{
-	   return String( "tnlFastSweeping< " ) +
-	          MeshType::getType() + ", " +
-	          ::getType< Real >() + ", " +
-	          ::getType< Index >() + " >";
-}
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: init( const Config::ParameterContainer& parameters )
-{
-	const String& meshFile = parameters.getParameter< String >( "mesh" );
-
-	if( ! Mesh.load( meshFile ) )
-	{
-		  std::cerr << "I am not able to load the mesh from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-
-	const String& initialCondition = parameters.getParameter <String>("initial-condition");
-	if( ! dofVector.load( initialCondition ) )
-	{
-		  std::cerr << "I am not able to load the initial condition from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-	h = Mesh.getSpaceSteps().x();
-	counter = 0;
-
-	const String& exact_input = parameters.getParameter< String >( "exact-input" );
-
-	if(exact_input == "no")
-		exactInput=false;
-	else
-		exactInput=true;
-
-
-#ifdef HAVE_CUDA
-
-	cudaMalloc(&(cudaDofVector), this->dofVector.getSize()*sizeof(double));
-	cudaMemcpy(cudaDofVector, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice);
-
-	cudaMalloc(&(cudaDofVector2), this->dofVector.getSize()*sizeof(double));
-	cudaMemcpy(cudaDofVector2, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice);
-
-
-	cudaMalloc(&(this->cudaSolver), sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >));
-	cudaMemcpy(this->cudaSolver, this,sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >), cudaMemcpyHostToDevice);
-
-#endif
-
-	int n = Mesh.getDimensions().x();
-	dim3 threadsPerBlock(16, 16);
-	dim3 numBlocks(n/16 + 1 ,n/16 +1);
-
-	initCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-
-	return true;
-}
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: run()
-{
-//
-//	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-//	{
-//		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-//		{
-//			updateValue(i,j);
-//		}
-//	}
-//
-///*---------------------------------------------------------------------------------------------------------------------------*/
-//
-//	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-//	{
-//		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-//		{
-//			updateValue(i,j);
-//		}
-//	}
-//
-///*---------------------------------------------------------------------------------------------------------------------------*/
-//
-//	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-//	{
-//		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-//		{
-//			updateValue(i,j);
-//		}
-//	}
-//
-///*---------------------------------------------------------------------------------------------------------------------------*/
-//	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-//	{
-//		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-//		{
-//			updateValue(i,j);
-//		}
-//	}
-//
-///*---------------------------------------------------------------------------------------------------------------------------*/
-//
-//
-//	dofVector.save("u-00001.tnl");
-
-	int n = Mesh.getDimensions().x();
-	dim3 threadsPerBlock(32, 32);
-	dim3 numBlocks(n/32 + 1 ,n/32 +1);
-
-	for(int i = 2*n - 1; i > -1; i--)
-	{
-		runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,4,i);
-		cudaDeviceSynchronize();
-	}
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-	for(int i = 0; i < 2*n ; i++)
-	{
-		runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,1,i);
-		cudaDeviceSynchronize();
-	}
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-	for(int i = 0; i < 2*n ; i++)
-	{
-		runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,2,i);
-		cudaDeviceSynchronize();
-	}
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-	for(int i = 2*n - 1; i > -1; i--)
-	{
-		runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,3,i);
-		cudaDeviceSynchronize();
-	}
-
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-
-	cudaMemcpy(this->dofVector.getData(), cudaDofVector, this->dofVector.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-	cudaDeviceSynchronize();
-	cudaFree(cudaDofVector);
-	cudaFree(cudaDofVector2);
-	cudaFree(cudaSolver);
-	dofVector.save("u-00001.tnl");
-	cudaDeviceSynchronize();
-	return true;
-}
-
-
-
-
-#ifdef HAVE_CUDA
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j)
-{
-	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-	Real value = cudaDofVector[index];
-	Real a,b, tmp;
-
-	if( i == 0 )
-		a = cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)];
-	else if( i == Mesh.getDimensions().x() - 1 )
-		a = cudaDofVector[Mesh.template getCellNextToCell<-1,0>(index)];
-	else
-	{
-		a = fabsMin( cudaDofVector[Mesh.template getCellNextToCell<-1,0>(index)],
-				 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)] );
-	}
-
-	if( j == 0 )
-		b = cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)];
-	else if( j == Mesh.getDimensions().y() - 1 )
-		b = cudaDofVector[Mesh.template getCellNextToCell<0,-1>(index)];
-	else
-	{
-		b = fabsMin( cudaDofVector[Mesh.template getCellNextToCell<0,-1>(index)],
-				 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)] );
-	}
-
-
-	if(abs(a-b) >= h)
-		tmp = fabsMin(a,b) + sign(value)*h;
-	else
-		tmp = 0.5 * (a + b + sign(value)*sqrt(2.0 * h * h - (a - b) * (a - b) ) );
-
-	cudaDofVector[index]  = fabsMin(value, tmp);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid()
-{
-	int gx = threadIdx.x + blockDim.x*blockIdx.x;
-	int gy = blockDim.y*blockIdx.y + threadIdx.y;
-	int gid = Mesh.getCellIndex(CoordinatesType(gx,gy));
-
-	int total = blockDim.x*gridDim.x;
-
-
-
-	Real tmp = 0.0;
-	int flag = 0;
-	counter = 0;
-	tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
-
-
-	if(!exactInput)
-	{
-		cudaDofVector[gid]=cudaDofVector[gid]=0.5*h*sign(cudaDofVector[gid]);
-	}
-	__threadfence();
-//	printf("-----------------------------------------------------------------------------------\n");
-
-	__threadfence();
-
-	if(gx > 0 && gx < Mesh.getDimensions().x()-1)
-	{
-		if(gy > 0 && gy < Mesh.getDimensions().y()-1)
-		{
-
-			Index j = gy;
-			Index i = gx;
-//			 tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-			if(tmp == 0.0)
-			{}
-			else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
-					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-			{}
-			else
-				flag=1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-		}
-	}
-
-//	printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
-//	printf("****************************************************************\n");
-//	printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
-	if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == 0)
-	{
-//		printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
-		Index j = 0;
-		Index i = gx;
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-
-		if(tmp == 0.0)
-		{}
-		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 )
-		{}
-		else
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-	}
-
-//	printf("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n");
-	if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == Mesh.getDimensions().y() - 1)
-	{
-		Index i = gx;
-		Index j = Mesh.getDimensions().y() - 1;
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-
-		if(tmp == 0.0)
-		{}
-		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-		{}
-		else
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-	}
-
-//	printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
-	if(gy > 0 && gy < Mesh.getDimensions().y()-1 && gx == 0)
-	{
-		Index j = gy;
-		Index i = 0;
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-
-		if(tmp == 0.0)
-		{}
-		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-		{}
-		else
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-	}
-//	printf("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n");
-	if(gy > 0 && gy < Mesh.getDimensions().y()-1  && gx == Mesh.getDimensions().x() - 1)
-	{
-		Index j = gy;
-		Index i = Mesh.getDimensions().x() - 1;
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-
-		if(tmp == 0.0)
-		{}
-		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-		{}
-		else
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-	}
-
-//	printf("##################################################################################################\n");
-	if(gx == Mesh.getDimensions().x() - 1 &&
-	   gy == Mesh.getDimensions().y() - 1)
-	{
-
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
-		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 &&
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0)
-
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
-	}
-	if(gx == Mesh.getDimensions().x() - 1 &&
-	   gy == 0)
-	{
-
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
-		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 &&
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0)
-
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
-	}
-//	printf("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n");
-	if(gx == 0 &&
-	   gy == Mesh.getDimensions().y() - 1)
-	{
-
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
-		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 &&
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0)
-
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
-	}
-	if(gx == 0 &&
-	   gy == 0)
-	{
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
-		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 &&
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0)
-
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
-	}
-
-	__threadfence();
-
-	if(flag==1)
-		cudaDofVector[gid] =  tmp*3;
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-Real tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
-{
-	Real fx = abs(x);
-	Real fy = abs(y);
-
-	Real tmpMin = Min(fx,fy);
-
-	if(tmpMin == fx)
-		return x;
-	else
-		return y;
-
-
-}
-
-
-
-__global__ void runCUDA(tnlFastSweeping< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int i)
-{
-
-	int gx = threadIdx.x + blockDim.x*blockIdx.x;
-	int gy = blockDim.y*blockIdx.y + threadIdx.y;
-	if(solver->Mesh.getDimensions().x() <= gx || solver->Mesh.getDimensions().y() <= gy)
-		return;
-	int total = solver->Mesh.getDimensions().x();
-	//int gid = solver->Mesh.getDimensions().x() * gy + gx;
-	int max = solver->Mesh.getDimensions().x()*solver->Mesh.getDimensions().x();
-
-	int id1 = gx+gy;
-	int id2 = (solver->Mesh.getDimensions().x() - gx - 1) + gy;
-
-	/*---------------------------------------------------------------------------------------------------------------------------*/
-	if(sweep == 1)
-//	for(int i = 0; i < 2*total - 1; i++)
-	{
-		if(id1 == i)
-		{
-			solver->updateValue(gx,gy);
-			return;
-		}
-
-	}
-	/*---------------------------------------------------------------------------------------------------------------------------*/
-	else if(sweep == 2)
-//	for(int i = 0; i < 2*total - 1; i++)
-	{
-		if(id2 == i)
-		{
-			solver->updateValue(gx,gy);
-			return;
-		}
-	}
-	/*---------------------------------------------------------------------------------------------------------------------------*/
-	else if(sweep == 3)
-//	for(int i = 2*total - 2; i > -1; i--)
-	{
-		if(id1 == i)
-		{
-			solver->updateValue(gx,gy);
-			return;
-		}
-	}
-	/*---------------------------------------------------------------------------------------------------------------------------*/
-	else if(sweep == 4)
-//	for(int i = 2*total - 2; i > -1; i--)
-	{
-		if(id2 == i)
-		{
-			solver->updateValue(gx,gy);
-			return;
-		}
-	}
-	/*---------------------------------------------------------------------------------------------------------------------------*/
-
-
-
-
-}
-
-
-__global__ void initCUDA(tnlFastSweeping< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver)
-{
-	int gx = threadIdx.x + blockDim.x*blockIdx.x;
-	int gy = blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy)
-	{
-		solver->initGrid();
-	}
-
-
-}
-#endif
-
-
-
-
-#endif /* TNLFASTSWEEPING_IMPL_H_ */
diff --git a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v2_impl.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v2_impl.h
deleted file mode 100644
index 3ad5b7944..000000000
--- a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v2_impl.h
+++ /dev/null
@@ -1,588 +0,0 @@
-/***************************************************************************
-                          tnlFastSweeping2D_CUDA_v2_impl.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLFASTSWEEPING2D_IMPL_H_
-#define TNLFASTSWEEPING2D_IMPL_H_
-
-#include "tnlFastSweeping.h"
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-String tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
-{
-	   return String( "tnlFastSweeping< " ) +
-	          MeshType::getType() + ", " +
-	          ::getType< Real >() + ", " +
-	          ::getType< Index >() + " >";
-}
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: init( const Config::ParameterContainer& parameters )
-{
-	const String& meshFile = parameters.getParameter< String >( "mesh" );
-
-	if( ! Mesh.load( meshFile ) )
-	{
-		  std::cerr << "I am not able to load the mesh from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-
-	const String& initialCondition = parameters.getParameter <String>("initial-condition");
-	if( ! dofVector.load( initialCondition ) )
-	{
-		  std::cerr << "I am not able to load the initial condition from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-	h = Mesh.getSpaceSteps().x();
-	counter = 0;
-
-	const String& exact_input = parameters.getParameter< String >( "exact-input" );
-
-	if(exact_input == "no")
-		exactInput=false;
-	else
-		exactInput=true;
-
-
-#ifdef HAVE_CUDA
-
-	cudaMalloc(&(cudaDofVector), this->dofVector.getSize()*sizeof(double));
-	cudaMemcpy(cudaDofVector, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice);
-
-	cudaMalloc(&(cudaDofVector2), this->dofVector.getSize()*sizeof(double));
-	cudaMemcpy(cudaDofVector2, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice);
-
-
-	cudaMalloc(&(this->cudaSolver), sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >));
-	cudaMemcpy(this->cudaSolver, this,sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >), cudaMemcpyHostToDevice);
-
-#endif
-
-	int n = Mesh.getDimensions().x();
-	dim3 threadsPerBlock(16, 16);
-	dim3 numBlocks(n/16 + 1 ,n/16 +1);
-
-	initCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-
-	return true;
-}
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: run()
-{
-//
-//	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-//	{
-//		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-//		{
-//			updateValue(i,j);
-//		}
-//	}
-//
-///*---------------------------------------------------------------------------------------------------------------------------*/
-//
-//	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-//	{
-//		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-//		{
-//			updateValue(i,j);
-//		}
-//	}
-//
-///*---------------------------------------------------------------------------------------------------------------------------*/
-//
-//	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-//	{
-//		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-//		{
-//			updateValue(i,j);
-//		}
-//	}
-//
-///*---------------------------------------------------------------------------------------------------------------------------*/
-//	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-//	{
-//		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-//		{
-//			updateValue(i,j);
-//		}
-//	}
-//
-///*---------------------------------------------------------------------------------------------------------------------------*/
-//
-//
-//	dofVector.save("u-00001.tnl");
-
-	int n = Mesh.getDimensions().x();
-	dim3 threadsPerBlock(27, 27);
-	dim3 numBlocks(1 ,1);
-
-//	for(int i = 2*n - 1; i > -1; i--)
-	{
-		runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,4,0);
-		cudaDeviceSynchronize();
-	}
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-////	for(int i = 0; i < 2*n ; i++)
-//	{
-//		runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,1,0);
-//		cudaDeviceSynchronize();
-//	}
-//	cudaDeviceSynchronize();
-//	TNL_CHECK_CUDA_DEVICE;
-////	for(int i = 0; i < 2*n ; i++)
-//	{
-//		runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,2,0);
-//		cudaDeviceSynchronize();
-//	}
-//	cudaDeviceSynchronize();
-//	TNL_CHECK_CUDA_DEVICE;
-////	for(int i = 2*n - 1; i > -1; i--)
-//	{
-//		runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,3,0);
-//		cudaDeviceSynchronize();
-//	}
-//
-//	cudaDeviceSynchronize();
-//	TNL_CHECK_CUDA_DEVICE;
-
-	cudaMemcpy(this->dofVector.getData(), cudaDofVector, this->dofVector.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-	cudaDeviceSynchronize();
-	cudaFree(cudaDofVector);
-	cudaFree(cudaDofVector2);
-	cudaFree(cudaSolver);
-	dofVector.save("u-00001.tnl");
-	cudaDeviceSynchronize();
-	return true;
-}
-
-
-
-
-#ifdef HAVE_CUDA
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j)
-{
-	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-	Real value = cudaDofVector[index];
-	Real a,b, tmp;
-
-	if( i == 0 )
-		a = cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)];
-	else if( i == Mesh.getDimensions().x() - 1 )
-		a = cudaDofVector[Mesh.template getCellNextToCell<-1,0>(index)];
-	else
-	{
-		a = fabsMin( cudaDofVector[Mesh.template getCellNextToCell<-1,0>(index)],
-				 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)] );
-	}
-
-	if( j == 0 )
-		b = cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)];
-	else if( j == Mesh.getDimensions().y() - 1 )
-		b = cudaDofVector[Mesh.template getCellNextToCell<0,-1>(index)];
-	else
-	{
-		b = fabsMin( cudaDofVector[Mesh.template getCellNextToCell<0,-1>(index)],
-				 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)] );
-	}
-
-
-	if(abs(a-b) >= h)
-		tmp = fabsMin(a,b) + sign(value)*h;
-	else
-		tmp = 0.5 * (a + b + sign(value)*sqrt(2.0 * h * h - (a - b) * (a - b) ) );
-
-	cudaDofVector[index]  = fabsMin(value, tmp);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid()
-{
-	int gx = threadIdx.x + blockDim.x*blockIdx.x;
-	int gy = blockDim.y*blockIdx.y + threadIdx.y;
-	int gid = Mesh.getCellIndex(CoordinatesType(gx,gy));
-
-	int total = blockDim.x*gridDim.x;
-
-
-
-	Real tmp = 0.0;
-	int flag = 0;
-	counter = 0;
-	tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
-
-
-	if(!exactInput)
-	{
-		cudaDofVector[gid]=cudaDofVector[gid]=0.5*h*sign(cudaDofVector[gid]);
-	}
-	__threadfence();
-//	printf("-----------------------------------------------------------------------------------\n");
-
-	__threadfence();
-
-	if(gx > 0 && gx < Mesh.getDimensions().x()-1)
-	{
-		if(gy > 0 && gy < Mesh.getDimensions().y()-1)
-		{
-
-			Index j = gy;
-			Index i = gx;
-//			 tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-			if(tmp == 0.0)
-			{}
-			else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
-					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-			{}
-			else
-				flag=1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-		}
-	}
-
-//	printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
-//	printf("****************************************************************\n");
-//	printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
-	if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == 0)
-	{
-//		printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
-		Index j = 0;
-		Index i = gx;
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-
-		if(tmp == 0.0)
-		{}
-		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 )
-		{}
-		else
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-	}
-
-//	printf("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n");
-	if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == Mesh.getDimensions().y() - 1)
-	{
-		Index i = gx;
-		Index j = Mesh.getDimensions().y() - 1;
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-
-		if(tmp == 0.0)
-		{}
-		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-		{}
-		else
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-	}
-
-//	printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
-	if(gy > 0 && gy < Mesh.getDimensions().y()-1 && gx == 0)
-	{
-		Index j = gy;
-		Index i = 0;
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-
-		if(tmp == 0.0)
-		{}
-		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-		{}
-		else
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-	}
-//	printf("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n");
-	if(gy > 0 && gy < Mesh.getDimensions().y()-1  && gx == Mesh.getDimensions().x() - 1)
-	{
-		Index j = gy;
-		Index i = Mesh.getDimensions().x() - 1;
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-
-		if(tmp == 0.0)
-		{}
-		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-		{}
-		else
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-	}
-
-//	printf("##################################################################################################\n");
-	if(gx == Mesh.getDimensions().x() - 1 &&
-	   gy == Mesh.getDimensions().y() - 1)
-	{
-
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
-		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 &&
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0)
-
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
-	}
-	if(gx == Mesh.getDimensions().x() - 1 &&
-	   gy == 0)
-	{
-
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
-		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 &&
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0)
-
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
-	}
-//	printf("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n");
-	if(gx == 0 &&
-	   gy == Mesh.getDimensions().y() - 1)
-	{
-
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
-		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 &&
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0)
-
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
-	}
-	if(gx == 0 &&
-	   gy == 0)
-	{
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
-		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 &&
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0)
-
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
-	}
-
-	__threadfence();
-
-	if(flag==1)
-		cudaDofVector[gid] =  tmp*3;
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-Real tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
-{
-	Real fx = abs(x);
-
-	Real tmpMin = Min(fx,abs(y));
-
-	if(tmpMin == fx)
-		return x;
-	else
-		return y;
-
-
-}
-
-
-
-__global__ void runCUDA(tnlFastSweeping< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int k)
-{
-
-	//int gx = threadIdx.x;
-	//int gy = threadIdx.y;
-	int id1,id2;
-	int nx = solver->Mesh.getDimensions().x()+ threadIdx.x;
-	int ny = solver->Mesh.getDimensions().y()+ threadIdx.y;
-
-	int blockCount = solver->Mesh.getDimensions().x()/blockDim.x + 1;
-
-	for(int gy = threadIdx.y; gy < ny;gy+=blockDim.y)
-	{
-		for(int gx = threadIdx.x; gx < nx;gx+=blockDim.x)
-		{
-//			if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy && gy > -1&& gx > -1)
-			{
-				id1 = threadIdx.x+threadIdx.y;
-
-				for(int l = 0; l < 2*blockDim.x - 1; l++)
-				{
-					if(id1 == l)
-					{
-						if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
-						solver->updateValue(gx,gy);
-					}
-					__syncthreads();
-				}
-
-			}
-			//gx+=blockDim.x;
-			//__syncthreads();
-		}
-		//gx = threadIdx.x;
-		//gy+=blockDim.y;
-		//__syncthreads();
-	}
-			/*---------------------------------------------------------------------------------------------------------------------------*/
-//	gx = blockDim.x*(blockCount-1) + threadIdx.x;
-//	gy = threadIdx.y;
-	for(int gy = threadIdx.y; gy < ny;gy+=blockDim.y)
-	{
-		for(int gx = blockDim.x*(blockCount-1) + threadIdx.x; gx >- 1;gx-=blockDim.x)
-		{
-//			if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy && gy > -1&& gx > -1)
-			{
-				id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y;
-
-				for(int l = 0; l < 2*blockDim.x - 1; l++)
-				{
-					if(id2 == l)
-					{
-						if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
-						solver->updateValue(gx,gy);
-					}
-					__syncthreads();
-				}
-			}
-			//gx-=blockDim.x;
-			//__syncthreads();
-		}
-		//gx = blockDim.x*(blockCount-1) + threadIdx.x;
-		//gy+=blockDim.y;
-		//__syncthreads();
-	}
-			/*---------------------------------------------------------------------------------------------------------------------------*/
-//	gx = blockDim.x*(blockCount-1) + threadIdx.x;
-//	gy = blockDim.x*(blockCount-1) + threadIdx.y;
-	for(int gy = blockDim.x*(blockCount-1) +threadIdx.y; gy >- 1;gy-=blockDim.y)
-	{
-		for(int gx = blockDim.x*(blockCount-1) + threadIdx.x; gx >- 1;gx-=blockDim.x)
-		{
-//			if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy && gy > -1&& gx > -1)
-			{
-				id1 = threadIdx.x+threadIdx.y;
-
-				for(int l = 2*blockDim.x - 2; l > -1; l--)
-				{
-					if(id1 == l)
-					{
-						if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
-						solver->updateValue(gx,gy);
-					}
-					__syncthreads();
-				}
-			}
-			//gx-=blockDim.x;
-			//__syncthreads();
-		}
-		//gx = blockDim.x*(blockCount-1) + threadIdx.x;
-		//gy-=blockDim.y;
-		//__syncthreads();
-	}
-			/*---------------------------------------------------------------------------------------------------------------------------*/
-	//gx = threadIdx.x;
-	//gy = blockDim.x*(blockCount-1) +threadIdx.y;
-	for(int gy = blockDim.x*(blockCount-1) +threadIdx.y; gy >- 1;gy-=blockDim.y)
-	{
-		for(int gx = threadIdx.x; gx < nx;gx+=blockDim.x)
-		{
-//			if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy && gy > -1&& gx > -1)
-			{
-				id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y;
-
-				for(int l = 2*blockDim.x - 2; l > -1; l--)
-				{
-					if(id2 == l)
-					{
-						if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
-						solver->updateValue(gx,gy);
-					}
-					__syncthreads();
-				}
-			}
-			//gx+=blockDim.x;
-			//__syncthreads();
-		}
-		//gx = threadIdx.x;
-		//gy-=blockDim.y;
-		///__syncthreads();
-	}
-			/*---------------------------------------------------------------------------------------------------------------------------*/
-
-
-
-
-
-}
-
-
-__global__ void initCUDA(tnlFastSweeping< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver)
-{
-	int gx = threadIdx.x + blockDim.x*blockIdx.x;
-	int gy = blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy)
-	{
-		solver->initGrid();
-	}
-
-
-}
-#endif
-
-
-
-
-#endif /* TNLFASTSWEEPING_IMPL_H_ */
diff --git a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v3_impl.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v3_impl.h
deleted file mode 100644
index ff36d3f8e..000000000
--- a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v3_impl.h
+++ /dev/null
@@ -1,920 +0,0 @@
-/***************************************************************************
-                          tnlFastSweeping2D_CUDA_v3_impl.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLFASTSWEEPING2D_IMPL_H_
-#define TNLFASTSWEEPING2D_IMPL_H_
-
-#include "tnlFastSweeping.h"
-
-
-
-
-__device__ double atomicSet(double* address, double val)
-{
-	unsigned long long int* address_as_ull =
-						  (unsigned long long int*)address;
-	unsigned long long int old = *address_as_ull, assumed;
-	do {
-		assumed = old;
-			old = atomicCAS(address_as_ull, assumed,__double_as_longlong(val ));
-	} while (assumed != old);
-	return __longlong_as_double(old);
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-String tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
-{
-	   return String( "tnlFastSweeping< " ) +
-	          MeshType::getType() + ", " +
-	          ::getType< Real >() + ", " +
-	          ::getType< Index >() + " >";
-}
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: init( const Config::ParameterContainer& parameters )
-{
-	const String& meshFile = parameters.getParameter< String >( "mesh" );
-
-	if( ! Mesh.load( meshFile ) )
-	{
-		  std::cerr << "I am not able to load the mesh from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-
-	const String& initialCondition = parameters.getParameter <String>("initial-condition");
-	if( ! dofVector.load( initialCondition ) )
-	{
-		  std::cerr << "I am not able to load the initial condition from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-	h = Mesh.getSpaceSteps().x();
-	counter = 0;
-
-	const String& exact_input = parameters.getParameter< String >( "exact-input" );
-
-	if(exact_input == "no")
-		exactInput=false;
-	else
-		exactInput=true;
-
-
-#ifdef HAVE_CUDA
-
-	cudaMalloc(&(cudaDofVector), this->dofVector.getSize()*sizeof(double));
-	cudaMemcpy(cudaDofVector, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice);
-
-	cudaMalloc(&(cudaDofVector2), this->dofVector.getSize()*sizeof(double));
-	cudaMemcpy(cudaDofVector2, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice);
-
-
-	cudaMalloc(&(this->cudaSolver), sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >));
-	cudaMemcpy(this->cudaSolver, this,sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >), cudaMemcpyHostToDevice);
-
-#endif
-
-	int n = Mesh.getDimensions().x();
-	dim3 threadsPerBlock(16, 16);
-	dim3 numBlocks(n/16 + 1 ,n/16 +1);
-
-	initCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-
-	return true;
-}
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: run()
-{
-//
-//	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-//	{
-//		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-//		{
-//			updateValue(i,j);
-//		}
-//	}
-//
-///*---------------------------------------------------------------------------------------------------------------------------*/
-//
-//	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-//	{
-//		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-//		{
-//			updateValue(i,j);
-//		}
-//	}
-//
-///*---------------------------------------------------------------------------------------------------------------------------*/
-//
-//	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-//	{
-//		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-//		{
-//			updateValue(i,j);
-//		}
-//	}
-//
-///*---------------------------------------------------------------------------------------------------------------------------*/
-//	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-//	{
-//		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-//		{
-//			updateValue(i,j);
-//		}
-//	}
-//
-///*---------------------------------------------------------------------------------------------------------------------------*/
-//
-//
-//	dofVector.save("u-00001.tnl");
-
-	int n = Mesh.getDimensions().x();
-	dim3 threadsPerBlock(16, 16);
-	dim3 numBlocks(n/16 +1 ,n/16 +1);
-	int m =n/16 +1;
-
-	for(int i = 0; i < 2*m -1; i++)
-	{
-		runCUDA<15><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,1,i);
-		//cudaDeviceSynchronize();
-	}
-//	cudaDeviceSynchronize();
-//	TNL_CHECK_CUDA_DEVICE;
-//	for(int i = 0; i < 2*m -1; i++)
-//	{
-//		runCUDA<2><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,2,i);
-//		cudaDeviceSynchronize();
-//	}
-//	cudaDeviceSynchronize();
-//	TNL_CHECK_CUDA_DEVICE;
-//	for(int i = 0; i < 2*m -1; i++)
-//	{
-//		runCUDA<4><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,4,i);
-//		cudaDeviceSynchronize();
-//	}
-//	cudaDeviceSynchronize();
-//	TNL_CHECK_CUDA_DEVICE;
-//	for(int i = 0; i < 2*m -1; i++)
-//	{
-//		runCUDA<8><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,8,i);
-//		cudaDeviceSynchronize();
-//	}
-
-
-
-
-//	for(int i = 0; i < (2*m -1)/4 -1; i++)
-//	{
-//		runCUDA<15><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,15,i);//all
-//		cudaDeviceSynchronize();
-//	}
-//	for(int i = (2*m -1)/4 -1; i < (2*m -1)/2 -1; i++)
-//	{
-//		runCUDA<5><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,5,i); //two
-//		cudaDeviceSynchronize();
-//		runCUDA<10><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,10,i); //two
-//		cudaDeviceSynchronize();
-//	}
-//	for(int i = (2*m -1)/2 -1; i < (2*m -1)/2 +1; i++)
-//	{
-//		runCUDA<1><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,1,i); //separate
-//		cudaDeviceSynchronize();
-//		runCUDA<2><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,2,i); //separate
-//		cudaDeviceSynchronize();
-//		runCUDA<4><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,4,i); //separate
-//		cudaDeviceSynchronize();
-//		runCUDA<8><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,8,i); //separate
-//		cudaDeviceSynchronize();
-//	}
-//	for(int i = (2*m -1)/2 +1; i < (2*m -1/4)*3 +1; i++)
-//	{
-//		runCUDA<5><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,5,i); //two
-//		cudaDeviceSynchronize();
-//		runCUDA<10><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,10,i); //two
-//		cudaDeviceSynchronize();
-//	}
-//	for(int i = (2*m -1/4)*3 +1; i < 2*m -1; i++)
-//	{
-//		runCUDA<15><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,15,i);//all
-//		cudaDeviceSynchronize();
-//	}
-cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-
-	cudaMemcpy(this->dofVector.getData(), cudaDofVector, this->dofVector.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-	cudaDeviceSynchronize();
-	cudaFree(cudaDofVector);
-	cudaFree(cudaDofVector2);
-	cudaFree(cudaSolver);
-	dofVector.save("u-00001.tnl");
-	cudaDeviceSynchronize();
-	return true;
-}
-
-
-
-
-#ifdef HAVE_CUDA
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j)
-{
-	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-	Real value = cudaDofVector[index];
-	Real a,b, tmp;
-
-	if( i == 0 )
-		a = cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)];
-	else if( i == Mesh.getDimensions().x() - 1 )
-		a = cudaDofVector[Mesh.template getCellNextToCell<-1,0>(index)];
-	else
-	{
-		a = fabsMin( cudaDofVector[Mesh.template getCellNextToCell<-1,0>(index)],
-				 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)] );
-	}
-
-	if( j == 0 )
-		b = cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)];
-	else if( j == Mesh.getDimensions().y() - 1 )
-		b = cudaDofVector[Mesh.template getCellNextToCell<0,-1>(index)];
-	else
-	{
-		b = fabsMin( cudaDofVector[Mesh.template getCellNextToCell<0,-1>(index)],
-				 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)] );
-	}
-
-
-	if(abs(a-b) >= h)
-		tmp = fabsMin(a,b) + sign(value)*h;
-	else
-		tmp = 0.5 * (a + b + sign(value)*sqrt(2.0 * h * h - (a - b) * (a - b) ) );
-
-	atomicSet(&cudaDofVector[index],fabsMin(value, tmp));
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid()
-{
-	int gx = threadIdx.x + blockDim.x*blockIdx.x;
-	int gy = blockDim.y*blockIdx.y + threadIdx.y;
-	int gid = Mesh.getCellIndex(CoordinatesType(gx,gy));
-
-	int total = blockDim.x*gridDim.x;
-
-
-
-	Real tmp = 0.0;
-	int flag = 0;
-	counter = 0;
-	tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
-
-
-	if(!exactInput)
-	{
-		cudaDofVector[gid]=cudaDofVector[gid]=0.5*h*sign(cudaDofVector[gid]);
-	}
-	__threadfence();
-//	printf("-----------------------------------------------------------------------------------\n");
-
-	__threadfence();
-
-	if(gx > 0 && gx < Mesh.getDimensions().x()-1)
-	{
-		if(gy > 0 && gy < Mesh.getDimensions().y()-1)
-		{
-
-			Index j = gy;
-			Index i = gx;
-//			 tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-			if(tmp == 0.0)
-			{}
-			else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
-					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-			{}
-			else
-				flag=1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-		}
-	}
-
-//	printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
-//	printf("****************************************************************\n");
-//	printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
-	if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == 0)
-	{
-//		printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
-		Index j = 0;
-		Index i = gx;
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-
-		if(tmp == 0.0)
-		{}
-		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 )
-		{}
-		else
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-	}
-
-//	printf("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n");
-	if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == Mesh.getDimensions().y() - 1)
-	{
-		Index i = gx;
-		Index j = Mesh.getDimensions().y() - 1;
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-
-		if(tmp == 0.0)
-		{}
-		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-		{}
-		else
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-	}
-
-//	printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
-	if(gy > 0 && gy < Mesh.getDimensions().y()-1 && gx == 0)
-	{
-		Index j = gy;
-		Index i = 0;
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-
-		if(tmp == 0.0)
-		{}
-		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-		{}
-		else
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-	}
-//	printf("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n");
-	if(gy > 0 && gy < Mesh.getDimensions().y()-1  && gx == Mesh.getDimensions().x() - 1)
-	{
-		Index j = gy;
-		Index i = Mesh.getDimensions().x() - 1;
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-
-		if(tmp == 0.0)
-		{}
-		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-		{}
-		else
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-	}
-
-//	printf("##################################################################################################\n");
-	if(gx == Mesh.getDimensions().x() - 1 &&
-	   gy == Mesh.getDimensions().y() - 1)
-	{
-
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
-		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 &&
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0)
-
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
-	}
-	if(gx == Mesh.getDimensions().x() - 1 &&
-	   gy == 0)
-	{
-
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
-		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 &&
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0)
-
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
-	}
-//	printf("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n");
-	if(gx == 0 &&
-	   gy == Mesh.getDimensions().y() - 1)
-	{
-
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
-		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 &&
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0)
-
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
-	}
-	if(gx == 0 &&
-	   gy == 0)
-	{
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
-		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 &&
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0)
-
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
-	}
-
-	__threadfence();
-
-	if(flag==1)
-		cudaDofVector[gid] =  tmp*3;
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-Real tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
-{
-//	Real fx = abs(x);
-//
-//	Real tmpMin = Min(fx,abs(y));
-
-	if(abs(y) > abs(x))
-		return x;
-	else
-		return y;
-
-
-}
-
-
-template<>
-__global__ void runCUDA<1>(tnlFastSweeping< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int k)
-{
-
-	if(blockIdx.x+blockIdx.y == k)
-	{
-		int gx = threadIdx.x + blockDim.x*blockIdx.x;
-		int gy = threadIdx.y + blockDim.y*blockIdx.y;
-
-		int id1 = threadIdx.x+threadIdx.y;
-
-						for(int l = 0; l < 2*blockDim.x - 1; l++)
-						{
-							if(id1 == l)
-							{
-								if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
-								solver->updateValue(gx,gy);
-							}
-							__syncthreads();
-						}
-
-	}
-			/*---------------------------------------------------------------------------------------------------------------------------*/
-}
-	template<>
-	__global__ void runCUDA<2>(tnlFastSweeping< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int k)
-	{
-	if((gridDim.x - blockIdx.x - 1)+blockIdx.y == k)
-	{
-		int gx = threadIdx.x + blockDim.x*blockIdx.x;
-		int gy = threadIdx.y + blockDim.y*blockIdx.y;
-
-		int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y;
-
-				for(int l = 0; l < 2*blockDim.x - 1; l++)
-				{
-					if(id2 == l)
-					{
-						if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
-						solver->updateValue(gx,gy);
-					}
-					__syncthreads();
-				}
-
-	}
-	}			/*---------------------------------------------------------------------------------------------------------------------------*/
-	template<>
-	__global__ void runCUDA<4>(tnlFastSweeping< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int k)
-	{
-	if(blockIdx.x+blockIdx.y == gridDim.x+gridDim.y-k-2)
-		{
-		int gx = threadIdx.x + blockDim.x*blockIdx.x;
-		int gy = threadIdx.y + blockDim.y*blockIdx.y;
-
-		int id1 = threadIdx.x+threadIdx.y;
-
-				for(int l = 2*blockDim.x - 2; l > -1; l--)
-				{
-					if(id1 == l)
-					{
-						if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
-						solver->updateValue(gx,gy);
-						return;
-					}
-					__syncthreads();
-				}
-
-		}
-			/*---------------------------------------------------------------------------------------------------------------------------*/
-
-	}
-
-	template<>
-	__global__ void runCUDA<8>(tnlFastSweeping< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int k)
-	{
-	if((gridDim.x - blockIdx.x - 1)+blockIdx.y == gridDim.x+gridDim.y-k-2)
-		{
-		int gx = threadIdx.x + blockDim.x*blockIdx.x;
-		int gy = threadIdx.y + blockDim.y*blockIdx.y;
-
-		int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y;
-
-				for(int l = 2*blockDim.x - 2; l > -1; l--)
-				{
-					if(id2 == l)
-					{
-						if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
-						solver->updateValue(gx,gy);
-						return;
-					}
-					__syncthreads();
-				}
-
-		}
-			/*---------------------------------------------------------------------------------------------------------------------------*/
-
-
-
-
-
-}
-
-
-	template<>
-		__global__ void runCUDA<5>(tnlFastSweeping< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int k)
-		{
-
-			if(blockIdx.x+blockIdx.y == k)
-			{
-				int gx = threadIdx.x + blockDim.x*blockIdx.x;
-				int gy = threadIdx.y + blockDim.y*blockIdx.y;
-
-				int id1 = threadIdx.x+threadIdx.y;
-
-								for(int l = 0; l < 2*blockDim.x - 1; l++)
-								{
-									if(id1 == l)
-									{
-										if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
-										solver->updateValue(gx,gy);
-										return;
-									}
-									__syncthreads();
-								}
-
-			}
-			else if(blockIdx.x+blockIdx.y == gridDim.x+gridDim.y-k-2)
-				{
-				int gx = threadIdx.x + blockDim.x*blockIdx.x;
-				int gy = threadIdx.y + blockDim.y*blockIdx.y;
-
-				int id1 = threadIdx.x+threadIdx.y;
-
-						for(int l = 2*blockDim.x - 2; l > -1; l--)
-						{
-							if(id1 == l)
-							{
-								if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
-								solver->updateValue(gx,gy);
-								return;
-							}
-							__syncthreads();
-						}
-
-				}
-		}
-
-
-	template<>
-		__global__ void runCUDA<10>(tnlFastSweeping< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int k)
-		{
-			if((gridDim.x - blockIdx.x - 1)+blockIdx.y == k)
-			{
-				int gx = threadIdx.x + blockDim.x*blockIdx.x;
-				int gy = threadIdx.y + blockDim.y*blockIdx.y;
-
-				int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y;
-
-						for(int l = 0; l < 2*blockDim.x - 1; l++)
-						{
-							if(id2 == l)
-							{
-								if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
-								solver->updateValue(gx,gy);
-								return;
-							}
-							__syncthreads();
-						}
-
-			}
-
-			else if((gridDim.x - blockIdx.x - 1)+blockIdx.y == gridDim.x+gridDim.y-k-2)
-				{
-				int gx = threadIdx.x + blockDim.x*blockIdx.x;
-				int gy = threadIdx.y + blockDim.y*blockIdx.y;
-
-				int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y;
-
-						for(int l = 2*blockDim.x - 2; l > -1; l--)
-						{
-							if(id2 == l)
-							{
-								if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
-								solver->updateValue(gx,gy);
-								return;
-							}
-							__syncthreads();
-						}
-
-				}
-
-		}
-
-
-
-	template<>
-	__global__ void runCUDA<15>(tnlFastSweeping< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int k)
-	{
-
-		if(blockIdx.x+blockIdx.y == k)
-		{
-			int gx = threadIdx.x + blockDim.x*blockIdx.x;
-			int gy = threadIdx.y + blockDim.y*blockIdx.y;
-
-			int id1 = threadIdx.x+threadIdx.y;
-
-							for(int l = 0; l < 2*blockDim.x - 1; l++)
-							{
-								if(id1 == l)
-								{
-									if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
-									solver->updateValue(gx,gy);
-									return;
-								}
-								__syncthreads();
-							}
-
-		}
-				/*---------------------------------------------------------------------------------------------------------------------------*/
-
-		if((gridDim.x - blockIdx.x - 1)+blockIdx.y == k)
-		{
-			int gx = threadIdx.x + blockDim.x*blockIdx.x;
-			int gy = threadIdx.y + blockDim.y*blockIdx.y;
-
-			int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y;
-
-					for(int l = 0; l < 2*blockDim.x - 1; l++)
-					{
-						if(id2 == l)
-						{
-							if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
-							solver->updateValue(gx,gy);
-							return;
-						}
-						__syncthreads();
-					}
-
-		}
-				/*---------------------------------------------------------------------------------------------------------------------------*/
-
-		if(blockIdx.x+blockIdx.y == gridDim.x+gridDim.y-k-2)
-			{
-			int gx = threadIdx.x + blockDim.x*blockIdx.x;
-			int gy = threadIdx.y + blockDim.y*blockIdx.y;
-
-			int id1 = threadIdx.x+threadIdx.y;
-
-					for(int l = 2*blockDim.x - 2; l > -1; l--)
-					{
-						if(id1 == l)
-						{
-							if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
-							solver->updateValue(gx,gy);
-							return;
-						}
-						__syncthreads();
-					}
-
-			}
-				/*---------------------------------------------------------------------------------------------------------------------------*/
-
-		if((gridDim.x - blockIdx.x - 1)+blockIdx.y == gridDim.x+gridDim.y-k-2)
-			{
-			int gx = threadIdx.x + blockDim.x*blockIdx.x;
-			int gy = threadIdx.y + blockDim.y*blockIdx.y;
-
-			int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y;
-
-					for(int l = 2*blockDim.x - 2; l > -1; l--)
-					{
-						if(id2 == l)
-						{
-							if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
-							solver->updateValue(gx,gy);
-							return;
-						}
-						__syncthreads();
-					}
-
-			}
-				/*---------------------------------------------------------------------------------------------------------------------------*/
-
-
-
-
-
-	}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-__global__ void initCUDA(tnlFastSweeping< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver)
-{
-	int gx = threadIdx.x + blockDim.x*blockIdx.x;
-	int gy = blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy)
-	{
-		solver->initGrid();
-	}
-
-
-}
-#endif
-
-
-
-
-
-
-//__global__ void runCUDA(tnlFastSweeping< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int k)
-//{
-//
-//	if(sweep==1 && blockIdx.x+blockIdx.y == k)
-//	{
-//		int gx = threadIdx.x + blockDim.x*blockIdx.x;
-//		int gy = threadIdx.y + blockDim.y*blockIdx.y;
-//
-//		int id1 = threadIdx.x+threadIdx.y;
-//
-//						for(int l = 0; l < 2*blockDim.x - 1; l++)
-//						{
-//							if(id1 == l)
-//							{
-//								if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
-//								solver->updateValue(gx,gy);
-//							}
-//							__syncthreads();
-//						}
-//
-//	}
-//			/*---------------------------------------------------------------------------------------------------------------------------*/
-//
-//	else if(sweep==2 && (gridDim.x - blockIdx.x - 1)+blockIdx.y == k)
-//	{
-//		int gx = threadIdx.x + blockDim.x*blockIdx.x;
-//		int gy = threadIdx.y + blockDim.y*blockIdx.y;
-//
-//		int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y;
-//
-//				for(int l = 0; l < 2*blockDim.x - 1; l++)
-//				{
-//					if(id2 == l)
-//					{
-//						if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
-//						solver->updateValue(gx,gy);
-//					}
-//					__syncthreads();
-//				}
-//
-//	}
-//			/*---------------------------------------------------------------------------------------------------------------------------*/
-//
-//	else if(sweep==4 && blockIdx.x+blockIdx.y == gridDim.x+gridDim.y-k-2)
-//		{
-//		int gx = threadIdx.x + blockDim.x*blockIdx.x;
-//		int gy = threadIdx.y + blockDim.y*blockIdx.y;
-//
-//		int id1 = threadIdx.x+threadIdx.y;
-//
-//				for(int l = 2*blockDim.x - 2; l > -1; l--)
-//				{
-//					if(id1 == l)
-//					{
-//						if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
-//						solver->updateValue(gx,gy);
-//						return;
-//					}
-//					__syncthreads();
-//				}
-//
-//		}
-//			/*---------------------------------------------------------------------------------------------------------------------------*/
-//
-//	else if(sweep==8 && (gridDim.x - blockIdx.x - 1)+blockIdx.y == gridDim.x+gridDim.y-k-2)
-//		{
-//		int gx = threadIdx.x + blockDim.x*blockIdx.x;
-//		int gy = threadIdx.y + blockDim.y*blockIdx.y;
-//
-//		int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y;
-//
-//				for(int l = 2*blockDim.x - 2; l > -1; l--)
-//				{
-//					if(id2 == l)
-//					{
-//						if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
-//						solver->updateValue(gx,gy);
-//						return;
-//					}
-//					__syncthreads();
-//				}
-//
-//		}
-//			/*---------------------------------------------------------------------------------------------------------------------------*/
-//
-//
-//
-//
-//
-//}
-
-
-#endif /* TNLFASTSWEEPING_IMPL_H_ */
diff --git a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v4_impl.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v4_impl.h
deleted file mode 100644
index e0a9697c2..000000000
--- a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v4_impl.h
+++ /dev/null
@@ -1,1003 +0,0 @@
-/***************************************************************************
-                          tnlFastSweeping2D_CUDA_v4_impl.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLFASTSWEEPING2D_IMPL_H_
-#define TNLFASTSWEEPING2D_IMPL_H_
-
-#include "tnlFastSweeping.h"
-
-__device__
-double fabsMin( double x, double y)
-{
-	double fx = abs(x);
-
-	if(Min(fx,abs(y)) == fx)
-		return x;
-	else
-		return y;
-}
-
-__device__
-double atomicFabsMin(double* address, double val)
-{
-	unsigned long long int* address_as_ull =
-						  (unsigned long long int*)address;
-	unsigned long long int old = *address_as_ull, assumed;
-	do {
-		assumed = old;
-			old = atomicCAS(address_as_ull, assumed,__double_as_longlong( fabsMin(__longlong_as_double(assumed),val) ));
-	} while (assumed != old);
-	return __longlong_as_double(old);
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-String tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
-{
-	   return String( "tnlFastSweeping< " ) +
-	          MeshType::getType() + ", " +
-	          ::getType< Real >() + ", " +
-	          ::getType< Index >() + " >";
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: tnlFastSweeping()
-:dofVector(Mesh)
-{
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: init( const Config::ParameterContainer& parameters )
-{
-	const String& meshFile = parameters.getParameter< String >( "mesh" );
-
-	if( ! Mesh.load( meshFile ) )
-	{
-		  std::cerr << "I am not able to load the mesh from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-
-	const String& initialCondition = parameters.getParameter <String>("initial-condition");
-	if( ! dofVector.load( initialCondition ) )
-	{
-		  std::cerr << "I am not able to load the initial condition from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-	h = Mesh.template getSpaceStepsProducts< 1, 0 >();
-	//Entity.refresh();
-	counter = 0;
-
-	const String& exact_input = parameters.getParameter< String >( "exact-input" );
-
-	if(exact_input == "no")
-		exactInput=false;
-	else
-		exactInput=true;
-
-
-#ifdef HAVE_CUDA
-
-	cudaMalloc(&(cudaDofVector), this->dofVector.getData().getSize()*sizeof(double));
-	cudaMemcpy(cudaDofVector, this->dofVector.getData().getData(), this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyHostToDevice);
-
-	cudaMalloc(&(cudaDofVector2), this->dofVector.getData().getSize()*sizeof(double));
-	cudaMemcpy(cudaDofVector2, this->dofVector.getData().getData(), this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyHostToDevice);
-
-
-	cudaMalloc(&(this->cudaSolver), sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >));
-	cudaMemcpy(this->cudaSolver, this,sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >), cudaMemcpyHostToDevice);
-
-#endif
-
-	int n = Mesh.getDimensions().x();
-	dim3 threadsPerBlock(16, 16);
-	dim3 numBlocks(n/16 + 1 ,n/16 +1);
-
-
-	initCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-
-	return true;
-}
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: run()
-{
-
-	int n = Mesh.getDimensions().x();
-	dim3 threadsPerBlock(1, 1024);
-	dim3 numBlocks(4,1);
-
-
-	runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,0,0);
-
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-
-	//data.setLike(dofVector.getData());
-	//cudaMemcpy(data.getData(), cudaDofVector2, this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-	cudaMemcpy(dofVector.getData().getData(), cudaDofVector2, this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-	cudaDeviceSynchronize();
-	cudaFree(cudaDofVector);
-	cudaFree(cudaDofVector2);
-	cudaFree(cudaSolver);
-	//data.save("u-00001.tnl");
-	dofVector.save("u-00001.tnl");
-	cudaDeviceSynchronize();
-	return true;
-}
-
-
-
-
-#ifdef HAVE_CUDA
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-
-	Real value = cudaDofVector2[Entity.getIndex()];
-	Real a,b, tmp;
-
-	if( i == 0 )
-		a = cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()];
-	else if( i == Mesh.getDimensions().x() - 1 )
-		a = cudaDofVector2[neighborEntities.template getEntityIndex< -1,  0 >()];
-	else
-	{
-		a = fabsMin( cudaDofVector2[neighborEntities.template getEntityIndex< -1,  0 >()],
-				 cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()] );
-	}
-
-	if( j == 0 )
-		b = cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()];
-	else if( j == Mesh.getDimensions().y() - 1 )
-		b = cudaDofVector2[neighborEntities.template getEntityIndex< 0,  -1 >()];
-	else
-	{
-		b = fabsMin( cudaDofVector2[neighborEntities.template getEntityIndex< 0,  -1 >()],
-				 cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()] );
-	}
-
-
-	if(abs(a-b) >= h)
-		tmp = fabsMin(a,b) + sign(value)*h;
-	else
-		tmp = 0.5 * (a + b + sign(value)*sqrt(2.0 * h * h - (a - b) * (a - b) ) );
-
-//	cudaDofVector2[Entity.getIndex()]  = fabsMin(value, tmp);
-	atomicFabsMin(&(cudaDofVector2[Entity.getIndex()]), tmp);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid()
-{
-	int i = threadIdx.x + blockDim.x*blockIdx.x;
-	int j = blockDim.y*blockIdx.y + threadIdx.y;
-
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-
-	int gid = Entity.getIndex();
-
-	cudaDofVector2[gid] = INT_MAX*sign(cudaDofVector[gid]);
-//
-//	if(abs(cudaDofVector[gid]) < 1.01*h)
-//		cudaDofVector2[gid] = cudaDofVector[gid];
-
-
-
-
-
-	if(i+1 < Mesh.getDimensions().x() && j+1 < Mesh.getDimensions().y() && !exactInput)
-	{
-		if(cudaDofVector[Entity.getIndex()] > 0)
-		{
-			if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()] > 0)
-			{
-				if(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-				{
-					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-						setupSquare1111(i,j);
-					else
-						setupSquare1110(i,j);
-				}
-				else
-				{
-					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-						setupSquare1101(i,j);
-					else
-						setupSquare1100(i,j);
-				}
-			}
-			else
-			{
-				if(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-				{
-					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-						setupSquare1011(i,j);
-					else
-						setupSquare1010(i,j);
-				}
-				else
-				{
-					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-						setupSquare1001(i,j);
-					else
-						setupSquare1000(i,j);
-				}
-			}
-		}
-		else
-		{
-			if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()] > 0)
-			{
-				if(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-				{
-					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-						setupSquare0111(i,j);
-					else
-						setupSquare0110(i,j);
-				}
-				else
-				{
-					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-						setupSquare0101(i,j);
-					else
-						setupSquare0100(i,j);
-				}
-			}
-			else
-			{
-				if(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-				{
-					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-						setupSquare0011(i,j);
-					else
-						setupSquare0010(i,j);
-				}
-				else
-				{
-					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-						setupSquare0001(i,j);
-					else
-						setupSquare0000(i,j);
-				}
-			}
-		}
-
-	}
-	if(exactInput)
-	{
-		if(abs(cudaDofVector[gid]) < 1.5*h)
-			cudaDofVector2[gid] = cudaDofVector[gid];
-	}
-
-
-	return true;
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-Real tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
-{
-	Real fx = abs(x);
-	//Real fy = abs(y);
-
-	//Real tmpMin = Min(fx,abs(y));
-
-	if(Min(fx,abs(y)) == fx)
-		return x;
-	else
-		return y;
-
-
-}
-
-
-
-__global__ void runCUDA(tnlFastSweeping< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int i)
-{
-
-	int gx = 0;
-	int gy = threadIdx.y;
-	//if(solver->Mesh.getDimensions().x() <= gx || solver->Mesh.getDimensions().y() <= gy)
-	//	return;
-	int n = solver->Mesh.getDimensions().x();
-	int blockCount = n/blockDim.y +1;
-	//int gid = solver->Mesh.getDimensions().x() * gy + gx;
-	//int max = solver->Mesh.getDimensions().x()*solver->Mesh.getDimensions().x();
-
-	//int id1 = gx+gy;
-	//int id2 = (solver->Mesh.getDimensions().x() - gx - 1) + gy;
-
-	if(blockIdx.x==0)
-	{
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy < n)
-			{
-				solver->updateValue(gx,gy);
-				gx++;
-				if(gx==n)
-				{
-					gx=0;
-					gy+=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-	}
-	else if(blockIdx.x==1)
-	{
-		gx=n-1;
-		gy=threadIdx.y;
-
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy < n)
-			{
-				solver->updateValue(gx,gy);
-				gx--;
-				if(gx==-1)
-				{
-					gx=n-1;
-					gy+=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-	}
-	else if(blockIdx.x==2)
-	{
-		gx=0;
-		gy=n-threadIdx.y-1;
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy > -1)
-			{
-				solver->updateValue(gx,gy);
-				gx++;
-				if(gx==n)
-				{
-					gx=0;
-					gy-=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-	}
-	else if(blockIdx.x==3)
-	{
-		gx=n-1;
-		gy=n-threadIdx.y-1;
-
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy > -1)
-			{
-				solver->updateValue(gx,gy);
-				gx--;
-				if(gx==-1)
-				{
-					gx=n-1;
-					gy-=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-	}
-
-
-
-
-
-}
-
-
-__global__ void initCUDA(tnlFastSweeping< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver)
-{
-
-
-	int gx = threadIdx.x + blockDim.x*blockIdx.x;
-	int gy = blockDim.y*blockIdx.y + threadIdx.y;
-
-
-	if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy)
-	{
-		solver->initGrid();
-	}
-
-
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1111( Index i, Index j)
-{
-//	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-//	Entity.setCoordinates(CoordinatesType(i,j));
-//	Entity.refresh();
-//	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-//	cudaDofVector2[Entity.getIndex()]=fabsMin(INT_MAX,cudaDofVector2[Entity.getIndex()]);
-//	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-//	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-//	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0000( Index i, Index j)
-{
-//	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-//	Entity.setCoordinates(CoordinatesType(i,j));
-//	Entity.refresh();
-//	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-//	cudaDofVector2[Entity.getIndex()]=fabsMin(-INT_MAX,cudaDofVector2[Entity.getIndex()]);
-//	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-//	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-//	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1110( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=INT_MAX;	//fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=0.5*h;	//fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=-0.5*h;	//fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=0.5*h;	//fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1101( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[Entity.getIndex()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=0.5*h;	//fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=0.5*h;	//fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=0.5*h;	//fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=INT_MAX;	//fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1011( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[Entity.getIndex()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=0.5*h;	//fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=0.5*h;	//fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=INT_MAX;	//fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=-0.5*h;	//fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0111( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=-0.5*h;	//fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=0.5*h;	//fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=INT_MAX;	//fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=0.5*h;	//fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0001( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=-INT_MAX;	//fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=-0.5*h;	//fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=0.5*h;	//fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=-0.5*h;	//fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0010( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[Entity.getIndex()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=-0.5*h;	//fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=0.5*h;	//fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=-0.5*h;	//fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=-INT_MAX;	//fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0100( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[Entity.getIndex()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=-0.5*h;	//fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=INT_MAX;	//fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=-0.5*h;	//fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=0.5*h;	//fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1000( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=0.5*h;	//fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=-0.5*h;	//fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=-INT_MAX;	//fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=-0.5*h;	//fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1100( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-al;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=0.5*h;	//fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=-0.5*h;	//fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=-0.5*h;	//fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=0.5*h;	//fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1010( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=0.5*h;	//fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=0.5*h;	//fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=-0.5*h;	//fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=-0.5*h;	//fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1001( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	cudaDofVector2[Entity.getIndex()]=0.5*h;	//fabsMin(cudaDofVector[Entity.getIndex()],cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=-0.5*h;	//fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=0.5*h;	//fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=-0.5*h;	//fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0011( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-al;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=-0.5*h;	//fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=0.5*h;	//fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=0.5*h;	//fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=-0.5*h;	//fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0101( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = al-be;
-	b = 1.0;
-	c = -be;
-	s = h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=-0.5*h;	//fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=-0.5*h;	//fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=0.5*h;	//fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=0.5*h;	//fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0110( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	cudaDofVector2[Entity.getIndex()]=-0.5*h;	//fabsMin(cudaDofVector[Entity.getIndex()],cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=0.5*h;	//fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=-0.5*h;	//fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=0.5*h;	//fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-}
-#endif
-
-
-
-
-#endif /* TNLFASTSWEEPING_IMPL_H_ */
diff --git a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v5_impl.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v5_impl.h
deleted file mode 100644
index 1591bb613..000000000
--- a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_CUDA_v5_impl.h
+++ /dev/null
@@ -1,697 +0,0 @@
-/***************************************************************************
-                          tnlFastSweeping2D_CUDA_v5_impl.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLFASTSWEEPING2D_IMPL_H_
-#define TNLFASTSWEEPING2D_IMPL_H_
-
-#include "tnlFastSweeping.h"
-
-__device__
-double fabsMin( double x, double y)
-{
-	double fx = abs(x);
-
-	if(Min(fx,abs(y)) == fx)
-		return x;
-	else
-		return y;
-}
-
-__device__
-double atomicFabsMin(double* address, double val)
-{
-	unsigned long long int* address_as_ull =
-						  (unsigned long long int*)address;
-	unsigned long long int old = *address_as_ull, assumed;
-	do {
-		assumed = old;
-			old = atomicCAS(address_as_ull, assumed,__double_as_longlong( fabsMin(assumed,val) ));
-	} while (assumed != old);
-	return __longlong_as_double(old);
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-String tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
-{
-	   return String( "tnlFastSweeping< " ) +
-	          MeshType::getType() + ", " +
-	          ::getType< Real >() + ", " +
-	          ::getType< Index >() + " >";
-}
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: init( const Config::ParameterContainer& parameters )
-{
-	const String& meshFile = parameters.getParameter< String >( "mesh" );
-
-	if( ! Mesh.load( meshFile ) )
-	{
-		  std::cerr << "I am not able to load the mesh from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-
-	const String& initialCondition = parameters.getParameter <String>("initial-condition");
-	if( ! dofVector.load( initialCondition ) )
-	{
-		  std::cerr << "I am not able to load the initial condition from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-	h = Mesh.getSpaceSteps().x();
-	counter = 0;
-
-	const String& exact_input = parameters.getParameter< String >( "exact-input" );
-
-	if(exact_input == "no")
-		exactInput=false;
-	else
-		exactInput=true;
-
-
-#ifdef HAVE_CUDA
-
-	cudaMalloc(&(cudaDofVector), this->dofVector.getSize()*sizeof(double));
-	cudaMemcpy(cudaDofVector, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice);
-
-	cudaMalloc(&(cudaDofVector2), this->dofVector.getSize()*sizeof(double));
-	cudaMemcpy(cudaDofVector2, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice);
-
-
-	cudaMalloc(&(this->cudaSolver), sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >));
-	cudaMemcpy(this->cudaSolver, this,sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >), cudaMemcpyHostToDevice);
-
-#endif
-
-	int n = Mesh.getDimensions().x();
-	dim3 threadsPerBlock(16, 16);
-	dim3 numBlocks(n/16 + 1 ,n/16 +1);
-
-	initCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-
-	return true;
-}
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: run()
-{
-//
-//	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-//	{
-//		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-//		{
-//			updateValue(i,j);
-//		}
-//	}
-//
-///*---------------------------------------------------------------------------------------------------------------------------*/
-//
-//	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-//	{
-//		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-//		{
-//			updateValue(i,j);
-//		}
-//	}
-//
-///*---------------------------------------------------------------------------------------------------------------------------*/
-//
-//	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-//	{
-//		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-//		{
-//			updateValue(i,j);
-//		}
-//	}
-//
-///*---------------------------------------------------------------------------------------------------------------------------*/
-//	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-//	{
-//		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-//		{
-//			updateValue(i,j);
-//		}
-//	}
-//
-///*---------------------------------------------------------------------------------------------------------------------------*/
-//
-//
-//	dofVector.save("u-00001.tnl");
-
-	int n = Mesh.getDimensions().x();
-	dim3 threadsPerBlock(1, 512);
-	dim3 numBlocks(4,1);
-
-
-	runCUDA<<<numBlocks,threadsPerBlock,3*(512+1)*sizeof(double)>>>(this->cudaSolver,0,0);
-
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-
-	cudaMemcpy(this->dofVector.getData(), cudaDofVector, this->dofVector.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-	cudaDeviceSynchronize();
-	cudaFree(cudaDofVector);
-	cudaFree(cudaDofVector2);
-	cudaFree(cudaSolver);
-	dofVector.save("u-00001.tnl");
-	cudaDeviceSynchronize();
-	return true;
-}
-
-
-
-
-#ifdef HAVE_CUDA
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j)
-{
-	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-	Real value = cudaDofVector[index];
-	Real a,b, tmp;
-
-	if( i == 0 )
-		a = cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)];
-	else if( i == Mesh.getDimensions().x() - 1 )
-		a = cudaDofVector[Mesh.template getCellNextToCell<-1,0>(index)];
-	else
-	{
-		a = fabsMin( cudaDofVector[Mesh.template getCellNextToCell<-1,0>(index)],
-				 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)] );
-	}
-
-	if( j == 0 )
-		b = cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)];
-	else if( j == Mesh.getDimensions().y() - 1 )
-		b = cudaDofVector[Mesh.template getCellNextToCell<0,-1>(index)];
-	else
-	{
-		b = fabsMin( cudaDofVector[Mesh.template getCellNextToCell<0,-1>(index)],
-				 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)] );
-	}
-
-
-	if(abs(a-b) >= h)
-		tmp = fabsMin(a,b) + sign(value)*h;
-	else
-		tmp = 0.5 * (a + b + sign(value)*sqrt(2.0 * h * h - (a - b) * (a - b) ) );
-
-	cudaDofVector[index]  = fabsMin(value, tmp);
-
-}
-
-
-
-
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j, double** sharedMem, int k3)
-{
-	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-	Real value = sharedMem[k3+1][threadIdx.y];
-	Real a,b, tmp;
-
-	if( i == 0 )
-		a = sharedMem[k3][threadIdx.y];
-	else if( i == Mesh.getDimensions().x() - 1 )
-		a = sharedMem[k3+2][threadIdx.y];
-	else
-	{
-		a = fabsMin( sharedMem[k3][threadIdx.y],
-				sharedMem[k3+2][threadIdx.y] );
-	}
-
-	if( j == 0 )
-		b = sharedMem[k3][threadIdx.y+1];
-	else if( j == Mesh.getDimensions().y() - 1 )
-		b = sharedMem[k3+2][threadIdx.y-1];
-	else
-	{
-		b = fabsMin( sharedMem[k3][threadIdx.y+1],
-				sharedMem[k3+2][threadIdx.y-1] );
-	}
-
-
-	if(abs(a-b) >= h)
-		tmp = fabsMin(a,b) + sign(value)*h;
-	else
-		tmp = 0.5 * (a + b + sign(value)*sqrt(2.0 * h * h - (a - b) * (a - b) ) );
-
-//	sharedMem[k3+1][threadIdx.y] = this->fabsMin(value, tmp);
-//	atomicFabsMin(&(cudaDofVector[index]), tmp);
-	cudaDofVector[index]  = tmp;
-	this->fabsMin(value, tmp);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid()
-{
-	int gx = threadIdx.x + blockDim.x*blockIdx.x;
-	int gy = blockDim.y*blockIdx.y + threadIdx.y;
-	int gid = Mesh.getCellIndex(CoordinatesType(gx,gy));
-
-	int total = blockDim.x*gridDim.x;
-
-
-
-	Real tmp = 0.0;
-	int flag = 0;
-	counter = 0;
-	tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
-
-
-	if(!exactInput)
-	{
-		cudaDofVector[gid]=cudaDofVector[gid]=0.5*h*sign(cudaDofVector[gid]);
-	}
-	__threadfence();
-//	printf("-----------------------------------------------------------------------------------\n");
-
-	__threadfence();
-
-	if(gx > 0 && gx < Mesh.getDimensions().x()-1)
-	{
-		if(gy > 0 && gy < Mesh.getDimensions().y()-1)
-		{
-
-			Index j = gy;
-			Index i = gx;
-//			 tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-			if(tmp == 0.0)
-			{}
-			else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
-					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-			{}
-			else
-				flag=1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-		}
-	}
-
-//	printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
-//	printf("****************************************************************\n");
-//	printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
-	if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == 0)
-	{
-//		printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
-		Index j = 0;
-		Index i = gx;
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-
-		if(tmp == 0.0)
-		{}
-		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 )
-		{}
-		else
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-	}
-
-//	printf("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n");
-	if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == Mesh.getDimensions().y() - 1)
-	{
-		Index i = gx;
-		Index j = Mesh.getDimensions().y() - 1;
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-
-		if(tmp == 0.0)
-		{}
-		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-		{}
-		else
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-	}
-
-//	printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
-	if(gy > 0 && gy < Mesh.getDimensions().y()-1 && gx == 0)
-	{
-		Index j = gy;
-		Index i = 0;
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-
-		if(tmp == 0.0)
-		{}
-		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-		{}
-		else
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-	}
-//	printf("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n");
-	if(gy > 0 && gy < Mesh.getDimensions().y()-1  && gx == Mesh.getDimensions().x() - 1)
-	{
-		Index j = gy;
-		Index i = Mesh.getDimensions().x() - 1;
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-
-		if(tmp == 0.0)
-		{}
-		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-		{}
-		else
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-	}
-
-//	printf("##################################################################################################\n");
-	if(gx == Mesh.getDimensions().x() - 1 &&
-	   gy == Mesh.getDimensions().y() - 1)
-	{
-
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
-		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 &&
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0)
-
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
-	}
-	if(gx == Mesh.getDimensions().x() - 1 &&
-	   gy == 0)
-	{
-
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
-		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 &&
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0)
-
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
-	}
-//	printf("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n");
-	if(gx == 0 &&
-	   gy == Mesh.getDimensions().y() - 1)
-	{
-
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
-		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 &&
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0)
-
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
-	}
-	if(gx == 0 &&
-	   gy == 0)
-	{
-//		tmp = sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
-		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 &&
-				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0)
-
-			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
-	}
-
-	__threadfence();
-
-	if(flag==1)
-		cudaDofVector[gid] =  tmp*3;
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-Real tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
-{
-	Real fx = abs(x);
-	//Real fy = abs(y);
-
-	//Real tmpMin = Min(fx,abs(y));
-
-	if(Min(fx,abs(y)) == fx)
-		return x;
-	else
-		return y;
-
-
-}
-
-
-
-__global__ void runCUDA(tnlFastSweeping< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int i)
-{
-
-	extern __shared__ double u[];
-	double* sharedMem[5];
-	sharedMem[0] = u;
-	sharedMem[1] = &(u[blockDim.y+1]);
-	sharedMem[2] = &(sharedMem[1][blockDim.y+1]);
-	sharedMem[3] = sharedMem[1];
-	sharedMem[4] = sharedMem[2];
-
-	int gx = 0;
-	int gy = threadIdx.y;
-	//if(solver->Mesh.getDimensions().x() <= gx || solver->Mesh.getDimensions().y() <= gy)
-	//	return;
-	int n = solver->Mesh.getDimensions().x();
-	int blockCount = n/blockDim.y +1;
-	//int gid = solver->Mesh.getDimensions().x() * gy + gx;
-	//int max = solver->Mesh.getDimensions().x()*solver->Mesh.getDimensions().x();
-
-	//int id1 = gx+gy;
-	//int id2 = (solver->Mesh.getDimensions().x() - gx - 1) + gy;
-
-
-	if(blockIdx.x==0)
-	{
-		if(threadIdx.y==0)
-			sharedMem[1][0]=solver->cudaDofVector[solver->Mesh.getCellIndex(Containers::StaticVector<2,int>(0,0))];
-
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy < n)
-			{
-				int k3=k%3;
-
-				if(threadIdx.y==0)
-				{
-					if(gx==n-1)
-						sharedMem[k3][0]=solver->cudaDofVector[solver->Mesh.getCellIndex(Containers::StaticVector<2,int>(0,gy+blockDim.y))];
-					else
-						sharedMem[k3][0]=solver->cudaDofVector[solver->Mesh.getCellIndex(Containers::StaticVector<2,int>(gx+1,gy))];
-				}
-//				else
-//					solver->cudaDofVector[solver->Mesh.getCellIndex(Containers::StaticVector<2,int>(gx,gy-1))]=sharedMem[k3+2][threadIdx.y-1];
-
-				if(gy<n-1)
-					sharedMem[k3][threadIdx.y+1]=solver->cudaDofVector[solver->Mesh.getCellIndex(Containers::StaticVector<2,int>(gx,gy+1))];
-
-				solver->updateValue(gx,gy,sharedMem,k3);
-				gx++;
-				if(gx==n)
-				{
-					gx=0;
-					gy+=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-	}
-//	else if(blockIdx.x==1)
-//	{
-//		gx=n-1;
-//		gy=threadIdx.y;
-//
-//		if(threadIdx.y==0)
-//					sharedMem[1][0]=solver->cudaDofVector[solver->Mesh.getCellIndex(Containers::StaticVector<2,int>(n-1,0))];
-//
-//		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-//		{
-//			if(threadIdx.y  < k+1 && gy < n)
-//			{
-//				int k3=k%3;
-//
-//				if(threadIdx.y==0)
-//					if(gx==0)
-//						sharedMem[k3+2][0]=solver->cudaDofVector[solver->Mesh.getCellIndex(Containers::StaticVector<2,int>(n-1,gy+blockDim.y))];
-//					else
-//						sharedMem[k3+2][0]=solver->cudaDofVector[solver->Mesh.getCellIndex(Containers::StaticVector<2,int>(gx-1,gy))];
-//				else
-//					solver->cudaDofVector[solver->Mesh.getCellIndex(Containers::StaticVector<2,int>(gx,gy-1))]=sharedMem[k3][threadIdx.y-1];
-//
-//				if(gy<n-1)
-//					sharedMem[k3+2][threadIdx.y+1]=solver->cudaDofVector[solver->Mesh.getCellIndex(Containers::StaticVector<2,int>(gx,gy+1))];
-//
-//
-//				solver->updateValue(gx,gy,sharedMem,k3);
-//				gx--;
-//				if(gx==-1)
-//				{
-//					gx=n-1;
-//					gy+=blockDim.y;
-//				}
-//			}
-//
-//
-//			__syncthreads();
-//		}
-//	}
-//	else if(blockIdx.x==2)
-//	{
-//		gx=0;
-//		gy=n-blockDim.y-1+threadIdx.y;
-//
-//		if(threadIdx.y==0)
-//					sharedMem[1][0]=solver->cudaDofVector[solver->Mesh.getCellIndex(Containers::StaticVector<2,int>(0,n-1))];
-//
-//		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-//		{
-//			if(blockDim.y-threadIdx.y  < k+1 && gy > -1)
-//			{
-//				int k3=k%3;
-//
-//				if(threadIdx.y==blockDim.y-1)
-//					if(gx==n-1)
-//						sharedMem[k3][n-1]=solver->cudaDofVector[solver->Mesh.getCellIndex(Containers::StaticVector<2,int>(0,gy-blockDim.y))];
-//					else
-//						sharedMem[k3][n-1]=solver->cudaDofVector[solver->Mesh.getCellIndex(Containers::StaticVector<2,int>(gx+1,gy))];
-//				else
-//					solver->cudaDofVector[solver->Mesh.getCellIndex(Containers::StaticVector<2,int>(gx,gy-1))]=sharedMem[k3+2][threadIdx.y-1];
-//
-//				if(gy<n-1)
-//					sharedMem[k3][threadIdx.y+1]=solver->cudaDofVector[solver->Mesh.getCellIndex(Containers::StaticVector<2,int>(gx,gy+1))];
-//
-//
-//				solver->updateValue(gx,gy,sharedMem,k3);
-//				gx++;
-//				if(gx==n)
-//				{
-//					gx=0;
-//					gy-=blockDim.y;
-//				}
-//			}
-//
-//
-//			__syncthreads();
-//		}
-//	}
-//	else if(blockIdx.x==3)
-//	{
-//		gx=n-1;
-//		gy=n-blockDim.y-1+threadIdx.y;
-//
-//		if(threadIdx.y==0)
-//					sharedMem[1][0]=solver->cudaDofVector[solver->Mesh.getCellIndex(Containers::StaticVector<2,int>(n-1,n-1))];
-//
-//
-//		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-//		{
-//			if(blockDim.y-threadIdx.y  < k+1 && gy > -1)
-//			{
-//				int k3=k%3;
-//
-//				if(threadIdx.y==blockDim.y-1)
-//					if(gx==n-1)
-//						sharedMem[k3+2][n-1]=solver->cudaDofVector[solver->Mesh.getCellIndex(Containers::StaticVector<2,int>(n-1,gy-blockDim.y))];
-//					else
-//						sharedMem[k3+2][n-1]=solver->cudaDofVector[solver->Mesh.getCellIndex(Containers::StaticVector<2,int>(gx+1,gy))];
-//				else
-//					solver->cudaDofVector[solver->Mesh.getCellIndex(Containers::StaticVector<2,int>(gx,gy-1))]=sharedMem[k3][threadIdx.y-1];
-//
-//				if(gy<n-1)
-//					sharedMem[k3+2][threadIdx.y+1]=solver->cudaDofVector[solver->Mesh.getCellIndex(Containers::StaticVector<2,int>(gx,gy+1))];
-//
-//
-//				solver->updateValue(gx,gy,sharedMem,k3);
-//				gx--;
-//				if(gx==-1)
-//				{
-//					gx=n-1;
-//					gy-=blockDim.y;
-//				}
-//			}
-//
-//
-//			__syncthreads();
-//		}
-//	}
-
-
-
-
-}
-
-
-__global__ void initCUDA(tnlFastSweeping< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver)
-{
-	int gx = threadIdx.x + blockDim.x*blockIdx.x;
-	int gy = blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy)
-	{
-		solver->initGrid();
-	}
-
-
-}
-#endif
-
-
-
-
-#endif /* TNLFASTSWEEPING_IMPL_H_ */
diff --git a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_impl.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_impl.h
deleted file mode 100644
index c4ce8fe6b..000000000
--- a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_impl.h
+++ /dev/null
@@ -1,927 +0,0 @@
-/***************************************************************************
-                          tnlFastSweeping2D_impl.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLFASTSWEEPING2D_IMPL_H_
-#define TNLFASTSWEEPING2D_IMPL_H_
-
-#include "tnlFastSweeping.h"
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-String tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
-{
-	   return String( "tnlFastSweeping< " ) +
-	          MeshType::getType() + ", " +
-	          ::getType< Real >() + ", " +
-	          ::getType< Index >() + " >";
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: tnlFastSweeping()
-:Entity(Mesh),
- dofVector(Mesh),
- dofVector2(Mesh)
-{
-}
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: init( const Config::ParameterContainer& parameters )
-{
-	const String& meshFile = parameters.getParameter< String >( "mesh" );
-
-	if( ! Mesh.load( meshFile ) )
-	{
-		  std::cerr << "I am not able to load the mesh from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-
-	const String& initialCondition = parameters.getParameter <String>("initial-condition");
-	if( ! dofVector.load( initialCondition ) )
-	{
-		  std::cerr << "I am not able to load the initial condition from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-	dofVector2.load(initialCondition);
-
-	h = Mesh.template getSpaceStepsProducts< 1, 0 >();
-	Entity.refresh();
-
-	const String& exact_input = parameters.getParameter< String >( "exact-input" );
-
-	if(exact_input == "no")
-		exactInput=false;
-	else
-		exactInput=true;
-
-	cout << "a" <<std::endl;
-	return initGrid();
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid()
-{
-
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	for(int i=0; i< Mesh.getDimensions().x()*Mesh.getDimensions().x();i++)
-	{
-		dofVector2[i]=INT_MAX*sign(dofVector[i]);
-	}
-
-	for(int i = 0 ; i < Mesh.getDimensions().x()-1; i++)
-	{
-		for(int j = 0 ; j < Mesh.getDimensions().x()-1; j++)
-			{
-			this->Entity.setCoordinates(CoordinatesType(i,j));
-			this->Entity.refresh();
-			neighborEntities.refresh(Mesh,Entity.getIndex());
-
-				if(dofVector[this->Entity.getIndex()] > 0)
-				{
-					if(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()] > 0)
-					{
-						if(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-						{
-							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-								setupSquare1111(i,j);
-							else
-								setupSquare1110(i,j);
-						}
-						else
-						{
-							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-								setupSquare1101(i,j);
-							else
-								setupSquare1100(i,j);
-						}
-					}
-					else
-					{
-						if(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-						{
-							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-								setupSquare1011(i,j);
-							else
-								setupSquare1010(i,j);
-						}
-						else
-						{
-							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-								setupSquare1001(i,j);
-							else
-								setupSquare1000(i,j);
-						}
-					}
-				}
-				else
-				{
-					if(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()] > 0)
-					{
-						if(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-						{
-							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-								setupSquare0111(i,j);
-							else
-								setupSquare0110(i,j);
-						}
-						else
-						{
-							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-								setupSquare0101(i,j);
-							else
-								setupSquare0100(i,j);
-						}
-					}
-					else
-					{
-						if(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-						{
-							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-								setupSquare0011(i,j);
-							else
-								setupSquare0010(i,j);
-						}
-						else
-						{
-							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-								setupSquare0001(i,j);
-							else
-								setupSquare0000(i,j);
-						}
-					}
-				}
-
-			}
-	}
-	cout << "a" <<std::endl;
-
-//	Real tmp = 0.0;
-//	Real ax=0.5/sqrt(2.0);
-//
-//	if(!exactInput)
-//	{
-//		for(Index i = 0; i < Mesh.getDimensions().x()*Mesh.getDimensions().y(); i++)
-//				dofVector[i]=0.5*h*sign(dofVector[i]);
-//	}
-//
-//
-//	for(Index i = 1; i < Mesh.getDimensions().x()-1; i++)
-//	{
-//		for(Index j = 1; j < Mesh.getDimensions().y()-1; j++)
-//		{
-//			 tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-//
-//			if(tmp == 0.0)
-//			{}
-//			else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-//					dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-//					dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
-//					dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-//			{}
-//			else
-//				dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-//		}
-//	}
-//
-//
-//
-//	for(int i = 1; i < Mesh.getDimensions().x()-1; i++)
-//	{
-//		Index j = 0;
-//		tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-//
-//
-//		if(tmp == 0.0)
-//		{}
-//		else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-//				dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-//				dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 )
-//		{}
-//		else
-//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-//	}
-//
-//	for(int i = 1; i < Mesh.getDimensions().x()-1; i++)
-//	{
-//		Index j = Mesh.getDimensions().y() - 1;
-//		tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-//
-//
-//		if(tmp == 0.0)
-//		{}
-//		else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-//				dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-//				dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-//		{}
-//		else
-//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-//	}
-//
-//	for(int j = 1; j < Mesh.getDimensions().y()-1; j++)
-//	{
-//		Index i = 0;
-//		tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-//
-//
-//		if(tmp == 0.0)
-//		{}
-//		else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-//				dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
-//				dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-//		{}
-//		else
-//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-//	}
-//
-//	for(int j = 1; j < Mesh.getDimensions().y()-1; j++)
-//	{
-//		Index i = Mesh.getDimensions().x() - 1;
-//		tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-//
-//
-//		if(tmp == 0.0)
-//		{}
-//		else if(dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-//				dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
-//				dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-//		{}
-//		else
-//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-//	}
-//
-//
-//	Index i = Mesh.getDimensions().x() - 1;
-//	Index j = Mesh.getDimensions().y() - 1;
-//
-//	tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-//	if(dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp > 0.0 &&
-//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp > 0.0)
-//
-//		dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-//
-//
-//
-//	j = 0;
-//	tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-//	if(dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp > 0.0 &&
-//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp > 0.0)
-//
-//		dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-//
-//
-//
-//	i = 0;
-//	j = Mesh.getDimensions().y() -1;
-//	tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-//	if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp > 0.0 &&
-//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp > 0.0)
-//
-//		dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-//
-//
-//
-//	j = 0;
-//	tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-//	if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp > 0.0 &&
-//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp > 0.0)
-//
-//		dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-
-	//data.setLike(dofVector2.getData());
-	//data=dofVector2.getData();
-	//cout << data.getType() <<std::endl;
-	dofVector2.save("u-00000.tnl");
-	//dofVector2.getData().save("u-00000.tnl");
-
-	return true;
-}
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: run()
-{
-
-	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-	{
-		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-		{
-			updateValue(i,j);
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-
-	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-	{
-		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-		{
-			updateValue(i,j);
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-
-	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-	{
-		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-		{
-			updateValue(i,j);
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-	{
-		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-		{
-			updateValue(i,j);
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-
-
-//	data.setLike(dofVector2.getData());
-//	data = dofVector2.getData();
-//	cout << data.getType() <<std::endl;
-	dofVector2.save("u-00001.tnl");
-	//dofVector2.getData().save("u-00001.tnl");
-
-	return true;
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j)
-{
-
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-
-	Real value = dofVector2[Entity.getIndex()];
-	Real a,b, tmp;
-
-	if( i == 0 )
-		a = dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()];
-	else if( i == Mesh.getDimensions().x() - 1 )
-		a = dofVector2[neighborEntities.template getEntityIndex< -1,  0 >()];
-	else
-	{
-		a = fabsMin( dofVector2[neighborEntities.template getEntityIndex< -1,  0 >()],
-				 dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()] );
-	}
-
-	if( j == 0 )
-		b = dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()];
-	else if( j == Mesh.getDimensions().y() - 1 )
-		b = dofVector2[neighborEntities.template getEntityIndex< 0,  -1 >()];
-	else
-	{
-		b = fabsMin( dofVector2[neighborEntities.template getEntityIndex< 0,  -1 >()],
-				 dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()] );
-	}
-
-
-	if(fabs(a-b) >= h)
-		tmp = fabsMin(a,b) + sign(value)*h;
-	else
-		tmp = 0.5 * (a + b + sign(value)*sqrt(2.0 * h * h - (a - b) * (a - b) ) );
-
-
-	dofVector2[Entity.getIndex()] = fabsMin(value, tmp);
-
-//	if(dofVector2[Entity.getIndex()] > 1.0)
-//		cout << value << "    " << tmp << " " << dofVector2[Entity.getIndex()] <<std::endl;
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-Real tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
-{
-	Real fx = fabs(x);
-	Real fy = fabs(y);
-
-	Real tmpMin = Min(fx,fy);
-
-	if(tmpMin == fx)
-		return x;
-	else
-		return y;
-
-}
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1111( Index i, Index j)
-{
-//	this->Entity.setCoordinates(CoordinatesType(i,j));
-//	this->Entity.refresh();
-//	auto neighborEntities =  Entity.getNeighborEntities();
-//	dofVector2[Entity.getIndex()]=fabsMin(INT_MAX,dofVector2[Entity.getIndex()]);
-//	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(INT_MAX,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-//	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(INT_MAX,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-//	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(INT_MAX,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0000( Index i, Index j)
-{
-//	this->Entity.setCoordinates(CoordinatesType(i,j));
-//	this->Entity.refresh();
-//	auto neighborEntities =  Entity.getNeighborEntities();
-//	dofVector2[Entity.getIndex()]=fabsMin(-INT_MAX,dofVector2[(Entity.getIndex())]);
-//	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-INT_MAX,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-//	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-INT_MAX,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-//	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-INT_MAX,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1110( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1101( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(dofVector[Entity.getIndex()]-
-			 dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1011( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(dofVector[Entity.getIndex()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0111( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	be=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0001( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0010( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(dofVector[Entity.getIndex()]-
-			 dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0100( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(dofVector[Entity.getIndex()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1000( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	be=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1100( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-al;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1010( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1001( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	dofVector2[Entity.getIndex()]=fabsMin(dofVector[Entity.getIndex()],dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()],dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()],dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()],dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0011( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-al;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0101( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0110( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	dofVector2[Entity.getIndex()]=fabsMin(dofVector[Entity.getIndex()],dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()],dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()],dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()],dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-}
-
-
-
-
-#endif /* TNLFASTSWEEPING_IMPL_H_ */
diff --git a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_openMP_impl.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_openMP_impl.h
deleted file mode 100644
index 54bbe931e..000000000
--- a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping2D_openMP_impl.h
+++ /dev/null
@@ -1,399 +0,0 @@
-/***************************************************************************
-                          tnlFastSweeping_impl.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLFASTSWEEPING2D_IMPL_H_
-#define TNLFASTSWEEPING2D_IMPL_H_
-
-#include "tnlFastSweeping.h"
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-String tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
-{
-	   return String( "tnlFastSweeping< " ) +
-	          MeshType::getType() + ", " +
-	          ::getType< Real >() + ", " +
-	          ::getType< Index >() + " >";
-}
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: init( const Config::ParameterContainer& parameters )
-{
-	const String& meshFile = parameters.getParameter< String >( "mesh" );
-
-	if( ! Mesh.load( meshFile ) )
-	{
-		  std::cerr << "I am not able to load the mesh from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-
-	const String& initialCondition = parameters.getParameter <String>("initial-condition");
-	if( ! dofVector.load( initialCondition ) )
-	{
-		  std::cerr << "I am not able to load the initial condition from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-	h = Mesh.getSpaceSteps().x();
-
-
-	const String& exact_input = parameters.getParameter< String >( "exact-input" );
-
-	if(exact_input == "no")
-		exactInput=false;
-	else
-		exactInput=true;
-
-#ifdef HAVE_OPENMP
-//	gridLock = (omp_lock_t*) malloc(sizeof(omp_lock_t)*Mesh.getDimensions().x()*Mesh.getDimensions().y());
-//
-//	for(int i = 0; i < Mesh.getDimensions().x()*Mesh.getDimensions().y(); i++)
-//			omp_init_lock(&gridLock[i]);
-#endif
-
-	return initGrid();
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid()
-{
-
-	Real tmp = 0.0;
-
-	if(!exactInput)
-	{
-		for(Index i = 0; i < Mesh.getDimensions().x()*Mesh.getDimensions().y(); i++)
-				dofVector[i]=0.5*h*sign(dofVector[i]);
-	}
-
-
-	for(Index i = 1; i < Mesh.getDimensions().x()-1; i++)
-	{
-		for(Index j = 1; j < Mesh.getDimensions().y()-1; j++)
-		{
-			 tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-
-			if(tmp == 0.0)
-			{}
-			else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-					dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-					dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
-					dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-			{}
-			else
-				dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-		}
-	}
-
-
-
-	for(int i = 1; i < Mesh.getDimensions().x()-1; i++)
-	{
-		Index j = 0;
-		tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-
-		if(tmp == 0.0)
-		{}
-		else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-				dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-				dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 )
-		{}
-		else
-			dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-	}
-
-	for(int i = 1; i < Mesh.getDimensions().x()-1; i++)
-	{
-		Index j = Mesh.getDimensions().y() - 1;
-		tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-
-		if(tmp == 0.0)
-		{}
-		else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-				dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-				dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-		{}
-		else
-			dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-	}
-
-	for(int j = 1; j < Mesh.getDimensions().y()-1; j++)
-	{
-		Index i = 0;
-		tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-
-		if(tmp == 0.0)
-		{}
-		else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-				dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
-				dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-		{}
-		else
-			dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-	}
-
-	for(int j = 1; j < Mesh.getDimensions().y()-1; j++)
-	{
-		Index i = Mesh.getDimensions().x() - 1;
-		tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-
-
-		if(tmp == 0.0)
-		{}
-		else if(dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-				dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
-				dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-		{}
-		else
-			dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-	}
-
-
-	Index i = Mesh.getDimensions().x() - 1;
-	Index j = Mesh.getDimensions().y() - 1;
-
-	tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-	if(dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp > 0.0 &&
-			dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp > 0.0)
-
-		dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-
-
-
-	j = 0;
-	tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-	if(dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp > 0.0 &&
-			dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp > 0.0)
-
-		dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-
-
-
-	i = 0;
-	j = Mesh.getDimensions().y() -1;
-	tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-	if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp > 0.0 &&
-			dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp > 0.0)
-
-		dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-
-
-
-	j = 0;
-	tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-	if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp > 0.0 &&
-			dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp > 0.0)
-
-		dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-
-
-	dofVector.save("u-00000.tnl");
-
-	return true;
-}
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: run()
-{
-
-	DofVectorType d2,d3,d4;
-	d2.setLike(dofVector);
-	d2=dofVector;
-	d3.setLike(dofVector);
-	d3=dofVector;
-	d4.setLike(dofVector);
-	d4=dofVector;
-
-
-#ifdef HAVE_OPENMP
-#pragma omp parallel sections num_threads(4)
-	{
-	{
-#endif
-
-	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-	{
-		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-		{
-			updateValue(i,j,&dofVector);
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-#ifdef HAVE_OPENMP
-	}
-#pragma omp section
-	{
-#endif
-	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-	{
-		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-		{
-			updateValue(i,j,&d2);
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-#ifdef HAVE_OPENMP
-	}
-#pragma omp section
-	{
-#endif
-	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-	{
-		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-		{
-			updateValue(i,j, &d3);
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-#ifdef HAVE_OPENMP
-	}
-#pragma omp section
-	{
-#endif
-	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-	{
-		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-		{
-			updateValue(i,j, &d4);
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-#ifdef HAVE_OPENMP
-	}
-	}
-#endif
-
-
-#ifdef HAVE_OPENMP
-#pragma omp parallel for num_threads(4) schedule(dynamic)
-#endif
-	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-	{
-		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-		{
-			int index = Mesh.getCellIndex(CoordinatesType(i,j));
-			dofVector[index] = fabsMin(dofVector[index], d2[index]);
-			dofVector[index] = fabsMin(dofVector[index], d3[index]);
-			dofVector[index] = fabsMin(dofVector[index], d4[index]);
-		}
-	}
-
-	dofVector.save("u-00001.tnl");
-
-	return true;
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j, DofVectorType* grid)
-{
-	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-	Real value = (*grid)[index];
-	Real a,b, tmp;
-
-	if( i == 0 )
-		a = (*grid)[Mesh.template getCellNextToCell<1,0>(index)];
-	else if( i == Mesh.getDimensions().x() - 1 )
-		a = (*grid)[Mesh.template getCellNextToCell<-1,0>(index)];
-	else
-	{
-		a = fabsMin( (*grid)[Mesh.template getCellNextToCell<-1,0>(index)],
-				 (*grid)[Mesh.template getCellNextToCell<1,0>(index)] );
-	}
-
-	if( j == 0 )
-		b = (*grid)[Mesh.template getCellNextToCell<0,1>(index)];
-	else if( j == Mesh.getDimensions().y() - 1 )
-		b = (*grid)[Mesh.template getCellNextToCell<0,-1>(index)];
-	else
-	{
-		b = fabsMin( (*grid)[Mesh.template getCellNextToCell<0,-1>(index)],
-				 (*grid)[Mesh.template getCellNextToCell<0,1>(index)] );
-	}
-
-
-	if(fabs(a-b) >= h)
-		tmp = fabsMin(a,b) + sign(value)*h;
-	else
-		tmp = 0.5 * (a + b + sign(value)*sqrt(2.0 * h * h - (a - b) * (a - b) ) );
-
-#ifdef HAVE_OPENMP
-//	omp_set_lock(&gridLock[index]);
-#endif
-	(*grid)[index]  = fabsMin(value, tmp);
-#ifdef HAVE_OPENMP
-//	omp_unset_lock(&gridLock[index]);
-#endif
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-Real tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
-{
-	Real fx = fabs(x);
-	Real fy = fabs(y);
-
-	Real tmpMin = Min(fx,fy);
-
-	if(tmpMin == fx)
-		return x;
-	else
-		return y;
-
-
-}
-
-
-
-
-#endif /* TNLFASTSWEEPING_IMPL_H_ */
diff --git a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping3D_CUDA_impl.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping3D_CUDA_impl.h
deleted file mode 100644
index 6a5195cfe..000000000
--- a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping3D_CUDA_impl.h
+++ /dev/null
@@ -1,961 +0,0 @@
-/***************************************************************************
-                          tnlFastSweeping2D_CUDA_v4_impl.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLFASTSWEEPING3D_IMPL_H_
-#define TNLFASTSWEEPING3D_IMPL_H_
-
-#include "tnlFastSweeping.h"
-
-//__device__
-//double fabsMin( double x, double y)
-//{
-//	double fx = abs(x);
-//
-//	if(Min(fx,abs(y)) == fx)
-//		return x;
-//	else
-//		return y;
-//}
-//
-//__device__
-//double atomicFabsMin(double* address, double val)
-//{
-//	unsigned long long int* address_as_ull =
-//						  (unsigned long long int*)address;
-//	unsigned long long int old = *address_as_ull, assumed;
-//	do {
-//		assumed = old;
-//			old = atomicCAS(address_as_ull, assumed,__double_as_longlong( fabsMin(assumed,val) ));
-//	} while (assumed != old);
-//	return __longlong_as_double(old);
-//}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-String tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
-{
-	   return String( "tnlFastSweeping< " ) +
-	          MeshType::getType() + ", " +
-	          ::getType< Real >() + ", " +
-	          ::getType< Index >() + " >";
-}
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: init( const Config::ParameterContainer& parameters )
-{
-	const String& meshFile = parameters.getParameter< String >( "mesh" );
-
-	if( ! Mesh.load( meshFile ) )
-	{
-		  std::cerr << "I am not able to load the mesh from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-
-	const String& initialCondition = parameters.getParameter <String>("initial-condition");
-	if( ! dofVector.load( initialCondition ) )
-	{
-		  std::cerr << "I am not able to load the initial condition from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-	this->h = Mesh.template getSpaceStepsProducts< 1, 0, 0 >();
-	counter = 0;
-
-	const String& exact_input = parameters.getParameter< String >( "exact-input" );
-
-	if(exact_input == "no")
-		exactInput=false;
-	else
-		exactInput=true;
-
-
-#ifdef HAVE_CUDA
-
-	cudaMalloc(&(cudaDofVector), this->dofVector.getData().getSize()*sizeof(double));
-	cudaMemcpy(cudaDofVector, this->dofVector.getData().getData(), this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyHostToDevice);
-
-	cudaMalloc(&(cudaDofVector2), this->dofVector.getData().getSize()*sizeof(double));
-	cudaMemcpy(cudaDofVector2, this->dofVector.getData().getData(), this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyHostToDevice);
-
-
-	cudaMalloc(&(this->cudaSolver), sizeof(tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index >));
-	cudaMemcpy(this->cudaSolver, this,sizeof(tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index >), cudaMemcpyHostToDevice);
-
-#endif
-
-	int n = Mesh.getDimensions().x();
-	dim3 threadsPerBlock(8, 8,8);
-	dim3 numBlocks(n/8 + 1, n/8 +1, n/8 +1);
-
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-	initCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-
-	return true;
-}
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: run()
-{
-
-	int n = Mesh.getDimensions().x();
-	dim3 threadsPerBlock(1, 1024);
-	dim3 numBlocks(8,1);
-
-
-	runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,0,0);
-
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-
-	cudaMemcpy(this->dofVector.getData().getData(), cudaDofVector2, this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-	cudaDeviceSynchronize();
-	cudaFree(cudaDofVector);
-	cudaFree(cudaDofVector2);
-	cudaFree(cudaSolver);
-	dofVector.save("u-00001.tnl");
-	cudaDeviceSynchronize();
-	return true;
-}
-
-
-
-
-#ifdef HAVE_CUDA
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j, Index k)
-{
-	tnlGridEntity< tnlGrid< 3,double, TNL::Devices::Host, int >, 3, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j,k));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 3, tnlGridEntityNoStencilStorage >,3> neighborEntities(Entity);
-	Real value = cudaDofVector2[Entity.getIndex()];
-	Real a,b,c, tmp;
-
-	if( i == 0 )
-		a = cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0,  0 >()];
-	else if( i == Mesh.getDimensions().x() - 1 )
-		a = cudaDofVector2[neighborEntities.template getEntityIndex< -1,  0,  0 >()];
-	else
-	{
-		a = fabsMin( cudaDofVector2[neighborEntities.template getEntityIndex< -1,  0,  0 >()],
-				 cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0,  0 >()] );
-	}
-
-	if( j == 0 )
-		b = cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1,  0 >()];
-	else if( j == Mesh.getDimensions().y() - 1 )
-		b = cudaDofVector2[neighborEntities.template getEntityIndex< 0,  -1,  0 >()];
-	else
-	{
-		b = fabsMin( cudaDofVector2[neighborEntities.template getEntityIndex< 0,  -1,  0 >()],
-				 cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1,  0 >()] );
-	}
-
-	if( k == 0 )
-		c = cudaDofVector2[neighborEntities.template getEntityIndex< 0,  0,  1 >()];
-	else if( k == Mesh.getDimensions().z() - 1 )
-		c = cudaDofVector2[neighborEntities.template getEntityIndex< 0,  0,  -1 >()];
-	else
-	{
-		c = fabsMin( cudaDofVector2[neighborEntities.template getEntityIndex< 0,  0,  -1 >()],
-				 cudaDofVector2[neighborEntities.template getEntityIndex< 0,  0,  1 >()] );
-	}
-
-	Real hD = 3.0*h*h - 2.0*(a*a + b*b + c*c - a*b - a*c - b*c);
-
-	if(hD < 0.0)
-		tmp = fabsMin(a,fabsMin(b,c)) + sign(value)*h;
-	else
-		tmp = (1.0/3.0) * ( a + b + c + sign(value)*sqrt(hD) );
-
-	atomicFabsMin(&cudaDofVector2[Entity.getIndex()],tmp);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-bool tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid(int i, int j, int k)
-{
-	tnlGridEntity< tnlGrid< 3,double, TNL::Devices::Host, int >, 3, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j,k));
-	Entity.refresh();
-	int gid = Entity.getIndex();
-
-	if(abs(cudaDofVector[gid]) < 1.0*h)
-		cudaDofVector2[gid] = 0.5*h;//cudaDofVector[gid];
-	else
-		cudaDofVector2[gid] = INT_MAX*sign(cudaDofVector[gid]);
-
-	return true;
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-Real tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
-{
-	Real fx = abs(x);
-	if(Min(fx,abs(y)) == fx)
-		return x;
-	else
-		return y;
-
-
-}
-
-
-
-__global__ void runCUDA(tnlFastSweeping< tnlGrid< 3,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int i)
-{
-
-	int gx = 0;
-	int gy = threadIdx.y;
-
-	int n = solver->Mesh.getDimensions().x();
-	int blockCount = n/blockDim.y +1;
-
-	if(blockIdx.x==0)
-	{
-		for(int gz = 0; gz < n;gz++)
-		{
-		gx = 0;
-		gy = threadIdx.y;
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy < n)
-			{
-				solver->updateValue(gx,gy,gz);
-				gx++;
-				if(gx==n)
-				{
-					gx=0;
-					gy+=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-		__syncthreads();
-		}
-	}
-	else if(blockIdx.x==1)
-	{
-		for(int gz = 0; gz < n;gz++)
-		{
-		gx=n-1;
-		gy=threadIdx.y;
-
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy < n)
-			{
-				solver->updateValue(gx,gy,gz);
-				gx--;
-				if(gx==-1)
-				{
-					gx=n-1;
-					gy+=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-		}
-	}
-	else if(blockIdx.x==2)
-	{
-
-		for(int gz = 0; gz < n;gz++)
-		{
-		gx=0;
-		gy=n-threadIdx.y-1;
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy > -1)
-			{
-				solver->updateValue(gx,gy,gz);
-				gx++;
-				if(gx==n)
-				{
-					gx=0;
-					gy-=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-		}
-	}
-	else if(blockIdx.x==3)
-	{
-		for(int gz = 0; gz < n;gz++)
-		{
-		gx=n-1;
-		gy=n-threadIdx.y-1;
-
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy > -1)
-			{
-				solver->updateValue(gx,gy,gz);
-				gx--;
-				if(gx==-1)
-				{
-					gx=n-1;
-					gy-=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-		}
-	}
-
-
-
-
-	else if(blockIdx.x==4)
-	{
-		for(int gz = n-1; gz > -1;gz--)
-		{
-		gx = 0;
-		gy = threadIdx.y;
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy < n)
-			{
-				solver->updateValue(gx,gy,gz);
-				gx++;
-				if(gx==n)
-				{
-					gx=0;
-					gy+=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-		}
-	}
-	else if(blockIdx.x==5)
-	{
-		for(int gz = n-1; gz > -1;gz--)
-		{
-		gx=n-1;
-		gy=threadIdx.y;
-
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy < n)
-			{
-				solver->updateValue(gx,gy,gz);
-				gx--;
-				if(gx==-1)
-				{
-					gx=n-1;
-					gy+=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-		}
-	}
-	else if(blockIdx.x==6)
-	{
-
-		for(int gz = n-1; gz > -1;gz--)
-		{
-		gx=0;
-		gy=n-threadIdx.y-1;
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy > -1)
-			{
-				solver->updateValue(gx,gy,gz);
-				gx++;
-				if(gx==n)
-				{
-					gx=0;
-					gy-=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-		}
-	}
-	else if(blockIdx.x==7)
-	{
-		for(int gz = n-1; gz > -1;gz--)
-		{
-		gx=n-1;
-		gy=n-threadIdx.y-1;
-
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy > -1)
-			{
-				solver->updateValue(gx,gy,gz);
-				gx--;
-				if(gx==-1)
-				{
-					gx=n-1;
-					gy-=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-		}
-	}
-
-
-
-
-}
-
-
-__global__ void initCUDA(tnlFastSweeping< tnlGrid< 3,double, TNL::Devices::Host, int >, double, int >* solver)
-{
-	int gx = threadIdx.x + blockDim.x*blockIdx.x;
-	int gy = blockDim.y*blockIdx.y + threadIdx.y;
-	int gz = blockDim.z*blockIdx.z + threadIdx.z;
-
-	if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy && solver->Mesh.getDimensions().z() > gz)
-	{
-		solver->initGrid(gx,gy,gz);
-	}
-
-
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1111( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	cudaDofVector2[index]=fabsMin(INT_MAX,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0000( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	cudaDofVector2[index]=fabsMin(-INT_MAX,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1110( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]));
-//
-//	a = be/al;
-//	b=1.0;
-//	c=-be;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1101( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]));
-//
-//	a = be/al;
-//	b=1.0;
-//	c=-be;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1011( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]));
-//
-//	a = be/al;
-//	b=1.0;
-//	c=-be;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0111( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
-//
-//	a = be/al;
-//	b=1.0;
-//	c=-be;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0001( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]));
-//
-//	a = be/al;
-//	b=1.0;
-//	c=-be;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0010( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]));
-//
-//	a = be/al;
-//	b=1.0;
-//	c=-be;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0100( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]));
-//
-//	a = be/al;
-//	b=1.0;
-//	c=-be;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1000( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
-//
-//	a = be/al;
-//	b=1.0;
-//	c=-be;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//
-//
-//
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1100( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]));
-//
-//	a = al-be;
-//	b=1.0;
-//	c=-al;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1010( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]));
-//
-//	a = al-be;
-//	b=1.0;
-//	c=-be;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1001( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	cudaDofVector2[index]=fabsMin(cudaDofVector[index],cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)],cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)],cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)],cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//
-//
-//
-//
-//
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0011( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]));
-//
-//	a = al-be;
-//	b=1.0;
-//	c=-al;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0101( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]));
-//
-//	a = al-be;
-//	b=1.0;
-//	c=-be;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0110( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	cudaDofVector2[index]=fabsMin(cudaDofVector[index],cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)],cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)],cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)],cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//}
-#endif
-
-
-
-
-#endif /* TNLFASTSWEEPING_IMPL_H_ */
diff --git a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping3D_impl.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping3D_impl.h
deleted file mode 100644
index e22de0ab8..000000000
--- a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping3D_impl.h
+++ /dev/null
@@ -1,307 +0,0 @@
-/***************************************************************************
-                          tnlFastSweeping2D_impl.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLFASTSWEEPING3D_IMPL_H_
-#define TNLFASTSWEEPING3D_IMPL_H_
-
-#include "tnlFastSweeping.h"
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-String tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
-{
-	   return String( "tnlFastSweeping< " ) +
-	          MeshType::getType() + ", " +
-	          ::getType< Real >() + ", " +
-	          ::getType< Index >() + " >";
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: tnlFastSweeping()
-:Entity(Mesh),
- dofVector(Mesh),
- dofVector2(Mesh)
-{
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: init( const Config::ParameterContainer& parameters )
-{
-	const String& meshFile = parameters.getParameter< String >( "mesh" );
-
-	if( ! Mesh.load( meshFile ) )
-	{
-		  std::cerr << "I am not able to load the mesh from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-
-	const String& initialCondition = parameters.getParameter <String>("initial-condition");
-	if( ! dofVector.load( initialCondition ) )
-	{
-		  std::cerr << "I am not able to load the initial condition from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-	dofVector2.load(initialCondition);
-
-	h = Mesh.template getSpaceStepsProducts< 1, 0, 0 >();
-	Entity.refresh();
-
-	const String& exact_input = parameters.getParameter< String >( "exact-input" );
-
-	if(exact_input == "no")
-		exactInput=false;
-	else
-		exactInput=true;
-//	cout << "bla "<<endl;
-	return initGrid();
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid()
-{
-	for(int i=0; i< Mesh.getDimensions().x()*Mesh.getDimensions().y()*Mesh.getDimensions().z();i++)
-	{
-
-		if (abs(dofVector[i]) < 1.8*h)
-			dofVector2[i]=dofVector[i];
-		else
-			dofVector2[i]=INT_MAX*sign(dofVector[i]);
-	}
-
-	return true;
-}
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: run()
-{
-
-	for(Index k = 0; k < Mesh.getDimensions().z(); k++)
-	{
-		for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-		{
-			for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-			{
-				updateValue(i,j,k);
-			}
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-
-	for(Index k = 0; k < Mesh.getDimensions().z(); k++)
-	{
-		for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-		{
-			for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-			{
-				updateValue(i,j,k);
-			}
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-
-	for(Index k = 0; k < Mesh.getDimensions().z(); k++)
-	{
-		for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-		{
-			for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-			{
-				updateValue(i,j,k);
-			}
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-	for(Index k = 0; k < Mesh.getDimensions().z(); k++)
-	{
-		for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-		{
-			for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-			{
-				updateValue(i,j,k);
-			}
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-
-
-
-
-
-
-
-
-	for(Index k = Mesh.getDimensions().z() -1; k > -1; k--)
-	{
-		for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-		{
-			for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-			{
-				updateValue(i,j,k);
-			}
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-
-	for(Index k = Mesh.getDimensions().z() -1; k > -1; k--)
-	{
-		for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-		{
-			for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-			{
-				updateValue(i,j,k);
-			}
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-
-	for(Index k = Mesh.getDimensions().z() -1; k > -1; k--)
-	{
-		for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-		{
-			for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-			{
-				updateValue(i,j,k);
-			}
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-	for(Index k = Mesh.getDimensions().z() -1; k > -1; k--)
-	{
-		for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-		{
-			for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-			{
-				updateValue(i,j,k);
-			}
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-
-
-	dofVector2.save("u-00001.tnl");
-
-	cout << "bla 3"<<endl;
-	return true;
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j, Index k)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j,k));
-	this->Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 3, tnlGridEntityNoStencilStorage >,3> neighborEntities(Entity);
-	Real value = dofVector2[Entity.getIndex()];
-	Real a,b,c, tmp;
-
-	if( i == 0 )
-		a = dofVector2[neighborEntities.template getEntityIndex< 1,  0,  0>()];
-	else if( i == Mesh.getDimensions().x() - 1 )
-		a = dofVector2[neighborEntities.template getEntityIndex< -1,  0,  0 >()];
-	else
-	{
-		a = fabsMin( dofVector2[neighborEntities.template getEntityIndex< -1,  0,  0>()],
-				 dofVector2[neighborEntities.template getEntityIndex< 1,  0,  0>()] );
-	}
-
-	if( j == 0 )
-		b = dofVector2[neighborEntities.template getEntityIndex< 0,  1,  0>()];
-	else if( j == Mesh.getDimensions().y() - 1 )
-		b = dofVector2[neighborEntities.template getEntityIndex< 0,  -1,  0>()];
-	else
-	{
-		b = fabsMin( dofVector2[neighborEntities.template getEntityIndex< 0,  -1,  0>()],
-				 dofVector2[neighborEntities.template getEntityIndex< 0,  1,  0>()] );
-	}
-
-	if( k == 0 )
-		c = dofVector2[neighborEntities.template getEntityIndex< 0,  0,  1>()];
-	else if( k == Mesh.getDimensions().z() - 1 )
-		c = dofVector2[neighborEntities.template getEntityIndex< 0,  0,  -1>()];
-	else
-	{
-		c = fabsMin( dofVector2[neighborEntities.template getEntityIndex< 0,  0,  -1>()],
-				 dofVector2[neighborEntities.template getEntityIndex< 0,  0,  1>()] );
-	}
-
-	Real hD = 3.0*h*h - 2.0*(a*a+b*b+c*c-a*b-a*c-b*c);
-
-	if(hD < 0.0)
-		tmp = fabsMin(a,fabsMin(b,c)) + sign(value)*h;
-	else
-		tmp = (1.0/3.0) * ( a + b + c + sign(value)*sqrt(hD) );
-
-
-	dofVector2[Entity.getIndex()]  = fabsMin(value, tmp);
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-Real tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
-{
-	Real fx = fabs(x);
-	Real fy = fabs(y);
-
-	Real tmpMin = Min(fx,fy);
-
-	if(tmpMin == fx)
-		return x;
-	else
-		return y;
-
-}
-
-
-
-#endif /* TNLFASTSWEEPING_IMPL_H_ */
diff --git a/src/TNL/Legacy/fast-sweeping/tnlFastSweepingSolver.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweepingSolver.h
deleted file mode 100644
index fc9eb5459..000000000
--- a/src/TNL/Legacy/fast-sweeping/tnlFastSweepingSolver.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* 
- * File:   tnlFastSweepingSolver.h
- * Author: oberhuber
- *
- * Created on July 12, 2016, 6:04 PM
- */
-
-#pragma once
-
-#include <functions/tnlConstantFunction.h>
-#include <problems/tnlPDEProblem.h>
-
-template< typename Mesh,
-          typename Communicator,
-          typename Anisotropy = tnlConstanstFunction< Mesh > >
-class tnlFastSweepingSolver  : public tnlPDEProblem< Mesh,
-                                                     Communicator,
-                                                     typename Mesh::RealType,
-                                                     typename Mesh::DeviceType,
-                                                     typename Mesh::IndexType  >
-{
-   public:
-
-      typedef typename DifferentialOperator::RealType RealType;
-      typedef typename Mesh::DeviceType DeviceType;
-      typedef typename DifferentialOperator::IndexType IndexType;
-
-      typedef tnlMeshFunction< Mesh > MeshFunctionType;
-      typedef tnlPDEProblem< Mesh, TimeDependentProblem, RealType, DeviceType, IndexType > BaseType;
-
-      using typename BaseType::MeshType;
-      using typename BaseType::DofVectorType;
-      using typename BaseType::MeshDependentDataType;
-};
-
-
diff --git a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping_CUDA.h b/src/TNL/Legacy/fast-sweeping/tnlFastSweeping_CUDA.h
deleted file mode 100644
index f531da431..000000000
--- a/src/TNL/Legacy/fast-sweeping/tnlFastSweeping_CUDA.h
+++ /dev/null
@@ -1,194 +0,0 @@
-/***************************************************************************
-                          tnlFastSweeping_CUDA.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLFASTSWEEPING_H_
-#define TNLFASTSWEEPING_H_
-
-#include <TNL/Config/ParameterContainer.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Containers/StaticVector.h>
-#include <TNL/Devices/Host.h>
-#include <mesh/tnlGrid.h>
-#include <mesh/grids/tnlGridEntity.h>
-
-#include <functions/tnlMeshFunction.h>
-#include <limits.h>
-#include <core/tnlDevice.h>
-#include <ctime>
-
-
-
-
-
-template< typename Mesh,
-		  typename Real,
-		  typename Index >
-class tnlFastSweeping
-{};
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-class tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >
-{
-
-public:
-	typedef Real RealType;
-	typedef Device DeviceType;
-	typedef Index IndexType;
-	typedef tnlGrid< 2, Real, Device, Index > MeshType;
-	typedef TNL::Containers::Vector< RealType, DeviceType, IndexType> DofVectorType;
-	typedef typename MeshType::CoordinatesType CoordinatesType;
-
-	tnlFastSweeping();
-
-	__host__ static String getType();
-	__host__ bool init( const Config::ParameterContainer& parameters );
-	__host__ bool run();
-
-#ifdef HAVE_CUDA
-	__device__ bool initGrid();
-	__device__ void updateValue(const Index i, const Index j);
-	__device__ void updateValue(const Index i, const Index j, double** sharedMem, const int k3);
-	__device__ Real fabsMin(const Real x, const Real y);
-
-	tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >* cudaSolver;
-	double* cudaDofVector;
-	double* cudaDofVector2;
-	int counter;
-	__device__ void setupSquare1000(Index i, Index j);
-	__device__ void setupSquare1100(Index i, Index j);
-	__device__ void setupSquare1010(Index i, Index j);
-	__device__ void setupSquare1001(Index i, Index j);
-	__device__ void setupSquare1110(Index i, Index j);
-	__device__ void setupSquare1101(Index i, Index j);
-	__device__ void setupSquare1011(Index i, Index j);
-	__device__ void setupSquare1111(Index i, Index j);
-	__device__ void setupSquare0000(Index i, Index j);
-	__device__ void setupSquare0100(Index i, Index j);
-	__device__ void setupSquare0010(Index i, Index j);
-	__device__ void setupSquare0001(Index i, Index j);
-	__device__ void setupSquare0110(Index i, Index j);
-	__device__ void setupSquare0101(Index i, Index j);
-	__device__ void setupSquare0011(Index i, Index j);
-	__device__ void setupSquare0111(Index i, Index j);
-#endif
-
-	MeshType Mesh;
-
-protected:
-
-
-
-	bool exactInput;
-
-	tnlMeshFunction<MeshType> dofVector;
-	DofVectorType data;
-
-
-	RealType h;
-
-
-};
-
-
-
-
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-class tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index >
-{
-
-public:
-	typedef Real RealType;
-	typedef Device DeviceType;
-	typedef Index IndexType;
-	typedef tnlGrid< 3, Real, Device, Index > MeshType;
-	typedef TNL::Containers::Vector< RealType, DeviceType, IndexType> DofVectorType;
-	typedef typename MeshType::CoordinatesType CoordinatesType;
-
-
-
-	__host__ static String getType();
-	__host__ bool init( const Config::ParameterContainer& parameters );
-	__host__ bool run();
-
-#ifdef HAVE_CUDA
-	__device__ bool initGrid(int i, int j, int k);
-	__device__ void updateValue(const Index i, const Index j, const Index k);
-	__device__ void updateValue(const Index i, const Index j, const Index k, double** sharedMem, const int k3);
-	__device__ Real fabsMin(const Real x, const Real y);
-
-	tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index >* cudaSolver;
-	double* cudaDofVector;
-	double* cudaDofVector2;
-	int counter;
-#endif
-
-	MeshType Mesh;
-
-protected:
-
-
-
-	bool exactInput;
-
-	tnlMeshFunction<MeshType> dofVector;
-	DofVectorType data;
-
-	RealType h;
-
-
-};
-
-
-
-
-
-
-
-#ifdef HAVE_CUDA
-//template<int sweep_t>
-__global__ void runCUDA(tnlFastSweeping< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int i);
-__global__ void runCUDA(tnlFastSweeping< tnlGrid< 3,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int i);
-
-__global__ void initCUDA(tnlFastSweeping< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver);
-__global__ void initCUDA(tnlFastSweeping< tnlGrid< 3,double, TNL::Devices::Host, int >, double, int >* solver);
-#endif
-
-/*various implementtions.... choose one*/
-//#include "tnlFastSweeping2D_CUDA_impl.h"
-//#include "tnlFastSweeping2D_CUDA_v2_impl.h"
-//#include "tnlFastSweeping2D_CUDA_v3_impl.h"
-#include "tnlFastSweeping2D_CUDA_v4_impl.h"
-//#include "tnlFastSweeping2D_CUDA_v5_impl.h"
-
-
-#include "tnlFastSweeping3D_CUDA_impl.h"
-
-#endif /* TNLFASTSWEEPING_H_ */
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel-map/CMakeLists.txt b/src/TNL/Legacy/hamilton-jacobi-parallel-map/CMakeLists.txt
deleted file mode 100644
index 48382df82..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel-map/CMakeLists.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-set( tnl_hamilton_jacobi_parallel_map_SOURCES
-#     MainBuildConfig.h
-#     tnlParallelMapSolver2D_impl.h
-#     tnlParallelMapSolver.h
-#     parallelMapConfig.h 
-#	  main.cu
-     main.cpp)
-
-
-IF(  BUILD_CUDA ) 
-	CUDA_ADD_EXECUTABLE(hamilton-jacobi-parallel-map main.cu)
-ELSE(  BUILD_CUDA )                
-	ADD_EXECUTABLE(hamilton-jacobi-parallel-map main.cpp)
-ENDIF( BUILD_CUDA )
-target_link_libraries (hamilton-jacobi-parallel-map tnl )
-
-
-INSTALL( TARGETS hamilton-jacobi-parallel-map
-         RUNTIME DESTINATION bin
-         PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )
-        
-#INSTALL( FILES ${tnl_hamilton_jacobi_parallel_map_SOURCES}
-#         DESTINATION ${TNL_TARGET_DATA_DIRECTORY}/examples/hamilton-jacobi-parallel-map )
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel-map/MainBuildConfig.h b/src/TNL/Legacy/hamilton-jacobi-parallel-map/MainBuildConfig.h
deleted file mode 100644
index ed3d686eb..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel-map/MainBuildConfig.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/***************************************************************************
-                          MainBuildConfig.h  -  description
-                             -------------------
-    begin                : Jul 7, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef MAINBUILDCONFIG_H_
-#define MAINBUILDCONFIG_H_
-
-#include <solvers/tnlBuildConfigTags.h>
-
-class MainBuildConfig
-{
-   public:
-
-      static void print() {std::cerr << "MainBuildConfig" <<std::endl; }
-};
-
-/****
- * Turn off support for float and long double.
- */
-template<> struct tnlConfigTagReal< MainBuildConfig, float > { enum { enabled = false }; };
-template<> struct tnlConfigTagReal< MainBuildConfig, long double > { enum { enabled = false }; };
-
-/****
- * Turn off support for short int and long int indexing.
- */
-template<> struct tnlConfigTagIndex< MainBuildConfig, short int >{ enum { enabled = false }; };
-template<> struct tnlConfigTagIndex< MainBuildConfig, long int >{ enum { enabled = false }; };
-
-/****
- * Use of tnlGrid is enabled for allowed dimensions and Real, Device and Index types.
- */
-template< int Dimensions, typename Real, typename Device, typename Index >
-   struct tnlConfigTagMesh< MainBuildConfig, tnlGrid< Dimensions, Real, Device, Index > >
-      { enum { enabled = tnlConfigTagDimensions< MainBuildConfig, Dimensions >::enabled  &&
-                         tnlConfigTagReal< MainBuildConfig, Real >::enabled &&
-                         tnlConfigTagDevice< MainBuildConfig, Device >::enabled &&
-                         tnlConfigTagIndex< MainBuildConfig, Index >::enabled }; };
-
-/****
- * Please, chose your preferred time discretisation  here.
- */
-template<> struct tnlConfigTagTimeDiscretisation< MainBuildConfig, tnlExplicitTimeDiscretisationTag >{ enum { enabled = true }; };
-template<> struct tnlConfigTagTimeDiscretisation< MainBuildConfig, tnlSemiImplicitTimeDiscretisationTag >{ enum { enabled = false}; };
-template<> struct tnlConfigTagTimeDiscretisation< MainBuildConfig, tnlImplicitTimeDiscretisationTag >{ enum { enabled = false }; };
-
-/****
- * Only the Runge-Kutta-Merson solver is enabled by default.
- */
-template<> struct tnlConfigTagExplicitSolver< MainBuildConfig, tnlExplicitEulerSolverTag >{ enum { enabled = false }; };
-
-#endif /* MAINBUILDCONFIG_H_ */
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel-map/gnuplot.txt b/src/TNL/Legacy/hamilton-jacobi-parallel-map/gnuplot.txt
deleted file mode 100644
index d4ae61983..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel-map/gnuplot.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-tomas@tomas-linux:~/Desktop/VU_CPU_MAPA/work_dir$ gnuplot
-
-	G N U P L O T
-	Version 4.6 patchlevel 4    last modified 2013-10-02 
-	Build System: Linux x86_64
-
-	Copyright (C) 1986-1993, 1998, 2004, 2007-2013
-	Thomas Williams, Colin Kelley and many others
-
-	gnuplot home:     http://www.gnuplot.info
-	faq, bugs, etc:   type "help FAQ"
-	immediate help:   type "help"  (plot window: hit 'h')
-
-Terminal type set to 'wxt'
-gnuplot> set cntrparam levels 15
-gnuplot> set cntrparam bspline
-gnuplot> set contour
-gnuplot> splot 'u-00001.gplt'
-
-gnuplot> unset surface
-gnuplot> splot 'u-00001.gplt'
-
-gnuplot> set table "test.gplt"
-gnuplot> splot 'u-00001.gplt'
-gnuplot> unset table
-
-gnuplot> set table "test2.gplt"
-gnuplot> plot 'test.gplt' index 10
-gnuplot> unset table
-
-gnuplot> plot 'test2.gplt' 
-
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel-map/main.cpp b/src/TNL/Legacy/hamilton-jacobi-parallel-map/main.cpp
deleted file mode 100644
index b13498e17..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel-map/main.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/***************************************************************************
-                          main.cpp  -  description
-                             -------------------
-    begin                : Jul 8 , 2014
-    copyright            : (C) 2014 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#include "main.h"
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel-map/main.cu b/src/TNL/Legacy/hamilton-jacobi-parallel-map/main.cu
deleted file mode 100644
index 710197671..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel-map/main.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-/***************************************************************************
-                          main.cu  -  description
-                             -------------------
-    begin                : Mar 30 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#include "main.h"
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel-map/main.h b/src/TNL/Legacy/hamilton-jacobi-parallel-map/main.h
deleted file mode 100644
index fff21c77e..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel-map/main.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/***************************************************************************
-                          main.h  -  description
-                             -------------------
-    begin                : Mar 22 , 2016
-    copyright            : (C) 2016 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#include "tnlParallelMapSolver.h"
-#include "parallelMapConfig.h"
-#include "MainBuildConfig.h"
-#include <solvers/tnlBuildConfigTags.h>
-#include <operators/hamilton-jacobi/godunov-eikonal/parallelGodunovMap.h>
-#include <mesh/tnlGrid.h>
-#include <core/tnlDevice.h>
-#include <time.h>
-#include <ctime>
-
-typedef MainBuildConfig BuildConfig;
-
-int main( int argc, char* argv[] )
-{
-	time_t start;
-	time_t stop;
-	time(&start);
-	std::clock_t start2= std::clock();
-	Config::ParameterContainer parameters;
-	tnlConfigDescription configDescription;
-	parallelMapConfig< BuildConfig >::configSetup( configDescription );
-
-	if( ! parseCommandLine( argc, argv, configDescription, parameters ) )
-	  return false;
-
-
-	tnlDeviceEnum device;
-	device = TNL::Devices::HostDevice;
-
-	const int& dim = parameters.getParameter< int >( "dim" );
-
-	if(dim == 2)
-	{
-
-	   typedef parallelGodunovMapScheme< tnlGrid<2,double,TNL::Devices::Host, int>, double, int > SchemeTypeHost;
-/*#ifdef HAVE_CUDA
-		   typedef parallelGodunovMapScheme< tnlGrid<2,double,tnlCuda, int>, double, int > SchemeTypeDevice;
-#endif
-#ifndef HAVE_CUDA*/
-	   typedef parallelGodunovMapScheme< tnlGrid<2,double,TNL::Devices::Host, int>, double, int > SchemeTypeDevice;
-/*#endif*/
-
-	   if(device==TNL::Devices::HostDevice)
-	   {
-		   typedef TNL::Devices::Host Device;
-
-
-		   tnlParallelMapSolver<2,SchemeTypeHost,SchemeTypeDevice, Device> solver;
-		   if(!solver.init(parameters))
-		   {
-			  std::cerr << "Solver failed to initialize." <<std::endl;
-			   return EXIT_FAILURE;
-		   }
-		  std::cout << "-------------------------------------------------------------" <<std::endl;
-		  std::cout << "Starting solver loop..." <<std::endl;
-		   solver.run();
-	   }
-	   else if(device==tnlCudaDevice )
-	   {
-		   typedef tnlCuda Device;
-//typedef parallelGodunovMapScheme< tnlGrid<2,double,Device, int>, double, int > SchemeType;
-
-		   tnlParallelMapSolver<2,SchemeTypeHost,SchemeTypeDevice, Device> solver;
-		   if(!solver.init(parameters))
-		   {
-			  std::cerr << "Solver failed to initialize." <<std::endl;
-			   return EXIT_FAILURE;
-		   }
-		  std::cout << "-------------------------------------------------------------" <<std::endl;
-		  std::cout << "Starting solver loop..." <<std::endl;
-		   solver.run();
-	   }
-	}
-
-
-	time(&stop);
-	cout <<std::endl;
-	cout << "Running time was: " << difftime(stop,start) << " .... " << (std::clock() - start2) / (double)(CLOCKS_PER_SEC) <<std::endl;
-	return EXIT_SUCCESS;
-}
-
-
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel-map/mapa_png.png b/src/TNL/Legacy/hamilton-jacobi-parallel-map/mapa_png.png
deleted file mode 100644
index 668b6fe24b17b2fec486db28505b41e3beb2091a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 24841
zcmeAS@N?(olHy`uVBq!ia0y~yU~~Xs4kiW$2B%L(e;F7Un2Vh}LpV4%Za?&Yz`&ru
z;OXKRQZeUEZDfx0&)WYna~@7`=JTJZ<m!1v_w?~!9oM7t!|qw+?&7_)z(GjF_ma&P
z)>{f0jf*t8*mNiD+mi3vZ1Z*H3GePjng4w+s!6ZeJp1jfIQdPfk=H!u?m04hj%1wk
zxikMC-cA+qo~AMH@Z--LB5q$f6Ru+a`R~7d$DeoCecyfGbr<W(kXP*XKMuZLzrXJB
zvEH)Vm)uteYE6B2XXoY(8wwsCYSpiK$gR<}?e?Nw3Z6k5BXq>N`{itJ-M@eSY0>p{
zvDJUSUca07{mq*<S67FZTFEvkxa@wLqf%le+m?7SW6EQT#TRen75~c6G~l`XvP7y^
zZT49)F4kGgZs!KZMnpy3x^?T)>#vU=KMr3XS86pk{@<tR>S}6nRgvL>GgM9<Jb3W!
z+uZ)+#V;?Z@*KXE8z{e)xh*j`IC!FmiUiNSz182>ZojAGx#_OkT*gT%CpYccyLa!-
zoik5AHIeGQ{Blq2?{DV&KJqTUxZ+}liPYJP^IvH(E!L>|{q61lAN~JNq!?||`ObMK
z*dg#r-gfJ`=l1U18=|G!mBcP+A=Uft&6_2cU*?!)8_l%)`_lja)A)Z&uV&ePW#8R(
zfTJlv!tHX2icshA$7Q=^kAEzSzT*CxVZL48<D*F%L$tQuo^yGk*U~?A^;UEFTAdbY
zM6JE1U-z*){@11HZf<ToY|f_PFU**NMNCahwrtw8Y3tThqnW>0-f1q-u(Gz6?mqgb
zuHHc6%ImMMN)8=OGL-9Qu73P+ht6s9ygNUxZQu9wYn5bMV~F3>>95R~vbWyaxpQZR
z$*IpZD?=`2ZGF3KMgKIX_1AB|{kC=M)}u#{9z1w3Yte;ijv_}>j81;8(Ga<H^QL29
zNnYN&^Z(wQUl+IcRng9@>T4y^je;zS98SwGKmGi(k7LoV42uIFyB0-gh}_EMZc^x2
zr17&RPM{?~<IAI?+<lMj{{K8*|9STPl*nsOi+cO|oE%q}l_)lDjaqyCb!=2r)yGG!
zOJlCNzd!K$Yuvt?%H(8cfflo8Mxw5Qt{X$NmR^3@+S(eSQ}*}QSNp%0?PcZU_7y$t
zI{Yx>MC`5!2UI-!dU|s5^V^*ketv%bJLjEj1<y+<MpM0(UcDN+`fAqZn^8FqD?_xz
zx@C^r*8i)?&p-dL;LQ2+{Jgwp&!3OqTeWrL_g79#${dR>W@OCjJDT*gXy@H+UoK`X
z(BO)dSs0)(*Khlb15b;79_<#7j*b@BkK0r7@=|pE-ml+o=l{R8{%_Reyz6sWjuqZX
zG1Ar7|Nkc4{`aTp`|s#;d>082F_r4&YfrAPx1WFB{C8Q`B9BQ^yq1cIi(B?B&Wu`o
z@yyw?fA8M^D|>p=`m6&PBD*FZ(CG5>@nLB``1bAFI(K*eLs?tn_Se~paJec?{P^+X
zGug#vW@bh+dlWdV=K2LrS-5cF`}+UC<JNEY+-}|VPQ{=}f#>jzoNZgTZp|^vHj%R4
zyLE5>=B7m&GRJG}<{y7NbJnb^H{uF1eCk4+jsjw0VyB*dI&k1X{Qs}<@w-YoQzMsO
zzIi};Zr3_hgRVt7r!SfO1xd~S|L3`{TJZeCwybmg%nc1UZrL)$Z+U8Do`<*pH({IF
zUnY}J?)m%e_WytH|G)cKq0zM|YAwq}HnURQMnM)ydHMdMNwa6qzL_&E^`>28HJ6BM
z?9QU2&p)>wPV9SpCaw7F^iye@ZO%V0+IhTR-Z}H}_xk_ej~_pto|e|u-hTV`+v{gp
z+KQOE7VV7j3!G9m`?CA$g%>k4x^CUPDJd(P8o8}4q(jB?X3n>po73&)=O-jQ@Q(j=
z>B*BPU%q@f+ASV>Y01sAEN%ivGE9O*N(%}M3=9&CX5Py6lh<VxQC@U0Lr3iN^Uqdu
z*Ui%KnWPdZqUsqH9bNtH%}k9hVPWC;|G%!^uXx-$+dN;2hb^?VDAw=6#EBDKls0bK
zl(aG8S76C{##5FvebP!xujZJ|e`O}>swvi8x4*u$)O4oLtCC&j_iKu4YW_SsJ3D{h
z&t-Az^JhN2y7)9_&!QD^>)iuQW}n@5dzxJoFVEpOWxH?Y^c{a}bKZMW$)RrH&Lf-C
z&p-OOLxF?m@SR<yuZ`#bJY)TS&*D{u9gANSF)7b~@rH$|apOkA-FNSS%S;og-isM3
z97>Z<rWi@C4Xd``nw&cG@s6`+&o(zVyC^AkJ-Tt@hIIa(!q?Z<p0E41Ic#<4-^VNN
z`X@|Fy_@&_)TvYd{?*Mtu$QfC(Yd+S(|y#oZ{J>Ob$4y_c8^I@QZ42B*_#i>=uP)f
zN!mE$_S=1BZ*TQVo7X)ruUme!^^VoWzbs}_y_YXvzJ2@lUyWVtjfpcZW~4-Ji_t5V
z5$|;2YGta3-GBeQLC@{C|L#1u`@Vbs@7>16#ut9x$eVn=?VY+o*CMOAeJUsC&Ye3E
z3_zu%x1&JI%^Wq)qUvgE8NSygR!s^#Y=`gvdwaj`sdl^~htuk-SzB-Mz1{w{4wQc{
zztoH0_vilqzxS)Fs}CPO{5PiLy@NnYh!$&AxwN$O<HwKdKZ@5sEZTV|Z~N`H?@C@q
zE^+7Z5jgVZ&70rv_s469oO)Vxyw$VzN<+}fDJN6n_Ec=#xN+k7=Z<rmQ*Yk?bM}5n
zy5kD7FD#9MERKOw5@)PkyY}G0gT>F!t+d~L!<~16%E|zZjzv3m?%cUygMz1!4@Ylr
zuOZLpM@PFuw5Iwk|IPBw!(#5auV23wSlF1FhVnM1Mov@VbiKIEWaallHHK2=#si<9
zpATL+MS(+YGUt&<b~d&J?Z*=gRD?RWZZ%zf^_FQ?yX}{&dkP*pX-#d?n6+rdV*LrJ
zmb1@J^I96TQl%?tXUw$J$o1DZ>--M-yMh1ff)Fj+sxKNce9dAezL(|tk0%&d$ndSc
zT4iDCdU=Zi2dG|qQMP;e^5wf9Grv`2n(d=D`|PvNKgGnw_2c*PIN#f(^ZfJA8aw^2
zMH*c{Yve*Iv<~yPIw{USe>-QJR_nIk8H^&Xnp3^(=Ie`aEe%@fp|a>=M#QwQ6*iM6
zPv$wiK%;8P{ItzCGfeVwb6cGjo;-Q-orT+1fyS*-Z{NJhv6}0rBGl^S7&s+HZ@S;|
z@bGZQ9fz~F_Vx8i^&UGczgER_XH4Jo%2~^PZ!Ed3c)G|y!p6elfd!wP>E1YdGqbXH
zcXkG|v(5E;e&)=XckkYPetup*cGr?mawoMeI^Xx;IUF&~bf(Y7h@9G5TM3>|Ht{-Q
z#XDovCOZnS)actB;{U%u=k&{xsK6<CR@S9rBCejxFJH_snd`TH{d)JsfnAHvq<ubf
z=FEc!3a){ncF~$$r&5gU|2*VhAG^Ek^Ru&Qv+pu}5oi=-xtL*+wz)DjRaJsVtlKqE
zg#UlG+3eO_@xzA?w+m10S~TVK(-fnb=by8)v8{@a(K>C!v+UqPDMx|LH}B{t&E>pc
z!McZ+g^8mnW9u#d`d^n9FJ8>z_`srW*<u};#Yr0@3?y=9ojlvBb9#Yk?fZLs|Ni}Z
zPszY@QgCqa!-5|d7rP(dr!duP>gm)#5&hU*UoP1<?yEDAY7LtgewWFF>-5u2DxM#H
zb#2l0oK&@UpZ|kk#nVO_2~Uc4u3fu!>QqtJz&U=;w`O0Dn3gKhw(R9CspS#VWaZ`K
zBO^PHCh?l@W)*P#`111d*RNlHtdN=Vy1M(og|Ny`DfJs8o>cz3QQuT~(L*I^`|W<e
zY12+WjnE0x5mWVS;!-i#`Kf98zZ=Il28gIm^{V}Fko{ze(56%2>*GvkpJi)4m|=45
z=7)|&E5cTbx_)ftx2yPk*1X?t*;P)@Ni}xzHs$l|>*x8XY5%_x&EjS^Q*dk4+D?~8
zH^dM4%?<Co=rw7Ij~aJt)8&^c9E*hHnkDutaHwZ|cW}8oeWxu~1wVJ|p}V`w|JK#7
z3~Bl*^}#bzXPVpIWQ9@-Bco3b4mMYRf46eZfuA-19z0MG>U3Frv7KN3-P^Y(H~qPk
zUDbC$L9(`>;KN0C`Mo7<>lnPe)}IR3?3#2o?efbl3LJ|x!q#tFs^Mic^Gep%zg<@n
zMY@k3J$lqZVZzxo<C#1YuH~5Z_w<-Jmv=3D`SIh&-rimhl|_fzH7hzwtj-qRdG+cQ
zyL?T-CbQb}f)b7bGRMDG+4^2~=R9N3b2O>Z>Cs7X&!s_Ki{^xCuMg3hdOFoLaM$AQ
z1#%i)oA&MFo3-#`Ma=Tckvd}QuXE@2`|dUMoTSn9?}@to%=sV?&flIIdGG%H{~!D7
z_iWxg`E1%k+lNo&U%E0Wb0lqzS{tTal6T<#1eHt^DRy>t$7kQK@PExP2<BbDyl^@v
zPwSn(fB%MPiT1jwdM?s9_4=!yuW#hqFcH^i2lYuR`tkc}{{Q>?`}_O*dnz~g{+gt)
zK)vhH-QDH(|9(7<&|%wq=W6^ycg{Cot0tdR3275xDYTlabDEEb=SI%9wED&a%;sOk
zv|JVjeE9vhNg-u(<PZ5tD(dR$>S}5kv+_PZ<~eMz{r1|cS&J`bEDX4iVUk-pPpfOw
zrcIxIe0&_Cvu*!={=IDJdp-JvBpe0y#`z1doZghSxw53>N{Z3eW!w3p*6d!~|32iz
zra325F1;+#5Q&P5duG9R{Nu!l6K9*}8}Tr+?%Xz^UFBqFXD6r~WB==7|Gqy@_2q9%
z-?w@5_ARJ{6ybXN_U+N6jgzl_^K*MIbR%o4)!cI~N)L;6>WF!3?|0+<azMp1Yim@K
z!h-bQq2X=!9!T-M-Rd|k)lq<})yYQg|BuK0td0i?@8oUY|Mgn5+3a40j&1VRoJ|sv
zlAJ6|{_|{p_TK-|drd$>q_6qlgJZ{zy?d9pF=EaCS$n+`cHgzLw*LL&V{)KK^x-&<
zd8SJwxm^Rh7HM=X%9K)D8X(fj_CEPN_adjAG3z39u9a1-7rIb8(Qo;Dn@S<RcGo}_
z$F$vd^Gu{9+cK?7wJ&6tM6ERw>QoWpoP6?%&0!YD4?k-zW?XsxSyxwg@2b-a%)aO~
z3bHh|%`<+=lf~v5$ku!?X=8v$==sl03LF`Kuf8m4*Kd7n(Uzz`QTVXI-njDyJ#LE+
z8t@!1<aqG<mv3CJ)4~rA4mQu7E2~lN@9WFU$7jToY&JX8f}QF1c{6kK|8MXAd&~dt
z0l(Bf_B}Cr?u##asBGH0_3MX+hdop-^_a<B-Me;e%h4oBIk|iH?)4o_IvYIas6o%N
z%-Ltd&QA;GIn2>?A;ZLKE?-#7o)S$B{bCt*cC8FcBdOl@r?(#DUwT<$+rI8_gWa^G
zjXU=2*|Tex7DtoLX@{PJ2OW3cz4vJI)0qweEtg+@`Th6w>C-mn&ASdvQgIYDm*DyL
zWx4&`yLUrHmR9ngou_7SdXuoQu-*Lr=Reg1XDrfKd@*D5%`EQOfg)EkOaz!;fBhvK
z6fe={SSjo%@ZmgL^TVZowAvCMbV)lUxCU<9z8zFw#ahlk-{0FCxi)O^MGF&?C6n&P
zt2Zywn0`7`L*(1{@2~F|uh8J)c;UMEV!z$5jLOQNPuP0g7B1)HS$k2Ut?^OhVST=f
z$!BHvNVGBXE50uN<hr1VOC&WiZ24u`zgCMhs;+&UyI8>W<H^bDGRF&LrhBLat(<cD
zsg$&Ia%5Rf&YI07a)mPGyYHT#Z~uRJ{jcTP;p?tkR-AtN>AmXrd;9JGS<FBG+2;PN
zV#!PD8!l$lJnl8WSNlD-EwQb=UAi>2YtaIS(rXez7Hom$IY-va%eUy0=4lIm85j~5
zeEYIX;FPmx&z4xt^;<43F7CZ_(#@Q0QETP;)ARE9{MhH-n7im=hS$<dj~*pyh|HQd
z&+hX1XP+w_1>|fh3Z9>nospbf+q!=_XO9AhqkyrovHj1J{+n**EVfEsD8Je4IJfX=
z70-FEiu(WF3_iWebc@y7w{Ht&rk{Rl_veGNqW~Kl+nqdf*FbUoxDpGQ^j%Z98XxZn
zTYdND&7bY{Kic2dzON2dyPc5#fnC1l!<(C%Jya%nEuEG7eD#IbEN*t4rHM`3{g{MV
z5;sO<73^v6RrK8SY|h)2b{ndgo`3%7qjtFP&fVSR#xs2u23*M6>g?>idGqGD_0v^O
zzLUst3!Jj-WsP0^|9^jT%ua85v(mok|G&TecE3CXTAG`g9R;Eio^ZYpmT(j}^043y
z81(e_U!QieL2CWkw8fdb=EZPi+%#MNf=O%Vg)4<`xEHkV3(?y9{a*D%50>MFlT<Qi
z?W_3ssFhp%LAteJNS$bah^&M}frZQlw;!*ru6CFAxo~jb&YeGBTwL7T-0Yxm!EO3)
zp<gU+0xUbLzG|IeyLRn~$0U^%xmVZQX`e0%mev(uX|%cK&7wEeYiEq!{PX&Hdhg!9
zPv0ErIcZDO+S5;G&75f{!K40qLW#+nvfnpu+_=2lfB*l#@9V$szAq~+ZT&~F&1vD%
zrAsqRc5U6tYE+eL@>OHkXJt$61)dr+eYzG^TzeO@HtcnQ#e&Vd>bxC}FaGy>PQ&8P
zjSdPQK0I{pd+fDzk^;xhm~#&c_I$gQ?Y1~@{dMO>`&O!Ll{NUy`!^_Vy|`=Oyyxa-
zW((dHrlu}c%<VtxF#iC@Cr^ewd-w7Net8(h@r_fV<Va!6V$i6I1<Rp&X^WW)G`P-x
zR`E1j-^X>WYtalJwdvEQDNW=!UTAavaK`bgtHT|4te^d@e-2ms;fEh9Hr;&l=#kQ4
zgPk#Xxw&uu{r#=(y=DbtEt4`wQ-+Du_S>>NY}%8|O?-nzSWAu;%IsQ@HNnwFTg0`I
z@!~VV9RV6Ty1EDdL|Mq3OEF5@Tv=9jEp78go!2E+yOpg=)f_&be0;ng)Uo|{-2Pu7
zyXUempj5r+Vup>Jzlu;~bhNez*9A6?;#F(?Z#X=UJ>34U`{<)vw{D#~=Vu{v@6Mf`
z<B!>z6AKDHJUZH)wplWTN2+3XU-QY7O%XaLQ-TCq>>9247o3=QGDT_PiR|@z!#EYR
z^)`lRDRLy4NX?#mx=};wmip`G8&o_W7F6Wt>x*#N*8Vb)>V0I9w=qI5c2|ng%wxxo
zCw>ff)_L#Zz@4HX;GrVaq~MTUzPs0oxg%~`xh2nwRi7(tcE-FbSQaSI^7QG`zkmP6
z=($gNV*P$k^2Uf+xjs8%+Mc`SHmo%~m>{rAR^GmD&%S+rla}1fnKxs`h1XvvPoC_z
zd~+jPxlBrA-|@#QL$s!!Zaill()OjQR!6MaZcf?l2+?11KmIo)&N!PUEiYgH<wc<0
z^v^$QF1*v`aI%`qrW)6M^it2dmy1`V*SStw5~B6>&CSiA+IVLS-!$D@Z_Anw8km|s
z{aPhGyY#iil+&qwj~{)sII*~U(TuZc+W)z@s?%@nzZmz$_Je*y;*6U)Vy+(__uI$C
z#o7I0idb@m$y+P1#I)D(g!(DdpEY&)`S089|0G(?Jytj;QKCs9$L#jn=<N?`Zf}-1
zDY2X8we;=Vx9t3K2jcf--CGc%)ws>meNU{*-@nawS=tXj{90vhX4d!jdXCxdO`9&w
zI=)%w^wl%<hYw8;(D?Fh_xody3+FwroOk_YiQD3jr>1HL@U9AWTCiC0bkW8L9=2wK
z`_oT9^_wd!<Z39@Yc==WWb=v@EG#DPzP@4qBh=&e_{EDC@7}$8xZ#z-wd>c<&$quX
zl_`<2+D7q?D9b~Tqb+3G4)8lpIC;`@{dI0r1BtY(Z4+0yt_+Eah_JA*(3pO@)k$%#
zU%2HXb{0Ri$+8pgDGRW?oRYE7^uo&$^)~`jRX9WRqYm`O=(Ruo;a%jrNMqf)b%$on
zaZwWNKmN7q?}ZB&o;*=Gy{T-stl0GLYAKO@Zj0;o%O5icS}CH_XRGZJ_@q+KY>&yE
z6UUky1eV0;t#;H1P;Jnx>3baav?VcSH|t%#2aOAbJp!lv{rmUo)vF;|TX*i1<YD_-
zwfFMN605m<?a3)AOQwGg+ThILvb_0Vf{E1HcwS|(?t>GWPgQ=bmy2liPP)aq`>vh2
zxp@Eae$@iI`SJ1b<}-aZZ{Dn%@vePJk%B;jn(+Md)6YNWWjp-g!-oeC9z1!nByRot
zoBV#$p2q00zY5;4=;@1d&u0nty2a{+bGQ6^eMEm>(_aUv1s5|MtUrb7Kd7{m>vz6!
z$;Ozgm5Iam?fdukm7iLkS57{8WYe0{Pd`=IY~H+i@5`cx4&0qd0S9vK?M(<?b4uF#
z|9NKbNi1`Jz1`+6df{bB+3vZ|f0kI)eu-E0tXg?<abv^N*D}WsibR{uZY>dMIraS9
z+{>@O?)^EZb>W@8d-q<wdevzmgVQ(Py~$>?&!%l&(0A#I^wsA0PX}_hIENZY@H{^^
z*IPyCxKPBjr!{u>KP<F=ag&FI>HGWp=TnT92CWRxkl<<ap03w;LfOaht5kL3%-5$*
zd8r5qx(Z4yk1BrcqbAJ3QeriC*3@6W=0+T6I9|B<X3dvm=G3wia;92Qn$u4=DX0i_
zR)2pdD<gB@RO`Zp3s<hx+~9b&|Nk?_PbVK1%y_)RM@_i<Xx3@<>&zLKvf8AonLpOE
z<XvL%dG56QGA}Q$tJ1_AGj8X%ReSGcZH?Pqwl<Lav8tz*2p2m$J2yA?g3O9{cPtfh
z&(5lt+gD>J-+gq`O`Y?ffB*i?&d#oTIo+1abJC}uHCA)OikE#5m)mhO%V#d@tYzot
z+wb4EFHUQ!UhFQGqJRyaYtEe)GV4j*eDnMLdi#kU9+Of`q}0{a&ZK?*@L|KLU^U++
zozsgnie$F$-@kw3#y|~`uA@oWEf@V8G`gA;-n@NlIrki&f<e&hORv_kc(}bTu@Wd-
zegCyqSCHQH%{Sj%UheO_aDoEI0tIeSP1E_CX0dQMD9o?<{_gJIzkkc3OL!DIS3LW2
z;q<?4@dE1@O3DOSG)1_c7VVt3OvZCkSa>*dql4LO-uA-_{u`K^r<=`|PS00&oc46>
z+O=oT`kqi<?f&GWM%U4#x%2CSKW<`gs|xzB!n335YnF{%`?5Wajg7{8bp^E8nHcxq
zpYNmg{rmUSNHYnZxb@;fA-1JajOxce|9SQ-O-JnX)1oE?hyUA7O+1@sZf16FuJv+1
zzWsk>gNnCYsM-;uxBBX?ojXq&oJqI)JX6cw$u%%&Wr&50pW0-*`TEmOXKuaK*x1M|
zuD4>p>H3EgQZ2>0k3RnBqc#~dKzC!~QA^Xu4<00>q`1sA-IF}W!CpL;v1`%f$&+XL
zuvLBWQ!P-N-08Mh=6G&io}7%#fq5IX!`D^V%uC+hTWa{}XN}R!GijS=`lO|&v+p`}
zD>tM!KxAoz&Z^1c$14&dr~M8zVQ{_p@L}Tq`~0&`uCJQ8NaMm!j<=^;ofr>qIC=8q
z%a@t6mTis7?RPmnfu;53&)(kNw{LSJK3(|sSGjaifX175@0!#m-l{0HX{<gKK8L5*
z?XZE5f<Q!k{QiQ6hZ47@NGv;_Vq|lE{rdI!`T376{4RgFbLY;PGd)(y>(U%1sl2Fc
z*dtP+u<iB=H>LUK50}j1n%sB1{lcprv0F^NZpKztQk^c}zI{7({CMMmNh+L82MT4j
zJo{wRpE%?A<C*j3UCY|)wm5JuXMbDs^Ph9)&JEF;dj9#tNVZ3bciA<nR$kaH>;Irm
zZx@ry@wa8WZ@-O;kJlIBnxr!Ibn5>5{~sJ=p7&g{OG;jTd(>L1xqXQ<;v|-Zi?Ft^
zRLtvlTYNDi<gDugw}k;3KmOJ=A6#H&dj9eGw7=U<9cE~>;1GO&cei-|amlvEyVu0J
zk3KD0`F7=<0*^^YKG!S^Xn3|{>(;Gz@7^twz8du8bLAwJ(25!LsYOfU{hjOr|F-3<
zTBI@4$8GghKY#!J{{HO_vQfMZk=+$Gd`SwePL8edkFG6<(c8Y!@OP-iyaNS1c^frX
zIV}umcr{UhL*a(Kkx|jpQ=$=*uH4-k<Du5QXvd~ak3Ro&NR^w`zwP#xvPB_U;3jxT
zX+YIdcD?DRH%whW_1YHMr$v%i&RFzW&*@k66uKg}`#0~zRLhATYu2neU^791W3Bw7
zU|pBM7PA}0&(CeWJuUvhl3ywBEfNK8Hgu($NZFRHU=GVK*KWLi?OMgW_Lr9P&tK0g
z4(Y0~i`Nkg{c38_c_1M}eEFt^$p@kZYF4apkaOJ`bM9%;+Q61P$%KG;&wqkC0r7vg
zNir-{{aR8o$0Z@a{@p&2?42>wR6HlC?7y$i;WYJ;g+O@r)?0q-KZ<d&_C0n9)Dhw8
zm$UU!5z;zyc7<4@Lqlq0^T7p^PQDF}`m;;FYnqCtRPP~2aj(x;|Lgc<o1WjcjA_Z9
zqMdi%z0*5!Ic#(Kl723aNlP-Nwy<AW99C)7xO;&D&wf{<nLa8)8yLj6?xyeMW4W#3
zS>_cI^wShRQm^8<G-#!Gl>RGr(0JRuyzPh0beaz~B;L4mDM)K-*U==u<-xo%c5@GC
z<;bs=+whE^YsQ6$cz++pNm6Xhj(_txcrLto_l}K;@kYh8vuV|P_(FKURP8;Oupo1m
z0>@mx^o<dF;__2HgI0zlfl3rP8JP&3X?zL>K5C^&THH~^+^tS})3X=!7(dx7RkTt~
zajxI*zjYc+;9172{gYQSmK=&&du`@S$=!GB-tb7aCF-1hRkAB+<&sH<ue?mon_J4$
zrXl{x<f{6LRqQ(E%$?7U#qE?kv5Dtq;JoL~3j+)!&ZKRwy*H0%!Ywx0`1RMVjg6fH
zr?@S)blfg*c*Dboi4sYIu7SMEPp{%wQ@7Pi<B8MqJfoR5iDldU9JE%Lo<8fx#&OWu
zZszl!OP4NnP`D5`*^0x&_jb;<ix&e|U(GU*h?ur^*^UO#z+h^m+3d4<`T5yfquLUq
z65evY$YDDCE6w7*noy_H!V66mj+2&rbGx`9g~jb>LWW$F-R+DZ_XZWu*=L^>Sd>`F
z&dHbMJ07|JhDO&cAGPL#46QwCGox%*ZH?o-cqq}qReknZHzh%fN7trBtb2RyY0=K3
z`5b@BF1*^YWy=&FwT0XFJ^r#ZihtI!%vos?#}W*De0+HL`5#-v6`S+5C6?{3Ei3!>
z@NoOlq>USwU29my!87YvF27Wx<%0hLy>6FZmPDU>{vmJ6r@o~@nc#_w&o=z&zw1t4
zIjP_|DN$l=*y@WJZ_0l6N}EgFsAHa8?(pdMx3{<7=Dx9PX3twM946BGct?f_*I~m_
zv6fdXJ<l$f+`VGXVkVlhIWohhNCLJNz)|4P!vYhj+<SX>ajJMm=}fzM^X9>Wjs^R9
zzuPW&BirfHbu_7Z>*NsMgdLfB)2%Hm7-Bs-Vxuz;Z)|uKVth^Wz%|hq{(NlBzkmPM
zy|MV_%Z3`GwnV+@*;!d#sgY~LR$qQOYx<6hufG}_8*kv)wzq0=`VZddGjozQO7tIp
z_<HvF=T{HhKCChQbZ=H+<1%iIKT9sZJbLtKfkn*!)M^&B$(boBEWakdE!%xECF|f8
z&aOo~hXX{u{`pzl>z2Iv<}K4lMQr>$PLXHLw#4YEPxidA<KX8#UZ<YAPdxve)v@8*
zuBvs)r;Uym+Re`|FArDs-g||Ak_tN;n~oTFPLYhfeEwfYk!g1ix?gj9zetQ<=d_Ut
zXk5F*YVAKYmOzouKWliq)wO><iO~7>=qUG#DX*SS_GdjK1e%H8axr7hym{X$Y$o{U
zoff>3wDHWDGdlYE+r!s>%4u(D%iecbFkIwl(ng8yqgJPXUJ2NtHPy@7+WKOK$PqoG
znLhjLs*NPt6g`DpuAZN7zkZS+)04>>4U1N|C?%TBUK^GjJx@9P^8o?Z>;E@2>M5P@
zRu$r$_dN7oLXD1syr004lk;VsoNqpupf~+-rJVrBk)(~&ByaBAaQp4#$&=M4b6zug
zU1GJhcjAGVC~*H$tlRZ1i<^|7t249P%8(`}y+47uoU60~HAF7GG;s|)Ubi|WvMqU!
z0>|&)zi;2Z?NIS`KHn|Y87xizrFhr`-i3DVw-sY+e)#BUxBq;*v(G=jy>!#Ec-~&K
zO^lhRpB8CNeI>=o(V^@TxaP{_2l=0V+7$f$`1ttWM{hYBjs{BgvNb(vY;2S^&nuab
zB*>%gC?X4PaKFyAFmBB5Sd<!hF2!hL1kWG;?%OkFaRrJ5ubdLL+BHxlOm%O?$3ySG
zhnrluox5Pw!xF3K&z_Z7%?(?<RO3~Pp~BUsMH;h~ZN8bKBlenyDg1HcLPq9CDH{3H
zPCw0il*{UHtGT)P>C;pRdp)u4TFq<Wjmt8-7A;%0Oh>HyV8Vt79jm#1p|;=BTf<h@
zzTJ9#(#e$Fi(W)Uh5Ci9o{}mVCV7B!gYf3y^d?ZB`NsS2-(NeZA9!09y*3Oqn#r>!
zFK4@JFJsptx5X2ey)3ZUvwgd{mDR0B&jWi|?4o`&omIY>BNj73N^NZvv&XsBTGp%B
z4p=y*KEM6r$Bzpa0^F@{hckLzKAL1W(SwJVck7RvA>54%`5EUJyxG>4=RRTHtE%bM
zA}o#>Tcc(zvtHeIm&bsI`Rre}#UEc?T@4x;<`7VS=k<+=jh&sjkwN2Y8ABuE(WH$L
zI@7#VZ<yR%xO72V!?KqP7b>2x-+p`Aw+0nY&`|KK88aj#B@HEbVwW}48!cPD+<D=J
zdtmV8ONq8+jJKE+mlJ4^@cO-tc2gOqgzb73Q?54S?6ZXn6_;P;oS70a_0G|A-$F0E
zELq?arZx47M}o!orX|r@Q=1~^$n{L?XS(4n&~i1a)$>4E^p%nWFZvHA7)bSsbsv3O
zCe70(r}8$DhmQ}`Qe0A{{l0pao=HbS%I2H%Y#;68vgpg+8a4Z@nZ3Qeo!vYiwf&#x
zzQ41(eEr=rQ&tmSX&$y4I>uI3R+g4GdkSBiYd!t6>-gi@vuD37iCTVn>lx1ZP7j<G
z3iKa;xMuaF#Hy(c%U*(}%*)EArABVQ&3lUf?yj%1QzIvzG%+`SUSMIh<?*7M4Gumc
zt`YN6jAlj^-=EIOBbk<Aa4K!HLc-nWx=jiO+`fTVTDir~&9}EVFfg#Osj0ZW*>Cyg
zn>o2#HZ(REdUh>hV`tZo+rzQSM8x%LWwlO&Y1-zSFH4R+{ut(ValgxhqMbZjV<P@6
zy_NV&jM+6XBRW0uTGrNcXU|&BJ-1l@a@JP!{Cj_XJnp}K?OIfR%3_B}OHQUp$;xiM
znNzh_P9@}a?vz!Er;D_vM#aX8vMA2=bH0#eBBj}tb?`3dlgX@3O$t-IRBsgDZIUi$
z(dYsV(ak=)c=6)TpFgYrmhW}@yn6qy)&GC*|G#_t_UoNrRZghiep|Nr=9{9ORmLmh
zY`vCFy8qr^Md)UZ*<3$oSqVphNh-{Z40BAQ64)54UR?Njqv>}kOMuAMn>jkCt8C`2
zuX(C%(HFi_#b?rzt68d^n}RufCM~&`VKUQ4=k%<z&o*u}oa=YJf3knWA`OE;(7gY)
zZ42^BDt7VRvW$Ah+L>T*W0rL(ON^fQm)HL;%Kr`I^$+NEGd3|fV$k!e+tY0J*$k6c
zCA(g}d^vIA#Io6!89kN;$sB)Lv~%gwrIRBMf8KNdz4`33x&g0(9v_-=joCGDu3x&*
z%zeMEu6NRQ%KgaSwP<_({kVvTALsx7@h{C<|N3iHM6iLCm6e^{y^yz8-!bHDJejsx
zKW-00<>M^7o4vigA3uH+^KkqArJeouivpSd$ISN`T3M~SnDHv#LqbaG*CFwJ8}@;~
zy0*%Mh*EEXBR6i`IC6yLumNalm2ZE=?zO?1DUofzPXGJ&@A7hg$MPVNqsF#VSRD<d
zde7EPuAXv0s@HA#<-5B|xzmE#+1R9BbUb_hT;8T)!~XsE<CyyU`eym4eb~MsbKiV_
zJJAh#`cl2?)~#C>w6eyIe^Iw-jFtq?tFKj?Z{7(}zthc9_xJ1dg=!{NR;LVn4&RUG
zIjke*{d#Jb=cFt0ht8ZibKro3y>h6n(aba5;`#}*Z0Gu2ujKGjn=EsDqt5LR^*h2W
z#|+M-7{%_XC@d(rkhRs%$7h4i?LGNsZHd>{$H&LS{JD9)?%Es8`U^V_81Ni6SbeqX
z=_%38H+kycyyM*0#;_!r#qq|?o0b+9CsK@Fmh1{FoUGz`G->0_Nf%@N686T~+u7x9
zzr8o^{nqU3QJ431C*HVo=g#^+r`F%uTfM!qypeO?!i5VrZZs4qzViC(-=2`E4X$f9
zM)ZKkMG6Wgc&JEKb61!=`TTQP*p=O%KdW&ZdRt~K!`FYj`17-~*RsC}u_SK4J^SNH
zNglTL!w(BAzEtfETfH?x=d4|7sc=Kc#|oRde(&GDbyb@9OqAQ<v&=Uk^Y_fJw3w15
z*pl<|w*S6f|Mz;Vo_OEm<y`B{nJxPG`S@m?edeGL@c*jA)V5`rd*kxAM!CDY>xy~r
z4(lq>IQ7&h=kSet_wL=l-@cT|cd9vCBm0G2FNA-nG&FSI%rTpNcK+Wt>FMd|x8oEJ
z8oar)v-tj>XXgL^oc}-P`A?~J43e5NCT+i&vwrWlsQPEy*g{tAsM}wkoUA<4XWNf=
z8}8lAFfn^6ChMOdbKF<$^4r_n_2c%excqX~+%K!;qSlt><@GIld5dGs_1F4wdn`;$
zSh`&us&+keD>HOhd-nbJ_3PGsvbjHPYyTpRG^3e!@80be*9RHrnILmK@xa?&^LsmL
zf17n=uQsSpHk-}M=4^UeGQrjK^rqzG<gnFOGfbL(zZGvh$KpBZ)YGDc0iY$3_wTo-
z_0-tiKjy2@q);Gp-R5M*)!LW-Y+Z{)T+g06x9{h(*=mzB3km|(U*CRvMR-%{P4jyd
z&Ch=dTf8Z-$T5@ly=*o2T-MfYanrYCD|9`2_H5bx_w8Eis%~BU3>pS#+2VRw|DGYs
zwU;IN`~QA>@Ic|Pf#34WFH2V0FIkfLD?9%8ty8B?UF<z>z_8@{>%DR7ufNW9yqvS$
zX<F+3`}@H&(ZZcBHgfsfUG<eGpM3N7tt3xd8jr-Pxd&7{=h@fSWn`RqS`@phq;uIz
zzF&(LYh-~Jjk+yvl+6(MnQbDq{<^ifx%(s)lh`e{FP&o+DPnG92r0b$a!Z!GMdE<~
zk)v`-25pJL!omV^=gyy>efC*dbcphnsI^zKTDd0N%$YZLE-ULbyUc9ol_6FZ7AMj+
z-^?+S;AuOYSn5~1w<ShTy#M&@i7Nxe8v{kQMy=&*KMa|}RaREk^6{O^DdJjbGjDbH
z`lz*GYr|G=6TLc@X+q31(D35=wQCDymbZDXe<j#`G-KAb+i!Uen|wQ!wUGU=!G+DA
z&RM@dbM|ba#5LLDPMdH3sowv+TE6zn#aFMgcIUpdIq=C{<Cu<)j*3v{DZgiBv&+&t
z7I8!iWc>(BmT;S-0xHu*xmph^P0m#_@V%@y*>nB%`+F)sAM2H#TY6+In?ze<n(uDj
zmj(_u!@xuO1r|KNx8KSQiPaF{n&`n&Hf#P%cF##AJ7bEAi^aO1K6vop-QC?gW7d6I
z|5nAob5i*FxZ2-uw@bD$R!%pQx@;rBl4vy3W6~R!{TIcioPPTJ{QURt-W}qp&7EtI
z#jJAsO<DC+mTOsCgM)(?TwswlFyNV-`g5ss>S|7&TGz{$FTdaOneUI$+WE6{4GU!E
z&z{|VG^t<SK5x5j7E6bg_oOKvDq`Z|mGjO&Eeh4TULxkO_U99gSqdEs7cM+_@L*$O
z<HCgtr~gXt@3P(6cR-@ekf-?HpP&A<uY&WoS0^MaIKAkpu!8{0iq&71ZtGW?e_mf-
z|L_t=1Bo|}ES3gnv>Z;{e6z;nM$0=ki$Fhr|J!fBC9h|Aym7-Eo`1_kS=<6ka&mI&
z>h6J-e3h-fdMn#iM1y0q&TOxxzq0iho)%T+=ITy9xg<z)|NZ+f-S1=__-x}Surp?z
zQ-aL#UN_~#2ExL^&(F_q*R52Kaa?^Bv_3U8we_*ZkJ)L060U(V$LE6A(AdcF`#k?9
zm9O349-$+)yZn7z|LbSkT|!)~K`TSFrrx@7BO`xt(QC!7B)#d)b#hG#Hgf0x{IU7+
zIV3pVB~av6<1Ful#TP-NtJ6=L8W|Z`Ti=%IteV=az|odiAoJdriMwfnkDB84JsUPm
z@LGB+H?Vrb*|hTV^8MeouJ`rzHRK6ycHya+xagurOHr@NhOpJLva+&LQc|+Atv|1_
z#qHr{IDa?pcEiN;&o|$EbL-ZvcklF43ab>O>=+vz8WcFXj|N41^S332uZuaEwwaxs
z-NTuKCsnBZu%VqDXrXswW22V%#4l1Ze8<n6IrHdI(sk>^7v>tEwO{#_m6Bzbzggt-
zE6earKmGI;N6(@amtSs)(7BiqGVQ4pXU`%HlkZYIY|AfKetvd#<Hn74nf`32H^r?l
z&&^%i%^3q88qpAG%B|fQv(9?ej^3lUZryr%db+Kgf9{lng)(6gS3MF$TqWBoANQKa
z)&DJ>e3E7BO1tXSjJ}s+W4`;T2>H*qYh`tESJ(Kx;=#YazlDW`nVFflo?H^f(Gw!?
z)Xc%=8u<MD{Qb4Rza2Qhu;gOaj#~Z0NgHGK)%-lz%zpRb^pwbHpp}wWuU@UNIae5?
zHu>e}Zy_m0GXpeOgqNBAvFLlOH~n<pc6X(Tt0X)ofy%kU$Hy*Q5O{O<P5+fEY-_`;
zEi5872TWU~(Dmrsw=x~E(<w$5Gq&6m`gXUaXy=}-AW-`B)KsZDnd2WHK79D{Wu~Km
zpOV3vleM4Enjb%QETJ(#yuE%c<CTwA-*a1HKP(L3_;P`{(c!#UOBMr%`bCpi9kJ=t
zr{Ayt|M&EC{p;8IIk-+q^*)neTC8zu)0?H<(~liH*4EZ`C$BtPOzI1hi0eX)Q#oes
zhaaBHTCF42-RaWwc*owod+*+jeG&K0{j@r>+%=xVKK}mIHgeNaO{IEOJmu^EeC!t2
zZ#tN8cG5+Kr8!13>pBmpc>4MJ_V)Df@bMk`5!75a(N2KHQS1JX&(F`dA5IiqWiIQw
zGNkMH<DD^jB3!!q`ro}Bmou$07VBR6C_i#qsfCQFzQ$>zo{w+dy|b(P^W)do*IVzN
zNZY*k>$PZ>CIyMMoiTd-#~&B%gsdyCN<XuJwL$Vr(atp*TotiDVs7VN*PM_lnUX!v
zu9nOAOYOILJd;lzG3dFOGjHb1!rgaOJmqF<9Wuzt$<YzvTDxY=pZ)*8|6dokH!9lA
zbM;jt9_OC^etsUFGw09y`}!`_xV4_mlncBfFK8vp%3Ftxc~m@|xB_z(H8c{Yc&YY1
zes^c5a@V3o8h6dFT^E?3!nw9|@#4kC#>Sw<nBN;+)OS0q4a+u~X~5HcH0fu}zO=cr
zo|8KA>wn+=JEbUD(NpL{*d&$6Y5n^c4sSR;UEh82#kL&3EDrDz$tDGd&eZ`LTPnCc
z*$=+{+B9W*-1>BhV|m**>r^k(PLN33e*0|NW(AIyB~k0IUl;qV)#dc%=4}m+Ni03O
z(@!r~Tft|3dCRdq!W*`2+qQW#^Y48^oi6qvc3cq;vJ518+#a7f({uXis?D5985tb6
zKK*+d|L>{v{U66xvo-j;?iXGh=ddE?VeZt&`y|>dW%@EqW}QsAWu&MBT2y)G{{82b
zc4FN}lQv$vy&&qqyVNEH8NTC3yT#Xrt&UsYep&Sw=c_%eIhPt2zHm{xs9vy|(aGRI
zbbz$Y<^IPOc7}!%ebg%F<)_+e&IcumzQ><_))+{%7)P9bx2|Z$*=LQ7j1kYc3}&A-
zo4wXEQ0qms2v=)!bMwiQCvUyGr2b*%LQq@>io~Dto6Bj@w>{_i0u@i0<BjWXefzde
zfn#ZqXRhX!vc_eZ%?A${^ne#MG-l;;+f8q5Y`l2!qPTwCpBL`-v*yex*?reEW{LlU
zEXF;pjyv0AVmO-$cHdnXpdrI|y?$#l<I)8&dg9$jr%j(8yfS3}?<N(`8?2coQqP|~
zbFkn$UYL-hz1rY{ud3&zckl9S<lYPKh-?Um6mhk9WcKO=m+nis{^bu@rlnfWKmWYQ
za>Ck97p1++Vv0pTi|x(2k21Md%=Jq*k;*l@9T_@Hn)`;kWcSgs^77@&mSqXP$Y_g<
zWp)khI;!*~!DPZN(FBQQmtO{m<UW|H>KPRufB(jfhSn&Bj*Bb0%Vj(!nMm-kH6P5{
z>gVroEyLHf=*s2gDMm9FtbAPm?|J>PW5@Jjb`)IG{=)gks9}YI5~KT_XQ`cmBBi_U
zDl04h{{7pu{i(nW@$RDz3J*U2oOE)D0*CGFS(jdao!~4dFF$|YytGI$H+QK?R(Tc<
z*^-A1_TQK9bos@yPJHSVql&v?cP?g_@bU8>F3I4vU#H=vH1WgFnwKS3Q@vJJo=*Gh
z0O~=?N=RII{Z&BD`9Mr@v&(4)W6sT+H=mzp``BV#+s!sR(BkNn$hLc$A9k(XW7@SS
zYVEc*A*S=6Z{NQCvBGAi&$6JE=WboQuHo=|<H?jyReO_@ljrW(K6Tm4nKNg$g2DCp
z`oC*du8C9MpyK(||AH8EfJo`in9G+hpFVy1>J5diBm;?_<BvB+=*aM;8_g7xSSi$?
z#&dep^K)~HEo9UtuPj;+x^ua$WYpTQ)mQ6&JZ#V3^U-bofzXAQR@`xW6}I5|YgxYI
zw{G3yoHph3(+rbYUP~vPeDdvEnbO3JrLDOvEQfVYH!axdx7>K9kKgjm#jb*VO$r^}
z)AelY{#eNMpWYPXE5N9ncQk3^a}FNUg+VJroPURWb_l$(ani*Afu^I^q@|@-hO8>*
za@wGO>tVr}vuAI<{r2u!g0!S$WNhr-^7r??zP_HGmge`=Am^vY(xBMieKKrYY?U~e
z-nzefsH?M}F?hlvjrV)M%Pm=u!05g(prJ{iM{>*U8_$@6j&D%SsHl)&d!+sL_}gvk
zUN&5NF?H(H<^J=l_TD?pni%n0jRUkYylfX}k&{e~>aIBldb6r?EDU&>%PM=_n0u?u
zX7{R`^ii99`sugt-?wkuRx$7WJ1srm%L;LfXGIO(F`s-Ar|>rMfY}4^M50Te3I|iv
zi%&oQyn6NO*|TS>#i~k-8~Hzn@dj5v$meW2VBoX(;+`uUzJVb&a`o@`es|!voqxW(
z(9vW4_19mkR$sNUw6rubVmiMmPWf%30H`O(%XWBg^>?n;rn`CTWxjB=9^yPXah;NZ
zMMTE;+N%b&(G0uO3S_Ro{wme`?8bEqnZDcG^Y8C0cDD`w*UPj;{<6ti@Ot$`tM%Rd
zTedo`zaGD%;NiW!)!f|NSJ$(=nP)CIFXBM-RF~5ZQ9Oqi_!VqwKUi3kkZ>TwWRl8F
zR`s|^DhqGkNHB0<x|v`Qp(D2W=AFW&x2`oPa9G%hbRT_H>wjGMN5SpIUKQL8e7qCx
zu2|O3?|Sj%$&>bf4%`1(z5kc?{`>c(l%@G3zA$RK^&dC3wqAYtWt8#U>#Pl&myLTn
z$_*O*r_Rf=1@**vmNaC4Ft(b@_eI{S`#{y+d5bhApDcNJh;{j8&Zft2%~mNdyqJ+=
zcKh79bC;L<+y8y(AO2Z+uKNS8!^x(Xf_0fQ1ShGiy!`Uj&2?EOQn7m1&z<vg-x9G=
zhx^ihgSqU|n{VZYurs?}eEIUFhl&ta>#8>|UPv%GtL?n>bH%&>NyiCYr&+HQYwo`L
zZ*6|9_fwW_x8L%$AI>m|(h=)Fo@_MpR&L<;hD9392LnKp<~oNBmdn{*@k@B1d!?xR
zXwvSpKUo7rT7xFIEzW%4RNSPHmX`LW?DwlzS<ZSl94c(&EbdqBy(h(X|JMpOi@w`$
z%eofLv#Bil`|E44`=3y+2d5ZpKZJ$;6EorsxR~qazRqv??0NIzqNBUr7W3}^5y{Av
zf1$<3-kv{oL!H~IN{7Ibq$DM&-d{EQvcC1NnsVR*^Boo0o>`6pEUtkUw%^&KzcFt8
z|5xGrcWvCrI5)y~*J;j%<wAXr7cX8sT|d5Umae0K?G1&iVh=3N8T9C!p6SC@^h145
zgf0u)v{Rd6G(=W~hv)GwDD+x<(V=(A&G;mq!wSY%euc5$$YY<GGA&j6wNq4%oBRT|
z4ZC-rHaK(Anl02ZVZyv7r*&0ArzP41SsEW(%qfk&S-0TwOTE}#CIA2Zz4nGtUP7Y4
zYA(;=3nsR<wz(!!znA3d+sK{&@x$WG#yNjZl?h2y86>Rn&$X2(*mpH+YwpX7rV>2y
ze;$dyzrTO~=FOLvM2Cc3H{vliGBPqYb`A`2josS6joEMi@^4SJwjT)KYKmI7s)GAx
zipKM88z1wFt*@||cPwsNqJ#zOi?6@B#r6C8`^zs!)J$(ol#n^zd{ALdYTVaN(}W~e
z?U)c9=D&i+Rzkqwb&1u~o}(Eivl<&2XDySSdR)m<D2=nnO?l1L*PFx7Zn&AVVY{P1
zi<$6`55LxLdwpGoRpqaO_0oBE)grF>cXxd~$S!YF^JBxd$p$<OH*?JTk00Ko)3N{R
zwZ}KzUoX7;GDh$E%Ht2zgm#?!8t?0R^yxf<``63nSeNJ3)&0A<IXzv;z+>j@vuV>(
zBd2|>+FQ0eme<f#KAlM??qb%~>gsBq!xl1q$??}Jn4F)AeL892fB!@DmFBK4Ue8Hi
z%k9iWSsWwQUdwvsbnvFran{2I0$SHMEMLqYdN9EtW>?9}W`4T|7U%M|%m0qeHE4{F
z`oC0eJvVc$iLZ0wX5~w3Hyn>$etB<}(~h9&*D_3|>&4ERH;=7X?nnM>-gk>PDsUWn
z{84~~(e+xI;EqQ{iM;`T#dCBVYP(cCGt`6Z^R*8cyt%eEy8h4c`akCNpUw4kAGEP5
znazHCX{mSG=9!5zzUjSuwb+xvaPr^3wQY%R?(Y5j7Q`{h$NgWrEkm-dQs(%>{cp{d
zi@o?-wf%OjMOT|H3tv04$iq0%$gdZZCaAppSn;K5@A>D<t(|Qx^EVtSDlRsjep+^S
z#Zot>=5-~l+h1``6=Yf2GEpFL+wD13i5nw))Px-cF5Ju57~$sTCefC;J+}6b(6M93
z;;P@8&RV9fuKxIAg@Qnb+v1%u>&_Kvb}bUui}6sKtnj&P%kFPySk}F`zG6eNWK&9j
z#JdFnKSLXeci(>d?^O7{NvEIgdSL!4On}AE$cX8zJ$Qt~-_P%mT5p1ZMwil=rB}kb
z8yD3Y&3s~UF2!iCNWsF76?<l_cHno5zE(e%N9Op#TCc?y53IXe79FCmwfd1`Ls;OH
z1cM*z|NmOQceCHp%?*!#eE9HTx!qUI=7S$DUH6});yEcmr1$vaj}<mDeCm@u1%9oY
zck*abVQy}2X6DHZlXGXz?3wb$;wZCo7^pNeo6haDHY<*yI_$t3mZBHeSE`)9of{>8
zyg6~k#*G`3uNoiNt-xVqVUe-@_TIRBtGRkpy-ubWN$`BK;otmD{c@Lz=lbi?eC_?B
zyQ@_j6+PI#t=jNbu;RAt&)HqS-mH!~xZv3Bs>8o5L?zlb>QrC(F30L9U|`jxz~M3L
zNYcjU=4KC-NheeMZtQPdq|r0C#c837(#K0ix2AK(2v)B7qI>=0ZBq$_4Lqz1`!t@+
z<M|<~)xSizPPW%Ad1J&bXZ>BL6U=6(r=^`q`y9Ey_rk2a?YEB~KmPdRkIm`lFTMP-
z`o%KPQquGH?!_5MY`K}!KL6Q%g%94j{aOq4YL048uAO`C_Nkv@YGSK<6ilSfF1P!-
z(nV?G)~!n?onOk&;us*(+L~i3uDS41rh&wls=uF~pLgu}l+nl8<=Nca-01LNX~B#J
z`_%zeTv9y=9-=LeZ{-H|u8>L9u3Ra#?!YFUHzon+3zr>lT(sg~!i?Fots@N=OFgi#
zv#`i8k^27q`{Ii)?%lh0F@Rg?LyVp{51YEWx^{`eg13R8ov9P%G2JT*dw<2dAw8h=
zz}F==O{93Ygl=<NytqG@k)>>BOz*A*aqH*Lo%^-IhOIfVwDjvz!ypmYuT^^&X*4<<
zc>Ohuqi5xE9TwaEfUiu3NBV>v*1VB#xxqN?T_{)V*FSm}-oE)-C7qEg|FvP!ii;O7
zw!NMP85x3X#hj$VU!Q*L*s*hSt=X*)Xw~1W^VKN`UeJ6o>E+5_g*SF9aWK8fz0@nw
zW;pxo*&P+~>8x#u%F4=*A3r`@V4&jJ*Vnh}Zk|V8>)}KT^USSL-A5OJ79E@THVfxT
zHy(_1oD;Kg+2#PtYu6;@!t1S<U(VbbHTmR{RhvX@IY28)ql2YOO{LC00PPR!?(POp
zOCEBbv+U*FyRoyDIWA-VKiU50<Y$tFI$GH+JZ#KoPL)o(?rN90ogrjr%?3pSsa_jU
zcWLUJsw%O#_22L8EY`iCvMBRb)>a{wiKm~Qdb-JPd9cXQ_umh5NOdjJ{d8bzmU_;q
zw=VVFe~Q$0>b4hm%~Ir#g{=ixbSBN%Q9w)P_8Zv;7It=a_l)+kEWTLr@K9@KXXoF)
ze|vj-12kF^C5{(fkh;Ld<g6(E=n308!7Np+g+;qpa61Hwq<p`X>LAb2bmQ$eG5==A
zHSA3-O$s_XIxb3r(^A)lWp8$R;l<3hWJ>0&wy3=H)Txcz1J16ppL|DPg38R(Pn9Nm
z{Fk^R_Q2v@*=~0?w<!CSU5|GhOeolW_h{0^=bsNMZqYm^@FlVrv_7hZZ`HX^yDq4)
z_V48?y`3x4q`=X&C~S3UQqm&sXLGCtCE9rT`22i*LzP3WsW9i5-R_k(kC^t<V%@>?
z1dS(Q`+nt?hnX@orEq)=&9+bZ$JOezoA;$<f<)TZsN1`%*lvB&==${g@8rpoc?)y`
zPF#IiBA|PC&V;P}f7dg85Hp*xa?jt>Z_D=o`}Ml^;z`i#UiVQW6O$`?-=iB3Kb$dt
z{`IV_A1n6g@yBEuzq)hg)WfYjS!!<+Gq&D(_%N|+(XCB2ThbR^%y8&k^Y*^*wAA~R
z&*y%>SKa@3M^prRy3>Q)_O??UTv<+E8CDg(-L|NmVH1~Fx2x(+ffI(gd3o~=&N}ro
zVEy&kXVVs6ywSO^D3wKgOWM_%Yqg>&pczKC=7r9uU(V&2q_TYZ^7yTC3%?pj^@2`0
zNKZ@i`+i`m!sJCOzP)?1TG-*P*ZYv&XDy8#7`DXdrEk6&w6f&qr>7xWp-Y!U%9l1R
zdwKAn<JURDrc!7B{MmE<x%E*6iO6?NiB$o6tEZ;*{!;V|(wokl8+d`4_2b#H+gF7f
zF0a3sk+WTKKg)~bc5Md^1bOI`f0_Po!)aH$pL?x17IaTK|D3z^P+*|rB$c;i)h1H5
zHZj-SjTc;heLLcJt!RPF{Q2|k7YX|cJjl0>WntbFEGwm}%%OC;$Zo#ASohMPml2|T
zbN#;G%3klPG_k<qPSxisc8Ru~ck@Kw1+~505-%`Afg_{h&7C7ftaC%fsyL@+><rPG
zYg5W1d_~uIjrqd@i?sCgP>+NI|JNF=DEU+NV){STl0}bMIp0CIr(WG+CCp-IVj|M%
zQnWLMTjz~$fXLFc>25E+R^8ZfGtXP9t!q{p`@#?{(Qa3c8LQ`iJ!+6sRb?f@m7Aac
zJzaVhW3OAVj@aXm7GF5F-=34~P!j1LclK0&Zlj9lhTDQn&!jv$5+$yD`;|47mwico
z@y-|p0RyRCozts6R4=t^e#Q7?j`g9xPL|;UEPanz99I~n*>QC&S{s&ael$SKap@O{
z?xSJh;mI3ka2=L0bz|N7&?o3+F3XSTKoRMpf>xPK3mHEZp<4Aj4l-qIPm3&tIv-f@
zZMJ%A_ICP$Z|keSGPrt}S}zJ(IR&)S(8p?SUcxQT<At-9UH#KtuCyR~$ES6tHp%q5
zUB7<avTyaX<D#ybB3!x=t(R3<!Yc2s2+?|4WZC!FqA&YLJJ-C0_K)sII5>p=T)ar*
zXp*4>kK5vlS(|!Zmi#)*Z-3|h{rs{kcX?Ut?&K{s6X`w*-kW>*Wr^u-m28H##KVUV
zU%q^KRc&3VjDzXd7r!;T9vwS&?Bw6)Pm3%ic<k!_%y|BD@7}#7R(E~8cR%9cslD}f
z+oS0VG`v=ZG|kw_$~8&B?$&u%bFK?wg<sw)8r-{a<HF05s=fbis6Tu6?%k_bS}Cf*
zE3e$UC#SK(<mJazZtF|c9R7RmUi*W!VOPPmrxxo{f32_D8@K%O*SQ6M8>TX^{}moN
zjh~NC&aS2+#Mg1s5)YL}pMUbb=e%?Crl*R~*~=e~?f?5W-~8nzT?QM?s~^^eiY$%M
zn|}Ifjoti{DG8}D+Pn=}1u8->tsh!3@~%>^UwP`OQCp&kZ+J}h#)vz2?|%JRqo2An
zL5B5;?Cg)Op%dR*X@^WIV|(Q^YgO;;M;uKCQoSsWBHc%~Nn9{p@yt6^Yif<%d=<~j
zmoJOB9=vp*x3@Rhwa;j#kAlDgjZ+B*CiiZ&#c!ONk^G43Qs4%yzUP%QF6K@*JfZ$t
zJ+XBS*Kb}cRs->Xv)bNEgET~vBBv$Jc>DHkfX0_qkAuMTR|YeEtmdwJcv8w__F1!U
zU27RjPHs^xXANdAxUhEZ|BapjBB|x&-~WEU|Ngw$6}FD9^C1(YB_u9nm~4!A^W>F6
z#=(8xZe{bnpZVH@|IV?t^cC#SiY#YX1@Ko-^l!PIQ0lxp_oc^`1rO$Xs0grFn!aD4
z(WSsqU0ofucH746NUNy_bgzA1SrI$$d1_?X+aiZaDxuapR~0?5;8SW(|F!Z}aNE-&
zOR?@q9WjA37P}jdFaFHI^Xav2@_H4|sm<$d#k@<@=$hoC79JL+G|^*c%scb6xuOq5
zXH|9QnC;%Wb?Nojpvm^qvwX@0+R<u?d(RZg?1;HKEq&9?9PsAv8+Y!+#KlcZz4^B6
zJ7<lW!`tmQ?`v}`skNJbH!uInUaQUnYiEkJd@0%(A;Q9VCqFnpoS&=JNkb$mHg<3I
zcfI3<3K1(WX8gL{e7<X%Wn0nCIUXuqN0nRyd3VOD3+`ZzW=f6J6XE(<voA((^|hx;
z9FzQ(fB*LF+V$)9e?A<Z{%QUsl~mBd3b*Zf!Z~j=-`-Sb7bp@Q9uC^YF+pQxGRK{!
zr?+nAh_S?kddIp3f>(=H{`&Ir&><$3!|_$UI;WcyX80_#Yg~Qg<lf1=?Ta)P2AoLS
zZ0e%9FPVd<Hk^0A4$GM)g@!G?S;Zj^rz3Q}eV+fnrm|8}&P08W#HyHAb1nplCX_mM
zOLKkt`RCm`y~74-lUMr9zarTy!{TVN_`~e;&lA&Dq`A&ca}E^gdpzl6iqga#<?rvk
zdX;5*kr8yj!nvnKr86Hy8Z+$P7xjP1VbF%v^JmV?@L5)66U<(baN@oB2i>;jyp<uR
zo`23d7udOIhKGurUCoXiJ2W_)^ro*i+7>RtI%Vl^7CX_mS<BlmE)7~~VrKTN#40=P
z_kuE}W&@|iN;004xLTbyMu@lu&b`vgQeij$eA;Hv=*E2edcLNqTVt2YyUyonQjnU;
zyN<DIk%~}fjotaeJ2!4b<X&E|O3h%yc18F21$iq&vi>fJpW3j~c;0i+j-(A64t%z`
zvN3C4g4*PTnX?jSY~#;aDXbOwYwGEz+qP}vYCZJefx_uc>2o83zDn*;e7x^z(as2+
zrb9EV_;)qb*wsHjH~0DZ`RbET`lto}XEo$0wvbtHCGh{%UHnr`6W*G!IND`CNs0V+
zXJ@fk_tIbTA+2BC10xT|@>eTbFO7QDROY@h;!KKB`MW!jk&%Kd3pcK2yE-R4Uw=W#
z5;;}18>@mfM2dFaIdbI4g$n^wLmf<oB<|g;x)FUdWRc%1C5}ZJTrQ_quU>sJ#YnDS
z-kLk^x$Xns3olE0_ZEpW_@!rA$2eSkc(hww=J?q&XF@jb^mAodmohusZca#zRKWI!
zVrE=NlRn-pzyI}T&BKBniWfg^3=mPBeRkR9%(Y>+`xnJKal46UxowQNbMvO=^wX0k
zPj*}U@yT+Z-<&lwwwTtRZ`id+!%2Jz*WPtHr;jETCMG84=IYM$`StgoSjK$^clXWj
zFBKjz=vky;YJWw=^VaR#r&ElKjg6a|o7HFBGGhwgmVaIHq(&E0tJs7kSqoZuU3SLo
zD}O)l{`>f-D6geKy1VmT>lSQuelnL&A~s{Y=yHaT%=vb;Q5qs`ZEbO(8hKyW2Xgkm
zUjOdUcHuqitaeXNjf{(nyLb0)@6n|5^X>Dy1KdE}+sO+eZ~of7TEO7)vTc{&uMW@<
ziQ7|g@bjM=H)b46u*fQV%@Ol)``7ul3Pw6py_ytElCL}dJ@Ndr9cYE<_lJkuH<Yc3
zbvqQYS;e!KiP=QTH@&yAW6_Sid(Rf$`SQi&^d=Qgxqlk7{Sq`d81rA(ygBF?K1t=n
zF0D7mHKuwsIw**6wK^?4dh}?8&APUBZ_pVB_fm|$THbkdaIqU>*lJNO*2NcJ6zz=B
z3xCRS;f`Bh*Dsk1JD$A`*)lybxKk?PqmWA=Py1oJ`R`9oR-dpSXzE0^=7Vp`%BMZg
zWYg%%nslHlo8kAJwPCLh9B4S2v`}N!LAf2Dt~C^0eYk%mufyG>jWf=sg=%=JdI}vc
z{8(Wlz;S1Py}ba(mRN;f5i25vR-|V1=J^FP1}|+=`0=-H)-qi&?`F=JiE1k^KULyT
z@)UY=sI56~u}%)?5EBa-KDK6lIh%;s*w93qdI^F3C0jnPl>d4~^D0~9w9Pkj?B?h5
zG%9Y7>74d@zk-hBsi#F+Uv@1Bc=`Re{f`ID4hjql9e4Lyu`UhLoaz-Q68iZ@)N<xK
z`PT&`6iyfIWywqE<FZuQufx%F>)bBT{tAKq<M(U7#}@3&+Wdj#07uErn0Jv(ho2-&
z7nE3O%pH4k&D{y#T_>rWn7-=O7B=lfR!5Fjr|<9Y-@kQ>sqsbG?b0*{t;S_9-~AQ}
zZ_ewQ&%fA>sd+;DVl}OYJGgC~vmU#d&8~gF_j}x)iixM68qMsP6}`s4P3QE=5U<r&
zzdgut?w-!LMJQd_U=OQ7%Ka7FnY)|~x))FC5DBranD_r;f8CSMKSO>*tYo~f<686D
zi@IUGxrXOHL?4XYe(7+`^WKHCSAmAHtS?459H@DIPWDBurG>?dnCaCbn{=Lku53%R
z+BkDz+6tG-#hxps9k_7d>I!ovmb^_j*Q{E#?ECxs@ljEaKr4Amo;KM>PP_baOWgYW
zW6Ls2nL?6xS6n=!$<nr5JY|(sk-dc6skF^nFGHJ_d~cVp+p&H7^Upsoz1$+TCQEL?
zmUS9l;c^`Pi+11DFEVP>-rXVfHF5gs)QFfxlO{))&943PqfkMh;@20=<Ara{mWwv9
zF*F~%z@0nwLEY}JU$`rtzGs^)Aa!*Hqm$$!ja5fJDsgPuy!rFJ>i28ctxMY+Id#Sr
zsREgF6RF*%+SL-TuAI%(iBl^uPD-2bDyZ2;^o((;5oqp3-GAPj_wUOctA4JTkh;@-
z(F?Y^-q7xNC4()_W(H}~R5YibZam}btQVnf(H9*N5fK@A^ZxzwDMq>V3$}2}UU3dH
z-jz_YP^!i#^hU;U?z3kCDsq08FAdOGvu@p?hIa=KGM@kZcI)!k1BG|qmYqL$uCJ%3
zK<4@9pSxCid=rw|al!el!UTVjUs9z}4cnq`)ZC1F)ui+I@^XK3GqY{mwh48*+?H3(
zZ46soY9S-motlyH;?>pF&(F_4URaZuc<}xAa_z>0(qCB`rFZ}EpS0xEw7YFF8`3`5
z?BB9w%ig_vw`{p$^D^uzlWXAOiy2#^B->`rne*q)b31bji-<~x)0^Iu?cN*r{?n(T
z@HEB{)m!&M1Z>Vnuf6t5pQZcZ!u_jy`}o2Vny=rzapT3!jGGAtf3D^Kz4rC>b<M6M
ziMC}g4?h%bOKfg#zI^$zwzl?N_XM899+TdPGS&Zm9smE{_kHGuhJl||7p#4CBlqK4
zmJ<bbH9rc<%gb|fb?2Y|?stHP)&D~G&eGS{tl#fByy?x!bS2R0vdM0XGppBJDJXgU
z;NnIHU9r<&zI-X)_j&I2dsVAvh4QsCKHk0I-oD<yXMWby6&4zTc6<C&_n%iX-`i5u
zZ~5oB<@XA$=DIB|tf-igdNZreOQ6NN{GCnJmld<`8S)&?nDy?_y_3@$;&zvvef~Le
z?X~yy|9{W_edqbA^T`Kgn0QpSoWB0iX8+EeJ9qD{et*x_QDE20?g-u<x5u~h_rLvG
z)!x>orlyvaVR`GujUUhJ|2-Fb(~%*`(ey%7Fy|0>Ev<*jr)K#-2VQ^m^YNKeRnf3;
znwg*ALC5m^{QS(!mlqeizk2yDi*bR7iJjfM5-WE%H@*0MYp%b}-Z$%Df`J52){Bjq
z91p(SwtnNP!2R}PMc2_s{&k-wzrMcy|F``A+YUdhm@3aED|w^&hS)c)uA+j15C3ml
zZ=bR4ltsh$EziHyS?|4{p(!G9l(!(~j?AW=8H++^o|Ek5_x`rwi2BX$^>#Bqyht*d
z>#x^ss3x^tb#mh`gEkAP=PonuESz#lTDNuD<!Rr({P=LaD`0`ai>dbyPulqI%j4hY
z{{Q}W|G$;3t*wm>&)wI?_HAn%mR}a-YCV_`urg%r+O_U|GB>AR7difM*REahbw8IH
zY+C<qwQP;tL5sRS6-pBqTz<JQAfu*cj*4gV!3m#)MEj2~%>8rgR@Aepp<Rpa<b6MK
z#3epnK4h}C&1-M30Fk4hQ|0!4x#TS?E357|CqQfJ@6r<sG`s?@oH^s8_r218a>}E+
zgYNNb*`I#?xq9{Lq>UC5JhyVT%{I@UHf745pU-B?@Uh!D28Jk2RNzpGek$%csbu$E
zx&Gzbx0e?cZ8Dg$_w7N0p7YN?SL}K9>ec7x=d(?uZUu+mxW}b8z5B7nmwL;N=jlC4
zopbLnaBVfanX}DnDHmv$(d*Ug_gPha$;i#s?R9J3X7#YZLOX6|hRLe}3&>L00F_R5
zj;4a0F^ex&e0vkQ_@c&Cui$N4Zn>)ndFJM>J^uJ)%Izby?0xr@KK@<D?ptQ{>|=%2
z)U2|yvW$!u&(6+Pn&`3kV#8d^&qrrI{<!1(^VxIf?ydj-@6aJ96(P-vmq!iOtX;cy
z=gvqKAx;*iR;P(Gr^KpFUKzH!(k3pB+u(?w*zJ$ri!TQ790n;ccx)lp=_12-yx;y`
zhS}`%b1WxU)<;>$l<mG-TwJ_8|GwT2m1-8pCq<Rs-rY_M16GDSefspbkm&a49Wm>C
z)QWfCwG7Hg+%DpL?|s;U*QfI|rh0+Tp!)WFetq4yo9SwkSMJ$kWBho52A9Z>jT<-a
z+h^Bt>dcw&bk>Cd5%KZ*6Fq#?f}^9O%i`_i`orVn-+%u3rmR}5ePcS;){JVa1zYNs
z_p1t>EIx00{lbL@MLX^Oe!1MvFCVwJ>g!~GyOReLJSN4&#QgdD{XRPz+qCJ^xlOr*
z90mB=7YD6OF`9WYrRwR_@Cutay&bkMN_K^4)k@VDl~w+7i}`ul{$-0rU;M5T&!s^Z
zGfZ~I{QGu0pP!%q`}gnncNQ<-yLayS=kga=SeY2-pa1@P{r+#?zl(2(_RtY~Y;o>s
zk$&v1kjTiHZyzpcIQ;O*N(N)l8CRiOn9gM6<+17B$}!_^b&~3BQ|O3WA0HF*=k50U
za&|Qm-ABE=y}yUI33s;Ke_w7Pvv)6O(tGZlIcLh{-MkEV4j0C(4V!OW{_apKxBk8#
zi@cVeIvdXGIHAIZ@7tlA{?u}VnP(oE#54YXkzc^-d(G1+@XMp4-Er%`Kc8R!@6&1h
zuT_7u*YDjHe@f1Cl3f4t-Mg!cij3CJp1OoZO#pPR)}LRm*K1GpQk$$f`DEE_jY)|C
zBDO_OPE1z!|Fn8T-bJbTlmDnszI^$z*U}&rp@StECbQP9TX)|6|DW0U`*zm-wd!@V
zuKl$oLg(J+l%tP7E?v6x=g*(_?%i9ta-~g`_oWuU<(ngP_Qu`6cQ4Oq<`l1`5jy9T
zPjk&a+xGSB^rf0@t2SDHy033<XUDgysK{-GPnx@X``NVLFPG0>7q>TS^UZVf?blz;
zD*qkilsS9l%9Sb3cWWwLFSQ(gc;(8KkgzZ>6`{3Z(Sd;zA3uK^=5SrKYtg5q#xAY*
z<=X#*6S@|0G%0u%{hhWfXr*4<o)alXyLa!-zq8}v*DCuT513n>CjJw5SlS0(Q^~mg
zdbGzht&I_Du4Y}$+Ildd;@i#iQmfc~?o3y&T|0OFe1Bixv!h0;$rHLH-HMBgKj{Z}
zcq#~}n47URFAUJg%FY(<JQ833_vz=KN0K%Mt$b3n^Y-gY6F!=oo0q@8w^!OcucxO+
z#An|+2L%mHO+%?(Sy|cld%xdX8N7Ve>eZjUqwjH<&EC6X$BP1sqWFe~hk~B`GR{5B
z`KH85majc(?Y#{fJyj+JFZc8H@v*7+@Zi_i*JZo^uHAmmNUB$J{tOS5u&^*2x&EtJ
zvQkn}v9V__|Csj`JZ0yiG%-SF-RjlZxw)aMud+sdYdroqQG)GmT(P8xD|gYq^3(s#
z3{FJfcyn>#<(H+UrQz#h0yRXo=iUAF?RI{v)5E8yr<=<kQt|v%v+v!zyo|_6<#wHC
z9eQTDDD5nG=;Z6mJNcvt2TP9GZiaKWr#N)g_pSC0IJRd-?|Dx%t_^qd=Fga6QTFCW
z+j+y8XHK0ub?)4`3Y)s$-*P7;b{}P0_qf95-S^*f{nGvY{TWmKG$zkEo7U~JXmk4c
zc@~9EYLi7hlYe)0b$O^1y}!424zEl5Rn4|%IWIrhGq(NPShl-%GXryt-TkDE6?^YB
zHa2EoUk6&i{r}(Z0M*Cs@^v37Yz`(E++RA~Z~5zzUDM<1A{7KWjy}3{>5@tI5mB}h
ztGTD2KKpT3s3>8xZP%QH$5yIBonHlIr4Jaa*|;%q`Q_N%Wp8)C-?zK;b(pvJ>A%0f
zb2f3XFzsAZ@|I(+U;F2<g#lBhP7MtWO|5ph$tX0b_-9SCp3+S@*L5H4wM_*0+67rI
zzAVxF&wPdVa@JN`TU)vQ<;$0|XPtWbvbE)4!j$RLXP<rc_;~+(%i?8=7cXA4$VqK-
zYIV#_MbO$Z!$~TOFTVKmr-n}}c!7dURsW<VH%=yPoN@SJ$I+zOYil}Plq7iUs=j0t
z6kJF!xR7Bo|Gc@SrKdnk)Y@~4XB4qG9%$8`ep*+oJ8u2?ZylT59>qWUSaB&e!DuEM
zJA3hlJ8u@g&0<~|I&te((<X%$r^KR79+Ol&H!WZO{NiHw)2B}_4De793JnW8ckW!(
z-niX&`#`%)&wf1F`fp{7UiiX*Js*!rr){2jHtl)ll`Fwa%d>UFq9Y?4yR-xT@ca{<
zW%DFq=S^qEdk$HQWxMZ&t=_tFW#{?l$CC{A-{(L3X8N>gll)>&<h4Gr054|@5Se;1
zW#_ZG&4rsHbo8d*zHlL7W5l<OtCH_I241;!OKMAPYq0s!4kL-4U%#qWU-c4jd3VA<
z<()j&hpHg%0voybxHvZbcVE7g)Yt$2@uT9@)|RTHDxSao*3CXE#@B8t<?A;$)1oh0
zN38jv0%&l3rHhiL)63RuUC&9KE=p3p?<6u<HJ2`4ym)7@zSOH{A`frno}YGYkB18A
zNVVhr^3^u;0yMT<o#Z@$x%=oLgEMc-s`th%4482@S7^_l8oT{#*RI{QD{A%CS0z?r
z-BZ_oOyLgbd^n@`Xp*LL$+pIW2_GtKtme*BUU^I4XHCVQyaaE?ZxuFXv(LUPd6#k0
zL7?UQ^K6q@1r{<KEX$THTXb<njNbV!ZKw6u!^6Y1MY!hrg$D;uKGUZobnSi`XH&(j
zM=m*LyLauHrNFWL^3MIucbHEudV5pO<J`ulJq@<XwY{yUpH7=H#Xy1wycIya|G2l8
zm(j8h8dJR{pH!LZHG9^qGik+z>r6Z+u{b*1=XsF1Zhe57uZGC3n00I7o4Yq>R9oeI
zUt}!7!*f_^qKDOM_Sd}67PYprURXMR8N-1-hZBr_kBw%oSqTE33IYidVIkL-e09_O
zAo6DM23707so~vix8E;(Z?1Zq)1YgSP$$be|Ccu0Dy#PFk#Q8*eb>&;E-ol&QGmv`
z?=~$$HBJiy)`so>{ciWtrAvSR_^_v8k%qu^7bV7PE16Gye^g+Rw%JmE<II^eGDkjN
zX0d9Qk=w6%({al;ffBB=mjxDYzW+{7PgkDk;pusD-;Zev$~OnC+!CQPb?VfDL-v1@
zKeGy4PdVbs7F4F#IqCd!`7h$R=NEq3!)wi9kW=@tz~Zk(`U<Xq$Vf|L<Hu#Y=lU&o
zQM$R=kn@VVh--Fsc6{yEQ1CF;(x6N|_Sf(97YMm84$L^=^Vd6VpRDV}H`8Xj27dYQ
z<8Iqr)(EYs@60xfI|`W1jtvWQ6KJ{rzW&n_&&bG`AzF9eTJ|J@x_>V(FAvq48n=G>
z>8D3GPe^FxYCr7xKQ~Or^THoj6(L=H{q%pgGuw~9_j^-g-D+IG-=AhQbLrBh5^cJA
zdVapXYuB!QS+c7{q{wc5{hJ$xi!XBAtO*SbJ(@hd$vW3Yj=k0L^}(JA*MdSrQ@2JL
z+fPc$tD2&g>8a-PDDOIdl8uE7Ut6Nz^5VimL8F}Db?h4VvbIJo4VpT2>Rg-3qT<x~
zPO6f5j+0V!#7=Jw>)~Nz7HTkx*LZ#5#~$7dN0+M}a9U^})w}Gae-!_XrKv`d8X{aQ
zOp_;1j#?Y0Hd!@)gTi*MR;J^HD{3cZFTBF;V&tPW(PPcJbxvMs+t|0N8qM@s8l<bE
zlTwu4xT*B$;+mzNHgf$gN*6P(<ZbWo?Om#RAjfR>(WJLORVSZJ;bBV#cT&%vJ==OX
zaju`b0LKr|IJ14_r<CB}$uWA*3oKk0>8J{Ep7`Dr)0A^J|1x{j{3@GyKHDx@==|O@
zGr_>Z#3W_&O{Iw^PMzX%6nGhzr`T7MFKy7IV9~cZLZ@rdp1Ae5-<r)nJ9q9}x5X2W
zCVi~f(`0Vkbhn~bD!}NO;caJc@fDjV8#1ODNcgC2-f+LGWPyg3T)*|)b1z?JuD`x}
z%a$kKe_!98<nP@e$RfzWGQoP+?ee~Z2^l6*%?A~PI++>|+}&N?-rnx+1_G;Aty&V4
z$*3>n5_90Gm`>2n`20%kB|$6wmd~C$_iPW3^RqI>{nFM>0xhSXM#aT_`&nZ)J2x>=
zQG{#le}?(zj~CX6s~f!Oc*$Wf*H4^-MW~bIc;U{Nd$(>yg@?N@zs%W`k&*G^QMZ0o
zb@lFDyXN>kZ<nt-@vz|Ar{7Nv5+*j;JNlpd*71#TV?>UDM2ucI8_SN2rPCL!NHEy3
zXO9dI+uJhhEkQR79{Q*SuME)=;p%lO{_^5t39Cd_W!`$%WC=D$@u;<8TKk=6C0-Ia
z__XNu?c13qv#w^9TF8{YyHokFRa{D1np;7O`J3sE3vGuJPs}~1uUhGDA>;RJ_K$gt
z*8>Azo=*^FEKZA$zklWo&x`Df877*0ZRGeLU21iDX#IZA=Rcp%Zw8%z-x{;CNcHS`
zMNcCE4jnzcydRcJsyJH@O`A6D*fF=AG5czlgqTSE{rCI*?p?do)YZMcz3qR!Q0_iD
zX}J-P(kv4mrz5W0*h>RioEBcZcoB5uro(SD2ay27C;7o5qMa^>9)9@qN2lvU5(5Ln
zTu&FrkUs|wI2cIqv^q&Yn|Ami2Md$l^lYP<dhz>ork^g9iO;`%+qH$QC3M|?PVf;6
z*R!@dEeuc*dinKN$**}EBi0l@Kc_kMRKNYd8(CXHdl%Ivznqyq@1*`@<&upaS*JHf
zteL=fdrHC181P9eRRv5jHm<ihn>IwP)tk=!=H%zkpJ&@2PBBsy@=Q-(?x8a2^wXzL
epU$@bY%bTB&}OFb@d^V21B0ilpUXO@geCwy&rk;d

diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel-map/no-Makefile b/src/TNL/Legacy/hamilton-jacobi-parallel-map/no-Makefile
deleted file mode 100644
index bfdc1ef23..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel-map/no-Makefile
+++ /dev/null
@@ -1,41 +0,0 @@
-TNL_VERSION=0.1
-TNL_INSTALL_DIR=${HOME}/local/lib
-TNL_INCLUDE_DIR=${HOME}/local/include/tnl-${TNL_VERSION}
-
-TARGET = hamiltonJacobiParallelSolver
-#CONFIG_FILE = $(TARGET).cfg.desc
-INSTALL_DIR = ${HOME}/local
-CXX = g++
-CUDA_CXX = nvcc
-OMP_FLAGS = -DHAVE_OPENMP -fopenmp
-CXX_FLAGS = -std=gnu++0x -I$(TNL_INCLUDE_DIR) -O3 $(OMP_FLAGS) -DDEBUG
-LD_FLAGS = -L$(TNL_INSTALL_DIR) -ltnl-0.1 -lgomp
-
-SOURCES = main.cpp
-HEADERS = 
-OBJECTS = main.o
-DIST = $(SOURCES) Makefile
-
-all: $(TARGET)
-clean: 
-	rm -f $(OBJECTS)
-	rm -f $(TARGET)-conf.h	
-
-dist: $(DIST)
-	tar zcvf $(TARGET).tgz $(DIST) 
-
-install: $(TARGET)
-	cp $(TARGET) $(INSTALL_DIR)/bin
-	cp $(CONFIG_FILE) $(INSTALL_DIR)/share
-
-uninstall: $(TARGET)
-	rm -f $(INSTALL_DIR)/bin/$(TARGET) 
-	rm -f $(CONFIG_FILE) $(INSTALL_DIR)/share
-
-$(TARGET): $(OBJECTS)
-	$(CXX) -o $(TARGET) $(OBJECTS) $(LD_FLAGS)
-
-%.o: %.cpp $(HEADERS)
-	$(CXX) -c -o $@ $(CXX_FLAGS) $<
-
-
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel-map/parallelMapConfig.h b/src/TNL/Legacy/hamilton-jacobi-parallel-map/parallelMapConfig.h
deleted file mode 100644
index c07ee95aa..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel-map/parallelMapConfig.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/***************************************************************************
-                          parallelMapConfig.h  -  description
-                             -------------------
-    begin                : Mar 22 , 2016
-    copyright            : (C) 2016 by Tomas Sobotik
-    email                :
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef HAMILTONJACOBIPARALLELMAPPROBLEMCONFIG_H_
-#define HAMILTONJACOBIPARALLELMAPPROBLEMCONFIG_H_
-
-#include <config/tnlConfigDescription.h>
-
-template< typename ConfigTag >
-class parallelMapConfig
-{
-   public:
-      static void configSetup( tnlConfigDescription& config )
-      {
-         config.addDelimiter( "Parallel Eikonal solver settings:" );
-         config.addEntry        < String > ( "problem-name", "This defines particular problem.", "hamilton-jacobi-parallel" );
-         config.addEntry       < String > ( "scheme", "This defines scheme used for discretization.", "godunov" );
-         config.addEntryEnum( "godunov" );
-         config.addEntryEnum( "upwind" );
-         config.addRequiredEntry        < String > ( "initial-condition", "Initial condition for solver");
-         config.addRequiredEntry        < String > ( "map", "Gradient map for solver");
-         config.addEntry       < String > ( "mesh", "Name of mesh.", "mesh.tnl" );
-         config.addEntry        < double > ( "epsilon", "This defines epsilon for smoothening of sign().", 0.0 );
-         config.addEntry        < double > ( "delta", " Allowed difference on subgrid boundaries", 0.0 );
-         config.addRequiredEntry        < double > ( "stop-time", " Final time for solver");
-         config.addRequiredEntry        < double > ( "initial-tau", " initial tau for solver" );
-         config.addEntry        < double > ( "cfl-condition", " CFL condition", 0.0 );
-         config.addEntry        < int > ( "subgrid-size", "Subgrid size.", 16 );
-         config.addRequiredEntry        < int > ( "dim", "Dimension of problem.");
-      }
-};
-
-#endif /* HAMILTONJACOBIPARALLELMAPPROBLEMCONFIG_H_ */
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel-map/run b/src/TNL/Legacy/hamilton-jacobi-parallel-map/run
deleted file mode 100755
index 484419962..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel-map/run
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-dimensions=2
-
-size=2
-
-time=50
-
-rm -r work_dir
-mkdir work_dir
-cp mapa_png.png work_dir/mapa_png.png
-cd  work_dir
-
-tnl-image-converter 		--image-format png\
-		    		--input-images mapa_png.png
-
-
-tnl-init 			--test-function sdf-para \
-	     			--x-centre 0.5 \
-	    			--y-centre 1.0 \
- 	   			--offset 0.05 \
-           			--output-file init.tnl \
-	     			--final-time 0.0 \
-	     			--snapshot-period 0.1
-
-hamilton-jacobi-parallel-map-dbg 	--initial-condition init.tnl \
-				--map mapa_png.tnl \
-              			--cfl-condition 50 \
-	      	  		--mesh mesh.tnl \
-	     	  		--initial-tau 1.0e-3 \
-	      	  		--epsilon 4.0 \
-        	  		--delta 0.0 \
-       	      			--stop-time $time \
-	          		--scheme godunov \
-	          		--subgrid-size 8 \
-		  		--dim $dimensions
-
-	
-#cp ../template.dat1 template.dat1
-#cp ../template.dat2 template.dat2
-#cp ../gplt2eps.py gplt2eps.py
-cd ..
-
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel-map/tnl-err2eoc-2.py b/src/TNL/Legacy/hamilton-jacobi-parallel-map/tnl-err2eoc-2.py
deleted file mode 100755
index f8cde3768..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel-map/tnl-err2eoc-2.py
+++ /dev/null
@@ -1,141 +0,0 @@
-#!/usr/bin/env python
-
-import sys, string, math
-
-arguments = sys. argv[1:]
-format = "txt"
-output_file_name = "eoc-table.txt"
-input_files = []
-verbose = 1
-size = 1.0
-
-i = 0
-while i < len( arguments ):
-   if arguments[ i ] == "--format":
-      format = arguments[ i + 1 ]
-      i = i + 2
-      continue
-   if arguments[ i ] == "--output-file":
-      output_file_name = arguments[ i + 1 ]
-      i = i + 2
-      continue
-   if arguments[ i ] == "--verbose":
-       verbose = float( arguments[ i + 1 ] )
-       i = i +2
-       continue
-   if arguments[ i ] == "--size":
-       size = float( arguments[ i + 1 ] )
-       i = i +2
-       continue
-   input_files. append( arguments[ i ] )
-   i = i + 1
-
-if not verbose == 0:
-   print "Writing to " + output_file_name + " in " + format + "."
-
-h_list = []
-l1_norm_list = []
-l2_norm_list = []
-max_norm_list = []
-items = 0
-
-for file_name in input_files:
-   if not verbose == 0:
-       print "Processing file " + file_name
-   file = open( file_name, "r" )
-   
-   l1_max = 0.0
-   l_max_max = 0.0
-   file.readline();
-   file.readline();
-   for line in file. readlines():
-         data = string. split( line )
-         h_list. append( size/(float(file_name[0:len(file_name)-5] ) - 1.0) )
-         l1_norm_list. append( float( data[ 1 ] ) )
-         l2_norm_list. append( float( data[ 2 ] ) )
-         max_norm_list. append( float( data[ 3 ] ) )
-         items = items + 1
-         if not verbose == 0:
-            print line
-   file. close()
-
-h_width = 12
-err_width = 15
-file = open( output_file_name, "w" )
-if format == "latex":
-      file. write( "\\begin{tabular}{|r|l|l|l|l|l|l|}\\hline\n" )
-      file. write( "\\raisebox{-1ex}[0ex]{$h$}& \n" )
-      file. write( "\\multicolumn{2}{|c|}{\\raisebox{1ex}[3.5ex]{$\\left\| \\cdot \\right\\|_{L_1\\left(\\omega_h;\\left[0,T\\right]\\right)}^{h,\\tau}$}}& \n" )
-      file. write( "\\multicolumn{2}{|c|}{\\raisebox{1ex}[3.5ex]{$\\left\| \\cdot \\right\\|_{L_2\\left(\\omega_h;\left[0,T\\right]\\right)}^{h,\\tau}$}}& \n" )
-      file. write( "\\multicolumn{2}{|c|}{\\raisebox{1ex}[3.5ex]{$\\left\| \\cdot \\right\\|_{L_\\infty\\left(\\omega_h;\\left[0,T\\right]\\right)}^{h,\\tau}$}}\\\\ \\cline{2-7} \n" )
-      file. write( " " + string. rjust( " ", h_width ) + "&" +
-                string. rjust( "Error", err_width ) + "&" +
-                string. rjust( "{\\bf EOC}", err_width ) + "&" +
-                string. rjust( "Error", err_width ) + "&" +
-                string. rjust( "{\\bf EOC}", err_width ) + "&" +
-                string. rjust( "Error.", err_width ) + "&" +
-                string. rjust( "{\\bf EOC}", err_width ) +
-                "\\\\ \\hline \\hline \n")
-if format == "txt":
-    file. write( "+--------------+----------------+----------------+----------------+----------------+----------------+----------------+\n" )
-    file. write( "|       h      |     L1 Err.    |     L1 EOC.    |     L2 Err.    |      L2 EOC    |    MAX Err.    |     MAX EOC    |\n" )
-    file. write( "+==============+================+================+================+================+================+================+\n" )
-                  
-
-i = 0
-while i < items:
-   if i == 0:
-      if format == "latex":
-         file. write( " " + string. ljust( str( h_list[ i ] ), h_width ) + "&" +
-                      string. rjust( "%.2g" % l1_norm_list[ i ], err_width ) + "&" + 
-                      string. rjust( " ", err_width ) + "&"+ 
-                      string. rjust( "%.2g" % l2_norm_list[ i ], err_width ) + "&" +
-                      string. rjust( " ", err_width ) + "&" +
-                      string. rjust( "%.2g" % max_norm_list[ i ], err_width ) + "&" +
-                      string. rjust( " ", err_width ) + "\\\\\n" )
-      if format == "txt":
-         file. write( "| " + string. ljust( str( h_list[ i ] ), h_width ) + " |" + 
-                      string. rjust( "%.2g" % l1_norm_list[ i ], err_width ) + " |" +
-                      string. rjust( " ", err_width ) + " |" +
-                      string. rjust( "%.2g" % l2_norm_list[ i ], err_width ) + " |" +
-                      string. rjust( " ", err_width ) + " |" +
-                      string. rjust( "%.2g" % max_norm_list[ i ], err_width ) + " |" +
-                      string. rjust( " ", err_width ) + " |\n" )
-         file. write( "+--------------+----------------+----------------+----------------+----------------+----------------+----------------+\n" )
-      i = i + 1;
-      continue
-   if h_list[ i ] == h_list[ i - 1 ]:
-      print "Unable to count eoc since h[ " + \
-      str( i ) + " ] = h[ " + str( i - 1 ) + \
-      " ] = " + str( h_list[ i ] ) + ". \n"
-      file. write( " eoc error:  h[ " + \
-      str( i ) + " ] = h[ " + str( i - 1 ) + \
-      " ] = " + str( h_list[ i ] ) + ". \n" )
-   else:
-      h_ratio = math. log( h_list[ i ] / h_list[ i - 1 ] )
-      l1_ratio = math. log( l1_norm_list[ i ] / l1_norm_list[ i - 1 ] )
-      l2_ratio = math. log( l2_norm_list[ i ] / l2_norm_list[ i - 1 ] )
-      max_ratio = math. log( max_norm_list[ i ] / max_norm_list[ i - 1 ] )
-      if format == "latex":
-         file. write( " " + string. ljust( str( h_list[ i ] ), h_width ) + "&" +
-                      string. rjust( "%.2g" % l1_norm_list[ i ], err_width ) + "&" +
-                      string. rjust( "{\\bf " + "%.2g" % ( l1_ratio / h_ratio ) + "}", err_width ) + "&" +
-                      string. rjust( "%.2g" % l2_norm_list[ i ], err_width ) + "&" +
-                      string. rjust( "{\\bf " + "%.2g" % ( l2_ratio / h_ratio ) + "}", err_width ) + "&" +
-                      string. rjust( "%.2g" % max_norm_list[ i ], err_width ) + "&" +
-                      string. rjust( "{\\bf " + "%.2g" % ( max_ratio / h_ratio ) + "}", err_width ) + "\\\\\n" )
-      if format == "txt":
-         file. write( "| " + string. ljust( str( h_list[ i ] ), h_width ) + " |" +
-                      string. rjust( "%.2g" % l1_norm_list[ i ], err_width ) + " |" +
-                      string. rjust( "**" + "%.2g" % ( l1_ratio / h_ratio ) + "**", err_width ) + " |" +
-                      string. rjust( "%.2g" % l2_norm_list[ i ], err_width ) + " |" +
-                      string. rjust( "**" + "%.2g" % ( l2_ratio / h_ratio ) + "**", err_width ) + " |" +
-                      string. rjust( "%.2g" % max_norm_list[ i ], err_width ) + " |" +
-                      string. rjust( "**" + "%.2g" % ( max_ratio / h_ratio ) + "**", err_width ) + " |\n" )
-         file. write( "+--------------+----------------+----------------+----------------+----------------+----------------+----------------+\n" )
-   i = i + 1
-
-if format == "latex":
-   file. write( "\\hline \n" )
-   file. write( "\\end{tabular} \n" )
-    
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel-map/tnlParallelMapSolver.h b/src/TNL/Legacy/hamilton-jacobi-parallel-map/tnlParallelMapSolver.h
deleted file mode 100644
index 400e163c9..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel-map/tnlParallelMapSolver.h
+++ /dev/null
@@ -1,217 +0,0 @@
-/***************************************************************************
-                          tnlParallelMapSolver.h  -  description
-                             -------------------
-    begin                : Mar 22 , 2016
-    copyright            : (C) 2016 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef TNLPARALLELMAPSOLVER_H_
-#define TNLPARALLELMAPSOLVER_H_
-
-#include <TNL/Config/ParameterContainer.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Containers/StaticVector.h>
-#include <functions/tnlMeshFunction.h>
-#include <TNL/Devices/Host.h>
-#include <mesh/tnlGrid.h>
-#include <mesh/grids/tnlGridEntity.h>
-#include <limits.h>
-#include <core/tnlDevice.h>
-
-
-#include <ctime>
-
-#ifdef HAVE_CUDA
-#include <core/tnlCuda.h>
-#endif
-
-
-template< int Dimension,
-		  typename SchemeHost,
-		  typename SchemeDevice,
-		  typename Device,
-		  typename RealType = double,
-          typename IndexType = int >
-class tnlParallelMapSolver
-{};
-
-template<typename SchemeHost, typename SchemeDevice, typename Device>
-class tnlParallelMapSolver<2, SchemeHost, SchemeDevice, Device, double, int >
-{
-public:
-
-	typedef SchemeDevice SchemeTypeDevice;
-	typedef SchemeHost SchemeTypeHost;
-	typedef Device DeviceType;
-	typedef TNL::Containers::Vector< double, TNL::Devices::Host, int > VectorType;
-	typedef TNL::Containers::Vector< int, TNL::Devices::Host, int > IntVectorType;
-	typedef tnlGrid< 2, double, TNL::Devices::Host, int > MeshType;
-#ifdef HAVE_CUDA
-	typedef TNL::Containers::Vector< double, TNL::Devices::Host, int > VectorTypeCUDA;
-	typedef TNL::Containers::Vector< int, TNL::Devices::Host, int > IntVectorTypeCUDA;
-	typedef tnlGrid< 2, double, TNL::Devices::Host, int > MeshTypeCUDA;
-#endif
-	tnlParallelMapSolver();
-	bool init( const Config::ParameterContainer& parameters );
-	void run();
-
-	void test();
-
-/*private:*/
-
-
-	void synchronize();
-
-	int getOwner( int i) const;
-
-	int getSubgridValue( int i ) const;
-
-	void setSubgridValue( int i, int value );
-
-	int getBoundaryCondition( int i ) const;
-
-	void setBoundaryCondition( int i, int value );
-
-	void stretchGrid();
-
-	void contractGrid();
-
-	VectorType getSubgrid( const int i ) const;
-
-	void insertSubgrid( VectorType u, const int i );
-
-	VectorType runSubgrid( int boundaryCondition, VectorType u, int subGridID,VectorType map);
-
-
-	tnlMeshFunction<MeshType> u0;
-	VectorType work_u, map_stretched, map;
-	IntVectorType subgridValues, boundaryConditions, unusedCell, calculationsCount;
-	MeshType mesh, subMesh;
-
-//	tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage > Entity;
-
-	SchemeHost schemeHost;
-	SchemeDevice schemeDevice;
-	double delta, tau0, stopTime,cflCondition;
-	int gridRows, gridCols, gridLevels, currentStep, n;
-
-	std::clock_t start;
-	double time_diff;
-
-
-	tnlDeviceEnum device;
-
-	tnlParallelMapSolver<2, SchemeHost, SchemeDevice, Device, double, int >* getSelf()
-	{
-		return this;
-	};
-
-#ifdef HAVE_CUDA
-
-	tnlParallelMapSolver<2, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver;
-
-	double* work_u_cuda;
-	double* map_stretched_cuda;
-
-	int* subgridValues_cuda;
-	int* boundaryConditions_cuda;
-	int* unusedCell_cuda;
-	int* calculationsCount_cuda;
-	double* tmpw;
-	double* tmp_map;
-
-
-	int* runcuda;
-	int run_host;
-
-
-	__device__ void getSubgridCUDA2D( const int i, tnlParallelMapSolver<2, SchemeHost, SchemeDevice, Device, double, int >* caller, double* a);
-
-	__device__ void updateSubgridCUDA2D( const int i, tnlParallelMapSolver<2, SchemeHost, SchemeDevice, Device, double, int >* caller, double* a);
-
-	__device__ void insertSubgridCUDA2D( double u, const int i );
-
-	__device__ void runSubgridCUDA2D( int boundaryCondition, double* u, int subGridID);
-
-	__device__ int getOwnerCUDA2D( int i) const;
-
-	__device__ int getSubgridValueCUDA2D( int i ) const;
-
-	__device__ void setSubgridValueCUDA2D( int i, int value );
-
-	__device__ int getBoundaryConditionCUDA2D( int i ) const;
-
-	__device__ void setBoundaryConditionCUDA2D( int i, int value );
-
-#endif
-
-};
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-#ifdef HAVE_CUDA
-template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__ void runCUDA2D(tnlParallelMapSolver<2, SchemeHost, SchemeDevice, Device, double, int >* caller);
-
-template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__ void initRunCUDA2D(tnlParallelMapSolver<2, SchemeHost, SchemeDevice, Device, double, int >* caller);
-
-template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__ void initCUDA2D( tnlParallelMapSolver<2, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr, int * ptr2, int* ptr3, double* tmp_map_ptr);
-
-template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__ void synchronizeCUDA2D(tnlParallelMapSolver<2, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver);
-
-template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__ void synchronize2CUDA2D(tnlParallelMapSolver<2, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver);
-
-
-
-__device__
-double fabsMin( double x, double y)
-{
-	double fx = abs(x);
-
-	if(Min(fx,abs(y)) == fx)
-		return x;
-	else
-		return y;
-}
-
-__device__
-double atomicFabsMin(double* address, double val)
-{
-	unsigned long long int* address_as_ull =
-						  (unsigned long long int*)address;
-	unsigned long long int old = *address_as_ull, assumed;
-	do {
-		assumed = old;
-			old = atomicCAS(address_as_ull, assumed,__double_as_longlong( fabsMin(__longlong_as_double(assumed),val) ));
-	} while (assumed != old);
-	return __longlong_as_double(old);
-}
-
-#endif
-
-#include "tnlParallelMapSolver2D_impl.h"
-#endif /* TNLPARALLELMAPSOLVER_H_ */
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel-map/tnlParallelMapSolver2D_impl.h b/src/TNL/Legacy/hamilton-jacobi-parallel-map/tnlParallelMapSolver2D_impl.h
deleted file mode 100644
index e8cbc6fc1..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel-map/tnlParallelMapSolver2D_impl.h
+++ /dev/null
@@ -1,1315 +0,0 @@
-/***************************************************************************
-                          tnlParallelMapSolver2D_impl.h  -  description
-                             -------------------
-    begin                : Mar 22 , 2016
-    copyright            : (C) 2016 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef TNLPARALLELMAPSOLVER2D_IMPL_H_
-#define TNLPARALLELMAPSOLVER2D_IMPL_H_
-
-
-#include "tnlParallelMapSolver.h"
-#include <core/mfilename.h>
-
-
-
-
-#define MAP_SOLVER_MAX_VALUE 3
-
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::tnlParallelMapSolver()
-{
-	this->device = TNL::Devices::HostDevice;  /////////////// tnlCuda Device --- vypocet na GPU, TNL::Devices::HostDevice   ---    vypocet na CPU
-
-#ifdef HAVE_CUDA
-	if(this->device == tnlCudaDevice)
-	{
-	run_host = 1;
-	}
-#endif
-
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::test()
-{
-/*
-	for(int i =0; i < this->subgridValues.getSize(); i++ )
-	{
-		insertSubgrid(getSubgrid(i), i);
-	}
-*/
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-
-bool tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::init( const Config::ParameterContainer& parameters )
-{
-	cout << "Initializating solver..." <<std::endl;
-	const String& meshLocation = parameters.getParameter <String>("mesh");
-	this->mesh.load( meshLocation );
-
-	this->n = parameters.getParameter <int>("subgrid-size");
-	cout << "Setting N to " << this->n <<std::endl;
-
-	this->subMesh.setDimensions( this->n, this->n );
-	this->subMesh.setDomain( Containers::StaticVector<2,double>(0.0, 0.0),
-							 Containers::StaticVector<2,double>(mesh.template getSpaceStepsProducts< 1, 0 >()*(double)(this->n), mesh.template getSpaceStepsProducts< 0, 1 >()*(double)(this->n)) );
-
-	this->subMesh.save("submesh.tnl");
-
-	const String& initialCondition = parameters.getParameter <String>("initial-condition");
-	this->u0.load( initialCondition );
-
-	/* LOAD MAP */
-	const String& mapFile = parameters.getParameter <String>("map");
-	if(! this->map.load( mapFile ))
-		cout << "Failed to load map file : " << mapFile <<std::endl;
-
-
-	this->delta = parameters.getParameter <double>("delta");
-	this->delta *= mesh.template getSpaceStepsProducts< 1, 0 >()*mesh.template getSpaceStepsProducts< 0, 1 >();
-
-	cout << "Setting delta to " << this->delta <<std::endl;
-
-	this->tau0 = parameters.getParameter <double>("initial-tau");
-	cout << "Setting initial tau to " << this->tau0 <<std::endl;
-	this->stopTime = parameters.getParameter <double>("stop-time");
-
-	this->cflCondition = parameters.getParameter <double>("cfl-condition");
-	this -> cflCondition *= sqrt(mesh.template getSpaceStepsProducts< 1, 0 >()*mesh.template getSpaceStepsProducts< 0, 1 >());
-	cout << "Setting CFL to " << this->cflCondition <<std::endl;
-
-	stretchGrid();
-	this->stopTime /= (double)(this->gridCols);
-	this->stopTime *= (1.0+1.0/((double)(this->n) - 2.0));
-	cout << "Setting stopping time to " << this->stopTime <<std::endl;
-
-	cout << "Initializating scheme..." <<std::endl;
-	if(!this->schemeHost.init(parameters))
-	{
-		cerr << "SchemeHost failed to initialize." <<std::endl;
-		return false;
-	}
-	cout << "Scheme initialized." <<std::endl;
-
-	test();
-
-	VectorType* tmp = new VectorType[subgridValues.getSize()];
-	bool containsCurve = false;
-
-#ifdef HAVE_CUDA
-
-	if(this->device == tnlCudaDevice)
-	{
-		cudaMalloc(&(this->cudaSolver), sizeof(tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int >));
-		cudaMemcpy(this->cudaSolver, this,sizeof(tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int >), cudaMemcpyHostToDevice);
-
-		double** tmpdev = NULL;
-		cudaMalloc(&tmpdev, sizeof(double*));
-		cudaMalloc(&(this->tmpw), this->work_u.getSize()*sizeof(double));
-		cudaMalloc(&(this->tmp_map), this->map_stretched.getSize()*sizeof(double));
-		cudaMalloc(&(this->runcuda), sizeof(int));
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-
-		int* tmpUC;
-		cudaMalloc(&(tmpUC), this->work_u.getSize()*sizeof(int));
-		cudaMemcpy(tmpUC, this->unusedCell.getData(), this->unusedCell.getSize()*sizeof(int), cudaMemcpyHostToDevice);
-
-		initCUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<1,1>>>(this->cudaSolver, (this->tmpw), (this->runcuda),tmpUC, tmp_map);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-
-		double* tmpu = NULL;
-		cudaMemcpy(&tmpu, tmpdev,sizeof(double*), cudaMemcpyDeviceToHost);
-		cudaMemcpy((this->tmpw), this->work_u.getData(), this->work_u.getSize()*sizeof(double), cudaMemcpyHostToDevice);
-		cudaMemcpy((this->tmp_map), this->map_stretched.getData(), this->map_stretched.getSize()*sizeof(double), cudaMemcpyHostToDevice);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-
-	}
-#endif
-
-	if(this->device == TNL::Devices::HostDevice)
-	{
-		VectorType tmp_map;
-		tmp_map.setSize(this->n * this->n);
-		for(int i = 0; i < this->subgridValues.getSize(); i++)
-		{
-
-			if(! tmp[i].setSize(this->n * this->n))
-				cout << "Could not allocate tmp["<< i <<"] array." <<std::endl;
-				tmp[i] = getSubgrid(i);
-			containsCurve = false;
-
-			for(int j = 0; j < tmp[i].getSize(); j++)
-			{
-				if(tmp[i][0]*tmp[i][j] <= 0.0)
-				{
-					containsCurve = true;
-					j=tmp[i].getSize();
-				}
-
-			}
-			if(containsCurve)
-			{
-				for( int j = 0; j < tmp_map.getSize(); j++)
-				{
-					tmp_map[j] = this->map_stretched[ (i / this->gridCols) * this->n*this->n*this->gridCols
-										 + (i % this->gridCols) * this->n
-										 + (j/this->n) * this->n*this->gridCols
-										 + (j % this->n) ];
-				}
-				//cout << "Computing initial SDF on subgrid " << i << "." <<std::endl;
-				tmp[i] = runSubgrid(0, tmp[i],i,tmp_map);
-				insertSubgrid(tmp[i], i);
-				setSubgridValue(i, 4);
-				//cout << "Computed initial SDF on subgrid " << i  << "." <<std::endl;
-			}
-			containsCurve = false;
-
-		}
-	}
-#ifdef HAVE_CUDA
-	else if(this->device == tnlCudaDevice)
-	{
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-		dim3 threadsPerBlock(this->n, this->n);
-		dim3 numBlocks(this->gridCols,this->gridRows);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-		initRunCUDA2D<SchemeTypeHost,SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-
-	}
-#endif
-
-
-	this->currentStep = 1;
-	if(this->device == TNL::Devices::HostDevice)
-		synchronize();
-#ifdef HAVE_CUDA
-	else if(this->device == tnlCudaDevice)
-	{
-		dim3 threadsPerBlock(this->n, this->n);
-		dim3 numBlocks(this->gridCols,this->gridRows);
-
-		synchronizeCUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-		synchronize2CUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-	}
-
-#endif
-	cout << "Solver initialized." <<std::endl;
-
-	return true;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::run()
-{
-	if(this->device == TNL::Devices::HostDevice)
-	{
-		while ((this->boundaryConditions.max() > 0 )/* || !end*/)
-		{
-
-#ifdef HAVE_OPENMP
-#pragma omp parallel for num_threads(4) schedule(dynamic)
-#endif
-			for(int i = 0; i < this->subgridValues.getSize(); i++)
-			{
-				if(getSubgridValue(i) != INT_MAX)
-				{
-					VectorType tmp, tmp_map;
-					tmp.setSize(this->n * this->n);
-					tmp_map.setSize(this->n * this->n);
-					for( int j = 0; j < tmp_map.getSize(); j++)
-					{
-						tmp_map[j] = this->map_stretched[ (i / this->gridCols) * this->n*this->n*this->gridCols
-											 + (i % this->gridCols) * this->n
-											 + (j/this->n) * this->n*this->gridCols
-											 + (j % this->n) ];
-					}
-
-					if(getSubgridValue(i) == currentStep+4)
-					{
-
-						if(getBoundaryCondition(i) & 1)
-						{
-							tmp = getSubgrid(i);
-							tmp = runSubgrid(1, tmp ,i,tmp_map);
-							insertSubgrid( tmp, i);
-							this->calculationsCount[i]++;
-						}
-						if(getBoundaryCondition(i) & 2)
-						{
-							tmp = getSubgrid(i);
-							tmp = runSubgrid(2, tmp ,i,tmp_map);
-							insertSubgrid( tmp, i);
-							this->calculationsCount[i]++;
-						}
-						if(getBoundaryCondition(i) & 4)
-						{
-							tmp = getSubgrid(i);
-							tmp = runSubgrid(4, tmp ,i,tmp_map);
-							insertSubgrid( tmp, i);
-							this->calculationsCount[i]++;
-						}
-						if(getBoundaryCondition(i) & 8)
-						{
-							tmp = getSubgrid(i);
-							tmp = runSubgrid(8, tmp ,i,tmp_map);
-							insertSubgrid( tmp, i);
-							this->calculationsCount[i]++;
-						}
-					}
-					else
-					{
-
-						if(getBoundaryCondition(i) == 1)
-						{
-							tmp = getSubgrid(i);
-							tmp = runSubgrid(1, tmp ,i,tmp_map);
-							insertSubgrid( tmp, i);
-							this->calculationsCount[i]++;
-						}
-						if(getBoundaryCondition(i) == 2)
-						{
-							tmp = getSubgrid(i);
-							tmp = runSubgrid(2, tmp ,i,tmp_map);
-							insertSubgrid( tmp, i);
-							this->calculationsCount[i]++;
-						}
-						if(getBoundaryCondition(i) == 4)
-						{
-							tmp = getSubgrid(i);
-							tmp = runSubgrid(4, tmp ,i,tmp_map);
-							insertSubgrid( tmp, i);
-							this->calculationsCount[i]++;
-						}
-						if(getBoundaryCondition(i) == 8)
-						{
-							tmp = getSubgrid(i);
-							tmp = runSubgrid(8, tmp ,i,tmp_map);
-							insertSubgrid( tmp, i);
-							this->calculationsCount[i]++;
-						}
-					}
-
-					if(getBoundaryCondition(i) & 3)
-					{
-						//cout << "3 @ " << getBoundaryCondition(i) <<std::endl;
-						tmp = getSubgrid(i);
-						tmp = runSubgrid(3, tmp ,i,tmp_map);
-						insertSubgrid( tmp, i);
-					}
-					if(getBoundaryCondition(i) & 5)
-					{
-						//cout << "5 @ " << getBoundaryCondition(i) <<std::endl;
-						tmp = getSubgrid(i);
-						tmp = runSubgrid(5, tmp ,i,tmp_map);
-						insertSubgrid( tmp, i);
-					}
-					if(getBoundaryCondition(i) & 10)
-					{
-						//cout << "10 @ " << getBoundaryCondition(i) <<std::endl;
-						tmp = getSubgrid(i);
-						tmp = runSubgrid(10, tmp ,i,tmp_map);
-						insertSubgrid( tmp, i);
-					}
-					if(getBoundaryCondition(i) & 12)
-					{
-						//cout << "12 @ " << getBoundaryCondition(i) <<std::endl;
-						tmp = getSubgrid(i);
-						tmp = runSubgrid(12, tmp ,i,tmp_map);
-						insertSubgrid( tmp, i);
-					}
-
-
-					setBoundaryCondition(i, 0);
-
-					setSubgridValue(i, getSubgridValue(i)-1);
-
-				}
-			}
-			synchronize();
-		}
-	}
-#ifdef HAVE_CUDA
-	else if(this->device == tnlCudaDevice)
-	{
-		bool end_cuda = false;
-		dim3 threadsPerBlock(this->n, this->n);
-		dim3 numBlocks(this->gridCols,this->gridRows);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-
-		bool* tmpb;
-		cudaMemcpy(&(this->run_host),this->runcuda,sizeof(int), cudaMemcpyDeviceToHost);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-
-		int i = 1;
-		time_diff = 0.0;
-		while (run_host || !end_cuda)
-		{
-			cout << "Computing at step "<< i++ <<std::endl;
-			if(run_host != 0 )
-				end_cuda = true;
-			else
-				end_cuda = false;
-			cudaDeviceSynchronize();
-			TNL_CHECK_CUDA_DEVICE;
-			start = std::clock();
-			runCUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver);
-			cudaDeviceSynchronize();
-			time_diff += (std::clock() - start) / (double)(CLOCKS_PER_SEC);
-
-			//start = std::clock();
-			synchronizeCUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
-			cudaDeviceSynchronize();
-			TNL_CHECK_CUDA_DEVICE;
-			synchronize2CUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver);
-			cudaDeviceSynchronize();
-			TNL_CHECK_CUDA_DEVICE;
-			//time_diff += (std::clock() - start) / (double)(CLOCKS_PER_SEC);
-
-			cudaMemcpy(&run_host, (this->runcuda),sizeof(int), cudaMemcpyDeviceToHost);
-		}
-		cout << "Solving time was: " << time_diff <<std::endl;
-
-		cudaMemcpy(this->work_u.getData()/* test*/, (this->tmpw), this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-
-		cudaDeviceSynchronize();
-	}
-#endif
-	contractGrid();
-	this->u0.save("u-00001.tnl");
-	cout << "Maximum number of calculations on one subgrid was " << this->calculationsCount.absMax() <<std::endl;
-	cout << "Average number of calculations on one subgrid was " << ( (double) this->calculationsCount.sum() / (double) this->calculationsCount.getSize() ) <<std::endl;
-	cout << "Solver finished" <<std::endl;
-
-#ifdef HAVE_CUDA
-	if(this->device == tnlCudaDevice)
-	{
-		cudaFree(this->runcuda);
-		cudaFree(this->tmpw);
-		cudaFree(this->tmp_map);
-		cudaFree(this->cudaSolver);
-	}
-#endif
-
-}
-
-//north - 1, east - 2, west - 4, south - 8
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::synchronize() //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now
-{
-	cout << "Synchronizig..." <<std::endl;
-	int tmp1, tmp2;
-	int grid1, grid2;
-
-//	if(this->currentStep & 1)
-//	{
-		for(int j = 0; j < this->gridRows - 1; j++)
-		{
-			for (int i = 0; i < this->gridCols*this->n; i++)
-			{
-				tmp1 = this->gridCols*this->n*((this->n-1)+j*this->n) + i;
-				tmp2 = this->gridCols*this->n*((this->n)+j*this->n) + i;
-				grid1 = getSubgridValue(getOwner(tmp1));
-				grid2 = getSubgridValue(getOwner(tmp2));
-				if(getOwner(tmp1)==getOwner(tmp2))
-					cout << "i, j" << i << "," << j <<std::endl;
-				if ((fabs(this->work_u[tmp1]) < fabs(this->work_u[tmp2]) - this->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX))
-				{
-					this->work_u[tmp2] = this->work_u[tmp1];
-					this->unusedCell[tmp2] = 0;
-					if(grid2 == INT_MAX)
-					{
-						setSubgridValue(getOwner(tmp2), -INT_MAX);
-					}
-					if(! (getBoundaryCondition(getOwner(tmp2)) & 8) )
-						setBoundaryCondition(getOwner(tmp2), getBoundaryCondition(getOwner(tmp2))+8);
-				}
-				else if ((fabs(this->work_u[tmp1]) > fabs(this->work_u[tmp2]) + this->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX))
-				{
-					this->work_u[tmp1] = this->work_u[tmp2];
-					this->unusedCell[tmp1] = 0;
-					if(grid1 == INT_MAX)
-					{
-						setSubgridValue(getOwner(tmp1), -INT_MAX);
-					}
-					if(! (getBoundaryCondition(getOwner(tmp1)) & 1) )
-						setBoundaryCondition(getOwner(tmp1), getBoundaryCondition(getOwner(tmp1))+1);
-				}
-			}
-		}
-
-//	}
-//	else
-//	{
-		for(int i = 1; i < this->gridCols; i++)
-		{
-			for (int j = 0; j < this->gridRows*this->n; j++)
-			{
-				tmp1 = this->gridCols*this->n*j + i*this->n - 1;
-				tmp2 = this->gridCols*this->n*j + i*this->n ;
-				grid1 = getSubgridValue(getOwner(tmp1));
-				grid2 = getSubgridValue(getOwner(tmp2));
-				if(getOwner(tmp1)==getOwner(tmp2))
-					cout << "i, j" << i << "," << j <<std::endl;
-				if ((fabs(this->work_u[tmp1]) < fabs(this->work_u[tmp2]) - this->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX))
-				{
-					this->work_u[tmp2] = this->work_u[tmp1];
-					this->unusedCell[tmp2] = 0;
-					if(grid2 == INT_MAX)
-					{
-						setSubgridValue(getOwner(tmp2), -INT_MAX);
-					}
-					if(! (getBoundaryCondition(getOwner(tmp2)) & 4) )
-						setBoundaryCondition(getOwner(tmp2), getBoundaryCondition(getOwner(tmp2))+4);
-				}
-				else if ((fabs(this->work_u[tmp1]) > fabs(this->work_u[tmp2]) + this->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX))
-				{
-					this->work_u[tmp1] = this->work_u[tmp2];
-					this->unusedCell[tmp1] = 0;
-					if(grid1 == INT_MAX)
-					{
-						setSubgridValue(getOwner(tmp1), -INT_MAX);
-					}
-					if(! (getBoundaryCondition(getOwner(tmp1)) & 2) )
-						setBoundaryCondition(getOwner(tmp1), getBoundaryCondition(getOwner(tmp1))+2);
-				}
-			}
-		}
-//	}
-
-
-	this->currentStep++;
-	int stepValue = this->currentStep + 4;
-	for (int i = 0; i < this->subgridValues.getSize(); i++)
-	{
-		if( getSubgridValue(i) == -INT_MAX )
-			setSubgridValue(i, stepValue);
-	}
-
-	cout << "Grid synchronized at step " << (this->currentStep - 1 ) <<std::endl;
-
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-int tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getOwner(int i) const
-{
-
-	return (i / (this->gridCols*this->n*this->n))*this->gridCols + (i % (this->gridCols*this->n))/this->n;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-int tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getSubgridValue( int i ) const
-{
-	return this->subgridValues[i];
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::setSubgridValue(int i, int value)
-{
-	this->subgridValues[i] = value;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-int tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getBoundaryCondition( int i ) const
-{
-	return this->boundaryConditions[i];
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::setBoundaryCondition(int i, int value)
-{
-	this->boundaryConditions[i] = value;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::stretchGrid()
-{
-	cout << "Stretching grid..." <<std::endl;
-
-
-	this->gridCols = ceil( ((double)(this->mesh.getDimensions().x()-1)) / ((double)(this->n-1)) );
-	this->gridRows = ceil( ((double)(this->mesh.getDimensions().y()-1)) / ((double)(this->n-1)) );
-
-
-	cout << "Setting gridCols to " << this->gridCols << "." <<std::endl;
-	cout << "Setting gridRows to " << this->gridRows << "." <<std::endl;
-
-	this->subgridValues.setSize(this->gridCols*this->gridRows);
-	this->subgridValues.setValue(0);
-	this->boundaryConditions.setSize(this->gridCols*this->gridRows);
-	this->boundaryConditions.setValue(0);
-	this->calculationsCount.setSize(this->gridCols*this->gridRows);
-	this->calculationsCount.setValue(0);
-
-	for(int i = 0; i < this->subgridValues.getSize(); i++ )
-	{
-		this->subgridValues[i] = INT_MAX;
-		this->boundaryConditions[i] = 0;
-	}
-
-	int stretchedSize = this->n*this->n*this->gridCols*this->gridRows;
-
-	if(!this->work_u.setSize(stretchedSize))
-		cerr << "Could not allocate memory for stretched grid." <<std::endl;
-	if(!this->map_stretched.setSize(stretchedSize))
-		cerr << "Could not allocate memory for stretched map." <<std::endl;
-	if(!this->unusedCell.setSize(stretchedSize))
-		cerr << "Could not allocate memory for supporting stretched grid." <<std::endl;
-	int idealStretch =this->mesh.getDimensions().x() + (this->mesh.getDimensions().x()-2)/(this->n-1);
-	cout << idealStretch <<std::endl;
-
-	for(int i = 0; i < stretchedSize; i++)
-	{
-		this->unusedCell[i] = 1;
-		int diff =(this->n*this->gridCols) - idealStretch ;
-		int k = i/this->n - i/(this->n*this->gridCols) + this->mesh.getDimensions().x()*(i/(this->n*this->n*this->gridCols)) + (i/(this->n*this->gridCols))*diff;
-
-		if(i%(this->n*this->gridCols) - idealStretch  >= 0)
-		{
-			k+= i%(this->n*this->gridCols) - idealStretch +1 ;
-		}
-
-		if(i/(this->n*this->gridCols) - idealStretch + 1  > 0)
-		{
-			k+= (i/(this->n*this->gridCols) - idealStretch +1 )* this->mesh.getDimensions().x() ;
-		}
-
-
-		if(fabs(this->u0[i-k]) < mesh.template getSpaceStepsProducts< 1, 0 >()+mesh.template getSpaceStepsProducts< 0, 1 >() )
-			this->work_u[i] = this->u0[i-k];
-		else
-			this->work_u[i] = sign(this->u0[i-k])*MAP_SOLVER_MAX_VALUE;
-
-		this->map_stretched[i] = this->map[i-k];
-	}
-
-
-	cout << "Grid stretched." <<std::endl;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::contractGrid()
-{
-	cout << "Contracting grid..." <<std::endl;
-	int stretchedSize = this->n*this->n*this->gridCols*this->gridRows;
-
-	int idealStretch =this->mesh.getDimensions().x() + (this->mesh.getDimensions().x()-2)/(this->n-1);
-	cout << idealStretch <<std::endl;
-
-	for(int i = 0; i < stretchedSize; i++)
-	{
-		int diff =(this->n*this->gridCols) - idealStretch ;
-		int k = i/this->n - i/(this->n*this->gridCols) + this->mesh.getDimensions().x()*(i/(this->n*this->n*this->gridCols)) + (i/(this->n*this->gridCols))*diff;
-
-		if((i%(this->n*this->gridCols) - idealStretch  < 0) && (i/(this->n*this->gridCols) - idealStretch + 1  <= 0))
-		{
-			this->u0[i-k] = this->work_u[i];
-		}
-
-	}
-
-	cout << "Grid contracted" <<std::endl;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-typename tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::VectorType
-tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getSubgrid( const int i ) const
-{
-	VectorType u;
-	u.setSize(this->n*this->n);
-
-	for( int j = 0; j < u.getSize(); j++)
-	{
-		u[j] = this->work_u[ (i / this->gridCols) * this->n*this->n*this->gridCols
-		                     + (i % this->gridCols) * this->n
-		                     + (j/this->n) * this->n*this->gridCols
-		                     + (j % this->n) ];
-	}
-	return u;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::insertSubgrid( VectorType u, const int i )
-{
-
-	for( int j = 0; j < this->n*this->n; j++)
-	{
-		int index = (i / this->gridCols)*this->n*this->n*this->gridCols + (i % this->gridCols)*this->n + (j/this->n)*this->n*this->gridCols + (j % this->n);
-		if( (fabs(this->work_u[index]) > fabs(u[j])) || (this->unusedCell[index] == 1) )
-		{
-			this->work_u[index] = u[j];
-			this->unusedCell[index] = 0;
-		}
-	}
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-typename tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::VectorType
-tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::runSubgrid( int boundaryCondition, VectorType u, int subGridID,VectorType map)
-{
-
-	VectorType fu;
-
-	fu.setLike(u);
-	fu.setValue( 0.0 );
-
-
-
-	bool tmp = false;
-	for(int i = 0; i < u.getSize(); i++)
-	{
-		if(u[0]*u[i] <= 0.0)
-			tmp=true;
-		int centerGID = (this->n*(subGridID / this->gridRows)+ (this->n >> 1))*(this->n*this->gridCols) + this->n*(subGridID % this->gridRows) + (this->n >> 1);
-		if(this->unusedCell[centerGID] == 0 || boundaryCondition == 0)
-			tmp = true;
-	}
-
-
-	double value = sign(u[0]) * u.absMax();
-
-	if(tmp)
-	{}
-
-
-	//north - 1, east - 2, west - 4, south - 8
-	else if(boundaryCondition == 4)
-	{
-		for(int i = 0; i < this->n; i++)
-			for(int j = 1;j < this->n; j++)
-				//if(fabs(u[i*this->n + j]) <  fabs(u[i*this->n]))
-				u[i*this->n + j] = value;// u[i*this->n];
-	}
-	else if(boundaryCondition == 2)
-	{
-		for(int i = 0; i < this->n; i++)
-			for(int j =0 ;j < this->n -1; j++)
-				//if(fabs(u[i*this->n + j]) < fabs(u[(i+1)*this->n - 1]))
-				u[i*this->n + j] = value;// u[(i+1)*this->n - 1];
-	}
-	else if(boundaryCondition == 1)
-	{
-		for(int j = 0; j < this->n; j++)
-			for(int i = 0;i < this->n - 1; i++)
-				//if(fabs(u[i*this->n + j]) < fabs(u[j + this->n*(this->n - 1)]))
-				u[i*this->n + j] = value;// u[j + this->n*(this->n - 1)];
-	}
-	else if(boundaryCondition == 8)
-	{
-		for(int j = 0; j < this->n; j++)
-			for(int i = 1;i < this->n; i++)
-				//if(fabs(u[i*this->n + j]) < fabs(u[j]))
-				u[i*this->n + j] = value;// u[j];
-	}
-
-
-
-   double time = 0.0;
-   double currentTau = this->tau0;
-   double finalTime = this->stopTime;// + 3.0*(u.max() - u.min());
-   if( time + currentTau > finalTime ) currentTau = finalTime - time;
-
-   double maxResidue( 1.0 );
-   tnlGridEntity<MeshType, 2, tnlGridEntityNoStencilStorage > Entity(subMesh);
-   tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-
-   for( int i = 0; i < u.getSize(); i ++ )
-   {
-		if(map[i] == 0.0)
-		{
-			u[i] = /*sign(u[l])**/MAP_SOLVER_MAX_VALUE;
-		}
-   }
-
-   while( time < finalTime )
-   {
-      /****
-       * Compute the RHS
-       */
-
-      for( int i = 0; i < fu.getSize(); i ++ )
-      {
-			Entity.setCoordinates(Containers::StaticVector<2,int>(i % subMesh.getDimensions().x(),i / subMesh.getDimensions().x()));
-			Entity.refresh();
-			neighborEntities.refresh(subMesh,Entity.getIndex());
-			if(map[i] != 0.0)
-				fu[ i ] = schemeHost.getValue( this->subMesh, i, Containers::StaticVector<2,int>(i % subMesh.getDimensions().x(),i / subMesh.getDimensions().x()), u, time, boundaryCondition,neighborEntities,map);
-      }
-      maxResidue = fu. absMax();
-
-
-      if(maxResidue != 0.0)
-    	  currentTau =  fabs(this -> cflCondition / maxResidue);
-
-
-      if(currentTau > 1.0 * this->subMesh.template getSpaceStepsProducts< 1, 0 >())
-      {
-    	  currentTau = 1.0 * this->subMesh.template getSpaceStepsProducts< 1, 0 >();
-      }
-
-
-      if( time + currentTau > finalTime ) currentTau = finalTime - time;
-
-
-
-      for( int i = 0; i < fu.getSize(); i ++ )
-      {
-    	  if(map[i] != 0.0)
-    		  u[ i ] += currentTau * fu[ i ];
-      }
-      time += currentTau;
-
-   }
-   return u;
-}
-
-
-#ifdef HAVE_CUDA
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-void tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getSubgridCUDA2D( const int i ,tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int >* caller, double* a)
-{
-	int th = (blockIdx.y) * caller->n*caller->n*caller->gridCols
-            + (blockIdx.x) * caller->n
-            + threadIdx.y * caller->n*caller->gridCols
-            + threadIdx.x;
-
-	*a = caller->work_u_cuda[th];
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-void tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::updateSubgridCUDA2D( const int i ,tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int >* caller, double* a)
-{
-	int index = (blockIdx.y) * caller->n*caller->n*caller->gridCols
-            + (blockIdx.x) * caller->n
-            + threadIdx.y * caller->n*caller->gridCols
-            + threadIdx.x;
-
-	if( (fabs(caller->work_u_cuda[index]) > fabs(*a)) || (caller->unusedCell_cuda[index] == 1) )
-	{
-		caller->work_u_cuda[index] = *a;
-		caller->unusedCell_cuda[index] = 0;
-
-	}
-
-	*a = caller->work_u_cuda[index];
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-void tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::insertSubgridCUDA2D( double u, const int i )
-{
-		int index = (blockIdx.y)*this->n*this->n*this->gridCols
-					+ (blockIdx.x)*this->n
-					+ threadIdx.y*this->n*this->gridCols
-					+ threadIdx.x;
-
-		if( (fabs(this->work_u_cuda[index]) > fabs(u)) || (this->unusedCell_cuda[index] == 1) )
-		{
-			this->work_u_cuda[index] = u;
-			this->unusedCell_cuda[index] = 0;
-
-		}
-
-
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-void tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::runSubgridCUDA2D( int boundaryCondition, double* u, int subGridID)
-{
-
-	__shared__ int tmp;
-	__shared__ double value;
-	volatile double* sharedTau = &u[blockDim.x*blockDim.y];
-	double* map_local = &u[2*blockDim.x*blockDim.y];
-
-	int i = threadIdx.x;
-	int j = threadIdx.y;
-	int l = threadIdx.y * blockDim.x + threadIdx.x;
-	int gid = (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x + blockDim.x*blockIdx.x + threadIdx.x;
-
-	/* LOAD MAP */
-	map_local[l]=this->map_stretched_cuda[gid];
-	if(map_local[l] != 0.0)
-		map_local[l] = 1.0/map_local[l];
-	/* LOADED */
-
-	bool computeFU = !((i == 0 && (boundaryCondition & 4)) or
-			 (i == blockDim.x - 1 && (boundaryCondition & 2)) or
-			 (j == 0 && (boundaryCondition & 8)) or
-			 (j == blockDim.y - 1  && (boundaryCondition & 1)));
-
-	if(l == 0)
-	{
-		tmp = 0;
-		int centerGID = (blockDim.y*blockIdx.y + (blockDim.y>>1))*(blockDim.x*gridDim.x) + blockDim.x*blockIdx.x + (blockDim.x>>1);
-		if(this->unusedCell_cuda[centerGID] == 0 || boundaryCondition == 0)
-			tmp = 1;
-	}
-	__syncthreads();
-
-
-	if(tmp !=1)
-	{
-		if(computeFU)
-		{
-			if(boundaryCondition == 4)
-				u[l] = u[threadIdx.y * blockDim.x] ;//+ sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0 >()*(threadIdx.x);
-			else if(boundaryCondition == 2)
-				u[l] = u[threadIdx.y * blockDim.x + blockDim.x - 1] ;//+ sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0 >()*(this->n - 1 - threadIdx.x);
-			else if(boundaryCondition == 8)
-				u[l] = u[threadIdx.x] ;//+ sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0 >()*(threadIdx.y);
-			else if(boundaryCondition == 1)
-				u[l] = u[(blockDim.y - 1)* blockDim.x + threadIdx.x] ;//+ sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0 >()*(this->n - 1 - threadIdx.y);
-		}
-	}
-
-   double time = 0.0;
-   __shared__ double currentTau;
-   double cfl = this->cflCondition;
-   double fu = 0.0;
-
-   double finalTime = this->stopTime;
-   if(boundaryCondition == 0)
-	   finalTime*=2.0;
-   __syncthreads();
-
-   tnlGridEntity<MeshType, 2, tnlGridEntityNoStencilStorage > Entity(subMesh);
-   tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-   Entity.setCoordinates(Containers::StaticVector<2,int>(i,j));
-   Entity.refresh();
-   neighborEntities.refresh(subMesh,Entity.getIndex());
-
-
-	if(map_local[l] == 0.0)
-	{
-		u[l] = /*sign(u[l])**/MAP_SOLVER_MAX_VALUE;
-		computeFU = false;
-	}
-	__syncthreads();
-
-
-   while( time < finalTime )
-   {
-	  sharedTau[l] = finalTime;
-
-	  if(computeFU)
-	  {
-		  fu = schemeHost.getValueDev( this->subMesh, l, Containers::StaticVector<2,int>(i,j), u, time, boundaryCondition, neighborEntities, map_local);
-	  	  sharedTau[l]=abs(cfl/fu);
-	  }
-
-
-
-      if(l == 0)
-      {
-    	  if(sharedTau[0] > 1.0 * this->subMesh.template getSpaceStepsProducts< 1, 0 >())	sharedTau[0] = 1.0 * this->subMesh.template getSpaceStepsProducts< 1, 0 >();
-      }
-      else if(l == blockDim.x*blockDim.y - 1)
-    	  if( time + sharedTau[l] > finalTime )		sharedTau[l] = finalTime - time;
-
-
-      if((blockDim.x == 16) && (l < 128))		sharedTau[l] = Min(sharedTau[l],sharedTau[l+128]);
-      __syncthreads();
-      if((blockDim.x == 16) && (l < 64))		sharedTau[l] = Min(sharedTau[l],sharedTau[l+64]);
-      __syncthreads();
-      if(l < 32)    							sharedTau[l] = Min(sharedTau[l],sharedTau[l+32]);
-      if(l < 16)								sharedTau[l] = Min(sharedTau[l],sharedTau[l+16]);
-      if(l < 8)									sharedTau[l] = Min(sharedTau[l],sharedTau[l+8]);
-      if(l < 4)									sharedTau[l] = Min(sharedTau[l],sharedTau[l+4]);
-      if(l < 2)									sharedTau[l] = Min(sharedTau[l],sharedTau[l+2]);
-      if(l < 1)									currentTau   = Min(sharedTau[l],sharedTau[l+1]);
-      __syncthreads();
-
-      u[l] += currentTau * fu;
-      time += currentTau;
-   }
-
-
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-int tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getOwnerCUDA2D(int i) const
-{
-
-	return ((i / (this->gridCols*this->n*this->n))*this->gridCols
-			+ (i % (this->gridCols*this->n))/this->n);
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-int tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getSubgridValueCUDA2D( int i ) const
-{
-	return this->subgridValues_cuda[i];
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-void tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::setSubgridValueCUDA2D(int i, int value)
-{
-	this->subgridValues_cuda[i] = value;
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-int tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getBoundaryConditionCUDA2D( int i ) const
-{
-	return this->boundaryConditions_cuda[i];
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-void tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::setBoundaryConditionCUDA2D(int i, int value)
-{
-	this->boundaryConditions_cuda[i] = value;
-}
-
-
-
-//north - 1, east - 2, west - 4, south - 8
-
-template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__
-void synchronizeCUDA2D(tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver) //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now
-{
-
-	__shared__ int boundary[4]; // north,east,west,south
-	__shared__ int subgridValue;
-	__shared__ int newSubgridValue;
-
-
-	int gid = (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x + blockDim.x*blockIdx.x + threadIdx.x;
-	double u = cudaSolver->work_u_cuda[gid];
-	double u_cmp;
-	int subgridValue_cmp=INT_MAX;
-	int boundary_index=0;
-
-
-	if(threadIdx.x+threadIdx.y == 0)
-	{
-		subgridValue = cudaSolver->getSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x);
-		boundary[0] = 0;
-		boundary[1] = 0;
-		boundary[2] = 0;
-		boundary[3] = 0;
-		newSubgridValue = 0;
-	}
-	__syncthreads();
-
-
-
-	if(		(threadIdx.x == 0 				 /*	&& !(cudaSolver->currentStep & 1)*/) 		||
-			(threadIdx.y == 0 				 /*	&& (cudaSolver->currentStep & 1)*/) 		||
-			(threadIdx.x == blockDim.x - 1 	 /*	&& !(cudaSolver->currentStep & 1)*/) 		||
-			(threadIdx.y == blockDim.y - 1 	 /*	&& (cudaSolver->currentStep & 1)*/) 		)
-	{
-		if(threadIdx.x == 0 && (blockIdx.x != 0)/* && !(cudaSolver->currentStep & 1)*/)
-		{
-			u_cmp = cudaSolver->work_u_cuda[gid - 1];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x - 1);
-			boundary_index = 2;
-		}
-
-		if(threadIdx.x == blockDim.x - 1 && (blockIdx.x != gridDim.x - 1)/* && !(cudaSolver->currentStep & 1)*/)
-		{
-			u_cmp = cudaSolver->work_u_cuda[gid + 1];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x + 1);
-			boundary_index = 1;
-		}
-
-		__threadfence();
-		if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX))
-		{
-			cudaSolver->unusedCell_cuda[gid] = 0;
-			atomicMax(&newSubgridValue, INT_MAX);
-			atomicMax(&boundary[boundary_index], 1);
-			cudaSolver->work_u_cuda[gid] = u_cmp;
-			u=u_cmp;
-		}
-		__threadfence();
-		if(threadIdx.y == 0 && (blockIdx.y != 0)/* && (cudaSolver->currentStep & 1)*/)
-		{
-			u_cmp = cudaSolver->work_u_cuda[gid - blockDim.x*gridDim.x];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA2D((blockIdx.y - 1)*gridDim.x + blockIdx.x);
-			boundary_index = 3;
-		}
-		if(threadIdx.y == blockDim.y - 1 && (blockIdx.y != gridDim.y - 1)/* && (cudaSolver->currentStep & 1)*/)
-		{
-			u_cmp = cudaSolver->work_u_cuda[gid + blockDim.x*gridDim.x];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA2D((blockIdx.y + 1)*gridDim.x + blockIdx.x);
-			boundary_index = 0;
-		}
-
-		if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX))
-		{
-			cudaSolver->unusedCell_cuda[gid] = 0;
-			atomicMax(&newSubgridValue, INT_MAX);
-			atomicMax(&boundary[boundary_index], 1);
-			cudaSolver->work_u_cuda[gid] = u_cmp;
-		}
-	}
-	__threadfence();
-	__syncthreads();
-
-	if(threadIdx.x+threadIdx.y == 0)
-	{
-		if(subgridValue == INT_MAX && newSubgridValue !=0)
-			cudaSolver->setSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x, -INT_MAX);
-
-		cudaSolver->setBoundaryConditionCUDA2D(blockIdx.y*gridDim.x + blockIdx.x, 	boundary[0] +
-																				2 * boundary[1] +
-																				4 * boundary[2] +
-																				8 * boundary[3]);
-
-
-		if(blockIdx.x+blockIdx.y ==0)
-		{
-			cudaSolver->currentStep += 1;
-			*(cudaSolver->runcuda) = 0;
-		}
-	}
-
-}
-
-
-
-template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__
-void synchronize2CUDA2D(tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver)
-{
-
-
-	int stepValue = cudaSolver->currentStep + 4;
-	if( cudaSolver->getSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x) == -INT_MAX )
-			cudaSolver->setSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x, stepValue);
-
-	atomicMax((cudaSolver->runcuda),cudaSolver->getBoundaryConditionCUDA2D(blockIdx.y*gridDim.x + blockIdx.x));
-}
-
-
-
-
-
-
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__global__
-void initCUDA2D( tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr , int* ptr2, int* ptr3, double* tmp_map_ptr)
-{
-
-
-	cudaSolver->work_u_cuda = ptr;
-	cudaSolver->map_stretched_cuda = tmp_map_ptr;
-	cudaSolver->unusedCell_cuda = ptr3;
-	cudaSolver->subgridValues_cuda =(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*sizeof(int));
-	cudaSolver->boundaryConditions_cuda =(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*sizeof(int));
-	cudaSolver->runcuda = ptr2;
-	*(cudaSolver->runcuda) = 1;
-
-/* CHANGED !!!!!! from 1 to 0*/	cudaSolver->currentStep = 0;
-
-	printf("GPU memory allocated.\n");
-
-	for(int i = 0; i < cudaSolver->gridCols*cudaSolver->gridRows; i++)
-	{
-		cudaSolver->subgridValues_cuda[i] = INT_MAX;
-		cudaSolver->boundaryConditions_cuda[i] = 0;
-	}
-
-	printf("GPU memory initialized.\n");
-}
-
-
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device >
-__global__
-void initRunCUDA2D(tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int >* caller)
-
-{
-	extern __shared__ double u[];
-
-	int i = blockIdx.y * gridDim.x + blockIdx.x;
-	int l = threadIdx.y * blockDim.x + threadIdx.x;
-
-	__shared__ int containsCurve;
-	if(l == 0)
-		containsCurve = 0;
-
-
-	caller->getSubgridCUDA2D(i,caller, &u[l]);
-	__syncthreads();
-
-	if(u[0] * u[l] <= 0.0)
-		atomicMax( &containsCurve, 1);
-
-	__syncthreads();
-	if(containsCurve == 1)
-	{
-		caller->runSubgridCUDA2D(0,u,i);
-		caller->insertSubgridCUDA2D(u[l],i);
-		__syncthreads();
-		if(l == 0)
-			caller->setSubgridValueCUDA2D(i, 4);
-	}
-
-
-}
-
-
-
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device >
-__global__
-void runCUDA2D(tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int >* caller)
-{
-	extern __shared__ double u[];
-	int i = blockIdx.y * gridDim.x + blockIdx.x;
-	int l = threadIdx.y * blockDim.x + threadIdx.x;
-	int bound = caller->getBoundaryConditionCUDA2D(i);
-
-	if(caller->getSubgridValueCUDA2D(i) != INT_MAX && bound != 0 && caller->getSubgridValueCUDA2D(i) > 0)
-	{
-		caller->getSubgridCUDA2D(i,caller, &u[l]);
-
-
-		if(caller->getSubgridValueCUDA2D(i) == caller->currentStep+4)
-		{
-			if(bound & 1)
-			{
-				caller->runSubgridCUDA2D(1,u,i);
-				caller->updateSubgridCUDA2D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if(bound & 2)
-			{
-				caller->runSubgridCUDA2D(2,u,i);
-				caller->updateSubgridCUDA2D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if(bound & 4)
-			{
-				caller->runSubgridCUDA2D(4,u,i);
-				caller->updateSubgridCUDA2D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if(bound & 8)
-			{
-				caller->runSubgridCUDA2D(8,u,i);
-				caller->updateSubgridCUDA2D(i,caller, &u[l]);
-				__syncthreads();
-			}
-		}
-		else
-		{
-
-			if(bound == 1)
-			{
-				caller->runSubgridCUDA2D(1,u,i);
-				caller->updateSubgridCUDA2D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if(bound == 2)
-			{
-				caller->runSubgridCUDA2D(2,u,i);
-				caller->updateSubgridCUDA2D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if(bound == 4)
-			{
-				caller->runSubgridCUDA2D(4,u,i);
-				caller->updateSubgridCUDA2D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if(bound == 8)
-			{
-				caller->runSubgridCUDA2D(8,u,i);
-				caller->updateSubgridCUDA2D(i,caller, &u[l]);
-				__syncthreads();
-			}
-		}
-
-		if(bound & 3)
-		{
-			caller->runSubgridCUDA2D(3,u,i);
-			caller->updateSubgridCUDA2D(i,caller, &u[l]);
-			__syncthreads();
-		}
-		if(bound & 5)
-		{
-			caller->runSubgridCUDA2D(5,u,i);
-			caller->updateSubgridCUDA2D(i,caller, &u[l]);
-			__syncthreads();
-		}
-		if(bound & 10)
-		{
-			caller->runSubgridCUDA2D(10,u,i);
-			caller->updateSubgridCUDA2D(i,caller, &u[l]);
-			__syncthreads();
-		}
-		if(bound & 12)
-		{
-			caller->runSubgridCUDA2D(12,u,i);
-			caller->updateSubgridCUDA2D(i,caller, &u[l]);
-			__syncthreads();
-		}
-
-
-		if(l==0)
-		{
-			caller->setBoundaryConditionCUDA2D(i, 0);
-			caller->setSubgridValueCUDA2D(i, caller->getSubgridValueCUDA2D(i) - 1 );
-		}
-
-
-	}
-
-
-
-}
-
-#endif /*HAVE_CUDA*/
-
-#endif /* TNLPARALLELMAPSOLVER2D_IMPL_H_ */
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel/CMakeLists.txt b/src/TNL/Legacy/hamilton-jacobi-parallel/CMakeLists.txt
deleted file mode 100644
index f6a00127c..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel/CMakeLists.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-set( tnl_hamilton_jacobi_parallel_SOURCES
-#     MainBuildConfig.h
-#     tnlParallelEikonalSolver2D_impl.h
-#     tnlParallelEikonalSolver3D_impl.h
-#     tnlParallelEikonalSolver.h
-#     parallelEikonalConfig.h 
-     main.cpp)
-
-
-IF(  BUILD_CUDA ) 
-	CUDA_ADD_EXECUTABLE(hamilton-jacobi-parallel main.cu)
-ELSE(  BUILD_CUDA )                
-	ADD_EXECUTABLE(hamilton-jacobi-parallel main.cpp)
-ENDIF( BUILD_CUDA )
-target_link_libraries (hamilton-jacobi-parallel tnl )
-
-
-INSTALL( TARGETS hamilton-jacobi-parallel
-         RUNTIME DESTINATION bin
-         PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )
-        
-#INSTALL( FILES ${tnl_hamilton_jacobi_parallel_SOURCES}
-#         DESTINATION ${TNL_TARGET_DATA_DIRECTORY}/examples/hamilton-jacobi-parallel )
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel/MainBuildConfig.h b/src/TNL/Legacy/hamilton-jacobi-parallel/MainBuildConfig.h
deleted file mode 100644
index ed3d686eb..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel/MainBuildConfig.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/***************************************************************************
-                          MainBuildConfig.h  -  description
-                             -------------------
-    begin                : Jul 7, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef MAINBUILDCONFIG_H_
-#define MAINBUILDCONFIG_H_
-
-#include <solvers/tnlBuildConfigTags.h>
-
-class MainBuildConfig
-{
-   public:
-
-      static void print() {std::cerr << "MainBuildConfig" <<std::endl; }
-};
-
-/****
- * Turn off support for float and long double.
- */
-template<> struct tnlConfigTagReal< MainBuildConfig, float > { enum { enabled = false }; };
-template<> struct tnlConfigTagReal< MainBuildConfig, long double > { enum { enabled = false }; };
-
-/****
- * Turn off support for short int and long int indexing.
- */
-template<> struct tnlConfigTagIndex< MainBuildConfig, short int >{ enum { enabled = false }; };
-template<> struct tnlConfigTagIndex< MainBuildConfig, long int >{ enum { enabled = false }; };
-
-/****
- * Use of tnlGrid is enabled for allowed dimensions and Real, Device and Index types.
- */
-template< int Dimensions, typename Real, typename Device, typename Index >
-   struct tnlConfigTagMesh< MainBuildConfig, tnlGrid< Dimensions, Real, Device, Index > >
-      { enum { enabled = tnlConfigTagDimensions< MainBuildConfig, Dimensions >::enabled  &&
-                         tnlConfigTagReal< MainBuildConfig, Real >::enabled &&
-                         tnlConfigTagDevice< MainBuildConfig, Device >::enabled &&
-                         tnlConfigTagIndex< MainBuildConfig, Index >::enabled }; };
-
-/****
- * Please, chose your preferred time discretisation  here.
- */
-template<> struct tnlConfigTagTimeDiscretisation< MainBuildConfig, tnlExplicitTimeDiscretisationTag >{ enum { enabled = true }; };
-template<> struct tnlConfigTagTimeDiscretisation< MainBuildConfig, tnlSemiImplicitTimeDiscretisationTag >{ enum { enabled = false}; };
-template<> struct tnlConfigTagTimeDiscretisation< MainBuildConfig, tnlImplicitTimeDiscretisationTag >{ enum { enabled = false }; };
-
-/****
- * Only the Runge-Kutta-Merson solver is enabled by default.
- */
-template<> struct tnlConfigTagExplicitSolver< MainBuildConfig, tnlExplicitEulerSolverTag >{ enum { enabled = false }; };
-
-#endif /* MAINBUILDCONFIG_H_ */
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel/main.cpp b/src/TNL/Legacy/hamilton-jacobi-parallel/main.cpp
deleted file mode 100644
index b13498e17..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel/main.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/***************************************************************************
-                          main.cpp  -  description
-                             -------------------
-    begin                : Jul 8 , 2014
-    copyright            : (C) 2014 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#include "main.h"
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel/main.cu b/src/TNL/Legacy/hamilton-jacobi-parallel/main.cu
deleted file mode 100644
index 710197671..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel/main.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-/***************************************************************************
-                          main.cu  -  description
-                             -------------------
-    begin                : Mar 30 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#include "main.h"
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel/main.h b/src/TNL/Legacy/hamilton-jacobi-parallel/main.h
deleted file mode 100644
index dbaebdceb..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel/main.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/***************************************************************************
-                          main.h  -  description
-                             -------------------
-    begin                : Mar 30 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#include "tnlParallelEikonalSolver.h"
-#include "parallelEikonalConfig.h"
-#include "MainBuildConfig.h"
-#include <solvers/tnlBuildConfigTags.h>
-#include <operators/hamilton-jacobi/godunov-eikonal/parallelGodunovEikonal.h>
-#include <mesh/tnlGrid.h>
-#include <core/tnlDevice.h>
-#include <time.h>
-#include <ctime>
-
-typedef MainBuildConfig BuildConfig;
-
-int main( int argc, char* argv[] )
-{
-	time_t start;
-	time_t stop;
-	time(&start);
-	std::clock_t start2= std::clock();
-   Config::ParameterContainer parameters;
-   tnlConfigDescription configDescription;
-   parallelEikonalConfig< BuildConfig >::configSetup( configDescription );
-
-   if( ! parseCommandLine( argc, argv, configDescription, parameters ) )
-      return false;
-
-   //if (parameters.GetParameter <String>("scheme") == "godunov")
-   //{
-   tnlDeviceEnum device;
-   device = TNL::Devices::HostDevice;
-
-   const int& dim = parameters.getParameter< int >( "dim" );
-
-  if(dim == 2)
-  {
-
-	   typedef parallelGodunovEikonalScheme< tnlGrid<2,double,TNL::Devices::Host, int>, double, int > SchemeTypeHost;
-		/*#ifdef HAVE_CUDA
-		   typedef parallelGodunovEikonalScheme< tnlGrid<2,double,tnlCuda, int>, double, int > SchemeTypeDevice;
-		#endif
-		#ifndef HAVE_CUDA*/
-	   typedef parallelGodunovEikonalScheme< tnlGrid<2,double,TNL::Devices::Host, int>, double, int > SchemeTypeDevice;
-		/*#endif*/
-
-	   if(device==TNL::Devices::HostDevice)
-	   {
-		   typedef TNL::Devices::Host Device;
-
-
-		   tnlParallelEikonalSolver<2,SchemeTypeHost,SchemeTypeDevice, Device> solver;
-		   if(!solver.init(parameters))
-		   {
-			  std::cerr << "Solver failed to initialize." <<std::endl;
-			   return EXIT_FAILURE;
-		   }
-		  std::cout << "-------------------------------------------------------------" <<std::endl;
-		  std::cout << "Starting solver loop..." <<std::endl;
-		   solver.run();
-	   }
-	   else if(device==tnlCudaDevice )
-	   {
-		   typedef tnlCuda Device;
-		   //typedef parallelGodunovEikonalScheme< tnlGrid<2,double,Device, int>, double, int > SchemeType;
-
-		   tnlParallelEikonalSolver<2,SchemeTypeHost,SchemeTypeDevice, Device> solver;
-		   if(!solver.init(parameters))
-		   {
-			  std::cerr << "Solver failed to initialize." <<std::endl;
-			   return EXIT_FAILURE;
-		   }
-		  std::cout << "-------------------------------------------------------------" <<std::endl;
-		  std::cout << "Starting solver loop..." <<std::endl;
-		   solver.run();
-	   }
-  // }
-  }
-  else if(dim == 3)
-  {
-
-	   typedef parallelGodunovEikonalScheme< tnlGrid<3,double,TNL::Devices::Host, int>, double, int > SchemeTypeHost;
-		/*#ifdef HAVE_CUDA
-		   typedef parallelGodunovEikonalScheme< tnlGrid<2,double,tnlCuda, int>, double, int > SchemeTypeDevice;
-		#endif
-		#ifndef HAVE_CUDA*/
-	   typedef parallelGodunovEikonalScheme< tnlGrid<3,double,TNL::Devices::Host, int>, double, int > SchemeTypeDevice;
-		/*#endif*/
-
-	   if(device==TNL::Devices::HostDevice)
-	   {
-		   typedef TNL::Devices::Host Device;
-
-
-		   tnlParallelEikonalSolver<3,SchemeTypeHost,SchemeTypeDevice, Device> solver;
-		   if(!solver.init(parameters))
-		   {
-			  std::cerr << "Solver failed to initialize." <<std::endl;
-			   return EXIT_FAILURE;
-		   }
-		  std::cout << "-------------------------------------------------------------" <<std::endl;
-		  std::cout << "Starting solver loop..." <<std::endl;
-		   solver.run();
-	   }
-	   else if(device==tnlCudaDevice )
-	   {
-		   typedef tnlCuda Device;
-		   //typedef parallelGodunovEikonalScheme< tnlGrid<2,double,Device, int>, double, int > SchemeType;
-
-		   tnlParallelEikonalSolver<3,SchemeTypeHost,SchemeTypeDevice, Device> solver;
-		   if(!solver.init(parameters))
-		   {
-			  std::cerr << "Solver failed to initialize." <<std::endl;
-			   return EXIT_FAILURE;
-		   }
-		  std::cout << "-------------------------------------------------------------" <<std::endl;
-		  std::cout << "Starting solver loop..." <<std::endl;
-		   solver.run();
-	   }
- // }
-  }
-
-   time(&stop);
-  std::cout <<std::endl;
-  std::cout << "Running time was: " << difftime(stop,start) << " .... " << (std::clock() - start2) / (double)(CLOCKS_PER_SEC) <<std::endl;
-   return EXIT_SUCCESS;
-}
-
-
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel/no-Makefile b/src/TNL/Legacy/hamilton-jacobi-parallel/no-Makefile
deleted file mode 100644
index bfdc1ef23..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel/no-Makefile
+++ /dev/null
@@ -1,41 +0,0 @@
-TNL_VERSION=0.1
-TNL_INSTALL_DIR=${HOME}/local/lib
-TNL_INCLUDE_DIR=${HOME}/local/include/tnl-${TNL_VERSION}
-
-TARGET = hamiltonJacobiParallelSolver
-#CONFIG_FILE = $(TARGET).cfg.desc
-INSTALL_DIR = ${HOME}/local
-CXX = g++
-CUDA_CXX = nvcc
-OMP_FLAGS = -DHAVE_OPENMP -fopenmp
-CXX_FLAGS = -std=gnu++0x -I$(TNL_INCLUDE_DIR) -O3 $(OMP_FLAGS) -DDEBUG
-LD_FLAGS = -L$(TNL_INSTALL_DIR) -ltnl-0.1 -lgomp
-
-SOURCES = main.cpp
-HEADERS = 
-OBJECTS = main.o
-DIST = $(SOURCES) Makefile
-
-all: $(TARGET)
-clean: 
-	rm -f $(OBJECTS)
-	rm -f $(TARGET)-conf.h	
-
-dist: $(DIST)
-	tar zcvf $(TARGET).tgz $(DIST) 
-
-install: $(TARGET)
-	cp $(TARGET) $(INSTALL_DIR)/bin
-	cp $(CONFIG_FILE) $(INSTALL_DIR)/share
-
-uninstall: $(TARGET)
-	rm -f $(INSTALL_DIR)/bin/$(TARGET) 
-	rm -f $(CONFIG_FILE) $(INSTALL_DIR)/share
-
-$(TARGET): $(OBJECTS)
-	$(CXX) -o $(TARGET) $(OBJECTS) $(LD_FLAGS)
-
-%.o: %.cpp $(HEADERS)
-	$(CXX) -c -o $@ $(CXX_FLAGS) $<
-
-
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel/parallelEikonalConfig.h b/src/TNL/Legacy/hamilton-jacobi-parallel/parallelEikonalConfig.h
deleted file mode 100644
index c27f5ebb3..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel/parallelEikonalConfig.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/***************************************************************************
-                          parallelEikonalConfig.h  -  description
-                             -------------------
-    begin                : Oct 5, 2014
-    copyright            : (C) 2014 by Tomas Sobotik
-    email                :
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef HAMILTONJACOBIPARALLELEIKONALPROBLEMCONFIG_H_
-#define HAMILTONJACOBIPARALLELEIKONALPROBLEMCONFIG_H_
-
-#include <config/tnlConfigDescription.h>
-
-template< typename ConfigTag >
-class parallelEikonalConfig
-{
-   public:
-      static void configSetup( tnlConfigDescription& config )
-      {
-         config.addDelimiter( "Parallel Eikonal solver settings:" );
-         config.addEntry        < String > ( "problem-name", "This defines particular problem.", "hamilton-jacobi-parallel" );
-         config.addEntry       < String > ( "scheme", "This defines scheme used for discretization.", "godunov" );
-         config.addEntryEnum( "godunov" );
-         config.addEntryEnum( "upwind" );
-         config.addRequiredEntry        < String > ( "initial-condition", "Initial condition for solver");
-         config.addEntry       < String > ( "mesh", "Name of mesh.", "mesh.tnl" );
-         config.addEntry        < double > ( "epsilon", "This defines epsilon for smoothening of sign().", 0.0 );
-         config.addEntry        < double > ( "delta", " Allowed difference on subgrid boundaries", 0.0 );
-         config.addRequiredEntry        < double > ( "stop-time", " Final time for solver");
-         config.addRequiredEntry        < double > ( "initial-tau", " initial tau for solver" );
-         config.addEntry        < double > ( "cfl-condition", " CFL condition", 0.0 );
-         config.addEntry        < int > ( "subgrid-size", "Subgrid size.", 16 );
-         config.addRequiredEntry        < int > ( "dim", "Dimension of problem.");
-      }
-};
-
-#endif /* HAMILTONJACOBIPARALLELEIKONALPROBLEMCONFIG_H_ */
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel/run b/src/TNL/Legacy/hamilton-jacobi-parallel/run
deleted file mode 100755
index 3aece294a..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel/run
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-#GRID_SIZES="0897"
-GRID_SIZES="0008 0015 0029 0057 0113 0225 0449"
-#GRID_SIZES="1793"
-
-dimensions=2
-
-size=2
-
-time=3
-
-for grid_size in $GRID_SIZES;
-
-do
-
-	rm -r grid-${grid_size}
-   	mkdir grid-${grid_size}
-   	cd grid-${grid_size}
-
-	tnl-grid-setup --dimensions $dimensions \
-	               --origin-x -1.0 \
-	               --origin-y -1.0 \
-	               --origin-z -1.0 \
-	               --proportions-x $size \
-	               --proportions-y $size \
-	               --proportions-z $size \
-	               --size-x ${grid_size} \
-	               --size-y ${grid_size} \
-	               --size-z ${grid_size}
-
-	tnl-init --test-function sdf-para \
-		     --offset 0.25 \
-	             --output-file init.tnl \
-		     --final-time 0.0 \
-		     --snapshot-period 0.1 \
-
-
-	tnl-init --test-function sdf-para-sdf \
-		     --offset 0.25 \
-	             --output-file sdf.tnl \
-		     --final-time 0.0 \
-		     --snapshot-period 0.1
-
-	hamilton-jacobi-parallel --initial-condition init.tnl \
-	              --cfl-condition 1.0e-1 \
-		      	  --mesh mesh.tnl \
-		     	  --initial-tau 1.0e-3 \
-		      	  --epsilon 1.0 \
-	        	  --delta 0.0 \
-	       	      --stop-time $time \
-		          --scheme godunov \
-		          --subgrid-size 8
-
-        tnl-diff --mesh mesh.tnl --mode sequence --input-files sdf.tnl u-00001.tnl --write-difference yes --output-file ../${grid_size}.diff
-	
-	cd ..
-
-done
-
-
-./tnl-err2eoc-2.py --format txt --size $size *.diff
-
-              
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel/tnl-err2eoc-2.py b/src/TNL/Legacy/hamilton-jacobi-parallel/tnl-err2eoc-2.py
deleted file mode 100755
index f8cde3768..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel/tnl-err2eoc-2.py
+++ /dev/null
@@ -1,141 +0,0 @@
-#!/usr/bin/env python
-
-import sys, string, math
-
-arguments = sys. argv[1:]
-format = "txt"
-output_file_name = "eoc-table.txt"
-input_files = []
-verbose = 1
-size = 1.0
-
-i = 0
-while i < len( arguments ):
-   if arguments[ i ] == "--format":
-      format = arguments[ i + 1 ]
-      i = i + 2
-      continue
-   if arguments[ i ] == "--output-file":
-      output_file_name = arguments[ i + 1 ]
-      i = i + 2
-      continue
-   if arguments[ i ] == "--verbose":
-       verbose = float( arguments[ i + 1 ] )
-       i = i +2
-       continue
-   if arguments[ i ] == "--size":
-       size = float( arguments[ i + 1 ] )
-       i = i +2
-       continue
-   input_files. append( arguments[ i ] )
-   i = i + 1
-
-if not verbose == 0:
-   print "Writing to " + output_file_name + " in " + format + "."
-
-h_list = []
-l1_norm_list = []
-l2_norm_list = []
-max_norm_list = []
-items = 0
-
-for file_name in input_files:
-   if not verbose == 0:
-       print "Processing file " + file_name
-   file = open( file_name, "r" )
-   
-   l1_max = 0.0
-   l_max_max = 0.0
-   file.readline();
-   file.readline();
-   for line in file. readlines():
-         data = string. split( line )
-         h_list. append( size/(float(file_name[0:len(file_name)-5] ) - 1.0) )
-         l1_norm_list. append( float( data[ 1 ] ) )
-         l2_norm_list. append( float( data[ 2 ] ) )
-         max_norm_list. append( float( data[ 3 ] ) )
-         items = items + 1
-         if not verbose == 0:
-            print line
-   file. close()
-
-h_width = 12
-err_width = 15
-file = open( output_file_name, "w" )
-if format == "latex":
-      file. write( "\\begin{tabular}{|r|l|l|l|l|l|l|}\\hline\n" )
-      file. write( "\\raisebox{-1ex}[0ex]{$h$}& \n" )
-      file. write( "\\multicolumn{2}{|c|}{\\raisebox{1ex}[3.5ex]{$\\left\| \\cdot \\right\\|_{L_1\\left(\\omega_h;\\left[0,T\\right]\\right)}^{h,\\tau}$}}& \n" )
-      file. write( "\\multicolumn{2}{|c|}{\\raisebox{1ex}[3.5ex]{$\\left\| \\cdot \\right\\|_{L_2\\left(\\omega_h;\left[0,T\\right]\\right)}^{h,\\tau}$}}& \n" )
-      file. write( "\\multicolumn{2}{|c|}{\\raisebox{1ex}[3.5ex]{$\\left\| \\cdot \\right\\|_{L_\\infty\\left(\\omega_h;\\left[0,T\\right]\\right)}^{h,\\tau}$}}\\\\ \\cline{2-7} \n" )
-      file. write( " " + string. rjust( " ", h_width ) + "&" +
-                string. rjust( "Error", err_width ) + "&" +
-                string. rjust( "{\\bf EOC}", err_width ) + "&" +
-                string. rjust( "Error", err_width ) + "&" +
-                string. rjust( "{\\bf EOC}", err_width ) + "&" +
-                string. rjust( "Error.", err_width ) + "&" +
-                string. rjust( "{\\bf EOC}", err_width ) +
-                "\\\\ \\hline \\hline \n")
-if format == "txt":
-    file. write( "+--------------+----------------+----------------+----------------+----------------+----------------+----------------+\n" )
-    file. write( "|       h      |     L1 Err.    |     L1 EOC.    |     L2 Err.    |      L2 EOC    |    MAX Err.    |     MAX EOC    |\n" )
-    file. write( "+==============+================+================+================+================+================+================+\n" )
-                  
-
-i = 0
-while i < items:
-   if i == 0:
-      if format == "latex":
-         file. write( " " + string. ljust( str( h_list[ i ] ), h_width ) + "&" +
-                      string. rjust( "%.2g" % l1_norm_list[ i ], err_width ) + "&" + 
-                      string. rjust( " ", err_width ) + "&"+ 
-                      string. rjust( "%.2g" % l2_norm_list[ i ], err_width ) + "&" +
-                      string. rjust( " ", err_width ) + "&" +
-                      string. rjust( "%.2g" % max_norm_list[ i ], err_width ) + "&" +
-                      string. rjust( " ", err_width ) + "\\\\\n" )
-      if format == "txt":
-         file. write( "| " + string. ljust( str( h_list[ i ] ), h_width ) + " |" + 
-                      string. rjust( "%.2g" % l1_norm_list[ i ], err_width ) + " |" +
-                      string. rjust( " ", err_width ) + " |" +
-                      string. rjust( "%.2g" % l2_norm_list[ i ], err_width ) + " |" +
-                      string. rjust( " ", err_width ) + " |" +
-                      string. rjust( "%.2g" % max_norm_list[ i ], err_width ) + " |" +
-                      string. rjust( " ", err_width ) + " |\n" )
-         file. write( "+--------------+----------------+----------------+----------------+----------------+----------------+----------------+\n" )
-      i = i + 1;
-      continue
-   if h_list[ i ] == h_list[ i - 1 ]:
-      print "Unable to count eoc since h[ " + \
-      str( i ) + " ] = h[ " + str( i - 1 ) + \
-      " ] = " + str( h_list[ i ] ) + ". \n"
-      file. write( " eoc error:  h[ " + \
-      str( i ) + " ] = h[ " + str( i - 1 ) + \
-      " ] = " + str( h_list[ i ] ) + ". \n" )
-   else:
-      h_ratio = math. log( h_list[ i ] / h_list[ i - 1 ] )
-      l1_ratio = math. log( l1_norm_list[ i ] / l1_norm_list[ i - 1 ] )
-      l2_ratio = math. log( l2_norm_list[ i ] / l2_norm_list[ i - 1 ] )
-      max_ratio = math. log( max_norm_list[ i ] / max_norm_list[ i - 1 ] )
-      if format == "latex":
-         file. write( " " + string. ljust( str( h_list[ i ] ), h_width ) + "&" +
-                      string. rjust( "%.2g" % l1_norm_list[ i ], err_width ) + "&" +
-                      string. rjust( "{\\bf " + "%.2g" % ( l1_ratio / h_ratio ) + "}", err_width ) + "&" +
-                      string. rjust( "%.2g" % l2_norm_list[ i ], err_width ) + "&" +
-                      string. rjust( "{\\bf " + "%.2g" % ( l2_ratio / h_ratio ) + "}", err_width ) + "&" +
-                      string. rjust( "%.2g" % max_norm_list[ i ], err_width ) + "&" +
-                      string. rjust( "{\\bf " + "%.2g" % ( max_ratio / h_ratio ) + "}", err_width ) + "\\\\\n" )
-      if format == "txt":
-         file. write( "| " + string. ljust( str( h_list[ i ] ), h_width ) + " |" +
-                      string. rjust( "%.2g" % l1_norm_list[ i ], err_width ) + " |" +
-                      string. rjust( "**" + "%.2g" % ( l1_ratio / h_ratio ) + "**", err_width ) + " |" +
-                      string. rjust( "%.2g" % l2_norm_list[ i ], err_width ) + " |" +
-                      string. rjust( "**" + "%.2g" % ( l2_ratio / h_ratio ) + "**", err_width ) + " |" +
-                      string. rjust( "%.2g" % max_norm_list[ i ], err_width ) + " |" +
-                      string. rjust( "**" + "%.2g" % ( max_ratio / h_ratio ) + "**", err_width ) + " |\n" )
-         file. write( "+--------------+----------------+----------------+----------------+----------------+----------------+----------------+\n" )
-   i = i + 1
-
-if format == "latex":
-   file. write( "\\hline \n" )
-   file. write( "\\end{tabular} \n" )
-    
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel/tnlParallelEikonalSolver.h b/src/TNL/Legacy/hamilton-jacobi-parallel/tnlParallelEikonalSolver.h
deleted file mode 100644
index 19cdd9493..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel/tnlParallelEikonalSolver.h
+++ /dev/null
@@ -1,366 +0,0 @@
-/***************************************************************************
-                          tnlParallelEikonalSolver.h  -  description
-                             -------------------
-    begin                : Nov 28 , 2014
-    copyright            : (C) 2014 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef TNLPARALLELEIKONALSOLVER_H_
-#define TNLPARALLELEIKONALSOLVER_H_
-
-#include <TNL/Config/ParameterContainer.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Containers/StaticVector.h>
-#include <functions/tnlMeshFunction.h>
-#include <TNL/Devices/Host.h>
-#include <mesh/tnlGrid.h>
-#include <mesh/grids/tnlGridEntity.h>
-#include <limits.h>
-#include <core/tnlDevice.h>
- #include <omp.h>
-
-
-#include <ctime>
-
-#ifdef HAVE_CUDA
-#include <core/tnlCuda.h>
-#endif
-
-
-template< int Dimension,
-		  typename SchemeHost,
-		  typename SchemeDevice,
-		  typename Device,
-		  typename RealType = double,
-          typename IndexType = int >
-class tnlParallelEikonalSolver
-{};
-
-template<typename SchemeHost, typename SchemeDevice, typename Device>
-class tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >
-{
-public:
-
-	typedef SchemeDevice SchemeTypeDevice;
-	typedef SchemeHost SchemeTypeHost;
-	typedef Device DeviceType;
-	typedef TNL::Containers::Vector< double, TNL::Devices::Host, int > VectorType;
-	typedef TNL::Containers::Vector< int, TNL::Devices::Host, int > IntVectorType;
-	typedef tnlGrid< 2, double, TNL::Devices::Host, int > MeshType;
-#ifdef HAVE_CUDA
-	typedef TNL::Containers::Vector< double, TNL::Devices::Host, int > VectorTypeCUDA;
-	typedef TNL::Containers::Vector< int, TNL::Devices::Host, int > IntVectorTypeCUDA;
-	typedef tnlGrid< 2, double, TNL::Devices::Host, int > MeshTypeCUDA;
-#endif
-	tnlParallelEikonalSolver();
-	bool init( const Config::ParameterContainer& parameters );
-	void run();
-
-	void test();
-
-/*private:*/
-
-
-	void synchronize();
-
-	int getOwner( int i) const;
-
-	int getSubgridValue( int i ) const;
-
-	void setSubgridValue( int i, int value );
-
-	int getBoundaryCondition( int i ) const;
-
-	void setBoundaryCondition( int i, int value );
-
-	void stretchGrid();
-
-	void contractGrid();
-
-	VectorType getSubgrid( const int i ) const;
-
-	void insertSubgrid( VectorType u, const int i );
-
-	VectorType runSubgrid( int boundaryCondition, VectorType u, int subGridID);
-
-
-	tnlMeshFunction<MeshType> u0;
-	VectorType work_u;
-	IntVectorType subgridValues, boundaryConditions, unusedCell, calculationsCount;
-	MeshType mesh, subMesh;
-
-//	tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage > Entity;
-
-	SchemeHost schemeHost;
-	SchemeDevice schemeDevice;
-	double delta, tau0, stopTime,cflCondition;
-	int gridRows, gridCols, gridLevels, currentStep, n;
-
-	std::clock_t start;
-	double time_diff;
-
-
-	tnlDeviceEnum device;
-
-	tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* getSelf()
-	{
-		return this;
-	};
-
-#ifdef HAVE_CUDA
-
-	tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver;
-
-	double* work_u_cuda;
-
-	int* subgridValues_cuda;
-	int*boundaryConditions_cuda;
-	int* unusedCell_cuda;
-	int* calculationsCount_cuda;
-	double* tmpw;
-	//MeshTypeCUDA mesh_cuda, subMesh_cuda;
-	//SchemeDevice scheme_cuda;
-	//double delta_cuda, tau0_cuda, stopTime_cuda,cflCondition_cuda;
-	//int gridRows_cuda, gridCols_cuda, currentStep_cuda, n_cuda;
-
-	int* runcuda;
-	int run_host;
-
-
-	__device__ void getSubgridCUDA2D( const int i, tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* caller, double* a);
-
-	__device__ void updateSubgridCUDA2D( const int i, tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* caller, double* a);
-
-	__device__ void insertSubgridCUDA2D( double u, const int i );
-
-	__device__ void runSubgridCUDA2D( int boundaryCondition, double* u, int subGridID);
-
-	/*__global__ void runCUDA();*/
-
-	//__device__ void synchronizeCUDA();
-
-	__device__ int getOwnerCUDA2D( int i) const;
-
-	__device__ int getSubgridValueCUDA2D( int i ) const;
-
-	__device__ void setSubgridValueCUDA2D( int i, int value );
-
-	__device__ int getBoundaryConditionCUDA2D( int i ) const;
-
-	__device__ void setBoundaryConditionCUDA2D( int i, int value );
-
-	//__device__ bool initCUDA( tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver);
-
-	/*__global__ void initRunCUDA(tnlParallelEikonalSolver<Scheme, double, TNL::Devices::Host, int >* caller);*/
-
-#endif
-
-};
-
-
-
-
-
-
-
-	template<typename SchemeHost, typename SchemeDevice, typename Device>
-	class tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >
-	{
-	public:
-
-		typedef SchemeDevice SchemeTypeDevice;
-		typedef SchemeHost SchemeTypeHost;
-		typedef Device DeviceType;
-		typedef TNL::Containers::Vector< double, TNL::Devices::Host, int > VectorType;
-		typedef TNL::Containers::Vector< int, TNL::Devices::Host, int > IntVectorType;
-		typedef tnlGrid< 3, double, TNL::Devices::Host, int > MeshType;
-	#ifdef HAVE_CUDA
-		typedef TNL::Containers::Vector< double, TNL::Devices::Host, int > VectorTypeCUDA;
-		typedef TNL::Containers::Vector< int, TNL::Devices::Host, int > IntVectorTypeCUDA;
-		typedef tnlGrid< 3, double, TNL::Devices::Host, int > MeshTypeCUDA;
-	#endif
-		tnlParallelEikonalSolver();
-		bool init( const Config::ParameterContainer& parameters );
-		void run();
-
-		void test();
-
-	/*private:*/
-
-
-		void synchronize();
-
-		int getOwner( int i) const;
-
-		int getSubgridValue( int i ) const;
-
-		void setSubgridValue( int i, int value );
-
-		int getBoundaryCondition( int i ) const;
-
-		void setBoundaryCondition( int i, int value );
-
-		void stretchGrid();
-
-		void contractGrid();
-
-		VectorType getSubgrid( const int i ) const;
-
-		void insertSubgrid( VectorType u, const int i );
-
-		VectorType runSubgrid( int boundaryCondition, VectorType u, int subGridID);
-
-
-		tnlMeshFunction<MeshType> u0;
-		VectorType work_u;
-		IntVectorType subgridValues, boundaryConditions, unusedCell, calculationsCount;
-		MeshType mesh, subMesh;
-		SchemeHost schemeHost;
-		SchemeDevice schemeDevice;
-		double delta, tau0, stopTime,cflCondition;
-		int gridRows, gridCols, gridLevels, currentStep, n;
-
-		std::clock_t start;
-		double time_diff;
-
-
-		tnlDeviceEnum device;
-
-		tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* getSelf()
-		{
-			return this;
-		};
-
-#ifdef HAVE_CUDA
-
-	tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver;
-
-	double* work_u_cuda;
-
-	int* subgridValues_cuda;
-	int*boundaryConditions_cuda;
-	int* unusedCell_cuda;
-	int* calculationsCount_cuda;
-	double* tmpw;
-	//MeshTypeCUDA mesh_cuda, subMesh_cuda;
-	//SchemeDevice scheme_cuda;
-	//double delta_cuda, tau0_cuda, stopTime_cuda,cflCondition_cuda;
-	//int gridRows_cuda, gridCols_cuda, currentStep_cuda, n_cuda;
-
-	int* runcuda;
-	int run_host;
-
-
-	__device__ void getSubgridCUDA3D( const int i, tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* caller, double* a);
-
-	__device__ void updateSubgridCUDA3D( const int i, tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* caller, double* a);
-
-	__device__ void insertSubgridCUDA3D( double u, const int i );
-
-	__device__ void runSubgridCUDA3D( int boundaryCondition, double* u, int subGridID);
-
-	/*__global__ void runCUDA();*/
-
-	//__device__ void synchronizeCUDA();
-
-	__device__ int getOwnerCUDA3D( int i) const;
-
-	__device__ int getSubgridValueCUDA3D( int i ) const;
-
-	__device__ void setSubgridValueCUDA3D( int i, int value );
-
-	__device__ int getBoundaryConditionCUDA3D( int i ) const;
-
-	__device__ void setBoundaryConditionCUDA3D( int i, int value );
-
-	//__device__ bool initCUDA( tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver);
-
-	/*__global__ void initRunCUDA(tnlParallelEikonalSolver<Scheme, double, TNL::Devices::Host, int >* caller);*/
-
-#endif
-
-};
-
-
-
-
-
-
-#ifdef HAVE_CUDA
-template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__ void runCUDA2D(tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* caller);
-
-template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__ void initRunCUDA2D(tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* caller);
-
-template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__ void initCUDA2D( tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr, int * ptr2, int* ptr3);
-
-template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__ void synchronizeCUDA2D(tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver);
-
-template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__ void synchronize2CUDA2D(tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver);
-
-
-
-
-
-
-
-template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__ void runCUDA3D(tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* caller);
-
-template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__ void initRunCUDA3D(tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* caller);
-
-template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__ void initCUDA3D( tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr, int * ptr2, int* ptr3);
-
-template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__ void synchronizeCUDA3D(tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver);
-
-template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__ void synchronize2CUDA3D(tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver);
-#endif
-
-
-#ifdef HAVE_CUDA
-__cuda_callable__
-double fabsMin( double x, double y)
-{
-	double fx = fabs(x);
-
-	if(Min(fx,fabs(y)) == fx)
-		return x;
-	else
-		return y;
-}
-
-__cuda_callable__
-double atomicFabsMin(double* address, double val)
-{
-	unsigned long long int* address_as_ull =
-						  (unsigned long long int*)address;
-	unsigned long long int old = *address_as_ull, assumed;
-	do {
-		assumed = old;
-			old = atomicCAS(address_as_ull, assumed,__double_as_longlong( fabsMin(__longlong_as_double(assumed),val) ));
-	} while (assumed != old);
-	return __longlong_as_double(old);
-}
-
-#endif
-
-#include "tnlParallelEikonalSolver2D_impl.h"
-#include "tnlParallelEikonalSolver3D_impl.h"
-#endif /* TNLPARALLELEIKONALSOLVER_H_ */
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel/tnlParallelEikonalSolver2D_impl.h b/src/TNL/Legacy/hamilton-jacobi-parallel/tnlParallelEikonalSolver2D_impl.h
deleted file mode 100644
index 76cf49bc8..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel/tnlParallelEikonalSolver2D_impl.h
+++ /dev/null
@@ -1,1928 +0,0 @@
-/***************************************************************************
-                          tnlParallelEikonalSolver2D_impl.h  -  description
-                             -------------------
-    begin                : Nov 28 , 2014
-    copyright            : (C) 2014 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef TNLPARALLELEIKONALSOLVER2D_IMPL_H_
-#define TNLPARALLELEIKONALSOLVER2D_IMPL_H_
-
-
-#include "tnlParallelEikonalSolver.h"
-#include <core/mfilename.h>
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::tnlParallelEikonalSolver()
-{
-	cout << "a" <<std::endl;
-	this->device = tnlCudaDevice;  /////////////// tnlCuda Device --- vypocet na GPU, TNL::Devices::HostDevice   ---    vypocet na CPU
-
-#ifdef HAVE_CUDA
-	if(this->device == tnlCudaDevice)
-	{
-	run_host = 1;
-	}
-#endif
-
-	cout << "b" <<std::endl;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::test()
-{
-/*
-	for(int i =0; i < this->subgridValues.getSize(); i++ )
-	{
-		insertSubgrid(getSubgrid(i), i);
-	}
-*/
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-
-bool tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::init( const Config::ParameterContainer& parameters )
-{
-	cout << "Initializating solver..." <<std::endl;
-	const String& meshLocation = parameters.getParameter <String>("mesh");
-	this->mesh.load( meshLocation );
-
-	this->n = parameters.getParameter <int>("subgrid-size");
-	cout << "Setting N to " << this->n <<std::endl;
-
-	this->subMesh.setDimensions( this->n, this->n );
-	this->subMesh.setDomain( Containers::StaticVector<2,double>(0.0, 0.0),
-							 Containers::StaticVector<2,double>(mesh.template getSpaceStepsProducts< 1, 0 >()*(double)(this->n), mesh.template getSpaceStepsProducts< 0, 1 >()*(double)(this->n)) );
-
-	this->subMesh.save("submesh.tnl");
-
-	const String& initialCondition = parameters.getParameter <String>("initial-condition");
-	this->u0.load( initialCondition );
-
-	//cout << this->mesh.getCellCenter(0) <<std::endl;
-
-	this->delta = parameters.getParameter <double>("delta");
-	this->delta *= mesh.template getSpaceStepsProducts< 1, 0 >()*mesh.template getSpaceStepsProducts< 0, 1 >();
-
-	cout << "Setting delta to " << this->delta <<std::endl;
-
-	this->tau0 = parameters.getParameter <double>("initial-tau");
-	cout << "Setting initial tau to " << this->tau0 <<std::endl;
-	this->stopTime = parameters.getParameter <double>("stop-time");
-
-	this->cflCondition = parameters.getParameter <double>("cfl-condition");
-	this -> cflCondition *= sqrt(mesh.template getSpaceStepsProducts< 1, 0 >()*mesh.template getSpaceStepsProducts< 0, 1 >());
-	cout << "Setting CFL to " << this->cflCondition <<std::endl;
-
-	stretchGrid();
-	this->stopTime /= (double)(this->gridCols);
-	this->stopTime *= (1.0+1.0/((double)(this->n) - 2.0));
-	cout << "Setting stopping time to " << this->stopTime <<std::endl;
-	//this->stopTime = 1.5*((double)(this->n))*parameters.getParameter <double>("stop-time")*this->mesh.template getSpaceStepsProducts< 1, 0 >();
-	//cout << "Setting stopping time to " << this->stopTime <<std::endl;
-
-	cout << "Initializating scheme..." <<std::endl;
-	if(!this->schemeHost.init(parameters))
-	{
-		cerr << "SchemeHost failed to initialize." <<std::endl;
-		return false;
-	}
-	cout << "Scheme initialized." <<std::endl;
-
-	test();
-
-	VectorType* tmp = new VectorType[subgridValues.getSize()];
-	bool containsCurve = false;
-
-#ifdef HAVE_CUDA
-
-	if(this->device == tnlCudaDevice)
-	{
-	/*cout << "Testing... " <<std::endl;
-	if(this->device == tnlCudaDevice)
-	{
-	if( !initCUDA2D(parameters, gridRows, gridCols) )
-		return false;
-	}*/
-		//cout << "s" <<std::endl;
-	cudaMalloc(&(this->cudaSolver), sizeof(tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >));
-	//cout << "s" <<std::endl;
-	cudaMemcpy(this->cudaSolver, this,sizeof(tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >), cudaMemcpyHostToDevice);
-	//cout << "s" <<std::endl;
-	double** tmpdev = NULL;
-	cudaMalloc(&tmpdev, sizeof(double*));
-	//double* tmpw;
-	cudaMalloc(&(this->tmpw), this->work_u.getSize()*sizeof(double));
-	cudaMalloc(&(this->runcuda), sizeof(int));
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-	int* tmpUC;
-	cudaMalloc(&(tmpUC), this->work_u.getSize()*sizeof(int));
-	cudaMemcpy(tmpUC, this->unusedCell.getData(), this->unusedCell.getSize()*sizeof(int), cudaMemcpyHostToDevice);
-
-	initCUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<1,1>>>(this->cudaSolver, (this->tmpw), (this->runcuda),tmpUC);
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-	//cout << "s " <<std::endl;
-	//cudaMalloc(&(cudaSolver->work_u_cuda), this->work_u.getSize()*sizeof(double));
-	double* tmpu = NULL;
-
-	cudaMemcpy(&tmpu, tmpdev,sizeof(double*), cudaMemcpyDeviceToHost);
-	//printf("%p %p \n",tmpu,tmpw);
-	cudaMemcpy((this->tmpw), this->work_u.getData(), this->work_u.getSize()*sizeof(double), cudaMemcpyHostToDevice);
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-	//cout << "s "<<std::endl;
-
-	}
-#endif
-
-	if(this->device == TNL::Devices::HostDevice)
-	{
-	for(int i = 0; i < this->subgridValues.getSize(); i++)
-	{
-
-		if(! tmp[i].setSize(this->n * this->n))
-			cout << "Could not allocate tmp["<< i <<"] array." <<std::endl;
-			tmp[i] = getSubgrid(i);
-		containsCurve = false;
-
-		for(int j = 0; j < tmp[i].getSize(); j++)
-		{
-			if(tmp[i][0]*tmp[i][j] <= 0.0)
-			{
-				containsCurve = true;
-				j=tmp[i].getSize();
-			}
-
-		}
-		if(containsCurve)
-		{
-			//cout << "Computing initial SDF on subgrid " << i << "." <<std::endl;
-			tmp[i] = runSubgrid(0, tmp[i],i);
-			insertSubgrid(tmp[i], i);
-			setSubgridValue(i, 4);
-			//cout << "Computed initial SDF on subgrid " << i  << "." <<std::endl;
-		}
-		containsCurve = false;
-
-	}
-	}
-#ifdef HAVE_CUDA
-	else if(this->device == tnlCudaDevice)
-	{
-//		cout << "pre 1 kernel" <<std::endl;
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-		dim3 threadsPerBlock(this->n, this->n);
-		dim3 numBlocks(this->gridCols,this->gridRows);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-		initRunCUDA2D<SchemeTypeHost,SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver);
-		cudaDeviceSynchronize();
-//		cout << "post 1 kernel" <<std::endl;
-
-	}
-#endif
-
-
-	this->currentStep = 1;
-	if(this->device == TNL::Devices::HostDevice)
-		synchronize();
-#ifdef HAVE_CUDA
-	else if(this->device == tnlCudaDevice)
-	{
-		dim3 threadsPerBlock(this->n, this->n);
-		dim3 numBlocks(this->gridCols,this->gridRows);
-		//double * test = (double*)malloc(this->work_u.getSize()*sizeof(double));
-		//cout << test[0] <<"   " << test[1] <<"   " << test[2] <<"   " << test[3] <<std::endl;
-		//cudaMemcpy(/*this->work_u.getData()*/ test, (this->tmpw), this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-		//cout << this->tmpw << "   " <<  test[0] <<"   " << test[1] << "   " <<test[2] << "   " <<test[3] <<std::endl;
-
-		TNL_CHECK_CUDA_DEVICE;
-
-		synchronizeCUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-		synchronize2CUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-		//cout << test[0] << "   " <<test[1] <<"   " << test[2] << "   " <<test[3] <<std::endl;
-		//cudaMemcpy(/*this->work_u.getData()*/ test, (this->tmpw), this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-		//TNL_CHECK_CUDA_DEVICE;
-		//cout << this->tmpw << "   " <<  test[0] << "   " <<test[1] << "   " <<test[2] <<"   " << test[3] <<std::endl;
-		//free(test);
-
-	}
-
-#endif
-	cout << "Solver initialized." <<std::endl;
-
-	return true;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::run()
-{
-	if(this->device == TNL::Devices::HostDevice)
-	{
-
-	bool end = false;
-	while ((this->boundaryConditions.max() > 0 ) || !end)
-	{
-		if(this->boundaryConditions.max() == 0 )
-			end=true;
-		else
-			end=false;
-#ifdef HAVE_OPENMP
-#pragma omp parallel for num_threads(4) schedule(dynamic)
-#endif
-		for(int i = 0; i < this->subgridValues.getSize(); i++)
-		{
-			if(getSubgridValue(i) != INT_MAX)
-			{
-				VectorType tmp;
-				tmp.setSize(this->n * this->n);
-				//cout << "subMesh: " << i << ", BC: " << getBoundaryCondition(i) <<std::endl;
-
-				if(getSubgridValue(i) == currentStep+4)
-				{
-
-				if(getBoundaryCondition(i) & 1)
-				{
-					tmp = getSubgrid(i);
-					tmp = runSubgrid(1, tmp ,i);
-					insertSubgrid( tmp, i);
-					this->calculationsCount[i]++;
-				}
-				if(getBoundaryCondition(i) & 2)
-				{
-					tmp = getSubgrid(i);
-					tmp = runSubgrid(1, tmp ,i);
-					insertSubgrid( tmp, 2);
-					this->calculationsCount[i]++;
-				}
-				if(getBoundaryCondition(i) & 4)
-				{
-					tmp = getSubgrid(i);
-					tmp = runSubgrid(4, tmp ,i);
-					insertSubgrid( tmp, i);
-					this->calculationsCount[i]++;
-				}
-				if(getBoundaryCondition(i) & 8)
-				{
-					tmp = getSubgrid(i);
-					tmp = runSubgrid(8, tmp ,i);
-					insertSubgrid( tmp, i);
-					this->calculationsCount[i]++;
-				}
-				}
-
-				if( ((getBoundaryCondition(i) & 2) )|| (getBoundaryCondition(i) & 1)//)
-					/*	&&(!(getBoundaryCondition(i) & 5) && !(getBoundaryCondition(i) & 10)) */)
-				{
-					//cout << "3 @ " << getBoundaryCondition(i) <<std::endl;
-					tmp = getSubgrid(i);
-					tmp = runSubgrid(1, tmp ,i);
-					insertSubgrid( tmp, 3);
-				}
-				if( ((getBoundaryCondition(i) & 4) )|| (getBoundaryCondition(i) & 1)//)
-					/*	&&(!(getBoundaryCondition(i) & 3) && !(getBoundaryCondition(i) & 12)) */)
-				{
-					//cout << "5 @ " << getBoundaryCondition(i) <<std::endl;
-					tmp = getSubgrid(i);
-					tmp = runSubgrid(5, tmp ,i);
-					insertSubgrid( tmp, i);
-				}
-				if( ((getBoundaryCondition(i) & 2) )|| (getBoundaryCondition(i) & 8)//)
-					/*	&&(!(getBoundaryCondition(i) & 12) && !(getBoundaryCondition(i) & 3))*/ )
-				{
-					//cout << "10 @ " << getBoundaryCondition(i) <<std::endl;
-					tmp = getSubgrid(i);
-					tmp = runSubgrid(10, tmp ,i);
-					insertSubgrid( tmp, i);
-				}
-				if(   ((getBoundaryCondition(i) & 4) )|| (getBoundaryCondition(i) & 8)//)
-					/*&&(!(getBoundaryCondition(i) & 10) && !(getBoundaryCondition(i) & 5)) */)
-				{
-					//cout << "12 @ " << getBoundaryCondition(i) <<std::endl;
-					tmp = getSubgrid(i);
-					tmp = runSubgrid(12, tmp ,i);
-					insertSubgrid( tmp, i);
-				}
-
-
-				/*if(getBoundaryCondition(i))
-				{
-					insertSubgrid( runSubgrid(15, getSubgrid(i),i), i);
-				}*/
-
-				setBoundaryCondition(i, 0);
-
-				setSubgridValue(i, getSubgridValue(i)-1);
-
-			}
-		}
-		synchronize();
-	}
-	}
-#ifdef HAVE_CUDA
-	else if(this->device == tnlCudaDevice)
-	{
-		//cout << "fn" <<std::endl;
-		bool end_cuda = false;
-		dim3 threadsPerBlock(this->n, this->n);
-		dim3 numBlocks(this->gridCols,this->gridRows);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-		//cudaMalloc(&runcuda,sizeof(bool));
-		//cudaMemcpy(runcuda, &run_host, sizeof(bool), cudaMemcpyHostToDevice);
-		//cout << "fn" <<std::endl;
-		bool* tmpb;
-		//cudaMemcpy(tmpb, &(cudaSolver->runcuda),sizeof(bool*), cudaMemcpyDeviceToHost);
-		//cudaDeviceSynchronize();
-		//TNL_CHECK_CUDA_DEVICE;
-		cudaMemcpy(&(this->run_host),this->runcuda,sizeof(int), cudaMemcpyDeviceToHost);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-		//cout << "fn" <<std::endl;
-		int i = 1;
-		time_diff = 0.0;
-		while (run_host || !end_cuda)
-		{
-			cout << "Computing at step "<< i++ <<std::endl;
-			if(run_host != 0 )
-				end_cuda = true;
-			else
-				end_cuda = false;
-			//cout << "a" <<std::endl;
-			cudaDeviceSynchronize();
-			TNL_CHECK_CUDA_DEVICE;
-			start = std::clock();
-			runCUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver);
-			//cout << "a" <<std::endl;
-			cudaDeviceSynchronize();
-			time_diff += (std::clock() - start) / (double)(CLOCKS_PER_SEC);
-
-			//start = std::clock();
-			synchronizeCUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
-			cudaDeviceSynchronize();
-			TNL_CHECK_CUDA_DEVICE;
-			synchronize2CUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver);
-			cudaDeviceSynchronize();
-			TNL_CHECK_CUDA_DEVICE;
-			//time_diff += (std::clock() - start) / (double)(CLOCKS_PER_SEC);
-
-
-			//cout << "a" <<std::endl;
-			//run_host = false;
-			//cout << "in kernel loop" << run_host <<std::endl;
-			//cudaMemcpy(tmpb, &(cudaSolver->runcuda),sizeof(bool*), cudaMemcpyDeviceToHost);
-			cudaMemcpy(&run_host, (this->runcuda),sizeof(int), cudaMemcpyDeviceToHost);
-			//cout << "in kernel loop" << run_host <<std::endl;
-		}
-		cout << "Solving time was: " << time_diff <<std::endl;
-		//cout << "b" <<std::endl;
-
-		//double* tmpu;
-		//cudaMemcpy(tmpu, &(cudaSolver->work_u_cuda),sizeof(double*), cudaMemcpyHostToDevice);
-		//cudaMemcpy(this->work_u.getData(), tmpu, this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-		//cout << this->work_u.getData()[0] <<std::endl;
-
-		//double * test = (double*)malloc(this->work_u.getSize()*sizeof(double));
-		//cout << test[0] << test[1] << test[2] << test[3] <<std::endl;
-		cudaMemcpy(this->work_u.getData()/* test*/, (this->tmpw), this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-		//cout << this->tmpw << "   " <<  test[0] << test[1] << test[2] << test[3] <<std::endl;
-		//free(test);
-
-		cudaDeviceSynchronize();
-	}
-#endif
-	contractGrid();
-	this->u0.save("u-00001.tnl");
-	cout << "Maximum number of calculations on one subgrid was " << this->calculationsCount.absMax() <<std::endl;
-	cout << "Average number of calculations on one subgrid was " << ( (double) this->calculationsCount.sum() / (double) this->calculationsCount.getSize() ) <<std::endl;
-	cout << "Solver finished" <<std::endl;
-
-#ifdef HAVE_CUDA
-	if(this->device == tnlCudaDevice)
-	{
-		cudaFree(this->runcuda);
-		cudaFree(this->tmpw);
-		cudaFree(this->cudaSolver);
-	}
-#endif
-
-}
-
-//north - 1, east - 2, west - 4, south - 8
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::synchronize() //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now
-{
-	cout << "Synchronizig..." <<std::endl;
-	int tmp1, tmp2;
-	int grid1, grid2;
-
-	if(this->currentStep & 1)
-	{
-		for(int j = 0; j < this->gridRows - 1; j++)
-		{
-			for (int i = 0; i < this->gridCols*this->n; i++)
-			{
-				tmp1 = this->gridCols*this->n*((this->n-1)+j*this->n) + i;
-				tmp2 = this->gridCols*this->n*((this->n)+j*this->n) + i;
-				grid1 = getSubgridValue(getOwner(tmp1));
-				grid2 = getSubgridValue(getOwner(tmp2));
-				if(getOwner(tmp1)==getOwner(tmp2))
-					cout << "i, j" << i << "," << j <<std::endl;
-				if ((fabs(this->work_u[tmp1]) < fabs(this->work_u[tmp2]) - this->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX))
-				{
-					this->work_u[tmp2] = this->work_u[tmp1];
-					this->unusedCell[tmp2] = 0;
-					if(grid2 == INT_MAX)
-					{
-						setSubgridValue(getOwner(tmp2), -INT_MAX);
-					}
-					if(! (getBoundaryCondition(getOwner(tmp2)) & 8) )
-						setBoundaryCondition(getOwner(tmp2), getBoundaryCondition(getOwner(tmp2))+8);
-				}
-				else if ((fabs(this->work_u[tmp1]) > fabs(this->work_u[tmp2]) + this->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX))
-				{
-					this->work_u[tmp1] = this->work_u[tmp2];
-					this->unusedCell[tmp1] = 0;
-					if(grid1 == INT_MAX)
-					{
-						setSubgridValue(getOwner(tmp1), -INT_MAX);
-					}
-					if(! (getBoundaryCondition(getOwner(tmp1)) & 1) )
-						setBoundaryCondition(getOwner(tmp1), getBoundaryCondition(getOwner(tmp1))+1);
-				}
-			}
-		}
-
-	}
-	else
-	{
-		for(int i = 1; i < this->gridCols; i++)
-		{
-			for (int j = 0; j < this->gridRows*this->n; j++)
-			{
-				tmp1 = this->gridCols*this->n*j + i*this->n - 1;
-				tmp2 = this->gridCols*this->n*j + i*this->n ;
-				grid1 = getSubgridValue(getOwner(tmp1));
-				grid2 = getSubgridValue(getOwner(tmp2));
-				if(getOwner(tmp1)==getOwner(tmp2))
-					cout << "i, j" << i << "," << j <<std::endl;
-				if ((fabs(this->work_u[tmp1]) < fabs(this->work_u[tmp2]) - this->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX))
-				{
-					this->work_u[tmp2] = this->work_u[tmp1];
-					this->unusedCell[tmp2] = 0;
-					if(grid2 == INT_MAX)
-					{
-						setSubgridValue(getOwner(tmp2), -INT_MAX);
-					}
-					if(! (getBoundaryCondition(getOwner(tmp2)) & 4) )
-						setBoundaryCondition(getOwner(tmp2), getBoundaryCondition(getOwner(tmp2))+4);
-				}
-				else if ((fabs(this->work_u[tmp1]) > fabs(this->work_u[tmp2]) + this->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX))
-				{
-					this->work_u[tmp1] = this->work_u[tmp2];
-					this->unusedCell[tmp1] = 0;
-					if(grid1 == INT_MAX)
-					{
-						setSubgridValue(getOwner(tmp1), -INT_MAX);
-					}
-					if(! (getBoundaryCondition(getOwner(tmp1)) & 2) )
-						setBoundaryCondition(getOwner(tmp1), getBoundaryCondition(getOwner(tmp1))+2);
-				}
-			}
-		}
-	}
-
-
-	this->currentStep++;
-	int stepValue = this->currentStep + 4;
-	for (int i = 0; i < this->subgridValues.getSize(); i++)
-	{
-		if( getSubgridValue(i) == -INT_MAX )
-			setSubgridValue(i, stepValue);
-	}
-
-	cout << "Grid synchronized at step " << (this->currentStep - 1 ) <<std::endl;
-
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-int tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getOwner(int i) const
-{
-
-	return (i / (this->gridCols*this->n*this->n))*this->gridCols + (i % (this->gridCols*this->n))/this->n;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-int tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getSubgridValue( int i ) const
-{
-	return this->subgridValues[i];
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::setSubgridValue(int i, int value)
-{
-	this->subgridValues[i] = value;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-int tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getBoundaryCondition( int i ) const
-{
-	return this->boundaryConditions[i];
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::setBoundaryCondition(int i, int value)
-{
-	this->boundaryConditions[i] = value;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::stretchGrid()
-{
-	cout << "Stretching grid..." <<std::endl;
-
-
-	this->gridCols = ceil( ((double)(this->mesh.getDimensions().x()-1)) / ((double)(this->n-1)) );
-	this->gridRows = ceil( ((double)(this->mesh.getDimensions().y()-1)) / ((double)(this->n-1)) );
-
-	//this->gridCols = (this->mesh.getDimensions().x()-1) / (this->n-1) ;
-	//this->gridRows = (this->mesh.getDimensions().y()-1) / (this->n-1) ;
-
-	cout << "Setting gridCols to " << this->gridCols << "." <<std::endl;
-	cout << "Setting gridRows to " << this->gridRows << "." <<std::endl;
-
-	this->subgridValues.setSize(this->gridCols*this->gridRows);
-	this->subgridValues.setValue(0);
-	this->boundaryConditions.setSize(this->gridCols*this->gridRows);
-	this->boundaryConditions.setValue(0);
-	this->calculationsCount.setSize(this->gridCols*this->gridRows);
-	this->calculationsCount.setValue(0);
-
-	for(int i = 0; i < this->subgridValues.getSize(); i++ )
-	{
-		this->subgridValues[i] = INT_MAX;
-		this->boundaryConditions[i] = 0;
-	}
-
-	int stretchedSize = this->n*this->n*this->gridCols*this->gridRows;
-
-	if(!this->work_u.setSize(stretchedSize))
-		cerr << "Could not allocate memory for stretched grid." <<std::endl;
-	if(!this->unusedCell.setSize(stretchedSize))
-		cerr << "Could not allocate memory for supporting stretched grid." <<std::endl;
-	int idealStretch =this->mesh.getDimensions().x() + (this->mesh.getDimensions().x()-2)/(this->n-1);
-	cout << idealStretch <<std::endl;
-
-	for(int i = 0; i < stretchedSize; i++)
-	{
-		this->unusedCell[i] = 1;
-		int diff =(this->n*this->gridCols) - idealStretch ;
-		//cout << "diff = " << diff <<endl;
-		int k = i/this->n - i/(this->n*this->gridCols) + this->mesh.getDimensions().x()*(i/(this->n*this->n*this->gridCols)) + (i/(this->n*this->gridCols))*diff;
-
-		if(i%(this->n*this->gridCols) - idealStretch  >= 0)
-		{
-			//cout << i%(this->n*this->gridCols) - idealStretch +1 <<std::endl;
-			k+= i%(this->n*this->gridCols) - idealStretch +1 ;
-		}
-
-		if(i/(this->n*this->gridCols) - idealStretch + 1  > 0)
-		{
-			//cout << i/(this->n*this->gridCols) - idealStretch + 1  <<std::endl;
-			k+= (i/(this->n*this->gridCols) - idealStretch +1 )* this->mesh.getDimensions().x() ;
-		}
-
-		//cout << "i = " << i << " : i-k = " << i-k <<std::endl;
-		/*int j=(i % (this->n*this->gridCols)) - ( (this->mesh.getDimensions().x() - this->n)/(this->n - 1) + this->mesh.getDimensions().x() - 1)
-				+ (this->n*this->gridCols - this->mesh.getDimensions().x())*(i/(this->n*this->n*this->gridCols)) ;
-
-		if(j > 0)
-			k += j;
-
-		int l = i-k - (this->u0.getSize() - 1);
-		int m = (l % this->mesh.getDimensions().x());
-
-		if(l>0)
-			k+= l + ( (l / this->mesh.getDimensions().x()) + 1 )*this->mesh.getDimensions().x() - (l % this->mesh.getDimensions().x());*/
-
-		this->work_u[i] = this->u0[i-k];
-		//cout << (i-k) <<endl;
-	}
-
-
-	cout << "Grid stretched." <<std::endl;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::contractGrid()
-{
-	cout << "Contracting grid..." <<std::endl;
-	int stretchedSize = this->n*this->n*this->gridCols*this->gridRows;
-
-	int idealStretch =this->mesh.getDimensions().x() + (this->mesh.getDimensions().x()-2)/(this->n-1);
-	cout << idealStretch <<std::endl;
-
-	for(int i = 0; i < stretchedSize; i++)
-	{
-		int diff =(this->n*this->gridCols) - idealStretch ;
-		int k = i/this->n - i/(this->n*this->gridCols) + this->mesh.getDimensions().x()*(i/(this->n*this->n*this->gridCols)) + (i/(this->n*this->gridCols))*diff;
-
-		if((i%(this->n*this->gridCols) - idealStretch  < 0) && (i/(this->n*this->gridCols) - idealStretch + 1  <= 0))
-		{
-			//cout << i <<" : " <<i-k<<std::endl;
-			this->u0[i-k] = this->work_u[i];
-		}
-
-	}
-
-	cout << "Grid contracted" <<std::endl;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-typename tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::VectorType
-tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getSubgrid( const int i ) const
-{
-	VectorType u;
-	u.setSize(this->n*this->n);
-
-	for( int j = 0; j < u.getSize(); j++)
-	{
-		u[j] = this->work_u[ (i / this->gridCols) * this->n*this->n*this->gridCols
-		                     + (i % this->gridCols) * this->n
-		                     + (j/this->n) * this->n*this->gridCols
-		                     + (j % this->n) ];
-	}
-	return u;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::insertSubgrid( VectorType u, const int i )
-{
-
-	for( int j = 0; j < this->n*this->n; j++)
-	{
-		int index = (i / this->gridCols)*this->n*this->n*this->gridCols
-					+ (i % this->gridCols)*this->n
-					+ (j/this->n)*this->n*this->gridCols
-					+ (j % this->n);
-		//OMP LOCK index
-		if( (fabs(this->work_u[index]) > fabs(u[j])) || (this->unusedCell[index] == 1) )
-		{
-			this->work_u[index] = u[j];
-			this->unusedCell[index] = 0;
-		}
-		//OMP UNLOCK index
-	}
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-typename tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::VectorType
-tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::runSubgrid( int boundaryCondition, VectorType u, int subGridID)
-{
-
-	VectorType fu;
-
-	fu.setLike(u);
-	fu.setValue( 0.0 );
-
-/*
- *          Insert Euler-Solver Here
- */
-
-	/**/
-
-	/*for(int i = 0; i < u.getSize(); i++)
-	{
-		int x = this->subMesh.getCellCoordinates(i).x();
-		int y = this->subMesh.getCellCoordinates(i).y();
-
-		if(x == 0 && (boundaryCondition & 4) && y ==0)
-		{
-			if((u[subMesh.getCellYSuccessor( i )] - u[i])/subMesh.template getSpaceStepsProducts< 0, 1 >() > 1.0)
-			{
-				//cout << "x = 0; y = 0" <<std::endl;
-				u[i] = u[subMesh.getCellYSuccessor( i )] - subMesh.template getSpaceStepsProducts< 0, 1 >();
-			}
-		}
-		else if(x == 0 && (boundaryCondition & 4) && y == subMesh.getDimensions().y() - 1)
-		{
-			if((u[subMesh.getCellYPredecessor( i )] - u[i])/subMesh.template getSpaceStepsProducts< 0, 1 >() > 1.0)
-			{
-				//cout << "x = 0; y = n" <<std::endl;
-				u[i] = u[subMesh.getCellYPredecessor( i )] - subMesh.template getSpaceStepsProducts< 0, 1 >();
-			}
-		}
-
-
-		else if(x == subMesh.getDimensions().x() - 1 && (boundaryCondition & 2) && y ==0)
-		{
-			if((u[subMesh.getCellYSuccessor( i )] - u[i])/subMesh.template getSpaceStepsProducts< 0, 1 >() > 1.0)
-			{
-				//cout << "x = n; y = 0" <<std::endl;
-				u[i] = u[subMesh.getCellYSuccessor( i )] - subMesh.template getSpaceStepsProducts< 0, 1 >();
-			}
-		}
-		else if(x == subMesh.getDimensions().x() - 1 && (boundaryCondition & 2) && y == subMesh.getDimensions().y() - 1)
-		{
-			if((u[subMesh.getCellYPredecessor( i )] - u[i])/subMesh.template getSpaceStepsProducts< 0, 1 >() > 1.0)
-			{
-				//cout << "x = n; y = n" <<std::endl;
-				u[i] = u[subMesh.getCellYPredecessor( i )] - subMesh.template getSpaceStepsProducts< 0, 1 >();
-			}
-		}
-
-
-		else if(y == 0 && (boundaryCondition & 8) && x ==0)
-		{
-			if((u[subMesh.getCellXSuccessor( i )] - u[i])/subMesh.template getSpaceStepsProducts< 1, 0 >() > 1.0)
-			{
-				//cout << "y = 0; x = 0" <<std::endl;
-				u[i] = u[subMesh.getCellXSuccessor( i )] - subMesh.template getSpaceStepsProducts< 1, 0 >();
-			}
-		}
-		else if(y == 0 && (boundaryCondition & 8) && x == subMesh.getDimensions().x() - 1)
-		{
-			if((u[subMesh.getCellXPredecessor( i )] - u[i])/subMesh.template getSpaceStepsProducts< 1, 0 >() > 1.0)
-			{
-				//cout << "y = 0; x = n" <<std::endl;
-				u[i] = u[subMesh.getCellXPredecessor( i )] - subMesh.template getSpaceStepsProducts< 1, 0 >();
-			}
-		}
-
-
-		else if(y == subMesh.getDimensions().y() - 1 && (boundaryCondition & 1) && x ==0)
-		{
-			if((u[subMesh.getCellXSuccessor( i )] - u[i])/subMesh.template getSpaceStepsProducts< 1, 0 >() > 1.0)			{
-				//cout << "y = n; x = 0" <<std::endl;
-				u[i] = u[subMesh.getCellXSuccessor( i )] - subMesh.template getSpaceStepsProducts< 1, 0 >();
-			}
-		}
-		else if(y == subMesh.getDimensions().y() - 1 && (boundaryCondition & 1) && x == subMesh.getDimensions().x() - 1)
-		{
-			if((u[subMesh.getCellXPredecessor( i )] - u[i])/subMesh.template getSpaceStepsProducts< 1, 0 >() > 1.0)
-			{
-				//cout << "y = n; x = n" <<std::endl;
-				u[i] = u[subMesh.getCellXPredecessor( i )] - subMesh.template getSpaceStepsProducts< 1, 0 >();
-			}
-		}
-	}*/
-
-	/**/
-
-
-/*	bool tmp = false;
-	for(int i = 0; i < u.getSize(); i++)
-	{
-		if(u[0]*u[i] <= 0.0)
-			tmp=true;
-	}
-
-
-	if(tmp)
-	{}
-	else if(boundaryCondition == 4)
-	{
-		int i;
-		for(i = 0; i < u.getSize() - subMesh.getDimensions().x() ; i=subMesh.getCellYSuccessor(i))
-		{
-			int j;
-			for(j = i; j < subMesh.getDimensions().x() - 1; j=subMesh.getCellXSuccessor(j))
-			{
-				u[j] = u[i];
-			}
-			u[j] = u[i];
-		}
-		int j;
-		for(j = i; j < subMesh.getDimensions().x() - 1; j=subMesh.getCellXSuccessor(j))
-		{
-			u[j] = u[i];
-		}
-		u[j] = u[i];
-	}
-	else if(boundaryCondition == 8)
-	{
-		int i;
-		for(i = 0; i < subMesh.getDimensions().x() - 1; i=subMesh.getCellXSuccessor(i))
-		{
-			int j;
-			for(j = i; j < u.getSize() - subMesh.getDimensions().x(); j=subMesh.getCellYSuccessor(j))
-			{
-				u[j] = u[i];
-			}
-			u[j] = u[i];
-		}
-		int j;
-		for(j = i; j < u.getSize() - subMesh.getDimensions().x(); j=subMesh.getCellYSuccessor(j))
-		{
-			u[j] = u[i];
-		}
-		u[j] = u[i];
-
-	}
-	else if(boundaryCondition == 2)
-	{
-		int i;
-		for(i = subMesh.getDimensions().x() - 1; i < u.getSize() - subMesh.getDimensions().x() ; i=subMesh.getCellYSuccessor(i))
-		{
-			int j;
-			for(j = i; j > (i-1)*subMesh.getDimensions().x(); j=subMesh.getCellXPredecessor(j))
-			{
-				u[j] = u[i];
-			}
-			u[j] = u[i];
-		}
-		int j;
-		for(j = i; j > (i-1)*subMesh.getDimensions().x(); j=subMesh.getCellXPredecessor(j))
-		{
-			u[j] = u[i];
-		}
-		u[j] = u[i];
-	}
-	else if(boundaryCondition == 1)
-	{
-		int i;
-		for(i = (subMesh.getDimensions().y() - 1)*subMesh.getDimensions().x(); i < u.getSize() - 1; i=subMesh.getCellXSuccessor(i))
-		{
-			int j;
-			for(j = i; j >=subMesh.getDimensions().x(); j=subMesh.getCellYPredecessor(j))
-			{
-				u[j] = u[i];
-			}
-			u[j] = u[i];
-		}
-		int j;
-		for(j = i; j >=subMesh.getDimensions().x(); j=subMesh.getCellYPredecessor(j))
-		{
-			u[j] = u[i];
-		}
-		u[j] = u[i];
-	}
-*/
-	/**/
-
-
-
-	bool tmp = false;
-	for(int i = 0; i < u.getSize(); i++)
-	{
-		if(u[0]*u[i] <= 0.0)
-			tmp=true;
-		int centerGID = (this->n*(subGridID / this->gridRows)+ (this->n >> 1))*(this->n*this->gridCols) + this->n*(subGridID % this->gridRows) + (this->n >> 1);
-		if(this->unusedCell[centerGID] == 0 || boundaryCondition == 0)
-			tmp = true;
-	}
-	//if(this->currentStep + 3 < getSubgridValue(subGridID))
-		//tmp = true;
-
-
-	double value = sign(u[0]) * u.absMax();
-
-	if(tmp)
-	{}
-
-
-	//north - 1, east - 2, west - 4, south - 8
-	else if(boundaryCondition == 4)
-	{
-		for(int i = 0; i < this->n; i++)
-			for(int j = 1;j < this->n; j++)
-				//if(fabs(u[i*this->n + j]) <  fabs(u[i*this->n]))
-				u[i*this->n + j] = value;// u[i*this->n];
-	}
-	else if(boundaryCondition == 2)
-	{
-		for(int i = 0; i < this->n; i++)
-			for(int j =0 ;j < this->n -1; j++)
-				//if(fabs(u[i*this->n + j]) < fabs(u[(i+1)*this->n - 1]))
-				u[i*this->n + j] = value;// u[(i+1)*this->n - 1];
-	}
-	else if(boundaryCondition == 1)
-	{
-		for(int j = 0; j < this->n; j++)
-			for(int i = 0;i < this->n - 1; i++)
-				//if(fabs(u[i*this->n + j]) < fabs(u[j + this->n*(this->n - 1)]))
-				u[i*this->n + j] = value;// u[j + this->n*(this->n - 1)];
-	}
-	else if(boundaryCondition == 8)
-	{
-		for(int j = 0; j < this->n; j++)
-			for(int i = 1;i < this->n; i++)
-				//if(fabs(u[i*this->n + j]) < fabs(u[j]))
-				u[i*this->n + j] = value;// u[j];
-	}
-
-/*
-
-	else if(boundaryCondition == 5)
-	{
-		for(int i = 0; i < this->n - 1; i++)
-			for(int j = 1;j < this->n; j++)
-				//if(fabs(u[i*this->n + j]) <  fabs(u[i*this->n]))
-				u[i*this->n + j] = value;// u[i*this->n];
-	}
-	else if(boundaryCondition == 10)
-	{
-		for(int i = 1; i < this->n; i++)
-			for(int j =0 ;j < this->n -1; j++)
-				//if(fabs(u[i*this->n + j]) < fabs(u[(i+1)*this->n - 1]))
-				u[i*this->n + j] = value;// u[(i+1)*this->n - 1];
-	}
-	else if(boundaryCondition == 3)
-	{
-		for(int j = 0; j < this->n - 1; j++)
-			for(int i = 0;i < this->n - 1; i++)
-				//if(fabs(u[i*this->n + j]) < fabs(u[j + this->n*(this->n - 1)]))
-				u[i*this->n + j] = value;// u[j + this->n*(this->n - 1)];
-	}
-	else if(boundaryCondition == 12)
-	{
-		for(int j = 1; j < this->n; j++)
-			for(int i = 1;i < this->n; i++)
-				//if(fabs(u[i*this->n + j]) < fabs(u[j]))
-				u[i*this->n + j] = value;// u[j];
-	}
-*/
-
-
-	/**/
-
-	/*if (u.max() > 0.0)
-		this->stopTime *=(double) this->gridCols;*/
-
-
-   double time = 0.0;
-   double currentTau = this->tau0;
-   double finalTime = this->stopTime;// + 3.0*(u.max() - u.min());
-   if( time + currentTau > finalTime ) currentTau = finalTime - time;
-
-   double maxResidue( 1.0 );
-   //double lastResidue( 10000.0 );
-   tnlGridEntity<MeshType, 2, tnlGridEntityNoStencilStorage > Entity(subMesh);
-   tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-   while( time < finalTime /*|| maxResidue > subMesh.template getSpaceStepsProducts< 1, 0 >()*/)
-   {
-      /****
-       * Compute the RHS
-       */
-
-      for( int i = 0; i < fu.getSize(); i ++ )
-      {
-			Entity.setCoordinates(Containers::StaticVector<2,int>(i % subMesh.getDimensions().x(),i / subMesh.getDimensions().x()));
-			Entity.refresh();
-			neighborEntities.refresh(subMesh,Entity.getIndex());
-    	  fu[ i ] = schemeHost.getValue( this->subMesh, i, Containers::StaticVector<2,int>(i % subMesh.getDimensions().x(),i / subMesh.getDimensions().x()), u, time, boundaryCondition,neighborEntities);
-      }
-      maxResidue = fu. absMax();
-
-
-      if( this -> cflCondition * maxResidue != 0.0)
-    	  currentTau =  this -> cflCondition / maxResidue;
-
-     /* if (maxResidue < 0.05)
-    	 std::cout << "Max < 0.05" <<std::endl;*/
-      if(currentTau > 1.0 * this->subMesh.template getSpaceStepsProducts< 1, 0 >())
-      {
-    	  //cout << currentTau << " >= " << 2.0 * this->subMesh.template getSpaceStepsProducts< 1, 0 >() <<std::endl;
-    	  currentTau = 1.0 * this->subMesh.template getSpaceStepsProducts< 1, 0 >();
-      }
-      /*if(maxResidue > lastResidue)
-    	  currentTau *=(1.0/10.0);*/
-
-
-      if( time + currentTau > finalTime ) currentTau = finalTime - time;
-//      for( int i = 0; i < fu.getSize(); i ++ )
-//      {
-//    	  //cout << "Too big RHS! i = " << i << ", fu = " << fu[i] << ", u = " << u[i] <<std::endl;
-//    	  if((u[i]+currentTau * fu[ i ])*u[i] < 0.0 && fu[i] != 0.0 && u[i] != 0.0 )
-//    		  currentTau = fabs(u[i]/(2.0*fu[i]));
-//
-//      }
-
-
-      for( int i = 0; i < fu.getSize(); i ++ )
-      {
-    	  double add = u[i] + currentTau * fu[ i ];
-    	  //if( fabs(u[i]) < fabs(add) or (this->subgridValues[subGridID] == this->currentStep +4) )
-    		  u[ i ] = add;
-      }
-      time += currentTau;
-
-      //cout << '\r' << flush;
-     //cout << maxResidue << "   " << currentTau << " @ " << time << flush;
-     //lastResidue = maxResidue;
-   }
-   //cout << "Time: " << time << ", Res: " << maxResidue <<endl;
-	/*if (u.max() > 0.0)
-		this->stopTime /=(double) this->gridCols;*/
-
-	VectorType solution;
-	solution.setLike(u);
-    for( int i = 0; i < u.getSize(); i ++ )
-  	{
-    	solution[i]=u[i];
-   	}
-	return solution;
-}
-
-
-#ifdef HAVE_CUDA
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getSubgridCUDA2D( const int i ,tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >* caller, double* a)
-{
-	//int j = threadIdx.x + threadIdx.y * blockDim.x;
-	int th = (blockIdx.y) * caller->n*caller->n*caller->gridCols
-            + (blockIdx.x) * caller->n
-            + threadIdx.y * caller->n*caller->gridCols
-            + threadIdx.x;
-	//printf("i= %d,j= %d,th= %d\n",i,j,th);
-	*a = caller->work_u_cuda[th];
-	//printf("Hi %f \n", *a);
-	//return ret;
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::updateSubgridCUDA2D( const int i ,tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >* caller, double* a)
-{
-//	int j = threadIdx.x + threadIdx.y * blockDim.x;
-	int index = (blockIdx.y) * caller->n*caller->n*caller->gridCols
-            + (blockIdx.x) * caller->n
-            + threadIdx.y * caller->n*caller->gridCols
-            + threadIdx.x;
-
-	if( (fabs(caller->work_u_cuda[index]) > fabs(*a)) || (caller->unusedCell_cuda[index] == 1) )
-	{
-		caller->work_u_cuda[index] = *a;
-		caller->unusedCell_cuda[index] = 0;
-
-	}
-
-	*a = caller->work_u_cuda[index];
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::insertSubgridCUDA2D( double u, const int i )
-{
-
-
-//	int j = threadIdx.x + threadIdx.y * blockDim.x;
-	//printf("j = %d, u = %f\n", j,u);
-
-		int index = (blockIdx.y)*this->n*this->n*this->gridCols
-					+ (blockIdx.x)*this->n
-					+ threadIdx.y*this->n*this->gridCols
-					+ threadIdx.x;
-
-		//printf("i= %d,j= %d,index= %d\n",i,j,index);
-		if( (fabs(this->work_u_cuda[index]) > fabs(u)) || (this->unusedCell_cuda[index] == 1) )
-		{
-			this->work_u_cuda[index] = u;
-			this->unusedCell_cuda[index] = 0;
-
-		}
-
-
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::runSubgridCUDA2D( int boundaryCondition, double* u, int subGridID)
-{
-
-	__shared__ int tmp;
-	__shared__ double value;
-	//double tmpRes = 0.0;
-	volatile double* sharedTau = &u[blockDim.x*blockDim.y];
-	volatile double* absVal = &u[2*blockDim.x*blockDim.y];
-	int i = threadIdx.x;
-	int j = threadIdx.y;
-	int l = threadIdx.y * blockDim.x + threadIdx.x;
-	bool computeFU = !((i == 0 && (boundaryCondition & 4)) or
-			 (i == blockDim.x - 1 && (boundaryCondition & 2)) or
-			 (j == 0 && (boundaryCondition & 8)) or
-			 (j == blockDim.y - 1  && (boundaryCondition & 1)));
-
-	if(l == 0)
-	{
-		tmp = 0;
-		int centerGID = (blockDim.y*blockIdx.y + (blockDim.y>>1))*(blockDim.x*gridDim.x) + blockDim.x*blockIdx.x + (blockDim.x>>1);
-		if(this->unusedCell_cuda[centerGID] == 0 || boundaryCondition == 0)
-			tmp = 1;
-	}
-	__syncthreads();
-
-	/*if(!tmp && (u[0]*u[l] <= 0.0))
-		atomicMax( &tmp, 1);*/
-
-	__syncthreads();
-	if(tmp !=1)
-	{
-//		if(computeFU)
-//			absVal[l]=0.0;
-//		else
-//			absVal[l] = fabs(u[l]);
-//
-//		__syncthreads();
-//
-//	      if((blockDim.x == 16) && (l < 128))		absVal[l] = Max(absVal[l],absVal[l+128]);
-//	      __syncthreads();
-//	      if((blockDim.x == 16) && (l < 64))		absVal[l] = Max(absVal[l],absVal[l+64]);
-//	      __syncthreads();
-//	      if(l < 32)    							absVal[l] = Max(absVal[l],absVal[l+32]);
-//	      if(l < 16)								absVal[l] = Max(absVal[l],absVal[l+16]);
-//	      if(l < 8)									absVal[l] = Max(absVal[l],absVal[l+8]);
-//	      if(l < 4)									absVal[l] = Max(absVal[l],absVal[l+4]);
-//	      if(l < 2)									absVal[l] = Max(absVal[l],absVal[l+2]);
-//	      if(l < 1)									value   = sign(u[0])*Max(absVal[l],absVal[l+1]);
-//		__syncthreads();
-//
-//		if(computeFU)
-//			u[l] = value;
-		if(computeFU)
-		{
-			if(boundaryCondition == 4)
-				u[l] = u[threadIdx.y * blockDim.x] + sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0 >()*(threadIdx.x) ;//+  2*sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0 >()*(threadIdx.x+this->n);
-			else if(boundaryCondition == 2)
-				u[l] = u[threadIdx.y * blockDim.x + blockDim.x - 1] + sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0 >()*(this->n - 1 - threadIdx.x);//+ 2*sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0 >()*(blockDim.x - threadIdx.x - 1+this->n);
-			else if(boundaryCondition == 8)
-				u[l] = u[threadIdx.x] + sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0 >()*(threadIdx.y) ;//+ 2*sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0 >()*(threadIdx.y+this->n);
-			else if(boundaryCondition == 1)
-				u[l] = u[(blockDim.y - 1)* blockDim.x + threadIdx.x] + sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0 >()*(this->n - 1 - threadIdx.y) ;//+ sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0 >()*(blockDim.y - threadIdx.y  - 1 +this->n);
-		}
-	}
-
-   double time = 0.0;
-   __shared__ double currentTau;
-   double cfl = this->cflCondition;
-   double fu = 0.0;
-//   if(threadIdx.x * threadIdx.y == 0)
-//   {
-//	   currentTau = finalTime;
-//   }
-   double finalTime = this->stopTime;
-   __syncthreads();
-//   if( time + currentTau > finalTime ) currentTau = finalTime - time;
-
-   tnlGridEntity<MeshType, 2, tnlGridEntityNoStencilStorage > Entity(subMesh);
-   tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-   Entity.setCoordinates(Containers::StaticVector<2,int>(i,j));
-   Entity.refresh();
-   neighborEntities.refresh(subMesh,Entity.getIndex());
-
-
-   while( time < finalTime )
-   {
-	  if(computeFU)
-		  fu = schemeHost.getValueDev( this->subMesh, l, Containers::StaticVector<2,int>(i,j)/*this->subMesh.getCellCoordinates(l)*/, u, time, boundaryCondition, neighborEntities);
-
-	  sharedTau[l]=abs(cfl/fu);
-
-      if(l == 0)
-      {
-    	  if(sharedTau[0] > 1.0 * this->subMesh.template getSpaceStepsProducts< 1, 0 >())	sharedTau[0] = 1.0 * this->subMesh.template getSpaceStepsProducts< 1, 0 >();
-      }
-      else if(l == blockDim.x*blockDim.y - 1)
-    	  if( time + sharedTau[l] > finalTime )		sharedTau[l] = finalTime - time;
-
-
-//      if(  (sign(u[l]+sharedTau[l]*fu) != sign(u[l])) && fu != 0.0 && fu != -0.0)
-//    	  {
-//    	  printf("orig: %10f", sharedTau[l]);
-//    	  sharedTau[l]=abs(u[l]/(1.1*fu)) ;
-//    	  printf("   new: %10f\n", sharedTau[l]);
-//    	  }
-
-
-
-      if((blockDim.x == 16) && (l < 128))		sharedTau[l] = Min(sharedTau[l],sharedTau[l+128]);
-      __syncthreads();
-      if((blockDim.x == 16) && (l < 64))		sharedTau[l] = Min(sharedTau[l],sharedTau[l+64]);
-      __syncthreads();
-      if(l < 32)    							sharedTau[l] = Min(sharedTau[l],sharedTau[l+32]);
-      if(l < 16)								sharedTau[l] = Min(sharedTau[l],sharedTau[l+16]);
-      if(l < 8)									sharedTau[l] = Min(sharedTau[l],sharedTau[l+8]);
-      if(l < 4)									sharedTau[l] = Min(sharedTau[l],sharedTau[l+4]);
-      if(l < 2)									sharedTau[l] = Min(sharedTau[l],sharedTau[l+2]);
-      if(l < 1)									currentTau   = Min(sharedTau[l],sharedTau[l+1]);
-	__syncthreads();
-
-      u[l] += currentTau * fu;
-      time += currentTau;
-   }
-
-
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-int tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getOwnerCUDA2D(int i) const
-{
-
-	return ((i / (this->gridCols*this->n*this->n))*this->gridCols
-			+ (i % (this->gridCols*this->n))/this->n);
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-int tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getSubgridValueCUDA2D( int i ) const
-{
-	return this->subgridValues_cuda[i];
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::setSubgridValueCUDA2D(int i, int value)
-{
-	this->subgridValues_cuda[i] = value;
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-int tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getBoundaryConditionCUDA2D( int i ) const
-{
-	return this->boundaryConditions_cuda[i];
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::setBoundaryConditionCUDA2D(int i, int value)
-{
-	this->boundaryConditions_cuda[i] = value;
-}
-
-
-
-//north - 1, east - 2, west - 4, south - 8
-
-template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__
-void /*tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::*/synchronizeCUDA2D(tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver) //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now
-{
-
-	__shared__ int boundary[4]; // north,east,west,south
-	__shared__ int subgridValue;
-	__shared__ int newSubgridValue;
-
-
-	int gid = (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x + blockDim.x*blockIdx.x + threadIdx.x;
-	double u = cudaSolver->work_u_cuda[gid];
-	double u_cmp;
-	int subgridValue_cmp=INT_MAX;
-	int boundary_index=0;
-
-
-	if(threadIdx.x+threadIdx.y == 0)
-	{
-		subgridValue = cudaSolver->getSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x);
-		boundary[0] = 0;
-		boundary[1] = 0;
-		boundary[2] = 0;
-		boundary[3] = 0;
-		newSubgridValue = 0;
-		//printf("%d   %d\n", blockDim.x, gridDim.x);
-	}
-	__syncthreads();
-
-
-
-	if(		(threadIdx.x == 0 				/*				&& !(cudaSolver->currentStep & 1)*/) 		||
-			(threadIdx.y == 0 				 	/*			&& (cudaSolver->currentStep & 1)*/) 		||
-			(threadIdx.x == blockDim.x - 1 	 /*	&& !(cudaSolver->currentStep & 1)*/) 		||
-			(threadIdx.y == blockDim.y - 1 	 /*	&& (cudaSolver->currentStep & 1)*/) 		)
-	{
-		if(threadIdx.x == 0 && (blockIdx.x != 0)/* && !(cudaSolver->currentStep & 1)*/)
-		{
-			u_cmp = cudaSolver->work_u_cuda[gid - 1];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x - 1);
-			boundary_index = 2;
-		}
-
-		if(threadIdx.x == blockDim.x - 1 && (blockIdx.x != gridDim.x - 1)/* && !(cudaSolver->currentStep & 1)*/)
-		{
-			u_cmp = cudaSolver->work_u_cuda[gid + 1];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x + 1);
-			boundary_index = 1;
-		}
-
-		__threadfence();
-		if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX))
-		{
-			cudaSolver->unusedCell_cuda[gid] = 0;
-			atomicMax(&newSubgridValue, INT_MAX);
-			atomicMax(&boundary[boundary_index], 1);
-			cudaSolver->work_u_cuda[gid] = u_cmp;
-			u=u_cmp;
-		}
-		__threadfence();
-		if(threadIdx.y == 0 && (blockIdx.y != 0)/* && (cudaSolver->currentStep & 1)*/)
-		{
-			u_cmp = cudaSolver->work_u_cuda[gid - blockDim.x*gridDim.x];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA2D((blockIdx.y - 1)*gridDim.x + blockIdx.x);
-			boundary_index = 3;
-		}
-		if(threadIdx.y == blockDim.y - 1 && (blockIdx.y != gridDim.y - 1)/* && (cudaSolver->currentStep & 1)*/)
-		{
-			u_cmp = cudaSolver->work_u_cuda[gid + blockDim.x*gridDim.x];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA2D((blockIdx.y + 1)*gridDim.x + blockIdx.x);
-			boundary_index = 0;
-		}
-
-//		__threadfence();
-		if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX))
-		{
-			cudaSolver->unusedCell_cuda[gid] = 0;
-			atomicMax(&newSubgridValue, INT_MAX);
-			atomicMax(&boundary[boundary_index], 1);
-			cudaSolver->work_u_cuda[gid] = u_cmp;
-		}
-	}
-	__threadfence();
-	__syncthreads();
-
-	if(threadIdx.x+threadIdx.y == 0)
-	{
-		if(subgridValue == INT_MAX && newSubgridValue !=0)
-			cudaSolver->setSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x, -INT_MAX);
-
-		cudaSolver->setBoundaryConditionCUDA2D(blockIdx.y*gridDim.x + blockIdx.x, 	boundary[0] +
-																				2 * boundary[1] +
-																				4 * boundary[2] +
-																				8 * boundary[3]);
-
-
-		if(blockIdx.x+blockIdx.y ==0)
-		{
-			cudaSolver->currentStep = cudaSolver->currentStep + 1;
-			*(cudaSolver->runcuda) = 0;
-		}
-//
-//		int stepValue = cudaSolver->currentStep + 4;
-//		if( cudaSolver->getSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x) == -INT_MAX )
-//				cudaSolver->setSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x, stepValue);
-//
-//		atomicMax((cudaSolver->runcuda),cudaSolver->getBoundaryConditionCUDA2D(blockIdx.y*gridDim.x + blockIdx.x));
-	}
-
-
-	/*
-	//printf("I am not an empty kernel!\n");
-	//cout << "Synchronizig..." <<std::endl;
-	int tmp1, tmp2;
-	int grid1, grid2;
-
-	if(cudaSolver->currentStep & 1)
-	{
-		//printf("I am not an empty kernel! 1\n");
-		for(int j = 0; j < cudaSolver->gridRows - 1; j++)
-		{
-			//printf("I am not an empty kernel! 3\n");
-			for (int i = 0; i < cudaSolver->gridCols*cudaSolver->n; i++)
-			{
-				tmp1 = cudaSolver->gridCols*cudaSolver->n*((cudaSolver->n-1)+j*cudaSolver->n) + i;
-				tmp2 = cudaSolver->gridCols*cudaSolver->n*((cudaSolver->n)+j*cudaSolver->n) + i;
-				grid1 = cudaSolver->getSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1));
-				grid2 = cudaSolver->getSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2));
-
-				if ((fabs(cudaSolver->work_u_cuda[tmp1]) < fabs(cudaSolver->work_u_cuda[tmp2]) - cudaSolver->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX))
-				{
-					//printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA2D(tmp1),cudaSolver->getOwnerCUDA2D(tmp2));
-					cudaSolver->work_u_cuda[tmp2] = cudaSolver->work_u_cuda[tmp1];
-					cudaSolver->unusedCell_cuda[tmp2] = 0;
-					if(grid2 == INT_MAX)
-					{
-						cudaSolver->setSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2), -INT_MAX);
-					}
-					if(! (cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2)) & 8) )
-						cudaSolver->setBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2), cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2))+8);
-				}
-				else if ((fabs(cudaSolver->work_u_cuda[tmp1]) > fabs(cudaSolver->work_u_cuda[tmp2]) + cudaSolver->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX))
-				{
-					//printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA2D(tmp1),cudaSolver->getOwnerCUDA2D(tmp2));
-					cudaSolver->work_u_cuda[tmp1] = cudaSolver->work_u_cuda[tmp2];
-					cudaSolver->unusedCell_cuda[tmp1] = 0;
-					if(grid1 == INT_MAX)
-					{
-						cudaSolver->setSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1), -INT_MAX);
-					}
-					if(! (cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1)) & 1) )
-						cudaSolver->setBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1), cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1))+1);
-				}
-			}
-		}
-
-	}
-	else
-	{
-		//printf("I am not an empty kernel! 2\n");
-		for(int i = 1; i < cudaSolver->gridCols; i++)
-		{
-			//printf("I am not an empty kernel! 4\n");
-			for (int j = 0; j < cudaSolver->gridRows*cudaSolver->n; j++)
-			{
-
-				tmp1 = cudaSolver->gridCols*cudaSolver->n*j + i*cudaSolver->n - 1;
-				tmp2 = cudaSolver->gridCols*cudaSolver->n*j + i*cudaSolver->n ;
-				grid1 = cudaSolver->getSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1));
-				grid2 = cudaSolver->getSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2));
-
-				if ((fabs(cudaSolver->work_u_cuda[tmp1]) < fabs(cudaSolver->work_u_cuda[tmp2]) - cudaSolver->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX))
-				{
-					//printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA2D(tmp1),cudaSolver->getOwnerCUDA2D(tmp2));
-					cudaSolver->work_u_cuda[tmp2] = cudaSolver->work_u_cuda[tmp1];
-					cudaSolver->unusedCell_cuda[tmp2] = 0;
-					if(grid2 == INT_MAX)
-					{
-						cudaSolver->setSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2), -INT_MAX);
-					}
-					if(! (cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2)) & 4) )
-						cudaSolver->setBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2), cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2))+4);
-				}
-				else if ((fabs(cudaSolver->work_u_cuda[tmp1]) > fabs(cudaSolver->work_u_cuda[tmp2]) + cudaSolver->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX))
-				{
-					//printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA2D(tmp1),cudaSolver->getOwnerCUDA2D(tmp2));
-					cudaSolver->work_u_cuda[tmp1] = cudaSolver->work_u_cuda[tmp2];
-					cudaSolver->unusedCell_cuda[tmp1] = 0;
-					if(grid1 == INT_MAX)
-					{
-						cudaSolver->setSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1), -INT_MAX);
-					}
-					if(! (cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1)) & 2) )
-						cudaSolver->setBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1), cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1))+2);
-				}
-			}
-		}
-	}
-	//printf("I am not an empty kernel! 5 cudaSolver->currentStep : %d \n", cudaSolver->currentStep);
-
-	cudaSolver->currentStep = cudaSolver->currentStep + 1;
-	int stepValue = cudaSolver->currentStep + 4;
-	for (int i = 0; i < cudaSolver->gridRows * cudaSolver->gridCols; i++)
-	{
-		if( cudaSolver->getSubgridValueCUDA2D(i) == -INT_MAX )
-			cudaSolver->setSubgridValueCUDA2D(i, stepValue);
-	}
-
-	int maxi = 0;
-	for(int q=0; q < cudaSolver->gridRows*cudaSolver->gridCols;q++)
-	{
-		//printf("%d : %d\n", q, cudaSolver->boundaryConditions_cuda[q]);
-		maxi=Max(maxi,cudaSolver->getBoundaryConditionCUDA2D(q));
-	}
-	//printf("I am not an empty kernel! %d\n", maxi);
-	*(cudaSolver->runcuda) = (maxi > 0);
-	//printf("I am not an empty kernel! 7 %d\n", cudaSolver->boundaryConditions_cuda[0]);
-	//cout << "Grid synchronized at step " << (this->currentStep - 1 ) <<std::endl;
-*/
-}
-
-
-
-template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__
-void synchronize2CUDA2D(tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver)
-{
-//	if(blockIdx.x+blockIdx.y ==0)
-//	{
-//		cudaSolver->currentStep = cudaSolver->currentStep + 1;
-//		*(cudaSolver->runcuda) = 0;
-//	}
-
-	int stepValue = cudaSolver->currentStep + 4;
-	if( cudaSolver->getSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x) == -INT_MAX )
-			cudaSolver->setSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x, stepValue);
-
-	atomicMax((cudaSolver->runcuda),cudaSolver->getBoundaryConditionCUDA2D(blockIdx.y*gridDim.x + blockIdx.x));
-}
-
-
-
-
-
-
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__global__
-void /*tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::*/initCUDA2D( tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr , int* ptr2, int* ptr3)
-{
-	//cout << "Initializating solver..." <<std::endl;
-	//const String& meshLocation = parameters.getParameter <String>("mesh");
-	//this->mesh_cuda.load( meshLocation );
-
-	//this->n_cuda = parameters.getParameter <int>("subgrid-size");
-	//cout << "Setting N << this->n_cuda <<std::endl;
-
-	//this->subMesh_cuda.setDimensions( this->n_cuda, this->n_cuda );
-	//this->subMesh_cuda.setDomain( Containers::StaticVector<2,double>(0.0, 0.0),
-							 //Containers::StaticVector<2,double>(this->mesh_cuda.template getSpaceStepsProducts< 1, 0 >()*(double)(this->n_cuda), this->mesh_cuda.template getSpaceStepsProducts< 0, 1 >()*(double)(this->n_cuda)) );
-
-	//this->subMesh_cuda.save("submesh.tnl");
-
-//	const String& initialCondition = parameters.getParameter <String>("initial-condition");
-//	this->u0.load( initialCondition );
-
-	//cout << this->mesh.getCellCenter(0) <<std::endl;
-
-	//this->delta_cuda = parameters.getParameter <double>("delta");
-	//this->delta_cuda *= this->mesh_cuda.template getSpaceStepsProducts< 1, 0 >()*this->mesh_cuda.template getSpaceStepsProducts< 0, 1 >();
-
-	//cout << "Setting delta to " << this->delta <<std::endl;
-
-	//this->tau0_cuda = parameters.getParameter <double>("initial-tau");
-	//cout << "Setting initial tau to " << this->tau0_cuda <<std::endl;
-	//this->stopTime_cuda = parameters.getParameter <double>("stop-time");
-
-	//this->cflCondition_cuda = parameters.getParameter <double>("cfl-condition");
-	//this -> cflCondition_cuda *= sqrt(this->mesh_cuda.template getSpaceStepsProducts< 1, 0 >()*this->mesh_cuda.template getSpaceStepsProducts< 0, 1 >());
-	//cout << "Setting CFL to " << this->cflCondition <<std::endl;
-////
-////
-
-//	this->gridRows_cuda = gridRows;
-//	this->gridCols_cuda = gridCols;
-
-	cudaSolver->work_u_cuda = ptr;//(double*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*cudaSolver->n*cudaSolver->n*sizeof(double));
-	cudaSolver->unusedCell_cuda = ptr3;//(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*cudaSolver->n*cudaSolver->n*sizeof(int));
-	cudaSolver->subgridValues_cuda =(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*sizeof(int));
-	cudaSolver->boundaryConditions_cuda =(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*sizeof(int));
-	cudaSolver->runcuda = ptr2;//(bool*)malloc(sizeof(bool));
-	*(cudaSolver->runcuda) = 1;
-	cudaSolver->currentStep = 1;
-	//cudaMemcpy(ptr,&(cudaSolver->work_u_cuda), sizeof(double*),cudaMemcpyDeviceToHost);
-	//ptr = cudaSolver->work_u_cuda;
-	printf("GPU memory allocated.\n");
-
-	for(int i = 0; i < cudaSolver->gridCols*cudaSolver->gridRows; i++)
-	{
-		cudaSolver->subgridValues_cuda[i] = INT_MAX;
-		cudaSolver->boundaryConditions_cuda[i] = 0;
-	}
-
-	/*for(long int j = 0; j < cudaSolver->n*cudaSolver->n*cudaSolver->gridCols*cudaSolver->gridRows; j++)
-	{
-		printf("%d\n",j);
-		cudaSolver->unusedCell_cuda[ j] = 1;
-	}*/
-	printf("GPU memory initialized.\n");
-
-
-	//cudaSolver->work_u_cuda[50] = 32.153438;
-////
-////
-	//stretchGrid();
-	//this->stopTime_cuda /= (double)(this->gridCols_cuda);
-	//this->stopTime_cuda *= (1.0+1.0/((double)(this->n_cuda) - 1.0));
-	//cout << "Setting stopping time to " << this->stopTime <<std::endl;
-	//this->stopTime_cuda = 1.5*((double)(this->n_cuda))*parameters.getParameter <double>("stop-time")*this->mesh_cuda.template getSpaceStepsProducts< 1, 0 >();
-	//cout << "Setting stopping time to " << this->stopTime <<std::endl;
-
-	//cout << "Initializating scheme..." <<std::endl;
-	//if(!this->schemeDevice.init(parameters))
-//	{
-		//cerr << "Scheme failed to initialize." <<std::endl;
-//		return false;
-//	}
-	//cout << "Scheme initialized." <<std::endl;
-
-	//test();
-
-//	this->currentStep_cuda = 1;
-	//return true;
-}
-
-
-
-
-//extern __shared__ double array[];
-template< typename SchemeHost, typename SchemeDevice, typename Device >
-__global__
-void /*tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::*/initRunCUDA2D(tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >* caller)
-
-{
-
-
-	extern __shared__ double u[];
-	//printf("%p\n",caller->work_u_cuda);
-
-	int i = blockIdx.y * gridDim.x + blockIdx.x;
-	int l = threadIdx.y * blockDim.x + threadIdx.x;
-
-	__shared__ int containsCurve;
-	if(l == 0)
-		containsCurve = 0;
-
-	//double a;
-	caller->getSubgridCUDA2D(i,caller, &u[l]);
-	//printf("%f   %f\n",a , u[l]);
-	//u[l] = a;
-	//printf("Hi %f \n", u[l]);
-	__syncthreads();
-	//printf("hurewrwr %f \n", u[l]);
-	if(u[0] * u[l] <= 0.0)
-	{
-		//printf("contains %d \n",i);
-		atomicMax( &containsCurve, 1);
-	}
-
-	__syncthreads();
-	//printf("hu");
-	//printf("%d : %f\n", l, u[l]);
-	if(containsCurve == 1)
-	{
-		//printf("have curve \n");
-		caller->runSubgridCUDA2D(0,u,i);
-		//printf("%d : %f\n", l, u[l]);
-		__syncthreads();
-		caller->insertSubgridCUDA2D(u[l],i);
-		__syncthreads();
-		if(l == 0)
-			caller->setSubgridValueCUDA2D(i, 4);
-	}
-
-
-}
-
-
-
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device >
-__global__
-void /*tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::*/runCUDA2D(tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >* caller)
-{
-	extern __shared__ double u[];
-	int i = blockIdx.y * gridDim.x + blockIdx.x;
-	int l = threadIdx.y * blockDim.x + threadIdx.x;
-	int bound = caller->getBoundaryConditionCUDA2D(i);
-
-	if(caller->getSubgridValueCUDA2D(i) != INT_MAX && bound != 0 && caller->getSubgridValueCUDA2D(i) > 0)
-	{
-		caller->getSubgridCUDA2D(i,caller, &u[l]);
-
-		//if(l == 0)
-			//printf("i = %d, bound = %d\n",i,caller->getSubgridValueCUDA2D(i));
-		if(caller->getSubgridValueCUDA2D(i) == caller->currentStep+4)
-		{
-			if(bound & 1)
-			{
-				caller->runSubgridCUDA2D(1,u,i);
-				//__syncthreads();
-				//caller->insertSubgridCUDA2D(u[l],i);
-				//__syncthreads();
-				//caller->getSubgridCUDA2D(i,caller, &u[l]);
-				caller->updateSubgridCUDA2D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if(bound & 2 )
-			{
-				caller->runSubgridCUDA2D(2,u,i);
-				//__syncthreads();
-				//caller->insertSubgridCUDA2D(u[l],i);
-				//__syncthreads();
-				//caller->getSubgridCUDA2D(i,caller, &u[l]);
-				caller->updateSubgridCUDA2D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if(bound & 4)
-			{
-				caller->runSubgridCUDA2D(4,u,i);
-				//__syncthreads();
-				//caller->insertSubgridCUDA2D(u[l],i);
-				//__syncthreads();
-				//caller->getSubgridCUDA2D(i,caller, &u[l]);
-				caller->updateSubgridCUDA2D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if(bound & 8)
-			{
-				caller->runSubgridCUDA2D(8,u,i);
-				//__syncthreads();
-				//caller->insertSubgridCUDA2D(u[l],i);
-				//__syncthreads();
-				//caller->getSubgridCUDA2D(i,caller, &u[l]);
-				caller->updateSubgridCUDA2D(i,caller, &u[l]);
-				__syncthreads();
-			}
-
-
-
-
-
-			if( ((bound & 3 )))
-				{
-					caller->runSubgridCUDA2D(3,u,i);
-					//__syncthreads();
-					//caller->insertSubgridCUDA2D(u[l],i);
-					//__syncthreads();
-					//caller->getSubgridCUDA2D(i,caller, &u[l]);
-					caller->updateSubgridCUDA2D(i,caller, &u[l]);
-					__syncthreads();
-				}
-				if( ((bound & 5 )))
-				{
-					caller->runSubgridCUDA2D(5,u,i);
-					//__syncthreads();
-					//caller->insertSubgridCUDA2D(u[l],i);
-					//__syncthreads();
-					//caller->getSubgridCUDA2D(i,caller, &u[l]);
-					caller->updateSubgridCUDA2D(i,caller, &u[l]);
-					__syncthreads();
-				}
-				if( ((bound & 10 )))
-				{
-					caller->runSubgridCUDA2D(10,u,i);
-					//__syncthreads();
-					//caller->insertSubgridCUDA2D(u[l],i);
-					//__syncthreads();
-					//caller->getSubgridCUDA2D(i,caller, &u[l]);
-					caller->updateSubgridCUDA2D(i,caller, &u[l]);
-					__syncthreads();
-				}
-				if(   (bound & 12 ))
-				{
-					caller->runSubgridCUDA2D(12,u,i);
-					//__syncthreads();
-					//caller->insertSubgridCUDA2D(u[l],i);
-					//__syncthreads();
-					//caller->getSubgridCUDA2D(i,caller, &u[l]);
-					caller->updateSubgridCUDA2D(i,caller, &u[l]);
-					__syncthreads();
-				}
-
-
-
-
-
-		}
-
-
-		else
-		{
-
-
-
-
-
-
-
-
-
-			if( ((bound == 2)))
-						{
-							caller->runSubgridCUDA2D(2,u,i);
-							//__syncthreads();
-							//caller->insertSubgridCUDA2D(u[l],i);
-							//__syncthreads();
-							//caller->getSubgridCUDA2D(i,caller, &u[l]);
-							caller->updateSubgridCUDA2D(i,caller, &u[l]);
-							__syncthreads();
-						}
-						if( ((bound == 1) ))
-						{
-							caller->runSubgridCUDA2D(1,u,i);
-							//__syncthreads();
-							//caller->insertSubgridCUDA2D(u[l],i);
-							//__syncthreads();
-							//caller->getSubgridCUDA2D(i,caller, &u[l]);
-							caller->updateSubgridCUDA2D(i,caller, &u[l]);
-							__syncthreads();
-						}
-						if( ((bound == 8) ))
-						{
-							caller->runSubgridCUDA2D(8,u,i);
-							//__syncthreads();
-							//caller->insertSubgridCUDA2D(u[l],i);
-							//__syncthreads();
-							//caller->getSubgridCUDA2D(i,caller, &u[l]);
-							caller->updateSubgridCUDA2D(i,caller, &u[l]);
-							__syncthreads();
-						}
-						if(   (bound == 4))
-						{
-							caller->runSubgridCUDA2D(4,u,i);
-							//__syncthreads();
-							//caller->insertSubgridCUDA2D(u[l],i);
-							//__syncthreads();
-							//caller->getSubgridCUDA2D(i,caller, &u[l]);
-							caller->updateSubgridCUDA2D(i,caller, &u[l]);
-							__syncthreads();
-						}
-
-
-
-
-
-
-
-
-
-
-			if( ((bound & 3) ))
-			{
-				caller->runSubgridCUDA2D(3,u,i);
-				//__syncthreads();
-				//caller->insertSubgridCUDA2D(u[l],i);
-				//__syncthreads();
-				//caller->getSubgridCUDA2D(i,caller, &u[l]);
-				caller->updateSubgridCUDA2D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if( ((bound & 5) ))
-			{
-				caller->runSubgridCUDA2D(5,u,i);
-				//__syncthreads();
-				//caller->insertSubgridCUDA2D(u[l],i);
-				//__syncthreads();
-				//caller->getSubgridCUDA2D(i,caller, &u[l]);
-				caller->updateSubgridCUDA2D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if( ((bound & 10) ))
-			{
-				caller->runSubgridCUDA2D(10,u,i);
-				//__syncthreads();
-				//caller->insertSubgridCUDA2D(u[l],i);
-				//__syncthreads();
-				//caller->getSubgridCUDA2D(i,caller, &u[l]);
-				caller->updateSubgridCUDA2D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if(   (bound & 12) )
-			{
-				caller->runSubgridCUDA2D(12,u,i);
-				//__syncthreads();
-				//caller->insertSubgridCUDA2D(u[l],i);
-				//__syncthreads();
-				//caller->getSubgridCUDA2D(i,caller, &u[l]);
-				caller->updateSubgridCUDA2D(i,caller, &u[l]);
-				__syncthreads();
-			}
-
-
-
-
-
-
-
-
-
-
-
-
-		}
-		/*if( bound )
-		{
-			caller->runSubgridCUDA2D(15,u,i);
-			__syncthreads();
-			//caller->insertSubgridCUDA2D(u[l],i);
-			//__syncthreads();
-			//caller->getSubgridCUDA2D(i,caller, &u[l]);
-			caller->updateSubgridCUDA2D(i,caller, &u[l]);
-			__syncthreads();
-		}*/
-
-		if(l==0)
-		{
-			caller->setBoundaryConditionCUDA2D(i, 0);
-			caller->setSubgridValueCUDA2D(i, caller->getSubgridValueCUDA2D(i) - 1 );
-		}
-
-
-	}
-
-
-
-}
-
-#endif /*HAVE_CUDA*/
-
-#endif /* TNLPARALLELEIKONALSOLVER2D_IMPL_H_ */
diff --git a/src/TNL/Legacy/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h b/src/TNL/Legacy/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h
deleted file mode 100644
index dc3fd5467..000000000
--- a/src/TNL/Legacy/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h
+++ /dev/null
@@ -1,1706 +0,0 @@
-/***************************************************************************
-                          tnlParallelEikonalSolver2D_impl.h  -  description
-                             -------------------
-    begin                : Nov 28 , 2014
-    copyright            : (C) 2014 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef TNLPARALLELEIKONALSOLVER3D_IMPL_H_
-#define TNLPARALLELEIKONALSOLVER3D_IMPL_H_
-
-
-#include "tnlParallelEikonalSolver.h"
-#include <core/mfilename.h>
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::tnlParallelEikonalSolver()
-{
-	cout << "a" <<std::endl;
-	this->device = TNL::Devices::HostDevice;  /////////////// tnlCuda Device --- vypocet na GPU, TNL::Devices::HostDevice   ---    vypocet na CPU
-
-#ifdef HAVE_CUDA
-	if(this->device == tnlCudaDevice)
-	{
-	run_host = 1;
-	}
-#endif
-
-	cout << "b" <<std::endl;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::test()
-{
-/*
-	for(int i =0; i < this->subgridValues.getSize(); i++ )
-	{
-		insertSubgrid(getSubgrid(i), i);
-	}
-*/
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-
-bool tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::init( const Config::ParameterContainer& parameters )
-{
-	cout << "Initializating solver..." <<std::endl;
-	const String& meshLocation = parameters.getParameter <String>("mesh");
-	this->mesh.load( meshLocation );
-
-	this->n = parameters.getParameter <int>("subgrid-size");
-	cout << "Setting N to " << this->n <<std::endl;
-
-	this->subMesh.setDimensions( this->n, this->n, this->n );
-	this->subMesh.setDomain( Containers::StaticVector<3,double>(0.0, 0.0, 0.0),
-							 Containers::StaticVector<3,double>(mesh.template getSpaceStepsProducts< 1, 0, 0 >()*(double)(this->n), mesh.template getSpaceStepsProducts< 0, 1, 0 >()*(double)(this->n),mesh.template getSpaceStepsProducts< 0, 0, 1 >()*(double)(this->n)) );
-
-	this->subMesh.save("submesh.tnl");
-
-	const String& initialCondition = parameters.getParameter <String>("initial-condition");
-	this->u0.load( initialCondition );
-
-	//cout << this->mesh.getCellCenter(0) <<std::endl;
-
-	this->delta = parameters.getParameter <double>("delta");
-	this->delta *= mesh.template getSpaceStepsProducts< 1, 0, 0 >()*mesh.template getSpaceStepsProducts< 0, 1, 0 >();
-
-	cout << "Setting delta to " << this->delta <<std::endl;
-
-	this->tau0 = parameters.getParameter <double>("initial-tau");
-	cout << "Setting initial tau to " << this->tau0 <<std::endl;
-	this->stopTime = parameters.getParameter <double>("stop-time");
-
-	this->cflCondition = parameters.getParameter <double>("cfl-condition");
-	this -> cflCondition *= sqrt(mesh.template getSpaceStepsProducts< 1, 0, 0 >()*mesh.template getSpaceStepsProducts< 0, 1, 0 >());
-	cout << "Setting CFL to " << this->cflCondition <<std::endl;
-
-	stretchGrid();
-	this->stopTime /= (double)(this->gridCols);
-	this->stopTime *= (1.0+1.0/((double)(this->n) - 2.0));
-	cout << "Setting stopping time to " << this->stopTime <<std::endl;
-	//this->stopTime = 1.5*((double)(this->n))*parameters.getParameter <double>("stop-time")*mesh.template getSpaceStepsProducts< 1, 0, 0 >();
-	//cout << "Setting stopping time to " << this->stopTime <<std::endl;
-
-	cout << "Initializating scheme..." <<std::endl;
-	if(!this->schemeHost.init(parameters))
-	{
-		cerr << "SchemeHost failed to initialize." <<std::endl;
-		return false;
-	}
-	cout << "Scheme initialized." <<std::endl;
-
-	test();
-
-	VectorType* tmp = new VectorType[subgridValues.getSize()];
-
-
-#ifdef HAVE_CUDA
-
-	if(this->device == tnlCudaDevice)
-	{
-	/*cout << "Testing... " <<std::endl;
-	if(this->device == tnlCudaDevice)
-	{
-	if( !initCUDA3D(parameters, gridRows, gridCols) )
-		return false;
-	}*/
-		//cout << "s" <<std::endl;
-	cudaMalloc(&(this->cudaSolver), sizeof(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >));
-	//cout << "s" <<std::endl;
-	cudaMemcpy(this->cudaSolver, this,sizeof(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >), cudaMemcpyHostToDevice);
-	//cout << "s" <<std::endl;
-	double** tmpdev = NULL;
-	cudaMalloc(&tmpdev, sizeof(double*));
-	//double* tmpw;
-	cudaMalloc(&(this->tmpw), this->work_u.getSize()*sizeof(double));
-	cudaMalloc(&(this->runcuda), sizeof(int));
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-	int* tmpUC;
-	cudaMalloc(&(tmpUC), this->work_u.getSize()*sizeof(int));
-	cudaMemcpy(tmpUC, this->unusedCell.getData(), this->unusedCell.getSize()*sizeof(int), cudaMemcpyHostToDevice);
-
-	initCUDA3D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<1,1>>>(this->cudaSolver, (this->tmpw), (this->runcuda),tmpUC);
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-	//cout << "s " <<std::endl;
-	//cudaMalloc(&(cudaSolver->work_u_cuda), this->work_u.getSize()*sizeof(double));
-	double* tmpu = NULL;
-
-	cudaMemcpy(&tmpu, tmpdev,sizeof(double*), cudaMemcpyDeviceToHost);
-	//printf("%p %p \n",tmpu,tmpw);
-	cudaMemcpy((this->tmpw), this->work_u.getData(), this->work_u.getSize()*sizeof(double), cudaMemcpyHostToDevice);
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-	//cout << "s "<<std::endl;
-
-	}
-#endif
-
-	if(this->device == TNL::Devices::HostDevice)
-	{
-#ifdef HAVE_OPENMP
-#pragma omp parallel for num_threads(4) schedule(dynamic)
-#endif
-		for(int i = 0; i < this->subgridValues.getSize(); i++)
-		{
-			bool containsCurve = false;
-//			cout << "Working on subgrid " << i <<" --- check 1" <<std::endl;
-
-			if(! tmp[i].setSize(this->n*this->n*this->n))
-				cout << "Could not allocate tmp["<< i <<"] array." <<std::endl;
-//			cout << "Working on subgrid " << i <<" --- check 2" <<std::endl;
-
-			tmp[i] = getSubgrid(i);
-			containsCurve = false;
-//			cout << "Working on subgrid " << i <<" --- check 3" <<std::endl;
-
-
-			for(int j = 0; j < tmp[i].getSize(); j++)
-			{
-				if(tmp[i][0]*tmp[i][j] <= 0.0)
-				{
-					containsCurve = true;
-					j=tmp[i].getSize();
-//					cout << tmp[i][0] << " " << tmp[i][j] <<std::endl;
-				}
-
-			}
-//			cout << "Working on subgrid " << i <<" --- check 4" <<std::endl;
-
-			if(containsCurve)
-			{
-//				cout << "Computing initial SDF on subgrid " << i << "." <<std::endl;
-				tmp[i] = runSubgrid(0, tmp[i] ,i);
-				insertSubgrid( tmp[i], i);
-				setSubgridValue(i, 4);
-//				cout << "Computed initial SDF on subgrid " << i  << "." <<std::endl;
-			}
-			containsCurve = false;
-
-		}
-//		cout << "CPU: Curve found" <<std::endl;
-	}
-#ifdef HAVE_CUDA
-	else if(this->device == tnlCudaDevice)
-	{
-//		cout << "pre 1 kernel" <<std::endl;
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-		dim3 threadsPerBlock(this->n, this->n, this->n);
-		dim3 numBlocks(this->gridCols,this->gridRows,this->gridLevels);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-		initRunCUDA3D<SchemeTypeHost,SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,2*this->n*this->n*this->n*sizeof(double)>>>(this->cudaSolver);
-		cudaDeviceSynchronize();
-//		cout << "post 1 kernel" <<std::endl;
-
-	}
-#endif
-
-
-	this->currentStep = 1;
-	if(this->device == TNL::Devices::HostDevice)
-		synchronize();
-#ifdef HAVE_CUDA
-	else if(this->device == tnlCudaDevice)
-	{
-		dim3 threadsPerBlock(this->n, this->n, this->n);
-		dim3 numBlocks(this->gridCols,this->gridRows,this->gridLevels);
-		//double * test = (double*)malloc(this->work_u.getSize()*sizeof(double));
-		//cout << test[0] <<"   " << test[1] <<"   " << test[2] <<"   " << test[3] <<std::endl;
-		//cudaMemcpy(/*this->work_u.getData()*/ test, (this->tmpw), this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-		//cout << this->tmpw << "   " <<  test[0] <<"   " << test[1] << "   " <<test[2] << "   " <<test[3] <<std::endl;
-
-		TNL_CHECK_CUDA_DEVICE;
-
-		synchronizeCUDA3D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
-		cout << cudaGetErrorString(cudaDeviceSynchronize()) <<std::endl;
-		TNL_CHECK_CUDA_DEVICE;
-		synchronize2CUDA3D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-		//cout << test[0] << "   " <<test[1] <<"   " << test[2] << "   " <<test[3] <<std::endl;
-		//cudaMemcpy(/*this->work_u.getData()*/ test, (this->tmpw), this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-		//TNL_CHECK_CUDA_DEVICE;
-		//cout << this->tmpw << "   " <<  test[0] << "   " <<test[1] << "   " <<test[2] <<"   " << test[3] <<std::endl;
-		//free(test);
-
-	}
-
-#endif
-	cout << "Solver initialized." <<std::endl;
-
-	return true;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::run()
-{
-	if(this->device == TNL::Devices::HostDevice)
-	{
-
-	bool end = false;
-	while (/*(this->boundaryConditions.max() > 0 ) ||*/ !end)
-	{
-		if(this->boundaryConditions.max() == 0 || this->subgridValues.max() < 0)
-			end=true;
-		else
-			end=false;
-#ifdef HAVE_OPENMP
-#pragma omp parallel for num_threads(4) schedule(dynamic)
-#endif
-		for(int i = 0; i < this->subgridValues.getSize(); i++)
-		{
-			VectorType tmp;
-			tmp.setSize(this->n*this->n*this->n);
-			if(getSubgridValue(i) != INT_MAX)
-			{
-				//cout << "subMesh: " << i << ", BC: " << getBoundaryCondition(i) <<std::endl;
-
-				if(getSubgridValue(i) == currentStep+4)
-				{
-
-					if(getBoundaryCondition(i) & 1)
-					{
-						tmp = getSubgrid(i);
-						tmp = runSubgrid(1, tmp ,i);
-						insertSubgrid( tmp, i);
-						this->calculationsCount[i]++;
-					}
-					if(getBoundaryCondition(i) & 2)
-					{
-						tmp = getSubgrid(i);
-						tmp = runSubgrid(2, tmp ,i);
-						insertSubgrid( tmp, i);
-						this->calculationsCount[i]++;
-					}
-					if(getBoundaryCondition(i) & 4)
-					{
-						tmp = getSubgrid(i);
-						tmp = runSubgrid(4, tmp ,i);
-						insertSubgrid( tmp, i);
-						this->calculationsCount[i]++;
-					}
-					if(getBoundaryCondition(i) & 8)
-					{
-						tmp = getSubgrid(i);
-						tmp = runSubgrid(8, tmp ,i);
-						insertSubgrid( tmp, i);
-						this->calculationsCount[i]++;
-					}
-					if(getBoundaryCondition(i) & 16)
-					{
-						tmp = getSubgrid(i);
-						tmp = runSubgrid(16, tmp ,i);
-						insertSubgrid( tmp, i);
-						this->calculationsCount[i]++;
-					}
-					if(getBoundaryCondition(i) & 32)
-					{
-						tmp = getSubgrid(i);
-						tmp = runSubgrid(32, tmp ,i);
-						insertSubgrid( tmp, i);
-						this->calculationsCount[i]++;
-					}
-				}
-
-				if( getBoundaryCondition(i) & 19)
-				{
-					tmp = getSubgrid(i);
-					tmp = runSubgrid(19, tmp ,i);
-					insertSubgrid( tmp, i);
-				}
-				if( getBoundaryCondition(i) & 21)
-				{
-					tmp = getSubgrid(i);
-					tmp = runSubgrid(21, tmp ,i);
-					insertSubgrid( tmp, i);
-				}
-				if( getBoundaryCondition(i) & 26)
-				{
-					tmp = getSubgrid(i);
-					tmp = runSubgrid(26, tmp ,i);
-					insertSubgrid( tmp, i);
-				}
-				if( getBoundaryCondition(i) & 28)
-				{
-					tmp = getSubgrid(i);
-					tmp = runSubgrid(28, tmp ,i);
-					insertSubgrid( tmp, i);
-				}
-
-				if( getBoundaryCondition(i) & 35)
-				{
-					tmp = getSubgrid(i);
-					tmp = runSubgrid(35, tmp ,i);
-					insertSubgrid( tmp, i);
-				}
-				if( getBoundaryCondition(i) & 37)
-				{
-					tmp = getSubgrid(i);
-					tmp = runSubgrid(37, tmp ,i);
-					insertSubgrid( tmp, i);
-				}
-				if( getBoundaryCondition(i) & 42)
-				{
-					tmp = getSubgrid(i);
-					tmp = runSubgrid(42, tmp ,i);
-					insertSubgrid( tmp, i);
-				}
-				if( getBoundaryCondition(i) & 44)
-				{
-					tmp = getSubgrid(i);
-					tmp = runSubgrid(44, tmp ,i);
-					insertSubgrid( tmp, i);
-				}
-
-
-				setBoundaryCondition(i, 0);
-				setSubgridValue(i, getSubgridValue(i)-1);
-
-			}
-		}
-		synchronize();
-	}
-	}
-#ifdef HAVE_CUDA
-	else if(this->device == tnlCudaDevice)
-	{
-		//cout << "fn" <<std::endl;
-		bool end_cuda = false;
-		dim3 threadsPerBlock(this->n, this->n, this->n);
-		dim3 numBlocks(this->gridCols,this->gridRows,this->gridLevels);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-		//cudaMalloc(&runcuda,sizeof(bool));
-		//cudaMemcpy(runcuda, &run_host, sizeof(bool), cudaMemcpyHostToDevice);
-		//cout << "fn" <<std::endl;
-		bool* tmpb;
-		//cudaMemcpy(tmpb, &(cudaSolver->runcuda),sizeof(bool*), cudaMemcpyDeviceToHost);
-		//cudaDeviceSynchronize();
-		//TNL_CHECK_CUDA_DEVICE;
-		cudaMemcpy(&(this->run_host),this->runcuda,sizeof(int), cudaMemcpyDeviceToHost);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-		//cout << "fn" <<std::endl;
-		int i = 1;
-		time_diff = 0.0;
-		while (run_host || !end_cuda)
-		{
-			cout << "Computing at step "<< i++ <<std::endl;
-			if(run_host != 0 )
-				end_cuda = true;
-			else
-				end_cuda = false;
-			//cout << "a" <<std::endl;
-			cudaDeviceSynchronize();
-			TNL_CHECK_CUDA_DEVICE;
-			start = std::clock();
-			runCUDA3D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,2*this->n*this->n*this->n*sizeof(double)>>>(this->cudaSolver);
-			//cout << "a" <<std::endl;
-			cudaDeviceSynchronize();
-			time_diff += (std::clock() - start) / (double)(CLOCKS_PER_SEC);
-
-			//start = std::clock();
-			synchronizeCUDA3D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
-			cudaDeviceSynchronize();
-			TNL_CHECK_CUDA_DEVICE;
-			synchronize2CUDA3D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver);
-			cudaDeviceSynchronize();
-			TNL_CHECK_CUDA_DEVICE;
-			//time_diff += (std::clock() - start) / (double)(CLOCKS_PER_SEC);
-
-
-			//cout << "a" <<std::endl;
-			//run_host = false;
-			//cout << "in kernel loop" << run_host <<std::endl;
-			//cudaMemcpy(tmpb, &(cudaSolver->runcuda),sizeof(bool*), cudaMemcpyDeviceToHost);
-			cudaMemcpy(&run_host, (this->runcuda),sizeof(int), cudaMemcpyDeviceToHost);
-			//cout << "in kernel loop" << run_host <<std::endl;
-		}
-		cout << "Solving time was: " << time_diff <<std::endl;
-		//cout << "b" <<std::endl;
-
-		//double* tmpu;
-		//cudaMemcpy(tmpu, &(cudaSolver->work_u_cuda),sizeof(double*), cudaMemcpyHostToDevice);
-		//cudaMemcpy(this->work_u.getData(), tmpu, this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-		//cout << this->work_u.getData()[0] <<std::endl;
-
-		//double * test = (double*)malloc(this->work_u.getSize()*sizeof(double));
-		//cout << test[0] << test[1] << test[2] << test[3] <<std::endl;
-		cudaMemcpy(this->work_u.getData()/* test*/, (this->tmpw), this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-		//cout << this->tmpw << "   " <<  test[0] << test[1] << test[2] << test[3] <<std::endl;
-		//free(test);
-
-		cudaDeviceSynchronize();
-	}
-#endif
-	contractGrid();
-	this->u0.save("u-00001.tnl");
-	cout << "Maximum number of calculations on one subgrid was " << this->calculationsCount.absMax() <<std::endl;
-	cout << "Average number of calculations on one subgrid was " << ( (double) this->calculationsCount.sum() / (double) this->calculationsCount.getSize() ) <<std::endl;
-	cout << "Solver finished" <<std::endl;
-
-#ifdef HAVE_CUDA
-	if(this->device == tnlCudaDevice)
-	{
-		cudaFree(this->runcuda);
-		cudaFree(this->tmpw);
-		cudaFree(this->cudaSolver);
-	}
-#endif
-
-}
-
-//north - 1, east - 2, west - 4, south - 8
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::synchronize() //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now
-{
-	cout << "Synchronizig..." <<std::endl;
-	int tmp1, tmp2;
-	int grid1, grid2;
-
-//	if(this->currentStep & 1)
-//	{
-		for(int j = 0; j < this->gridRows - 1; j++)
-		{
-			for (int i = 0; i < this->gridCols*this->n; i++)
-			{
-				for (int k = 0; k < this->gridLevels*this->n; k++)
-				{
-//					cout << "a" <<std::endl;
-					tmp1 = this->gridCols*this->n*((this->n-1)+j*this->n) + i + k*this->gridCols*this->n*this->gridRows*this->n;
-//					cout << "b" <<std::endl;
-					tmp2 = this->gridCols*this->n*((this->n)+j*this->n) + i + k*this->gridCols*this->n*this->gridRows*this->n;
-//					cout << "c" <<std::endl;
-					if(tmp1 > work_u.getSize())
-						cout << "tmp1: " << tmp1 << " x: " << j <<" y: " << i <<" z: " << k <<std::endl;
-					if(tmp2 > work_u.getSize())
-						cout << "tmp2: " << tmp2 << " x: " << j <<" y: " << i <<" z: " << k <<std::endl;
-					grid1 = getSubgridValue(getOwner(tmp1));
-//					cout << "d" <<std::endl;
-					grid2 = getSubgridValue(getOwner(tmp2));
-//					cout << "e" <<std::endl;
-					if(getOwner(tmp1)==getOwner(tmp2))
-						cout << "i, j, k" << i << "," << j << "," << k <<std::endl;
-					if ((fabs(this->work_u[tmp1]) < fabs(this->work_u[tmp2]) - this->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX))
-					{
-						this->work_u[tmp2] = this->work_u[tmp1];
-//						cout << "f" <<std::endl;
-						this->unusedCell[tmp2] = 0;
-//						cout << "g" <<std::endl;
-						if(grid2 == INT_MAX)
-						{
-							setSubgridValue(getOwner(tmp2), -INT_MAX);
-						}
-//						cout << "h" <<std::endl;
-						if(! (getBoundaryCondition(getOwner(tmp2)) & 8) )
-							setBoundaryCondition(getOwner(tmp2), getBoundaryCondition(getOwner(tmp2))+8);
-//						cout << "i" <<std::endl;
-					}
-					else if ((fabs(this->work_u[tmp1]) > fabs(this->work_u[tmp2]) + this->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX))
-					{
-						this->work_u[tmp1] = this->work_u[tmp2];
-//						cout << "j" <<std::endl;
-						this->unusedCell[tmp1] = 0;
-//						cout << "k" <<std::endl;
-						if(grid1 == INT_MAX)
-						{
-							setSubgridValue(getOwner(tmp1), -INT_MAX);
-						}
-//						cout << "l" <<std::endl;
-						if(! (getBoundaryCondition(getOwner(tmp1)) & 1) )
-							setBoundaryCondition(getOwner(tmp1), getBoundaryCondition(getOwner(tmp1))+1);
-//						cout << "m" <<std::endl;
-					}
-				}
-			}
-		}
-
-//	}
-//	else
-//	{
-
-		cout << "sync 2" <<std::endl;
-		for(int i = 1; i < this->gridCols; i++)
-		{
-			for (int j = 0; j < this->gridRows*this->n; j++)
-			{
-				for (int k = 0; k < this->gridLevels*this->n; k++)
-				{
-					tmp1 = this->gridCols*this->n*j + i*this->n - 1 + k*this->gridCols*this->n*this->gridRows*this->n;
-					tmp2 = this->gridCols*this->n*j + i*this->n + k*this->gridCols*this->n*this->gridRows*this->n;
-					grid1 = getSubgridValue(getOwner(tmp1));
-					grid2 = getSubgridValue(getOwner(tmp2));
-					if(getOwner(tmp1)==getOwner(tmp2))
-						cout << "i, j, k" << i << "," << j << "," << k <<std::endl;
-					if ((fabs(this->work_u[tmp1]) < fabs(this->work_u[tmp2]) - this->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX))
-					{
-						this->work_u[tmp2] = this->work_u[tmp1];
-						this->unusedCell[tmp2] = 0;
-						if(grid2 == INT_MAX)
-						{
-							setSubgridValue(getOwner(tmp2), -INT_MAX);
-						}
-						if(! (getBoundaryCondition(getOwner(tmp2)) & 4) )
-							setBoundaryCondition(getOwner(tmp2), getBoundaryCondition(getOwner(tmp2))+4);
-					}
-					else if ((fabs(this->work_u[tmp1]) > fabs(this->work_u[tmp2]) + this->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX))
-					{
-						this->work_u[tmp1] = this->work_u[tmp2];
-						this->unusedCell[tmp1] = 0;
-						if(grid1 == INT_MAX)
-						{
-							setSubgridValue(getOwner(tmp1), -INT_MAX);
-						}
-						if(! (getBoundaryCondition(getOwner(tmp1)) & 2) )
-							setBoundaryCondition(getOwner(tmp1), getBoundaryCondition(getOwner(tmp1))+2);
-					}
-				}
-			}
-		}
-
-		cout << "sync 3" <<std::endl;
-
-		for(int k = 1; k < this->gridLevels; k++)
-		{
-			for (int j = 0; j < this->gridRows*this->n; j++)
-			{
-				for (int i = 0; i < this->gridCols*this->n; i++)
-				{
-					tmp1 = this->gridCols*this->n*j + i + (k*this->n-1)*this->gridCols*this->n*this->gridRows*this->n;
-					tmp2 = this->gridCols*this->n*j + i + k*this->n*this->gridCols*this->n*this->gridRows*this->n;
-					grid1 = getSubgridValue(getOwner(tmp1));
-					grid2 = getSubgridValue(getOwner(tmp2));
-					if(getOwner(tmp1)==getOwner(tmp2))
-						cout << "i, j, k" << i << "," << j << "," << k <<std::endl;
-					if ((fabs(this->work_u[tmp1]) < fabs(this->work_u[tmp2]) - this->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX))
-					{
-						this->work_u[tmp2] = this->work_u[tmp1];
-						this->unusedCell[tmp2] = 0;
-						if(grid2 == INT_MAX)
-						{
-							setSubgridValue(getOwner(tmp2), -INT_MAX);
-						}
-						if(! (getBoundaryCondition(getOwner(tmp2)) & 32) )
-							setBoundaryCondition(getOwner(tmp2), getBoundaryCondition(getOwner(tmp2))+32);
-					}
-					else if ((fabs(this->work_u[tmp1]) > fabs(this->work_u[tmp2]) + this->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX))
-					{
-						this->work_u[tmp1] = this->work_u[tmp2];
-						this->unusedCell[tmp1] = 0;
-						if(grid1 == INT_MAX)
-						{
-							setSubgridValue(getOwner(tmp1), -INT_MAX);
-						}
-						if(! (getBoundaryCondition(getOwner(tmp1)) & 16) )
-							setBoundaryCondition(getOwner(tmp1), getBoundaryCondition(getOwner(tmp1))+16);
-					}
-				}
-			}
-		}
-//		}
-
-
-
-	this->currentStep++;
-	int stepValue = this->currentStep + 4;
-	for (int i = 0; i < this->subgridValues.getSize(); i++)
-	{
-		if( getSubgridValue(i) == -INT_MAX )
-			setSubgridValue(i, stepValue);
-	}
-
-	cout << "Grid synchronized at step " << (this->currentStep - 1 ) <<std::endl;
-
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-int tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getOwner(int i) const
-{
-
-	int j = i % (this->gridCols*this->gridRows*this->n*this->n);
-
-	return ( (i / (this->gridCols*this->gridRows*this->n*this->n*this->n))*this->gridCols*this->gridRows
-			+ (j / (this->gridCols*this->n*this->n))*this->gridCols
-			+ (j % (this->gridCols*this->n))/this->n);
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-int tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getSubgridValue( int i ) const
-{
-	return this->subgridValues[i];
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::setSubgridValue(int i, int value)
-{
-	this->subgridValues[i] = value;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-int tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getBoundaryCondition( int i ) const
-{
-	return this->boundaryConditions[i];
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::setBoundaryCondition(int i, int value)
-{
-	this->boundaryConditions[i] = value;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::stretchGrid()
-{
-	cout << "Stretching grid..." <<std::endl;
-
-
-	this->gridCols = ceil( ((double)(this->mesh.getDimensions().x()-1)) / ((double)(this->n-1)) );
-	this->gridRows = ceil( ((double)(this->mesh.getDimensions().y()-1)) / ((double)(this->n-1)) );
-	this->gridLevels = ceil( ((double)(this->mesh.getDimensions().z()-1)) / ((double)(this->n-1)) );
-
-	//this->gridCols = (this->mesh.getDimensions().x()-1) / (this->n-1) ;
-	//this->gridRows = (this->mesh.getDimensions().y()-1) / (this->n-1) ;
-
-	cout << "Setting gridCols to " << this->gridCols << "." <<std::endl;
-	cout << "Setting gridRows to " << this->gridRows << "." <<std::endl;
-	cout << "Setting gridLevels to " << this->gridLevels << "." <<std::endl;
-
-	this->subgridValues.setSize(this->gridCols*this->gridRows*this->gridLevels);
-	this->subgridValues.setValue(0);
-	this->boundaryConditions.setSize(this->gridCols*this->gridRows*this->gridLevels);
-	this->boundaryConditions.setValue(0);
-	this->calculationsCount.setSize(this->gridCols*this->gridRows*this->gridLevels);
-	this->calculationsCount.setValue(0);
-
-	for(int i = 0; i < this->subgridValues.getSize(); i++ )
-	{
-		this->subgridValues[i] = INT_MAX;
-		this->boundaryConditions[i] = 0;
-	}
-
-	int levelSize = this->n*this->n*this->gridCols*this->gridRows;
-	int stretchedSize = this->n*levelSize*this->gridLevels;
-
-	if(!this->work_u.setSize(stretchedSize))
-		cerr << "Could not allocate memory for stretched grid." <<std::endl;
-	if(!this->unusedCell.setSize(stretchedSize))
-		cerr << "Could not allocate memory for supporting stretched grid." <<std::endl;
-	int idealStretch =this->mesh.getDimensions().x() + (this->mesh.getDimensions().x()-2)/(this->n-1);
-	cout << idealStretch <<std::endl;
-
-
-
-
-	for(int i = 0; i < levelSize; i++)
-	{
-		int diff =(this->n*this->gridCols) - idealStretch ;
-
-		int k = i/this->n - i/(this->n*this->gridCols) + this->mesh.getDimensions().x()*(i/(this->n*this->n*this->gridCols)) + (i/(this->n*this->gridCols))*diff;
-
-		if(i%(this->n*this->gridCols) - idealStretch  >= 0)
-		{
-			k+= i%(this->n*this->gridCols) - idealStretch +1 ;
-		}
-
-		if(i/(this->n*this->gridCols) - idealStretch + 1  > 0)
-		{
-			k+= (i/(this->n*this->gridCols) - idealStretch +1 )* this->mesh.getDimensions().x() ;
-		}
-
-		for( int j = 0; j<this->n*this->gridLevels; j++)
-		{
-			this->unusedCell[i+j*levelSize] = 1;
-			int l = j/this->n;
-
-			if(j - idealStretch  >= 0)
-			{
-				l+= j - idealStretch + 1;
-			}
-
-			this->work_u[i+j*levelSize] = this->u0[i+(j-l)*mesh.getDimensions().x()*mesh.getDimensions().y()-k];
-		}
-
-	}
-
-
-
-	cout << "Grid stretched." <<std::endl;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::contractGrid()
-{
-	cout << "Contracting grid..." <<std::endl;
-	int levelSize = this->n*this->n*this->gridCols*this->gridRows;
-	int stretchedSize = this->n*levelSize*this->gridLevels;
-
-	int idealStretch =this->mesh.getDimensions().x() + (this->mesh.getDimensions().x()-2)/(this->n-1);
-	cout << idealStretch <<std::endl;
-
-
-	for(int i = 0; i < levelSize; i++)
-	{
-		int diff =(this->n*this->gridCols) - idealStretch ;
-		int k = i/this->n - i/(this->n*this->gridCols) + this->mesh.getDimensions().x()*(i/(this->n*this->n*this->gridCols)) + (i/(this->n*this->gridCols))*diff;
-
-		if((i%(this->n*this->gridCols) - idealStretch  < 0) && (i/(this->n*this->gridCols) - idealStretch + 1  <= 0) )
-		{
-			for( int j = 0; j<this->n*this->gridLevels; j++)
-			{
-				int l = j/this->n;
-				if(j - idealStretch  < 0)
-					this->u0[i+(j-l)*mesh.getDimensions().x()*mesh.getDimensions().y()-k] = this->work_u[i+j*levelSize];
-			}
-		}
-
-	}
-
-	cout << "Grid contracted" <<std::endl;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-typename tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::VectorType
-tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getSubgrid( const int i ) const
-{
-
-	VectorType u;
-	u.setSize(this->n*this->n*this->n);
-
-	int idx, idy, idz;
-	idz = i / (gridRows*this->gridCols);
-	idy = (i % (this->gridRows*this->gridCols)) / this->gridCols;
-	idx = i %  (this->gridCols);
-
-	for( int j = 0; j < this->n; j++)
-	{
-	//	int index = (i / this->gridCols)*this->n*this->n*this->gridCols + (i % this->gridCols)*this->n + (j/this->n)*this->n*this->gridCols + (j % this->n);
-		for( int k = 0; k < this->n; k++)
-		{
-			for( int l = 0; l < this->n; l++)
-			{
-				int index = (idz*this->n + l) * this->n*this->n*this->gridCols*this->gridRows
-						 + (idy) * this->n*this->n*this->gridCols
-						 + (idx) * this->n
-						 + k * this->n*this->gridCols
-						 + j;
-
-				u[j + k*this->n  + l*this->n*this->n] = this->work_u[ index ];
-			}
-		}
-	}
-	return u;
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::insertSubgrid( VectorType u, const int i )
-{
-	int idx, idy, idz;
-	idz = i / (this->gridRows*this->gridCols);
-	idy = (i % (this->gridRows*this->gridCols)) / this->gridCols;
-	idx = i %  (this->gridCols);
-
-	for( int j = 0; j < this->n; j++)
-	{
-	//	int index = (i / this->gridCols)*this->n*this->n*this->gridCols + (i % this->gridCols)*this->n + (j/this->n)*this->n*this->gridCols + (j % this->n);
-		for( int k = 0; k < this->n; k++)
-		{
-			for( int l = 0; l < this->n; l++)
-			{
-
-				int index = (idz*this->n + l) * this->n*this->n*this->gridCols*this->gridRows
-						 + (idy) * this->n*this->n*this->gridCols
-						 + (idx) * this->n
-						 + k * this->n*this->gridCols
-						 + j;
-
-				//OMP LOCK index
-//				cout<< idx << " " << idy << " " << idz << " " << j << " " << k << " " << l << " " << idz << " " << unusedCell.getSize() << " " << u.getSize() << " " << index <<endl;
-				if( (fabs(this->work_u[index]) > fabs(u[j + k*this->n  + l*this->n*this->n])) || (this->unusedCell[index] == 1) )
-				{
-					this->work_u[index] = u[j + k*this->n  + l*this->n*this->n];
-					this->unusedCell[index] = 0;
-				}
-				//OMP UNLOCK index
-			}
-		}
-	}
-}
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-typename tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::VectorType
-tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::runSubgrid( int boundaryCondition, VectorType u, int subGridID)
-{
-
-	VectorType fu;
-
-	fu.setLike(u);
-	fu.setValue( 0.0 );
-
-
-	bool tmp = false;
-	for(int i = 0; i < u.getSize(); i++)
-	{
-		if(u[0]*u[i] <= 0.0)
-			tmp=true;
-	}
-	int idx,idy,idz;
-	idz = subGridID / (this->gridRows*this->gridCols);
-	idy = (subGridID % (this->gridRows*this->gridCols)) / this->gridCols;
-	idx = subGridID %  (this->gridCols);
-	int centerGID = (this->n*idy + (this->n>>1) )*(this->n*this->gridCols) + this->n*idx + (this->n>>1)
-			      + ((this->n>>1)+this->n*idz)*this->n*this->n*this->gridRows*this->gridCols;
-	if(this->unusedCell[centerGID] == 0 || boundaryCondition == 0)
-		tmp = true;
-	//if(this->currentStep + 3 < getSubgridValue(subGridID))
-		//tmp = true;
-
-
-	double value = sign(u[0]) * u.absMax();
-
-	if(tmp)
-	{}
-
-
-	//north - 1, east - 2, west - 4, south - 8
-	else if(boundaryCondition == 4)
-	{
-		for(int i = 0; i < this->n; i++)
-			for(int j = 1;j < this->n; j++)
-				for(int k = 0;k < this->n; k++)
-				//if(fabs(u[i*this->n + j]) <  fabs(u[i*this->n]))
-				u[k*this->n*this->n + i*this->n + j] = value;// u[i*this->n];
-	}
-	else if(boundaryCondition == 2)
-	{
-		for(int i = 0; i < this->n; i++)
-			for(int j =0 ;j < this->n -1; j++)
-				for(int k = 0;k < this->n; k++)
-				//if(fabs(u[i*this->n + j]) < fabs(u[(i+1)*this->n - 1]))
-				u[k*this->n*this->n + i*this->n + j] = value;// u[(i+1)*this->n - 1];
-	}
-	else if(boundaryCondition == 1)
-	{
-		for(int j = 0; j < this->n; j++)
-			for(int i = 0;i < this->n - 1; i++)
-				for(int k = 0;k < this->n; k++)
-				//if(fabs(u[i*this->n + j]) < fabs(u[j + this->n*(this->n - 1)]))
-				u[k*this->n*this->n + i*this->n + j] = value;// u[j + this->n*(this->n - 1)];
-	}
-	else if(boundaryCondition == 8)
-	{
-		for(int j = 0; j < this->n; j++)
-			for(int i = 1;i < this->n; i++)
-				for(int k = 0;k < this->n; k++)
-				//if(fabs(u[i*this->n + j]) < fabs(u[j]))
-				u[k*this->n*this->n + i*this->n + j] = value;// u[j];
-	}
-	else if(boundaryCondition == 16)
-	{
-		for(int j = 0; j < this->n; j++)
-			for(int i = 0;i < this->n ; i++)
-				for(int k = 0;k < this->n-1; k++)
-				//if(fabs(u[i*this->n + j]) < fabs(u[j + this->n*(this->n - 1)]))
-				u[k*this->n*this->n + i*this->n + j] = value;// u[j + this->n*(this->n - 1)];
-	}
-	else if(boundaryCondition == 32)
-	{
-		for(int j = 0; j < this->n; j++)
-			for(int i = 0;i < this->n; i++)
-				for(int k = 1;k < this->n; k++)
-				//if(fabs(u[i*this->n + j]) < fabs(u[j]))
-				u[k*this->n*this->n + i*this->n + j] = value;// u[j];
-	}
-
-
-   double time = 0.0;
-   double currentTau = this->tau0;
-   double finalTime = this->stopTime;// + 3.0*(u.max() - u.min());
-   if(boundaryCondition == 0) finalTime *= 2.0;
-   if( time + currentTau > finalTime ) currentTau = finalTime - time;
-
-   double maxResidue( 1.0 );
-   //double lastResidue( 10000.0 );
-   tnlGridEntity<MeshType, 3, tnlGridEntityNoStencilStorage > Entity(subMesh);
-   tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 3, tnlGridEntityNoStencilStorage >,3> neighborEntities(Entity);
-   while( time < finalTime /*|| maxResidue > subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*/)
-   {
-      /****
-       * Compute the RHS
-       */
-
-      for( int i = 0; i < fu.getSize(); i ++ )
-      {
-//    	 std::cout << "i: " << i << ", time: " << time <<endl;
-    	  Containers::StaticVector<3,int> coords(i % subMesh.getDimensions().x(),
-    	  								(i % (subMesh.getDimensions().x()*subMesh.getDimensions().y())) / subMesh.getDimensions().x(),
-    	  								i / (subMesh.getDimensions().x()*subMesh.getDimensions().y()));
-//    	  	cout << "b " << i << " " << i % subMesh.getDimensions().x() << " " << (i % (subMesh.getDimensions().x()*subMesh.getDimensions().y())) << " " << (i % subMesh.getDimensions().x()*subMesh.getDimensions().y()) / subMesh.getDimensions().x() << " " << subMesh.getDimensions().x()*subMesh.getDimensions().y() << " " <<endl;
-			Entity.setCoordinates(coords);
-//			cout <<"c" << coords <<std::endl;
-			Entity.refresh();
-//			cout << "d" <<endl;
-			neighborEntities.refresh(subMesh,Entity.getIndex());
-//			cout << "e" <<endl;
-    	  fu[ i ] = schemeHost.getValue( this->subMesh, i, coords,u, time, boundaryCondition, neighborEntities );
-//    	 std::cout << "f" <<endl;
-      }
-      maxResidue = fu. absMax();
-
-
-      if( this -> cflCondition * maxResidue != 0.0)
-    	  currentTau =  this -> cflCondition / maxResidue;
-
-     /* if (maxResidue < 0.05)
-    	 std::cout << "Max < 0.05" <<std::endl;*/
-      if(currentTau > 0.5 * this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >())
-    	  currentTau = 0.5 * this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >();
-      /*if(maxResidue > lastResidue)
-    	  currentTau *=(1.0/10.0);*/
-
-
-      if( time + currentTau > finalTime ) currentTau = finalTime - time;
-//      for( int i = 0; i < fu.getSize(); i ++ )
-//      {
-//    	  //cout << "Too big RHS! i = " << i << ", fu = " << fu[i] << ", u = " << u[i] <<std::endl;
-//    	  if((u[i]+currentTau * fu[ i ])*u[i] < 0.0 && fu[i] != 0.0 && u[i] != 0.0 )
-//    		  currentTau = fabs(u[i]/(2.0*fu[i]));
-//
-//      }
-
-
-      for( int i = 0; i < fu.getSize(); i ++ )
-      {
-    	  double add = u[i] + currentTau * fu[ i ];
-    	  //if( fabs(u[i]) < fabs(add) or (this->subgridValues[subGridID] == this->currentStep +4) )
-    		  u[ i ] = add;
-      }
-      time += currentTau;
-
-      //cout << '\r' << flush;
-     //cout << maxResidue << "   " << currentTau << " @ " << time << flush;
-     //lastResidue = maxResidue;
-   }
-   //cout << "Time: " << time << ", Res: " << maxResidue <<endl;
-	/*if (u.max() > 0.0)
-		this->stopTime /=(double) this->gridCols;*/
-
-//	VectorType solution;
-//	solution.setLike(u);
-//    for( int i = 0; i < u.getSize(); i ++ )
-//  	{
-//    	solution[i]=u[i];
-//   	}
-//	return solution;
-	return u;
-}
-
-
-#ifdef HAVE_CUDA
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getSubgridCUDA3D( const int i ,tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller, double* a)
-{
-	//int j = threadIdx.x + threadIdx.y * blockDim.x;
-//	int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows
-//			 + (blockIdx.y) * this->n*this->n*this->gridCols
-//             + (blockIdx.x) * this->n
-//             + threadIdx.y * this->n*this->gridCols
-//             + threadIdx.x;
-
-
-	int index =  blockDim.x*blockIdx.x + threadIdx.x +
-			  (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x +
-			  (blockDim.z*blockIdx.z + threadIdx.z)*blockDim.x*gridDim.x*blockDim.y*gridDim.y;
-
-	//printf("i= %d,j= %d,th= %d\n",i,j,th);
-	*a = caller->work_u_cuda[index];
-	//printf("Hi %f \n", *a);
-	//return ret;
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::updateSubgridCUDA3D( const int i ,tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller, double* a)
-{
-//	int j = threadIdx.x + threadIdx.y * blockDim.x;
-//	int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows
-//			 + (blockIdx.y) * this->n*this->n*this->gridCols
-//             + (blockIdx.x) * this->n
-//             + threadIdx.y * this->n*this->gridCols
-//             + threadIdx.x;
-
-	int index =  blockDim.x*blockIdx.x + threadIdx.x +
-			  (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x +
-			  (blockDim.z*blockIdx.z + threadIdx.z)*blockDim.x*gridDim.x*blockDim.y*gridDim.y;
-
-	if( (fabs(caller->work_u_cuda[index]) > fabs(*a)) || (caller->unusedCell_cuda[index] == 1) )
-	{
-		caller->work_u_cuda[index] = *a;
-		caller->unusedCell_cuda[index] = 0;
-
-	}
-
-	*a = caller->work_u_cuda[index];
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::insertSubgridCUDA3D( double u, const int i )
-{
-
-
-//	int j = threadIdx.x + threadIdx.y * blockDim.x;
-	//printf("j = %d, u = %f\n", j,u);
-
-//		int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows
-//				 + (blockIdx.y) * this->n*this->n*this->gridCols
-//	             + (blockIdx.x) * this->n
-//	             + threadIdx.y * this->n*this->gridCols
-//	             + threadIdx.x;
-
-		int index =  blockDim.x*blockIdx.x + threadIdx.x +
-				  (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x +
-				  (blockDim.z*blockIdx.z + threadIdx.z)*blockDim.x*gridDim.x*blockDim.y*gridDim.y;
-
-		//printf("i= %d,j= %d,index= %d\n",i,j,index);
-		if( (fabs(this->work_u_cuda[index]) > fabs(u)) || (this->unusedCell_cuda[index] == 1) )
-		{
-			this->work_u_cuda[index] = u;
-			this->unusedCell_cuda[index] = 0;
-
-		}
-
-
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::runSubgridCUDA3D( int boundaryCondition, double* u, int subGridID)
-{
-
-	__shared__ int tmp;
-	__shared__ double value;
-	//double tmpRes = 0.0;
-	volatile double* sharedTau = &u[blockDim.x*blockDim.y*blockDim.z];
-//	volatile double* absVal = &u[2*blockDim.x*blockDim.y*blockDim.z];
-	int i = threadIdx.x;
-	int j = threadIdx.y;
-	int k = threadIdx.z;
-	int l = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z*blockDim.x*blockDim.y;
-	bool computeFU = !((i == 0 && (boundaryCondition & 4)) or
-			 (i == blockDim.x - 1 && (boundaryCondition & 2)) or
-			 (j == 0 && (boundaryCondition & 8)) or
-			 (j == blockDim.y - 1  && (boundaryCondition & 1))or
-			 (k == 0 && (boundaryCondition & 32)) or
-			 (k == blockDim.z - 1  && (boundaryCondition & 16)));
-
-	if(l == 0)
-	{
-		tmp = 0;
-		int centerGID = (blockDim.y*blockIdx.y + (blockDim.y>>1) )*(blockDim.x*gridDim.x) + blockDim.x*blockIdx.x + (blockDim.x>>1)
-				      + ((blockDim.z>>1)+blockDim.z*blockIdx.z)*blockDim.x*blockDim.y*gridDim.x*gridDim.y;
-		if(this->unusedCell_cuda[centerGID] == 0 || boundaryCondition == 0)
-			tmp = 1;
-	}
-	__syncthreads();
-
-
-	__syncthreads();
-	if(tmp !=1)
-	{
-//		if(computeFU)
-//			absVal[l]=0.0;
-//		else
-//			absVal[l] = fabs(u[l]);
-//
-//		__syncthreads();
-//
-//	      if((blockDim.x == 16) && (l < 128))		absVal[l] = Max(absVal[l],absVal[l+128]);
-//	      __syncthreads();
-//	      if((blockDim.x == 16) && (l < 64))		absVal[l] = Max(absVal[l],absVal[l+64]);
-//	      __syncthreads();
-//	      if(l < 32)    							absVal[l] = Max(absVal[l],absVal[l+32]);
-//	      if(l < 16)								absVal[l] = Max(absVal[l],absVal[l+16]);
-//	      if(l < 8)									absVal[l] = Max(absVal[l],absVal[l+8]);
-//	      if(l < 4)									absVal[l] = Max(absVal[l],absVal[l+4]);
-//	      if(l < 2)									absVal[l] = Max(absVal[l],absVal[l+2]);
-//	      if(l < 1)									value   = sign(u[0])*Max(absVal[l],absVal[l+1]);
-//		__syncthreads();
-//
-//		if(computeFU)
-//			u[l] = value;
-		if(computeFU)
-		{
-			tnlGridEntity<MeshType, 3, tnlGridEntityNoStencilStorage > Ent(subMesh);
-			if(boundaryCondition == 4)
-			{
-				Ent.setCoordinates(Containers::StaticVector<3,int>(0,j,k));
-			   	Ent.refresh();
-				u[l] = u[Ent.getIndex()];// + sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(threadIdx.x) ;//+  2*sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(threadIdx.x+this->n);
-			}
-			else if(boundaryCondition == 2)
-			{
-				Ent.setCoordinates(Containers::StaticVector<3,int>(blockDim.x - 1,j,k));
-			   	Ent.refresh();
-				u[l] = u[Ent.getIndex()];// + sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(this->n - 1 - threadIdx.x);//+ 2*sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(blockDim.x - threadIdx.x - 1+this->n);
-			}
-			else if(boundaryCondition == 8)
-			{
-				Ent.setCoordinates(Containers::StaticVector<3,int>(i,0,k));
-			   	Ent.refresh();
-				u[l] = u[Ent.getIndex()];// + sign(u[0])*this->subMesh.template getSpaceStepsProducts< 0, 1, 0 >()*(threadIdx.y) ;//+ 2*sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(threadIdx.y+this->n);
-			}
-			else if(boundaryCondition == 1)
-			{
-				Ent.setCoordinates(Containers::StaticVector<3,int>(i,blockDim.y - 1,k));
-			   	Ent.refresh();
-				u[l] = u[Ent.getIndex()];// + sign(u[0])*this->subMesh.template getSpaceStepsProducts< 0, 1, 0 >()*(this->n - 1 - threadIdx.y) ;//+ sign(u[0])*this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >()*(blockDim.y - threadIdx.y  - 1 +this->n);
-			}
-			else if(boundaryCondition == 32)
-			{
-				Ent.setCoordinates(Containers::StaticVector<3,int>(i,j,0));
-			   	Ent.refresh();
-				u[l] = u[Ent.getIndex()];// + sign(u[0])*this->subMesh.template getSpaceStepsProducts< 0, 0, 1 >()*(threadIdx.z);
-			}
-			else if(boundaryCondition == 16)
-			{
-				Ent.setCoordinates(Containers::StaticVector<3,int>(i,j,blockDim.z - 1));
-			   	Ent.refresh();
-				u[l] = u[Ent.getIndex()];// + sign(u[0])*this->subMesh.template getSpaceStepsProducts< 0, 0, 1 >()*(this->n - 1 - threadIdx.z) ;
-			}
-		}
-	}
-
-   double time = 0.0;
-   __shared__ double currentTau;
-   double cfl = this->cflCondition;
-   double fu = 0.0;
-//   if(threadIdx.x * threadIdx.y * threadIdx.z == 0)
-//   {
-//	   currentTau = this->tau0;
-//   }
-   double finalTime = this->stopTime;
-   __syncthreads();
-   if( boundaryCondition == 0 ) finalTime *= 2.0;
-
-   tnlGridEntity<MeshType, 3, tnlGridEntityNoStencilStorage > Entity(subMesh);
-   tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 3, tnlGridEntityNoStencilStorage >,3> neighborEntities(Entity);
-   Entity.setCoordinates(Containers::StaticVector<3,int>(i,j,k));
-   Entity.refresh();
-   neighborEntities.refresh(subMesh,Entity.getIndex());
-
-
-   while( time < finalTime )
-   {
-	  sharedTau[l]=finalTime;
-
-	  if(computeFU)
-	  {
-		  fu = schemeHost.getValueDev( this->subMesh, l, Containers::StaticVector<3,int>(i,j,k), u, time, boundaryCondition, neighborEntities);
-		  if(abs(fu) > 0.0)
-			  sharedTau[l]=abs(cfl/fu);
-	  }
-
-      if(l == 0)
-      {
-    	  if(sharedTau[0] > 0.5 * this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >())	sharedTau[0] = 0.5 * this->subMesh.template getSpaceStepsProducts< 1, 0, 0 >();
-      }
-      else if(l == blockDim.x*blockDim.y*blockDim.z - 1)
-      {
-    	  if( time + sharedTau[l] > finalTime )		sharedTau[l] = finalTime - time;
-      }
-
-      __syncthreads();
-      if(l < 256)								sharedTau[l] = Min(sharedTau[l],sharedTau[l+256]);
-      __syncthreads();
-      if(l < 128)								sharedTau[l] = Min(sharedTau[l],sharedTau[l+128]);
-      __syncthreads();
-      if(l < 64)								sharedTau[l] = Min(sharedTau[l],sharedTau[l+64]);
-      __syncthreads();
-      if(l < 32)    							sharedTau[l] = Min(sharedTau[l],sharedTau[l+32]);
-      __syncthreads();
-      if(l < 16)								sharedTau[l] = Min(sharedTau[l],sharedTau[l+16]);
-      if(l < 8)									sharedTau[l] = Min(sharedTau[l],sharedTau[l+8]);
-      if(l < 4)									sharedTau[l] = Min(sharedTau[l],sharedTau[l+4]);
-      if(l < 2)									sharedTau[l] = Min(sharedTau[l],sharedTau[l+2]);
-      if(l < 1)									currentTau   = Min(sharedTau[l],sharedTau[l+1]);
-      __syncthreads();
-
-//	if(abs(fu) < 10000.0)
-//		printf("bla");
-      if(computeFU)
-    	  u[l] += currentTau * fu;
-      time += currentTau;
-      __syncthreads();
-   }
-
-
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-int tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getOwnerCUDA3D(int i) const
-{
-	int j = i % (this->gridCols*this->gridRows*this->n*this->n);
-
-	return ( (i / (this->gridCols*this->gridRows*this->n*this->n))*this->gridCols*this->gridRows
-			+ (j / (this->gridCols*this->n*this->n))*this->gridCols
-			+ (j % (this->gridCols*this->n))/this->n);
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-int tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getSubgridValueCUDA3D( int i ) const
-{
-	return this->subgridValues_cuda[i];
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::setSubgridValueCUDA3D(int i, int value)
-{
-	this->subgridValues_cuda[i] = value;
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-int tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getBoundaryConditionCUDA3D( int i ) const
-{
-	return this->boundaryConditions_cuda[i];
-}
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__device__
-void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::setBoundaryConditionCUDA3D(int i, int value)
-{
-	this->boundaryConditions_cuda[i] = value;
-}
-
-
-
-//north - 1, east - 2, west - 4, south - 8, up -16, down - 32
-
-template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__
-void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::*/synchronizeCUDA3D(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver) //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now
-{
-
-	__shared__ int boundary[6]; // north,east,west,south
-	__shared__ int subgridValue;
-	__shared__ int newSubgridValue;
-
-
-	int gid =  blockDim.x*blockIdx.x + threadIdx.x +
-			  (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x +
-			  (blockDim.z*blockIdx.z + threadIdx.z)*blockDim.x*gridDim.x*blockDim.y*gridDim.y;
-	double u = cudaSolver->work_u_cuda[gid];
-	double u_cmp;
-	int subgridValue_cmp=INT_MAX;
-	int boundary_index=0;
-
-
-	if(threadIdx.x+threadIdx.y+threadIdx.z == 0)
-	{
-		subgridValue = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + blockIdx.z*gridDim.x*gridDim.y);
-		boundary[0] = 0;
-		boundary[1] = 0;
-		boundary[2] = 0;
-		boundary[3] = 0;
-		boundary[4] = 0;
-		boundary[5] = 0;
-		newSubgridValue = 0;
-//		printf("aaa z = %d, y = %d, x = %d\n",blockIdx.z,blockIdx.y,blockIdx.x);
-	}
-	__syncthreads();
-
-
-
-	if(		(threadIdx.x == 0 				/*				&& !(cudaSolver->currentStep & 1)*/) 		||
-			(threadIdx.y == 0 				 	/*			&& (cudaSolver->currentStep & 1)*/) 		||
-			(threadIdx.z == 0 	 /*	&& !(cudaSolver->currentStep & 1)*/) 		||
-			(threadIdx.x == blockDim.x - 1 	 /*	&& !(cudaSolver->currentStep & 1)*/) 		||
-			(threadIdx.y == blockDim.y - 1 	 /*	&& (cudaSolver->currentStep & 1)*/) 		||
-			(threadIdx.z == blockDim.z - 1 	 /*	&& (cudaSolver->currentStep & 1)*/) 		)
-	{
-		if(threadIdx.x == 0 && (blockIdx.x != 0)/* && !(cudaSolver->currentStep & 1)*/)
-		{
-			u_cmp = cudaSolver->work_u_cuda[gid - 1];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + blockIdx.z*gridDim.x*gridDim.y - 1);
-			boundary_index = 2;
-		}
-
-		if(threadIdx.x == blockDim.x - 1 && (blockIdx.x != gridDim.x - 1)/* && !(cudaSolver->currentStep & 1)*/)
-		{
-			u_cmp = cudaSolver->work_u_cuda[gid + 1];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + blockIdx.z*gridDim.x*gridDim.y + 1);
-			boundary_index = 1;
-		}
-
-		__threadfence();
-		if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX))
-		{
-			cudaSolver->unusedCell_cuda[gid] = 0;
-			atomicMax(&newSubgridValue, INT_MAX);
-			atomicMax(&boundary[boundary_index], 1);
-			cudaSolver->work_u_cuda[gid] = u_cmp;
-			u=u_cmp;
-		}
-		__threadfence();
-		if(threadIdx.y == 0 && (blockIdx.y != 0)/* && (cudaSolver->currentStep & 1)*/)
-		{
-			u_cmp = cudaSolver->work_u_cuda[gid - blockDim.x*gridDim.x];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D((blockIdx.y - 1)*gridDim.x + blockIdx.x + blockIdx.z*gridDim.x*gridDim.y);
-			boundary_index = 3;
-		}
-		if(threadIdx.y == blockDim.y - 1 && (blockIdx.y != gridDim.y - 1)/* && (cudaSolver->currentStep & 1)*/)
-		{
-			u_cmp = cudaSolver->work_u_cuda[gid + blockDim.x*gridDim.x];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D((blockIdx.y + 1)*gridDim.x + blockIdx.x + blockIdx.z*gridDim.x*gridDim.y);
-			boundary_index = 0;
-		}
-
-		__threadfence();
-		if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX))
-		{
-			cudaSolver->unusedCell_cuda[gid] = 0;
-			atomicMax(&newSubgridValue, INT_MAX);
-			atomicMax(&boundary[boundary_index], 1);
-			cudaSolver->work_u_cuda[gid] = u_cmp;
-			u=u_cmp;
-		}
-		__threadfence();
-
-		if(threadIdx.z == 0 && (blockIdx.z != 0)/* && (cudaSolver->currentStep & 1)*/)
-		{
-			u_cmp = cudaSolver->work_u_cuda[gid - blockDim.x*gridDim.x*blockDim.y*gridDim.y];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + (blockIdx.z - 1)*gridDim.x*gridDim.y);
-			boundary_index = 5;
-		}
-		if(threadIdx.z == blockDim.z - 1 && (blockIdx.z != gridDim.z - 1)/* && (cudaSolver->currentStep & 1)*/)
-		{
-			u_cmp = cudaSolver->work_u_cuda[gid + blockDim.x*gridDim.x*blockDim.y*gridDim.y];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + (blockIdx.z + 1)*gridDim.x*gridDim.y);
-			boundary_index = 4;
-		}
-		__threadfence();
-
-		if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX))
-		{
-			cudaSolver->unusedCell_cuda[gid] = 0;
-			atomicMax(&newSubgridValue, INT_MAX);
-			atomicMax(&boundary[boundary_index], 1);
-			cudaSolver->work_u_cuda[gid] = u_cmp;
-		}
-		__threadfence();
-
-	}
-	__syncthreads();
-
-	if(threadIdx.x+threadIdx.y+threadIdx.z == 0)
-	{
-
-		if(subgridValue == INT_MAX && newSubgridValue != 0)
-			cudaSolver->setSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + blockIdx.z*gridDim.x*gridDim.y, -INT_MAX);
-
-		cudaSolver->setBoundaryConditionCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + blockIdx.z*gridDim.x*gridDim.y, 	1  * boundary[0] +
-																													2  * boundary[1] +
-																													4  * boundary[2] +
-																													8  * boundary[3] +
-																													16 * boundary[4] +
-																													32 * boundary[5] );
-		if(blockIdx.x+blockIdx.y+blockIdx.z == 0)
-		{
-			cudaSolver->currentStep = cudaSolver->currentStep + 1;
-			*(cudaSolver->runcuda) = 0;
-		}
-	}
-}
-
-
-
-template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__
-void synchronize2CUDA3D(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver)
-{
-	int stepValue = cudaSolver->currentStep + 4;
-	if( cudaSolver->getSubgridValueCUDA3D(blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x) == -INT_MAX )
-			cudaSolver->setSubgridValueCUDA3D(blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x, stepValue);
-
-	atomicMax((cudaSolver->runcuda),cudaSolver->getBoundaryConditionCUDA3D(blockIdx.z*gridDim.x*gridDim.y + blockIdx.y*gridDim.x + blockIdx.x));
-}
-
-
-
-
-
-
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-__global__
-void initCUDA3D( tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr , int* ptr2, int* ptr3)
-{
-
-
-	cudaSolver->work_u_cuda = ptr;//(double*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*cudaSolver->n*cudaSolver->n*sizeof(double));
-	cudaSolver->unusedCell_cuda = ptr3;//(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*cudaSolver->n*cudaSolver->n*sizeof(int));
-	cudaSolver->subgridValues_cuda =(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*cudaSolver->gridLevels*sizeof(int));
-	cudaSolver->boundaryConditions_cuda =(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*cudaSolver->gridLevels*sizeof(int));
-	cudaSolver->runcuda = ptr2;//(bool*)malloc(sizeof(bool));
-	*(cudaSolver->runcuda) = 1;
-	cudaSolver->currentStep = 1;
-	//cudaMemcpy(ptr,&(cudaSolver->work_u_cuda), sizeof(double*),cudaMemcpyDeviceToHost);
-	//ptr = cudaSolver->work_u_cuda;
-	printf("GPU memory allocated.\n");
-
-	for(int i = 0; i < cudaSolver->gridCols*cudaSolver->gridRows*cudaSolver->gridLevels; i++)
-	{
-		cudaSolver->subgridValues_cuda[i] = INT_MAX;
-		cudaSolver->boundaryConditions_cuda[i] = 0;
-	}
-
-	/*for(long int j = 0; j < cudaSolver->n*cudaSolver->n*cudaSolver->gridCols*cudaSolver->gridRows; j++)
-	{
-		printf("%d\n",j);
-		cudaSolver->unusedCell_cuda[ j] = 1;
-	}*/
-	printf("GPU memory initialized.\n");
-}
-
-
-
-
-//extern __shared__ double array[];
-template< typename SchemeHost, typename SchemeDevice, typename Device >
-__global__
-void initRunCUDA3D(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller)
-
-{
-
-
-	extern __shared__ double u[];
-
-	int i =  blockIdx.z *  gridDim.x *  gridDim.y +  blockIdx.y *  gridDim.x +  blockIdx.x;
-	int l = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
-
-	__shared__ int containsCurve;
-	if(l == 0)
-	{
-//		printf("z = %d, y = %d, x = %d\n",blockIdx.z,blockIdx.y,blockIdx.x);
-		containsCurve = 0;
-	}
-
-	caller->getSubgridCUDA3D(i,caller, &u[l]);
-	__syncthreads();
-	if(u[0] * u[l] <= 0.0)
-	{
-		atomicMax( &containsCurve, 1);
-	}
-
-	__syncthreads();
-	if(containsCurve == 1)
-	{
-		caller->runSubgridCUDA3D(0,u,i);
-		__syncthreads();
-//		caller->insertSubgridCUDA3D(u[l],i);
-		caller->updateSubgridCUDA3D(i,caller, &u[l]);
-
-		__syncthreads();
-		if(l == 0)
-			caller->setSubgridValueCUDA3D(i, 4);
-	}
-
-
-}
-
-
-
-
-
-template< typename SchemeHost, typename SchemeDevice, typename Device >
-__global__
-void runCUDA3D(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller)
-{
-	extern __shared__ double u[];
-	int i =  blockIdx.z *  gridDim.x *  gridDim.y +  blockIdx.y *  gridDim.x +  blockIdx.x;
-	int l = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
-	int bound = caller->getBoundaryConditionCUDA3D(i);
-
-	if(caller->getSubgridValueCUDA3D(i) != INT_MAX && bound != 0 && caller->getSubgridValueCUDA3D(i) > 0)
-	{
-		caller->getSubgridCUDA3D(i,caller, &u[l]);
-
-		//if(l == 0)
-			//printf("i = %d, bound = %d\n",i,caller->getSubgridValueCUDA3D(i));
-		if(caller->getSubgridValueCUDA3D(i) == caller->currentStep+4)
-		{
-			if(bound & 1)
-			{
-				caller->runSubgridCUDA3D(1,u,i);
-				caller->updateSubgridCUDA3D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if(bound & 2 )
-			{
-				caller->runSubgridCUDA3D(2,u,i);
-				caller->updateSubgridCUDA3D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if(bound & 4)
-			{
-				caller->runSubgridCUDA3D(4,u,i);
-				caller->updateSubgridCUDA3D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if(bound & 8)
-			{
-				caller->runSubgridCUDA3D(8,u,i);
-				caller->updateSubgridCUDA3D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if(bound & 16)
-			{
-				caller->runSubgridCUDA3D(16,u,i);
-				caller->updateSubgridCUDA3D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if(bound & 32)
-			{
-				caller->runSubgridCUDA3D(32,u,i);
-				caller->updateSubgridCUDA3D(i,caller, &u[l]);
-				__syncthreads();
-			}
-
-		}
-		else
-		{
-			if( ((bound == 2)))
-			{
-				caller->runSubgridCUDA3D(2,u,i);
-				caller->updateSubgridCUDA3D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if( ((bound == 1) ))
-			{
-				caller->runSubgridCUDA3D(1,u,i);
-				caller->updateSubgridCUDA3D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if( ((bound == 8) ))
-			{
-				caller->runSubgridCUDA3D(8,u,i);
-				caller->updateSubgridCUDA3D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if((bound == 4))
-			{
-				caller->runSubgridCUDA3D(4,u,i);
-				caller->updateSubgridCUDA3D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if(bound == 16)
-			{
-				caller->runSubgridCUDA3D(16,u,i);
-				caller->updateSubgridCUDA3D(i,caller, &u[l]);
-				__syncthreads();
-			}
-			if(bound == 32)
-			{
-				caller->runSubgridCUDA3D(32,u,i);
-				caller->updateSubgridCUDA3D(i,caller, &u[l]);
-				__syncthreads();
-			}
-		}
-																/*  1  2  4  8  16  32  */
-
-		if( ((bound & 19 )))									/*  1  1  0  0   1   0  */
-		{
-			caller->runSubgridCUDA3D(19,u,i);
-			caller->updateSubgridCUDA3D(i,caller, &u[l]);
-			__syncthreads();
-		}
-		if( ((bound & 21 )))									/*  1  0  1  0   1   0  */
-		{
-			caller->runSubgridCUDA3D(21,u,i);
-			caller->updateSubgridCUDA3D(i,caller, &u[l]);
-			__syncthreads();
-		}
-		if( ((bound & 26 )))									/*  0  1  0  1   1   0  */
-		{
-			caller->runSubgridCUDA3D(26,u,i);
-			caller->updateSubgridCUDA3D(i,caller, &u[l]);
-			__syncthreads();
-		}
-		if(   (bound & 28 ))									/*  0  0  1  1   1   0  */
-		{
-			caller->runSubgridCUDA3D(28,u,i);
-			caller->updateSubgridCUDA3D(i,caller, &u[l]);
-			__syncthreads();
-		}
-
-
-
-		if( ((bound & 35 )))									/*  1  0  1  0   0   1  */
-		{
-			caller->runSubgridCUDA3D(35,u,i);
-			caller->updateSubgridCUDA3D(i,caller, &u[l]);
-			__syncthreads();
-		}
-		if( ((bound & 37 )))									/*  1  0  1  0   0   1  */
-		{
-			caller->runSubgridCUDA3D(37,u,i);
-			caller->updateSubgridCUDA3D(i,caller, &u[l]);
-			__syncthreads();
-		}
-		if( ((bound & 42 )))									/*  0  1  0  1   0   1  */
-		{
-			caller->runSubgridCUDA3D(42,u,i);
-			caller->updateSubgridCUDA3D(i,caller, &u[l]);
-			__syncthreads();
-		}
-		if(   (bound & 44 ))									/*  0  0  1  1   0   1  */
-		{
-			caller->runSubgridCUDA3D(44,u,i);
-			caller->updateSubgridCUDA3D(i,caller, &u[l]);
-			__syncthreads();
-		}
-
-		if(l==0)
-		{
-			caller->setBoundaryConditionCUDA3D(i, 0);
-			caller->setSubgridValueCUDA3D(i, caller->getSubgridValueCUDA3D(i) - 1 );
-		}
-
-
-	}
-
-
-
-}
-
-#endif /*HAVE_CUDA*/
-
-#endif /* TNLPARALLELEIKONALSOLVER3D_IMPL_H_ */
diff --git a/src/TNL/Legacy/narrow-band/CMakeLists.txt b/src/TNL/Legacy/narrow-band/CMakeLists.txt
deleted file mode 100644
index 158cd2013..000000000
--- a/src/TNL/Legacy/narrow-band/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-set( tnl_narrow_band_SOURCES
-#     MainBuildConfig.h
-#     tnlNarrowBand2D_impl.h
-#     tnlNarrowBand.h
-#     narrowBandConfig.h 
-     main.cpp)
-
-
-IF(  BUILD_CUDA ) 
-	CUDA_ADD_EXECUTABLE(narrow-band main.cu)
-ELSE(  BUILD_CUDA )                
-	ADD_EXECUTABLE(narrow-band main.cpp)
-ENDIF( BUILD_CUDA )
-target_link_libraries (narrow-band tnl )
-
-
-INSTALL( TARGETS narrow-band
-         RUNTIME DESTINATION bin
-         PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )
-        
-#INSTALL( FILES ${tnl_narrow_band_SOURCES}
-#         DESTINATION ${TNL_TARGET_DATA_DIRECTORY}/examples/narrow-band )
diff --git a/src/TNL/Legacy/narrow-band/MainBuildConfig.h b/src/TNL/Legacy/narrow-band/MainBuildConfig.h
deleted file mode 100644
index ed3d686eb..000000000
--- a/src/TNL/Legacy/narrow-band/MainBuildConfig.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/***************************************************************************
-                          MainBuildConfig.h  -  description
-                             -------------------
-    begin                : Jul 7, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef MAINBUILDCONFIG_H_
-#define MAINBUILDCONFIG_H_
-
-#include <solvers/tnlBuildConfigTags.h>
-
-class MainBuildConfig
-{
-   public:
-
-      static void print() {std::cerr << "MainBuildConfig" <<std::endl; }
-};
-
-/****
- * Turn off support for float and long double.
- */
-template<> struct tnlConfigTagReal< MainBuildConfig, float > { enum { enabled = false }; };
-template<> struct tnlConfigTagReal< MainBuildConfig, long double > { enum { enabled = false }; };
-
-/****
- * Turn off support for short int and long int indexing.
- */
-template<> struct tnlConfigTagIndex< MainBuildConfig, short int >{ enum { enabled = false }; };
-template<> struct tnlConfigTagIndex< MainBuildConfig, long int >{ enum { enabled = false }; };
-
-/****
- * Use of tnlGrid is enabled for allowed dimensions and Real, Device and Index types.
- */
-template< int Dimensions, typename Real, typename Device, typename Index >
-   struct tnlConfigTagMesh< MainBuildConfig, tnlGrid< Dimensions, Real, Device, Index > >
-      { enum { enabled = tnlConfigTagDimensions< MainBuildConfig, Dimensions >::enabled  &&
-                         tnlConfigTagReal< MainBuildConfig, Real >::enabled &&
-                         tnlConfigTagDevice< MainBuildConfig, Device >::enabled &&
-                         tnlConfigTagIndex< MainBuildConfig, Index >::enabled }; };
-
-/****
- * Please, chose your preferred time discretisation  here.
- */
-template<> struct tnlConfigTagTimeDiscretisation< MainBuildConfig, tnlExplicitTimeDiscretisationTag >{ enum { enabled = true }; };
-template<> struct tnlConfigTagTimeDiscretisation< MainBuildConfig, tnlSemiImplicitTimeDiscretisationTag >{ enum { enabled = false}; };
-template<> struct tnlConfigTagTimeDiscretisation< MainBuildConfig, tnlImplicitTimeDiscretisationTag >{ enum { enabled = false }; };
-
-/****
- * Only the Runge-Kutta-Merson solver is enabled by default.
- */
-template<> struct tnlConfigTagExplicitSolver< MainBuildConfig, tnlExplicitEulerSolverTag >{ enum { enabled = false }; };
-
-#endif /* MAINBUILDCONFIG_H_ */
diff --git a/src/TNL/Legacy/narrow-band/main.cpp b/src/TNL/Legacy/narrow-band/main.cpp
deleted file mode 100644
index 8849008ff..000000000
--- a/src/TNL/Legacy/narrow-band/main.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/***************************************************************************
-                          main.cpp  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#include "main.h"
diff --git a/src/TNL/Legacy/narrow-band/main.cu b/src/TNL/Legacy/narrow-band/main.cu
deleted file mode 100644
index 8849008ff..000000000
--- a/src/TNL/Legacy/narrow-band/main.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-/***************************************************************************
-                          main.cpp  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#include "main.h"
diff --git a/src/TNL/Legacy/narrow-band/main.h b/src/TNL/Legacy/narrow-band/main.h
deleted file mode 100644
index 51dbdac37..000000000
--- a/src/TNL/Legacy/narrow-band/main.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/***************************************************************************
-                          main.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-
-#include "MainBuildConfig.h"
-	//for HOST versions:
-//#include "tnlNarrowBand.h"
-	//for DEVICE versions:
-#include "tnlNarrowBand_CUDA.h"
-#include "narrowBandConfig.h"
-#include <solvers/tnlBuildConfigTags.h>
-
-#include <mesh/tnlGrid.h>
-#include <core/tnlDevice.h>
-#include <time.h>
-#include <ctime>
-
-typedef MainBuildConfig BuildConfig;
-
-int main( int argc, char* argv[] )
-{
-	time_t start;
-	time_t stop;
-	time(&start);
-	std::clock_t start2= std::clock();
-   Config::ParameterContainer parameters;
-   tnlConfigDescription configDescription;
-   narrowBandConfig< BuildConfig >::configSetup( configDescription );
-
-   if( ! parseCommandLine( argc, argv, configDescription, parameters ) )
-      return false;
-
-   const int& dim = parameters.getParameter< int >( "dim" );
-
-   if(dim == 2)
-   {
-		tnlNarrowBand<tnlGrid<2,double,TNL::Devices::Host, int>, double, int> solver;
-		if(!solver.init(parameters))
-	   {
-			cerr << "Solver failed to initialize." <<std::endl;
-			return EXIT_FAILURE;
-	   }
-		TNL_CHECK_CUDA_DEVICE;
-	  std::cout << "-------------------------------------------------------------" <<std::endl;
-	  std::cout << "Starting solver..." <<std::endl;
-	   solver.run();
-   }
-//   else if(dim == 3)
-//   {
-//		tnlNarrowBand<tnlGrid<3,double,TNL::Devices::Host, int>, double, int> solver;
-//		if(!solver.init(parameters))
-//	   {
-//			cerr << "Solver failed to initialize." <<std::endl;
-//			return EXIT_FAILURE;
-//	   }
-//		TNL_CHECK_CUDA_DEVICE;
-//	  std::cout << "-------------------------------------------------------------" <<std::endl;
-//	  std::cout << "Starting solver..." <<std::endl;
-//	   solver.run();
-//   }
-   else
-   {
-	  std::cerr << "Unsupported number of dimensions: " << dim << "!" <<std::endl;
-	   return EXIT_FAILURE;
-   }
-
-
-   time(&stop);
-  std::cout << "Solver stopped..." <<std::endl;
-  std::cout <<std::endl;
-  std::cout << "Running time was: " << difftime(stop,start) << " .... " << (std::clock() - start2) / (double)(CLOCKS_PER_SEC) <<std::endl;
-   return EXIT_SUCCESS;
-}
-
-
diff --git a/src/TNL/Legacy/narrow-band/narrowBandConfig.h b/src/TNL/Legacy/narrow-band/narrowBandConfig.h
deleted file mode 100644
index bab58ceac..000000000
--- a/src/TNL/Legacy/narrow-band/narrowBandConfig.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/***************************************************************************
-                          narrowBandConfig.h  -  description
-                             -------------------
-    begin                : Oct 15, 2015
-    copyright            : (C) 2015 by Tomas Sobotik
-    email                :
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef NARROWBANDCONFIG_H_
-#define NARROWBANDCONFIG_H_
-
-#include <config/tnlConfigDescription.h>
-
-template< typename ConfigTag >
-class narrowBandConfig
-{
-   public:
-      static void configSetup( tnlConfigDescription& config )
-      {
-         config.addDelimiter( "Narrow Band Solver solver settings:" );
-         config.addEntry        < String > ( "problem-name", "This defines particular problem.", "fast-sweeping" );
-         config.addRequiredEntry        < String > ( "initial-condition", "Initial condition for solver");
-         config.addRequiredEntry        < int > ( "dim", "Dimension of problem.");
-         config.addRequiredEntry        < double > ( "tau", "Time step.");
-         config.addRequiredEntry        < double > ( "final-time", "Final time.");
-         config.addEntry       < String > ( "mesh", "Name of mesh.", "mesh.tnl" );
-         config.addEntry       < String > ( "exact-input", "Are the function values near the curve equal to the SDF? (yes/no)", "no" );
-      }
-};
-
-#endif /* NARROWBANDCONFIG_H_ */
diff --git a/src/TNL/Legacy/narrow-band/tnlNarrowBand.h b/src/TNL/Legacy/narrow-band/tnlNarrowBand.h
deleted file mode 100644
index 7d3d19bc0..000000000
--- a/src/TNL/Legacy/narrow-band/tnlNarrowBand.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/***************************************************************************
-                          tnlNarrowBand.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLNARROWBAND_H_
-#define TNLNARROWBAND_H_
-
-#include <TNL/Config/ParameterContainer.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Containers/StaticVector.h>
-#include <functions/tnlMeshFunction.h>
-#include <TNL/Devices/Host.h>
-#include <mesh/tnlGrid.h>
-#include <mesh/grids/tnlGridEntity.h>
-#include <limits.h>
-#include <core/tnlDevice.h>
-#include <ctime>
-#ifdef HAVE_OPENMP
-#include <omp.h>
-#endif
-
-
-
-
-template< typename Mesh,
-		  typename Real,
-		  typename Index >
-class tnlNarrowBand
-{};
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-class tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >
-{
-
-public:
-	typedef Real RealType;
-	typedef Device DeviceType;
-	typedef Index IndexType;
-	typedef tnlGrid< 2, Real, Device, Index > MeshType;
-	typedef TNL::Containers::Vector< RealType, DeviceType, IndexType> DofVectorType;
-	typedef typename MeshType::CoordinatesType CoordinatesType;
-
-
-	tnlNarrowBand();
-
-	static String getType();
-	bool init( const Config::ParameterContainer& parameters );
-
-	bool initGrid();
-	bool run();
-
-	//for single core version use this implementation:
-	void updateValue(const Index i, const Index j);
-	//for parallel version use this one instead:
-//	void updateValue(const Index i, const Index j, DofVectorType* grid);
-
-
-	void setupSquare1000(Index i, Index j);
-	void setupSquare1100(Index i, Index j);
-	void setupSquare1010(Index i, Index j);
-	void setupSquare1001(Index i, Index j);
-	void setupSquare1110(Index i, Index j);
-	void setupSquare1101(Index i, Index j);
-	void setupSquare1011(Index i, Index j);
-	void setupSquare1111(Index i, Index j);
-	void setupSquare0000(Index i, Index j);
-	void setupSquare0100(Index i, Index j);
-	void setupSquare0010(Index i, Index j);
-	void setupSquare0001(Index i, Index j);
-	void setupSquare0110(Index i, Index j);
-	void setupSquare0101(Index i, Index j);
-	void setupSquare0011(Index i, Index j);
-	void setupSquare0111(Index i, Index j);
-
-	Real fabsMin(const Real x, const Real y);
-
-
-protected:
-
-	MeshType Mesh;
-
-	bool exactInput;
-
-	tnlMeshFunction<MeshType> dofVector, dofVector2;
-	DofVectorType data;
-
-	RealType h;
-
-	tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage > Entity;
-
-
-#ifdef HAVE_OPENMP
-//	omp_lock_t* gridLock;
-#endif
-
-
-};
-
-
-
-
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-class tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index >
-{
-
-public:
-	typedef Real RealType;
-	typedef Device DeviceType;
-	typedef Index IndexType;
-	typedef tnlGrid< 3, Real, Device, Index > MeshType;
-	typedef TNL::Containers::Vector< RealType, DeviceType, IndexType> DofVectorType;
-	typedef typename MeshType::CoordinatesType CoordinatesType;
-
-	tnlNarrowBand();
-
-	static String getType();
-	bool init( const Config::ParameterContainer& parameters );
-
-	bool initGrid();
-	bool run();
-
-	//for single core version use this implementation:
-	void updateValue(const Index i, const Index j, const Index k);
-	//for parallel version use this one instead:
-//	void updateValue(const Index i, const Index j, DofVectorType* grid);
-
-	Real fabsMin(const Real x, const Real y);
-
-
-protected:
-
-	MeshType Mesh;
-
-	bool exactInput;
-
-
-	tnlMeshFunction<MeshType> dofVector, dofVector2;
-	DofVectorType data;
-
-	RealType h;
-
-	tnlGridEntity< MeshType, 3, tnlGridEntityNoStencilStorage > Entity;
-
-#ifdef HAVE_OPENMP
-//	omp_lock_t* gridLock;
-#endif
-
-
-};
-
-
-	//for single core version use this implementation:
-#include "tnlNarrowBand2D_impl.h"
-	//for parallel version use this one instead:
-// #include "tnlNarrowBand2D_openMP_impl.h"
-
-#include "tnlNarrowBand3D_impl.h"
-
-#endif /* TNLNARROWBAND_H_ */
diff --git a/src/TNL/Legacy/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h b/src/TNL/Legacy/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h
deleted file mode 100644
index dff0b48c8..000000000
--- a/src/TNL/Legacy/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h
+++ /dev/null
@@ -1,1317 +0,0 @@
-/***************************************************************************
-                          tnlNarrowBand2D_CUDA_v4_impl.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLNARROWBAND2D_IMPL_H_
-#define TNLNARROWBAND2D_IMPL_H_
-
-#define NARROWBAND_SUBGRID_SIZE 32
-
-#include "tnlNarrowBand.h"
-
-#ifdef HAVE_CUDA
-__device__
-double fabsMin( double x, double y)
-{
-	double fx = abs(x);
-
-	if(Min(fx,abs(y)) == fx)
-		return x;
-	else
-		return y;
-}
-
-__device__
-double atomicFabsMin(double* address, double val)
-{
-	unsigned long long int* address_as_ull =
-						  (unsigned long long int*)address;
-	unsigned long long int old = *address_as_ull, assumed;
-	do {
-		assumed = old;
-			old = atomicCAS(address_as_ull, assumed,__double_as_longlong( fabsMin(__longlong_as_double(assumed),val) ));
-	} while (assumed != old);
-	return __longlong_as_double(old);
-}
-#endif
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-Real tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >:: positivePart(const Real arg) const
-{
-	if(arg > 0.0)
-		return arg;
-	return 0.0;
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-Real  tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: negativePart(const Real arg) const
-{
-	if(arg < 0.0)
-		return -arg;
-	return 0.0;
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-String tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
-{
-	   return String( "tnlNarrowBand< " ) +
-	          MeshType::getType() + ", " +
-	          ::getType< Real >() + ", " +
-	          ::getType< Index >() + " >";
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: tnlNarrowBand()
-:dofVector(Mesh)
-{
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: init( const Config::ParameterContainer& parameters )
-{
-	const String& meshFile = parameters.getParameter< String >( "mesh" );
-
-	if( ! Mesh.load( meshFile ) )
-	{
-		  std::cerr << "I am not able to load the mesh from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-
-	const String& initialCondition = parameters.getParameter <String>("initial-condition");
-	if( ! dofVector.load( initialCondition ) )
-	{
-		  std::cerr << "I am not able to load the initial condition from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-	h = Mesh.template getSpaceStepsProducts< 1, 0 >();
-	//Entity.refresh();
-	counter = 0;
-
-	const String& exact_input = parameters.getParameter< String >( "exact-input" );
-
-	if(exact_input == "no")
-		exactInput=false;
-	else
-		exactInput=true;
-
-	tau = parameters.getParameter< double >( "tau" );
-
-	finalTime = parameters.getParameter< double >( "final-time" );
-
-	statusGridSize = ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE);
-#ifdef HAVE_CUDA
-
-	cudaMalloc(&(cudaDofVector), this->dofVector.getData().getSize()*sizeof(double));
-	cudaMemcpy(cudaDofVector, this->dofVector.getData().getData(), this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyHostToDevice);
-
-	cudaMalloc(&(cudaDofVector2), this->dofVector.getData().getSize()*sizeof(double));
-	cudaMemcpy(cudaDofVector2, this->dofVector.getData().getData(), this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyHostToDevice);
-
-	cudaMalloc(&(cudaStatusVector),  statusGridSize*statusGridSize*sizeof(int));
-//	cudaMemcpy(cudaDofVector, this->dofVector.getData().getData(),  statusGridSize*statusGridSize* sizeof(int)), cudaMemcpyHostToDevice);
-
-	cudaMalloc(&reinitialize, sizeof(int));
-
-
-	cudaMalloc(&(this->cudaSolver), sizeof(tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >));
-	cudaMemcpy(this->cudaSolver, this,sizeof(tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >), cudaMemcpyHostToDevice);
-
-
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-#endif
-
-	int n = Mesh.getDimensions().x();
-
-	dim3 threadsPerBlock2(NARROWBAND_SUBGRID_SIZE, NARROWBAND_SUBGRID_SIZE);
-	dim3 numBlocks2(statusGridSize ,statusGridSize);
-	initSetupGridCUDA<<<numBlocks2,threadsPerBlock2>>>(this->cudaSolver);
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-	initSetupGrid2CUDA<<<numBlocks2,1>>>(this->cudaSolver);
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-
-
-	/*dim3 threadsPerBlock(16, 16);
-	dim3 numBlocks(n/16 + 1 ,n/16 +1);*/
-	initCUDA<<<numBlocks2,threadsPerBlock2>>>(this->cudaSolver);
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-
-
-	cout << "Solver initialized." <<std::endl;
-	return true;
-}
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: run()
-{
-
-	int n = Mesh.getDimensions().x();
-	dim3 threadsPerBlockFS(1, 512);
-	dim3 numBlocksFS(4,1);
-	dim3 threadsPerBlockNB(NARROWBAND_SUBGRID_SIZE, NARROWBAND_SUBGRID_SIZE);
-	dim3 numBlocksNB(n/NARROWBAND_SUBGRID_SIZE + 1,n/NARROWBAND_SUBGRID_SIZE + 1);
-
-	double time = 0.0;
-	int reinit = 0;
-
-	cout << "Hi!" <<std::endl;
-	runCUDA<<<numBlocksFS,threadsPerBlockFS>>>(this->cudaSolver,0,0);
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-	cout << "Hi2!" <<std::endl;
-	while(time < finalTime)
-	{
-		if(tau+time > finalTime)
-			tau=finalTime-time;
-
-		runNarrowBandCUDA<<<numBlocksNB,threadsPerBlockNB>>>(this->cudaSolver,tau);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-
-		time += tau;
-
-
-		cudaMemcpy(&reinit, this->reinitialize, sizeof(int), cudaMemcpyDeviceToHost);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-		if(reinit != 0 /*&& time != finalTime */)
-		{
-			cout << time <<std::endl;
-
-			initSetupGridCUDA<<<numBlocksNB,threadsPerBlockNB>>>(this->cudaSolver);
-			cudaDeviceSynchronize();
-			TNL_CHECK_CUDA_DEVICE;
-			initSetupGrid2CUDA<<<numBlocksNB,1>>>(this->cudaSolver);
-			cudaDeviceSynchronize();
-			TNL_CHECK_CUDA_DEVICE;
-			initCUDA<<<numBlocksNB,threadsPerBlockNB>>>(this->cudaSolver);
-			cudaDeviceSynchronize();
-			TNL_CHECK_CUDA_DEVICE;
-			runCUDA<<<numBlocksFS,threadsPerBlockFS>>>(this->cudaSolver,0,0);
-			cudaDeviceSynchronize();
-			TNL_CHECK_CUDA_DEVICE;
-		}
-	}
-
-	//data.setLike(dofVector.getData());
-	//cudaMemcpy(data.getData(), cudaDofVector2, this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-	cudaMemcpy(dofVector.getData().getData(), cudaDofVector2, this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-	cudaDeviceSynchronize();
-	cudaFree(cudaDofVector);
-	cudaFree(cudaDofVector2);
-	cudaFree(cudaSolver);
-	//data.save("u-00001.tnl");
-	dofVector.save("u-00001.tnl");
-	cudaDeviceSynchronize();
-	return true;
-}
-
-
-
-
-#ifdef HAVE_CUDA
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j)
-{
-	//			1 - with curve,  	2 - to the north of curve, 	4  - to the south of curve,
-	//								8 - to the east of curve, 	16 - to the west of curve.
-	int subgridID = i/NARROWBAND_SUBGRID_SIZE + (j/NARROWBAND_SUBGRID_SIZE) * statusGridSize;
-	if(cudaStatusVector[subgridID] != 0 && i<Mesh.getDimensions().x() && j < Mesh.getDimensions().y())
-	{
-		tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-		Entity.setCoordinates(CoordinatesType(i,j));
-		Entity.refresh();
-		tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-		Real value = cudaDofVector2[Entity.getIndex()];
-		Real a,b, tmp;
-
-		if( i == 0 /*|| (i/NARROWBAND_SUBGRID_SIZE == 0 && !(cudaStatusVector[subgridID] & 9))*/ )
-			a = cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()];
-		else if( i == Mesh.getDimensions().x() - 1 /*|| (i/NARROWBAND_SUBGRID_SIZE == NARROWBAND_SUBGRID_SIZE - 1 && !(cudaStatusVector[subgridID] & 17))*/ )
-			a = cudaDofVector2[neighborEntities.template getEntityIndex< -1,  0 >()];
-		else
-		{
-			a = fabsMin( cudaDofVector2[neighborEntities.template getEntityIndex< -1,  0 >()],
-					 cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()] );
-		}
-
-		if( j == 0 /*|| (j/NARROWBAND_SUBGRID_SIZE == 0 && !(cudaStatusVector[subgridID] & 3))*/ )
-			b = cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()];
-		else if( j == Mesh.getDimensions().y() - 1 /* || (j/NARROWBAND_SUBGRID_SIZE == NARROWBAND_SUBGRID_SIZE - 1 && !(cudaStatusVector[subgridID] & 5)) */)
-			b = cudaDofVector2[neighborEntities.template getEntityIndex< 0,  -1 >()];
-		else
-		{
-			b = fabsMin( cudaDofVector2[neighborEntities.template getEntityIndex< 0,  -1 >()],
-					 cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()] );
-		}
-
-
-		if(abs(a-b) >= h)
-			tmp = fabsMin(a,b) + sign(value)*h;
-		else
-			tmp = 0.5 * (a + b + sign(value)*sqrt(2.0 * h * h - (a - b) * (a - b) ) );
-
-	//	cudaDofVector2[Entity.getIndex()]  = fabsMin(value, tmp);
-		atomicFabsMin(&(cudaDofVector2[Entity.getIndex()]), tmp);
-	}
-
-}
-
-
-__global__ void initCUDA(tnlNarrowBand< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver)
-{
-
-
-	int gx = threadIdx.x + blockDim.x*blockIdx.x;
-	int gy = blockDim.y*blockIdx.y + threadIdx.y;
-
-
-	if(solver->Mesh.getDimensions().x() > gx  && solver->Mesh.getDimensions().y() > gy)
-	{
-		solver->initGrid();
-	}
-
-
-}
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid()
-{
-	int i = threadIdx.x + blockDim.x*blockIdx.x;
-	int j = blockDim.y*blockIdx.y + threadIdx.y;
-
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-
-	int gid = Entity.getIndex();
-
-	if(abs(cudaDofVector2[gid]) > 1.5*h)
-		cudaDofVector2[gid] = INT_MAX*sign(cudaDofVector2[gid]);
-
-//	if (i >0 && j > 0 && i+1 < Mesh.getDimensions().x() && j+1 < Mesh.getDimensions().y())
-//	{
-//		if(cudaDofVector2[gid]*cudaDofVector2[gid+1] <= 0 )
-//		{
-//			cudaDofVector2[gid] = sign(cudaDofVector2[gid])*0.5*h;
-//			cudaDofVector2[gid+1] = sign(cudaDofVector2[gid+1])*0.5*h;
-//		}
-//		if( cudaDofVector2[gid]*cudaDofVector2[gid+Mesh.getDimensions().x()] <= 0 )
-//		{
-//			cudaDofVector2[gid] = sign(cudaDofVector2[gid])*0.5*h;
-//			cudaDofVector2[gid+Mesh.getDimensions().x()] = sign(cudaDofVector2[gid+Mesh.getDimensions().x()])*0.5*h;
-//		}
-//
-//		if(cudaDofVector2[gid]*cudaDofVector2[gid-1] <= 0 )
-//		{
-//			cudaDofVector2[gid] = sign(cudaDofVector2[gid])*0.5*h;
-//			cudaDofVector2[gid-1] = sign(cudaDofVector2[gid-1])*0.5*h;
-//		}
-//		if( cudaDofVector2[gid]*cudaDofVector2[gid-Mesh.getDimensions().x()] <= 0 )
-//		{
-//			cudaDofVector2[gid] = sign(cudaDofVector2[gid])*0.5*h;
-//			cudaDofVector2[gid-Mesh.getDimensions().x()] = sign(cudaDofVector2[gid-Mesh.getDimensions().x()])*0.5*h;
-//		}
-//	}
-
-
-//
-
-
-
-
-
-
-//	if(i+1 < Mesh.getDimensions().x() && j+1 < Mesh.getDimensions().y() )
-//	{
-//		if(cudaDofVector[Entity.getIndex()] > 0)
-//		{
-//			if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()] > 0)
-//			{
-//				if(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare1111(i,j);
-//					else
-//						setupSquare1110(i,j);
-//				}
-//				else
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare1101(i,j);
-//					else
-//						setupSquare1100(i,j);
-//				}
-//			}
-//			else
-//			{
-//				if(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare1011(i,j);
-//					else
-//						setupSquare1010(i,j);
-//				}
-//				else
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare1001(i,j);
-//					else
-//						setupSquare1000(i,j);
-//				}
-//			}
-//		}
-//		else
-//		{
-//			if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()] > 0)
-//			{
-//				if(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare0111(i,j);
-//					else
-//						setupSquare0110(i,j);
-//				}
-//				else
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare0101(i,j);
-//					else
-//						setupSquare0100(i,j);
-//				}
-//			}
-//			else
-//			{
-//				if(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare0011(i,j);
-//					else
-//						setupSquare0010(i,j);
-//				}
-//				else
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare0001(i,j);
-//					else
-//						setupSquare0000(i,j);
-//				}
-//			}
-//		}
-//
-//	}
-
-	return true;
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-Real tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
-{
-	Real fx = abs(x);
-	//Real fy = abs(y);
-
-	//Real tmpMin = Min(fx,abs(y));
-
-	if(Min(fx,abs(y)) == fx)
-		return x;
-	else
-		return y;
-
-
-}
-
-
-
-__global__ void runCUDA(tnlNarrowBand< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int i)
-{
-
-
-	int gx = 0;
-	int gy = threadIdx.y;
-	//if(solver->Mesh.getDimensions().x() <= gx || solver->Mesh.getDimensions().y() <= gy)
-	//	return;
-	int n = solver->Mesh.getDimensions().x();
-	int blockCount = n/blockDim.y +1;
-	//int gid = solver->Mesh.getDimensions().x() * gy + gx;
-	//int max = solver->Mesh.getDimensions().x()*solver->Mesh.getDimensions().x();
-
-	//int id1 = gx+gy;
-	//int id2 = (solver->Mesh.getDimensions().x() - gx - 1) + gy;
-
-	if(blockIdx.x==0)
-	{
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy < n)
-			{
-				solver->updateValue(gx,gy);
-				gx++;
-				if(gx==n)
-				{
-					gx=0;
-					gy+=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-	}
-	else if(blockIdx.x==1)
-	{
-		gx=n-1;
-		gy=threadIdx.y;
-
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy < n)
-			{
-				solver->updateValue(gx,gy);
-				gx--;
-				if(gx==-1)
-				{
-					gx=n-1;
-					gy+=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-	}
-	else if(blockIdx.x==2)
-	{
-		gx=0;
-		gy=n-threadIdx.y-1;
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy > -1)
-			{
-				solver->updateValue(gx,gy);
-				gx++;
-				if(gx==n)
-				{
-					gx=0;
-					gy-=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-	}
-	else if(blockIdx.x==3)
-	{
-		gx=n-1;
-		gy=n-threadIdx.y-1;
-
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy > -1)
-			{
-				solver->updateValue(gx,gy);
-				gx--;
-				if(gx==-1)
-				{
-					gx=n-1;
-					gy-=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-	}
-
-}
-
-
-
-
-__global__ void initSetupGridCUDA(tnlNarrowBand< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver)
-{
-	__shared__ double u0;
-	int gx = threadIdx.x + blockDim.x*blockIdx.x;
-	int gy = blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy)
-	{
-
-//		printf("Hello from  block = %d, thread = %d, x = %d, y = %d\n", blockIdx.x + gridDim.x*blockIdx.y,(blockDim.y*blockIdx.y + threadIdx.y)*solver->Mesh.getDimensions().x() + blockDim.x*blockIdx.x + threadIdx.x, threadIdx.x, threadIdx.y);
-		if(threadIdx.x+threadIdx.y == 0)
-		{
-//			printf("Hello from  block = %d, thread = %d, x = %d, y = %d\n", blockIdx.x + gridDim.x*blockIdx.y,(blockDim.y*blockIdx.y + threadIdx.y)*solver->Mesh.getDimensions().x() + blockDim.x*blockIdx.x + threadIdx.x, threadIdx.x, threadIdx.y);
-
-			if(blockIdx.x+blockIdx.y == 0)
-				*(solver->reinitialize) = 0;
-
-			solver->cudaStatusVector[blockIdx.x + gridDim.x*blockIdx.y] = 0;
-
-			u0 = solver->cudaDofVector2[(blockDim.y*blockIdx.y + 0)*solver->Mesh.getDimensions().x() + blockDim.x*blockIdx.x + 0];
-		}
-		__syncthreads();
-
-		double u = solver->cudaDofVector2[(blockDim.y*blockIdx.y + threadIdx.y)*solver->Mesh.getDimensions().x() + blockDim.x*blockIdx.x + threadIdx.x];
-
-		if(u*u0 <=0.0)
-			atomicMax(&(solver->cudaStatusVector[blockIdx.x + gridDim.x*blockIdx.y]),1);
-	}
-//	if(threadIdx.x+threadIdx.y == 0)
-
-//	printf("Bye from  block = %d, thread = %d, x = %d, y = %d\n", blockIdx.x + gridDim.x*blockIdx.y,(blockDim.y*blockIdx.y + threadIdx.y)*solver->Mesh.getDimensions().x() + blockDim.x*blockIdx.x + threadIdx.x, threadIdx.x, threadIdx.y);
-
-
-}
-
-
-
-// run this with one thread per block
-__global__ void initSetupGrid2CUDA(tnlNarrowBand< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver)
-{
-//	printf("Hello\n");
-	if(solver->cudaStatusVector[blockIdx.x + gridDim.x*blockIdx.y] == 1)
-	{
-//			1 - with curve,  	2 - to the north of curve, 	4  - to the south of curve,
-//								8 - to the east of curve, 	16 - to the west of curve.
-			if(blockIdx.x > 0)
-			{
-				atomicAdd(&(solver->cudaStatusVector[blockIdx.x - 1 + gridDim.x*blockIdx.y]), 16);
-			}
-
-			if(blockIdx.x < gridDim.x - 1)
-				atomicAdd(&(solver->cudaStatusVector[blockIdx.x + 1 + gridDim.x*blockIdx.y]), 8);
-
-			if(blockIdx.y > 0 )
-				atomicAdd(&(solver->cudaStatusVector[blockIdx.x + gridDim.x*(blockIdx.y - 1)]), 4);
-
-			if(blockIdx.y < gridDim.y - 1)
-				atomicAdd(&(solver->cudaStatusVector[blockIdx.x + gridDim.x*(blockIdx.y + 1)]), 2);
-	}
-
-
-}
-
-
-
-
-
-__global__ void runNarrowBandCUDA(tnlNarrowBand< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver, double tau)
-{
-	int gid = (blockDim.y*blockIdx.y + threadIdx.y)*solver->Mesh.getDimensions().x()+ threadIdx.x;
-	int i = threadIdx.x + blockIdx.x*blockDim.x;
-	int j = threadIdx.y + blockIdx.y*blockDim.y;
-
-//	if(i+j == 0)
-//		printf("Hello\n");
-
-	int blockID = blockIdx.x + blockIdx.y*gridDim.x; /*i/NARROWBAND_SUBGRID_SIZE + (j/NARROWBAND_SUBGRID_SIZE) * ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE);*/
-
-	int status = solver->cudaStatusVector[blockID];
-
-	if(solver->Mesh.getDimensions().x() > i && solver->Mesh.getDimensions().y() > j)
-	{
-
-		if(status != 0)
-		{
-			tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(solver->Mesh);
-			Entity.setCoordinates(Containers::StaticVector<2,double>(i,j));
-			Entity.refresh();
-			tnlNeighborGridEntityGetter<tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-			double value = solver->cudaDofVector2[Entity.getIndex()];
-			double xf,xb,yf,yb, grad, fu, a,b;
-			a = b = 0.0;
-
-			if( i == 0 || (threadIdx.x == 0 && !(status & 9)) )
-			{
-				xb = value - solver->cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()];
-				xf = solver->cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()] - value;
-			}
-			else if( i == solver->Mesh.getDimensions().x() - 1 || (threadIdx.x == blockDim.x - 1 && !(status & 17)) )
-			{
-				xb = value - solver->cudaDofVector2[neighborEntities.template getEntityIndex< -1,  0 >()];
-				xf = solver->cudaDofVector2[neighborEntities.template getEntityIndex< -1,  0 >()] - value;
-			}
-			else
-			{
-				xb =  value - solver->cudaDofVector2[neighborEntities.template getEntityIndex< -1,  0 >()];
-				xf = solver-> cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()] - value;
-			}
-
-			if( j == 0 || (threadIdx.y == 0 && !(status & 3)) )
-			{
-				yb = value - solver->cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()] ;
-				yf = solver->cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()] - value;
-			}
-			else if( j == solver->Mesh.getDimensions().y() - 1  || (threadIdx.y == blockDim.y - 1 && !(status & 5)) )
-			{
-				yb = value - solver->cudaDofVector2[neighborEntities.template getEntityIndex< 0,  -1 >()];
-				yf = solver->cudaDofVector2[neighborEntities.template getEntityIndex< 0,  -1 >()] - value;
-			}
-			else
-			{
-				yb = value - solver->cudaDofVector2[neighborEntities.template getEntityIndex< 0, -1 >()];
-				yf = solver-> cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()] - value;
-			}
-			__syncthreads();
-
-
-
-
-
-			   if(sign(value) >= 0.0)
-			   {
-				   xf = solver->negativePart(xf);
-
-				   xb = solver->positivePart(xb);
-
-				   yf = solver->negativePart(yf);
-
-				   yb = solver->positivePart(yb);
-
-			   }
-			   else
-			   {
-
-				   xb = solver->negativePart(xb);
-
-				   xf = solver->positivePart(xf);
-
-				   yb = solver->negativePart(yb);
-
-				   yf = solver->positivePart(yf);
-			   }
-
-
-			   if(xb > xf)
-				   a = xb*solver->Mesh.template getSpaceStepsProducts< -1, 0 >();
-			   else
-				   a = xf*solver->Mesh.template getSpaceStepsProducts< -1, 0 >();
-
-			   if(yb > yf)
-				   b = yb*solver->Mesh.template getSpaceStepsProducts< 0, -1 >();
-			   else
-				   b = yf*solver->Mesh.template getSpaceStepsProducts< 0, -1 >();
-
-
-
-//			grad = sqrt(0.5 * (xf*xf + xb*xb    +   yf*yf + yb*yb ) )*solver->Mesh.template getSpaceStepsProducts< -1, 0 >();
-
-			grad = sqrt(/*0.5 **/ (a*a    +   b*b ) );
-
-			fu = -1.0 * grad;
-
-			if((tau*fu+value)*value <=0 )
-			{
-				//			1 - with curve,  	2 - to the north of curve, 	4  - to the south of curve,
-				//								8 - to the east of curve, 	16 - to the west of curve.
-
-				if((threadIdx.x == 6 && !(status & 9)) && (blockIdx.x > 0) )
-					atomicMax(solver->reinitialize,1);
-				else if((threadIdx.x == blockDim.x - 7 && !(status & 17)) && (blockIdx.x < gridDim.x - 1) )
-					atomicMax(solver->reinitialize,1);
-				else if((threadIdx.y == 6 && !(status & 3)) && (blockIdx.y > 0) )
-					atomicMax(solver->reinitialize,1);
-				else if((threadIdx.y == blockDim.y - 7 && !(status & 5)) && (blockIdx.y < gridDim.y - 1) )
-					atomicMax(solver->reinitialize,1);
-			}
-
-			solver->cudaDofVector2[Entity.getIndex()]  += tau*fu;
-		}
-	}
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1111( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	cudaDofVector2[Entity.getIndex()]=fabsMin(INT_MAX,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0000( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-INT_MAX,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1110( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1101( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[Entity.getIndex()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1011( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[Entity.getIndex()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0111( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0001( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0010( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[Entity.getIndex()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0100( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[Entity.getIndex()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1000( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1100( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-al;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1010( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1001( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	cudaDofVector2[Entity.getIndex()]=fabsMin(cudaDofVector[Entity.getIndex()],cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0011( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-al;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0101( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0110( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	cudaDofVector2[Entity.getIndex()]=fabsMin(cudaDofVector[Entity.getIndex()],cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-}
-#endif
-
-
-
-
-#endif /* TNLNARROWBAND_IMPL_H_ */
diff --git a/src/TNL/Legacy/narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h b/src/TNL/Legacy/narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h
deleted file mode 100644
index c92810490..000000000
--- a/src/TNL/Legacy/narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h
+++ /dev/null
@@ -1,1313 +0,0 @@
-/***************************************************************************
-                          tnlNarrowBand2D_CUDA_v4_impl.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLNARROWBAND2D_IMPL_H_
-#define TNLNARROWBAND2D_IMPL_H_
-
-#define NARROWBAND_SUBGRID_SIZE 32
-
-#include "tnlNarrowBand.h"
-
-__device__
-double fabsMin( double x, double y)
-{
-	double fx = abs(x);
-
-	if(Min(fx,abs(y)) == fx)
-		return x;
-	else
-		return y;
-}
-
-__device__
-double atomicFabsMin(double* address, double val)
-{
-	unsigned long long int* address_as_ull =
-						  (unsigned long long int*)address;
-	unsigned long long int old = *address_as_ull, assumed;
-	do {
-		assumed = old;
-			old = atomicCAS(address_as_ull, assumed,__double_as_longlong( fabsMin(__longlong_as_double(assumed),val) ));
-	} while (assumed != old);
-	return __longlong_as_double(old);
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-Real tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >:: positivePart(const Real arg) const
-{
-	if(arg > 0.0)
-		return arg;
-	return 0.0;
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-Real  tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: negativePart(const Real arg) const
-{
-	if(arg < 0.0)
-		return -arg;
-	return 0.0;
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-String tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
-{
-	   return String( "tnlNarrowBand< " ) +
-	          MeshType::getType() + ", " +
-	          ::getType< Real >() + ", " +
-	          ::getType< Index >() + " >";
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: tnlNarrowBand()
-:dofVector(Mesh)
-{
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: init( const Config::ParameterContainer& parameters )
-{
-	const String& meshFile = parameters.getParameter< String >( "mesh" );
-
-	if( ! Mesh.load( meshFile ) )
-	{
-		  std::cerr << "I am not able to load the mesh from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-
-	const String& initialCondition = parameters.getParameter <String>("initial-condition");
-	if( ! dofVector.load( initialCondition ) )
-	{
-		  std::cerr << "I am not able to load the initial condition from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-	h = Mesh.template getSpaceStepsProducts< 1, 0 >();
-	//Entity.refresh();
-	counter = 0;
-
-	const String& exact_input = parameters.getParameter< String >( "exact-input" );
-
-	if(exact_input == "no")
-		exactInput=false;
-	else
-		exactInput=true;
-
-	tau = parameters.getParameter< double >( "tau" );
-
-	finalTime = parameters.getParameter< double >( "final-time" );
-
-	statusGridSize = ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE);
-#ifdef HAVE_CUDA
-
-	cudaMalloc(&(cudaDofVector), this->dofVector.getData().getSize()*sizeof(double));
-	cudaMemcpy(cudaDofVector, this->dofVector.getData().getData(), this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyHostToDevice);
-
-	cudaMalloc(&(cudaDofVector2), this->dofVector.getData().getSize()*sizeof(double));
-	cudaMemcpy(cudaDofVector2, this->dofVector.getData().getData(), this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyHostToDevice);
-
-	cudaMalloc(&(cudaStatusVector),  statusGridSize*statusGridSize*sizeof(int));
-//	cudaMemcpy(cudaDofVector, this->dofVector.getData().getData(),  statusGridSize*statusGridSize* sizeof(int)), cudaMemcpyHostToDevice);
-
-	cudaMalloc(&reinitialize, sizeof(int));
-
-
-	cudaMalloc(&(this->cudaSolver), sizeof(tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >));
-	cudaMemcpy(this->cudaSolver, this,sizeof(tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >), cudaMemcpyHostToDevice);
-
-
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-#endif
-
-	int n = Mesh.getDimensions().x();
-
-	dim3 threadsPerBlock2(NARROWBAND_SUBGRID_SIZE, NARROWBAND_SUBGRID_SIZE);
-	dim3 numBlocks2(statusGridSize ,statusGridSize);
-	initSetupGridCUDA<<<numBlocks2,threadsPerBlock2>>>(this->cudaSolver);
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-	initSetupGrid2CUDA<<<numBlocks2,1>>>(this->cudaSolver);
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-
-
-	/*dim3 threadsPerBlock(16, 16);
-	dim3 numBlocks(n/16 + 1 ,n/16 +1);*/
-	initCUDA<<<numBlocks2,threadsPerBlock2>>>(this->cudaSolver);
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-
-
-	cout << "Solver initialized." <<std::endl;
-	return true;
-}
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: run()
-{
-
-	int n = Mesh.getDimensions().x();
-	dim3 threadsPerBlockFS(1, 512);
-	dim3 numBlocksFS(4,1);
-	dim3 threadsPerBlockNB(NARROWBAND_SUBGRID_SIZE, NARROWBAND_SUBGRID_SIZE);
-	dim3 numBlocksNB(n/NARROWBAND_SUBGRID_SIZE + 1,n/NARROWBAND_SUBGRID_SIZE + 1);
-
-	double time = 0.0;
-	int reinit = 0;
-
-	cout << "Hi!" <<std::endl;
-	runCUDA<<<numBlocksFS,threadsPerBlockFS>>>(this->cudaSolver,0,0);
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-	cout << "Hi2!" <<std::endl;
-	while(time < finalTime)
-	{
-		if(tau+time > finalTime)
-			tau=finalTime-time;
-
-		runNarrowBandCUDA<<<numBlocksNB,threadsPerBlockNB>>>(this->cudaSolver,tau);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-
-		time += tau;
-
-
-		cudaMemcpy(&reinit, this->reinitialize, sizeof(int), cudaMemcpyDeviceToHost);
-		cudaDeviceSynchronize();
-		TNL_CHECK_CUDA_DEVICE;
-		if(reinit != 0 /*&& time != finalTime */)
-		{
-			cout << time <<std::endl;
-
-			initSetupGridCUDA<<<numBlocksNB,threadsPerBlockNB>>>(this->cudaSolver);
-			cudaDeviceSynchronize();
-			TNL_CHECK_CUDA_DEVICE;
-			initSetupGrid2CUDA<<<numBlocksNB,1>>>(this->cudaSolver);
-			cudaDeviceSynchronize();
-			TNL_CHECK_CUDA_DEVICE;
-			initCUDA<<<numBlocksNB,threadsPerBlockNB>>>(this->cudaSolver);
-			cudaDeviceSynchronize();
-			TNL_CHECK_CUDA_DEVICE;
-			runCUDA<<<numBlocksFS,threadsPerBlockFS>>>(this->cudaSolver,0,0);
-			cudaDeviceSynchronize();
-			TNL_CHECK_CUDA_DEVICE;
-		}
-	}
-
-	//data.setLike(dofVector.getData());
-	//cudaMemcpy(data.getData(), cudaDofVector2, this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-	cudaMemcpy(dofVector.getData().getData(), cudaDofVector2, this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-	cudaDeviceSynchronize();
-	cudaFree(cudaDofVector);
-	cudaFree(cudaDofVector2);
-	cudaFree(cudaSolver);
-	//data.save("u-00001.tnl");
-	dofVector.save("u-00001.tnl");
-	cudaDeviceSynchronize();
-	return true;
-}
-
-
-
-
-#ifdef HAVE_CUDA
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j)
-{
-	//			1 - with curve,  	2 - to the north of curve, 	4  - to the south of curve,
-	//								8 - to the east of curve, 	16 - to the west of curve.
-	int subgridID = i/NARROWBAND_SUBGRID_SIZE + (j/NARROWBAND_SUBGRID_SIZE) * ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE);
-	if(/*cudaStatusVector[subgridID] != 0 &&*/ i<Mesh.getDimensions().x() && Mesh.getDimensions().y())
-	{
-		tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-		Entity.setCoordinates(CoordinatesType(i,j));
-		Entity.refresh();
-		tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-		Real value = cudaDofVector2[Entity.getIndex()];
-		Real a,b, tmp;
-
-		if( i == 0 /*|| (i/NARROWBAND_SUBGRID_SIZE == 0 && !(cudaStatusVector[subgridID] & 9)) */)
-			a = cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()];
-		else if( i == Mesh.getDimensions().x() - 1 /*|| (i/NARROWBAND_SUBGRID_SIZE == NARROWBAND_SUBGRID_SIZE - 1 && !(cudaStatusVector[subgridID] & 17)) */)
-			a = cudaDofVector2[neighborEntities.template getEntityIndex< -1,  0 >()];
-		else
-		{
-			a = fabsMin( cudaDofVector2[neighborEntities.template getEntityIndex< -1,  0 >()],
-					 cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()] );
-		}
-
-		if( j == 0/* || (j/NARROWBAND_SUBGRID_SIZE == 0 && !(cudaStatusVector[subgridID] & 3)) */)
-			b = cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()];
-		else if( j == Mesh.getDimensions().y() - 1 /* || (j/NARROWBAND_SUBGRID_SIZE == NARROWBAND_SUBGRID_SIZE - 1 && !(cudaStatusVector[subgridID] & 5))*/ )
-			b = cudaDofVector2[neighborEntities.template getEntityIndex< 0,  -1 >()];
-		else
-		{
-			b = fabsMin( cudaDofVector2[neighborEntities.template getEntityIndex< 0,  -1 >()],
-					 cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()] );
-		}
-
-
-		if(abs(a-b) >= h)
-			tmp = fabsMin(a,b) + sign(value)*h;
-		else
-			tmp = 0.5 * (a + b + sign(value)*sqrt(2.0 * h * h - (a - b) * (a - b) ) );
-
-	//	cudaDofVector2[Entity.getIndex()]  = fabsMin(value, tmp);
-		atomicFabsMin(&(cudaDofVector2[Entity.getIndex()]), tmp);
-	}
-
-}
-
-
-__global__ void initCUDA(tnlNarrowBand< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver)
-{
-
-
-	int gx = threadIdx.x + blockDim.x*blockIdx.x;
-	int gy = blockDim.y*blockIdx.y + threadIdx.y;
-
-
-	if(solver->Mesh.getDimensions().x() > gx  && solver->Mesh.getDimensions().y() > gy)
-	{
-		solver->initGrid();
-	}
-
-
-}
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid()
-{
-	int i = threadIdx.x + blockDim.x*blockIdx.x;
-	int j = blockDim.y*blockIdx.y + threadIdx.y;
-
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-
-	int gid = Entity.getIndex();
-
-	cudaDofVector2[gid] = INT_MAX*sign(cudaDofVector2[gid]);
-
-	if (i >0 && j > 0 && i+1 < Mesh.getDimensions().x() && j+1 < Mesh.getDimensions().y())
-	{
-		if(cudaDofVector2[gid]*cudaDofVector2[gid+1] <= 0 )
-		{
-			cudaDofVector2[gid] = sign(cudaDofVector2[gid])*0.5*h;
-			cudaDofVector2[gid+1] = sign(cudaDofVector2[gid+1])*0.5*h;
-		}
-		if( cudaDofVector2[gid]*cudaDofVector2[gid+Mesh.getDimensions().x()] <= 0 )
-		{
-			cudaDofVector2[gid] = sign(cudaDofVector2[gid])*0.5*h;
-			cudaDofVector2[gid+Mesh.getDimensions().x()] = sign(cudaDofVector2[gid+Mesh.getDimensions().x()])*0.5*h;
-		}
-
-		if(cudaDofVector2[gid]*cudaDofVector2[gid-1] <= 0 )
-		{
-			cudaDofVector2[gid] = sign(cudaDofVector2[gid])*0.5*h;
-			cudaDofVector2[gid-1] = sign(cudaDofVector2[gid-1])*0.5*h;
-		}
-		if( cudaDofVector2[gid]*cudaDofVector2[gid-Mesh.getDimensions().x()] <= 0 )
-		{
-			cudaDofVector2[gid] = sign(cudaDofVector2[gid])*0.5*h;
-			cudaDofVector2[gid-Mesh.getDimensions().x()] = sign(cudaDofVector2[gid-Mesh.getDimensions().x()])*0.5*h;
-		}
-	}
-
-
-//
-
-
-
-
-
-
-//	if(i+1 < Mesh.getDimensions().x() && j+1 < Mesh.getDimensions().y() )
-//	{
-//		if(cudaDofVector[Entity.getIndex()] > 0)
-//		{
-//			if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()] > 0)
-//			{
-//				if(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare1111(i,j);
-//					else
-//						setupSquare1110(i,j);
-//				}
-//				else
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare1101(i,j);
-//					else
-//						setupSquare1100(i,j);
-//				}
-//			}
-//			else
-//			{
-//				if(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare1011(i,j);
-//					else
-//						setupSquare1010(i,j);
-//				}
-//				else
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare1001(i,j);
-//					else
-//						setupSquare1000(i,j);
-//				}
-//			}
-//		}
-//		else
-//		{
-//			if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()] > 0)
-//			{
-//				if(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare0111(i,j);
-//					else
-//						setupSquare0110(i,j);
-//				}
-//				else
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare0101(i,j);
-//					else
-//						setupSquare0100(i,j);
-//				}
-//			}
-//			else
-//			{
-//				if(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare0011(i,j);
-//					else
-//						setupSquare0010(i,j);
-//				}
-//				else
-//				{
-//					if(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-//						setupSquare0001(i,j);
-//					else
-//						setupSquare0000(i,j);
-//				}
-//			}
-//		}
-//
-//	}
-
-	return true;
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-Real tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
-{
-	Real fx = abs(x);
-	//Real fy = abs(y);
-
-	//Real tmpMin = Min(fx,abs(y));
-
-	if(Min(fx,abs(y)) == fx)
-		return x;
-	else
-		return y;
-
-
-}
-
-
-
-__global__ void runCUDA(tnlNarrowBand< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int i)
-{
-
-
-	int gx = 0;
-	int gy = threadIdx.y;
-	//if(solver->Mesh.getDimensions().x() <= gx || solver->Mesh.getDimensions().y() <= gy)
-	//	return;
-	int n = solver->Mesh.getDimensions().x();
-	int blockCount = n/blockDim.y +1;
-	//int gid = solver->Mesh.getDimensions().x() * gy + gx;
-	//int max = solver->Mesh.getDimensions().x()*solver->Mesh.getDimensions().x();
-
-	//int id1 = gx+gy;
-	//int id2 = (solver->Mesh.getDimensions().x() - gx - 1) + gy;
-
-	if(blockIdx.x==0)
-	{
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy < n)
-			{
-				solver->updateValue(gx,gy);
-				gx++;
-				if(gx==n)
-				{
-					gx=0;
-					gy+=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-	}
-	else if(blockIdx.x==1)
-	{
-		gx=n-1;
-		gy=threadIdx.y;
-
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy < n)
-			{
-				solver->updateValue(gx,gy);
-				gx--;
-				if(gx==-1)
-				{
-					gx=n-1;
-					gy+=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-	}
-	else if(blockIdx.x==2)
-	{
-		gx=0;
-		gy=n-threadIdx.y-1;
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy > -1)
-			{
-				solver->updateValue(gx,gy);
-				gx++;
-				if(gx==n)
-				{
-					gx=0;
-					gy-=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-	}
-	else if(blockIdx.x==3)
-	{
-		gx=n-1;
-		gy=n-threadIdx.y-1;
-
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy > -1)
-			{
-				solver->updateValue(gx,gy);
-				gx--;
-				if(gx==-1)
-				{
-					gx=n-1;
-					gy-=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-	}
-
-}
-
-
-
-
-__global__ void initSetupGridCUDA(tnlNarrowBand< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver)
-{
-	__shared__ double u0;
-	int gx = threadIdx.x + blockDim.x*blockIdx.x;
-	int gy = blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy)
-	{
-
-//		printf("Hello from  block = %d, thread = %d, x = %d, y = %d\n", blockIdx.x + gridDim.x*blockIdx.y,(blockDim.y*blockIdx.y + threadIdx.y)*solver->Mesh.getDimensions().x() + blockDim.x*blockIdx.x + threadIdx.x, threadIdx.x, threadIdx.y);
-		if(threadIdx.x+threadIdx.y == 0)
-		{
-//			printf("Hello from  block = %d, thread = %d, x = %d, y = %d\n", blockIdx.x + gridDim.x*blockIdx.y,(blockDim.y*blockIdx.y + threadIdx.y)*solver->Mesh.getDimensions().x() + blockDim.x*blockIdx.x + threadIdx.x, threadIdx.x, threadIdx.y);
-
-			if(blockIdx.x+blockIdx.y == 0)
-				*(solver->reinitialize) = 0;
-
-			solver->cudaStatusVector[blockIdx.x + gridDim.x*blockIdx.y] = 0;
-
-			u0 = solver->cudaDofVector2[(blockDim.y*blockIdx.y + 0)*solver->Mesh.getDimensions().x() + blockDim.x*blockIdx.x + 0];
-		}
-		__syncthreads();
-
-		double u = solver->cudaDofVector2[(blockDim.y*blockIdx.y + threadIdx.y)*solver->Mesh.getDimensions().x() + blockDim.x*blockIdx.x + threadIdx.x];
-
-		if(u*u0 <=0.0)
-			atomicMax(&(solver->cudaStatusVector[blockIdx.x + gridDim.x*blockIdx.y]),1);
-	}
-//	if(threadIdx.x+threadIdx.y == 0)
-
-//	printf("Bye from  block = %d, thread = %d, x = %d, y = %d\n", blockIdx.x + gridDim.x*blockIdx.y,(blockDim.y*blockIdx.y + threadIdx.y)*solver->Mesh.getDimensions().x() + blockDim.x*blockIdx.x + threadIdx.x, threadIdx.x, threadIdx.y);
-
-
-}
-
-
-
-// run this with one thread per block
-__global__ void initSetupGrid2CUDA(tnlNarrowBand< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver)
-{
-//	printf("Hello\n");
-	if(solver->cudaStatusVector[blockIdx.x + gridDim.x*blockIdx.y] == 1)
-	{
-//			1 - with curve,  	2 - to the north of curve, 	4  - to the south of curve,
-//								8 - to the east of curve, 	16 - to the west of curve.
-			if(blockIdx.x > 0)
-				atomicAdd(&(solver->cudaStatusVector[blockIdx.x - 1 + gridDim.x*blockIdx.y]), 16);
-
-			if(blockIdx.x < gridDim.x - 1)
-				atomicAdd(&(solver->cudaStatusVector[blockIdx.x + 1 + gridDim.x*blockIdx.y]), 8);
-
-			if(blockIdx.y > 0 )
-				atomicAdd(&(solver->cudaStatusVector[blockIdx.x + gridDim.x*(blockIdx.y - 1)]), 4);
-
-			if(blockIdx.y < gridDim.y - 1)
-				atomicAdd(&(solver->cudaStatusVector[blockIdx.x + gridDim.x*(blockIdx.y + 1)]), 2);
-	}
-
-
-}
-
-
-
-
-
-__global__ void runNarrowBandCUDA(tnlNarrowBand< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver, double tau)
-{
-	int gid = (blockDim.y*blockIdx.y + threadIdx.y)*solver->Mesh.getDimensions().x()+ threadIdx.x;
-	int i = threadIdx.x + blockIdx.x*blockDim.x;
-	int j = threadIdx.y + blockIdx.y*blockDim.y;
-
-//	if(i+j == 0)
-//		printf("Hello\n");
-
-	int blockID = blockIdx.x + blockIdx.y*gridDim.x; /*i/NARROWBAND_SUBGRID_SIZE + (j/NARROWBAND_SUBGRID_SIZE) * ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE);*/
-
-	int status = solver->cudaStatusVector[blockID];
-
-	if(solver->Mesh.getDimensions().x() > i && solver->Mesh.getDimensions().y() > j)
-	{
-
-//		if(status != 0)
-		{
-			tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(solver->Mesh);
-			Entity.setCoordinates(Containers::StaticVector<2,double>(i,j));
-			Entity.refresh();
-			tnlNeighborGridEntityGetter<tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-			double value = solver->cudaDofVector2[Entity.getIndex()];
-			double xf,xb,yf,yb, grad, fu, a,b;
-			a = b = 0.0;
-
-			if( i == 0 /*|| (threadIdx.x == 0 && !(status & 9)) */)
-			{
-				xb = value - solver->cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()];
-				xf = solver->cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()] - value;
-			}
-			else if( i == solver->Mesh.getDimensions().x() - 1 /*|| (threadIdx.x == blockDim.x - 1 && !(status & 17)) */)
-			{
-				xb = value - solver->cudaDofVector2[neighborEntities.template getEntityIndex< -1,  0 >()];
-				xf = solver->cudaDofVector2[neighborEntities.template getEntityIndex< -1,  0 >()] - value;
-			}
-			else
-			{
-				xb =  value - solver->cudaDofVector2[neighborEntities.template getEntityIndex< -1,  0 >()];
-				xf = solver-> cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()] - value;
-			}
-
-			if( j == 0/* || (threadIdx.y == 0 && !(status & 3))*/ )
-			{
-				yb = value - solver->cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()] ;
-				yf = solver->cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()] - value;
-			}
-			else if( j == solver->Mesh.getDimensions().y() - 1  /*|| (threadIdx.y == blockDim.y - 1 && !(status & 5)) */)
-			{
-				yb = value - solver->cudaDofVector2[neighborEntities.template getEntityIndex< 0,  -1 >()];
-				yf = solver->cudaDofVector2[neighborEntities.template getEntityIndex< 0,  -1 >()] - value;
-			}
-			else
-			{
-				yb = value - solver->cudaDofVector2[neighborEntities.template getEntityIndex< 0, -1 >()];
-				yf = solver-> cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()] - value;
-			}
-			__syncthreads();
-
-
-
-
-
-			   if(sign(value) > 0.0)
-			   {
-				   xf = solver->negativePart(xf);
-
-				   xb = solver->positivePart(xb);
-
-				   yf = solver->negativePart(yf);
-
-				   yb = solver->positivePart(yb);
-
-			   }
-			   else
-			   {
-
-				   xb = solver->negativePart(xb);
-
-				   xf = solver->positivePart(xf);
-
-				   yb = solver->negativePart(yb);
-
-				   yf = solver->positivePart(yf);
-			   }
-
-
-			   if(xb > xf)
-				   a = xb*solver->Mesh.template getSpaceStepsProducts< -1, 0 >();
-			   else
-				   a = xf*solver->Mesh.template getSpaceStepsProducts< -1, 0 >();
-
-			   if(yb > yf)
-				   b = yb*solver->Mesh.template getSpaceStepsProducts< 0, -1 >();
-			   else
-				   b = yf*solver->Mesh.template getSpaceStepsProducts< 0, -1 >();
-
-
-
-//			grad = sqrt(0.5 * (xf*xf + xb*xb    +   yf*yf + yb*yb ) )*solver->Mesh.template getSpaceStepsProducts< -1, 0 >();
-
-			grad = sqrt(/*0.5 **/ (a*a    +   b*b ) );
-
-			fu = -1.0 * grad;
-
-//			if((tau*fu+value)*value <=0 )
-//			{
-//				//			1 - with curve,  	2 - to the north of curve, 	4  - to the south of curve,
-//				//								8 - to the east of curve, 	16 - to the west of curve.
-//
-//				if((threadIdx.x == 1 && !(status & 9)) && (blockIdx.x > 0) )
-//					atomicMax(solver->reinitialize,1);
-//				else if((threadIdx.x == blockDim.x - 2 && !(status & 17)) && (blockIdx.x < gridDim.x - 1) )
-//					atomicMax(solver->reinitialize,1);
-//				else if((threadIdx.y == 1 && !(status & 3)) && (blockIdx.y > 0) )
-//					atomicMax(solver->reinitialize,1);
-//				else if((threadIdx.y == blockDim.y - 2 && !(status & 5)) && (blockIdx.y < gridDim.y - 1) )
-//					atomicMax(solver->reinitialize,1);
-//			}
-
-			solver->cudaDofVector2[Entity.getIndex()]  += tau*fu;
-		}
-	}
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1111( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	cudaDofVector2[Entity.getIndex()]=fabsMin(INT_MAX,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0000( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-INT_MAX,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-INT_MAX,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1110( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1101( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[Entity.getIndex()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1011( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[Entity.getIndex()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0111( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0001( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0010( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[Entity.getIndex()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0100( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[Entity.getIndex()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1000( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1100( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-al;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1010( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1001( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	cudaDofVector2[Entity.getIndex()]=fabsMin(cudaDofVector[Entity.getIndex()],cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0011( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-al;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0101( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	Real al,be, a,b,c,s;
-	al=abs(cudaDofVector[Entity.getIndex()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 cudaDofVector[Entity.getIndex()]));
-
-	be=abs(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0110( Index i, Index j)
-{
-	tnlGridEntity< tnlGrid< 2,double, TNL::Devices::Host, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	cudaDofVector2[Entity.getIndex()]=fabsMin(cudaDofVector[Entity.getIndex()],cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 0,  1 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 1,  1 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(cudaDofVector[neighborEntities.template getEntityIndex< 1,  0 >()],cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-}
-#endif
-
-
-
-
-#endif /* TNLNARROWBAND_IMPL_H_ */
diff --git a/src/TNL/Legacy/narrow-band/tnlNarrowBand2D_impl.h b/src/TNL/Legacy/narrow-band/tnlNarrowBand2D_impl.h
deleted file mode 100644
index d42bc2a76..000000000
--- a/src/TNL/Legacy/narrow-band/tnlNarrowBand2D_impl.h
+++ /dev/null
@@ -1,927 +0,0 @@
-/***************************************************************************
-                          tnlNarrowBand2D_impl.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLNARROWBAND2D_IMPL_H_
-#define TNLNARROWBAND2D_IMPL_H_
-
-#include "tnlNarrowBand.h"
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-String tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
-{
-	   return String( "tnlNarrowBand< " ) +
-	          MeshType::getType() + ", " +
-	          ::getType< Real >() + ", " +
-	          ::getType< Index >() + " >";
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: tnlNarrowBand()
-:Entity(Mesh),
- dofVector(Mesh),
- dofVector2(Mesh)
-{
-}
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: init( const Config::ParameterContainer& parameters )
-{
-	const String& meshFile = parameters.getParameter< String >( "mesh" );
-
-	if( ! Mesh.load( meshFile ) )
-	{
-		  std::cerr << "I am not able to load the mesh from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-
-	const String& initialCondition = parameters.getParameter <String>("initial-condition");
-	if( ! dofVector.load( initialCondition ) )
-	{
-		  std::cerr << "I am not able to load the initial condition from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-	dofVector2.load(initialCondition);
-
-	h = Mesh.template getSpaceStepsProducts< 1, 0 >();
-	Entity.refresh();
-
-	const String& exact_input = parameters.getParameter< String >( "exact-input" );
-
-	if(exact_input == "no")
-		exactInput=false;
-	else
-		exactInput=true;
-
-	cout << "a" <<std::endl;
-	return initGrid();
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid()
-{
-
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-	for(int i=0; i< Mesh.getDimensions().x()*Mesh.getDimensions().x();i++)
-	{
-		dofVector2[i]=INT_MAX*sign(dofVector[i]);
-	}
-
-	for(int i = 0 ; i < Mesh.getDimensions().x()-1; i++)
-	{
-		for(int j = 0 ; j < Mesh.getDimensions().x()-1; j++)
-			{
-			this->Entity.setCoordinates(CoordinatesType(i,j));
-			this->Entity.refresh();
-			neighborEntities.refresh(Mesh,Entity.getIndex());
-
-				if(dofVector[this->Entity.getIndex()] > 0)
-				{
-					if(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()] > 0)
-					{
-						if(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-						{
-							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-								setupSquare1111(i,j);
-							else
-								setupSquare1110(i,j);
-						}
-						else
-						{
-							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-								setupSquare1101(i,j);
-							else
-								setupSquare1100(i,j);
-						}
-					}
-					else
-					{
-						if(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-						{
-							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-								setupSquare1011(i,j);
-							else
-								setupSquare1010(i,j);
-						}
-						else
-						{
-							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-								setupSquare1001(i,j);
-							else
-								setupSquare1000(i,j);
-						}
-					}
-				}
-				else
-				{
-					if(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()] > 0)
-					{
-						if(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-						{
-							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-								setupSquare0111(i,j);
-							else
-								setupSquare0110(i,j);
-						}
-						else
-						{
-							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-								setupSquare0101(i,j);
-							else
-								setupSquare0100(i,j);
-						}
-					}
-					else
-					{
-						if(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()] > 0)
-						{
-							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-								setupSquare0011(i,j);
-							else
-								setupSquare0010(i,j);
-						}
-						else
-						{
-							if(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()] > 0)
-								setupSquare0001(i,j);
-							else
-								setupSquare0000(i,j);
-						}
-					}
-				}
-
-			}
-	}
-	cout << "a" <<std::endl;
-
-//	Real tmp = 0.0;
-//	Real ax=0.5/sqrt(2.0);
-//
-//	if(!exactInput)
-//	{
-//		for(Index i = 0; i < Mesh.getDimensions().x()*Mesh.getDimensions().y(); i++)
-//				dofVector[i]=0.5*h*sign(dofVector[i]);
-//	}
-//
-//
-//	for(Index i = 1; i < Mesh.getDimensions().x()-1; i++)
-//	{
-//		for(Index j = 1; j < Mesh.getDimensions().y()-1; j++)
-//		{
-//			 tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-//
-//			if(tmp == 0.0)
-//			{}
-//			else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-//					dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-//					dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
-//					dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-//			{}
-//			else
-//				dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-//		}
-//	}
-//
-//
-//
-//	for(int i = 1; i < Mesh.getDimensions().x()-1; i++)
-//	{
-//		Index j = 0;
-//		tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-//
-//
-//		if(tmp == 0.0)
-//		{}
-//		else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-//				dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-//				dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 )
-//		{}
-//		else
-//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-//	}
-//
-//	for(int i = 1; i < Mesh.getDimensions().x()-1; i++)
-//	{
-//		Index j = Mesh.getDimensions().y() - 1;
-//		tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-//
-//
-//		if(tmp == 0.0)
-//		{}
-//		else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-//				dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-//				dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-//		{}
-//		else
-//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-//	}
-//
-//	for(int j = 1; j < Mesh.getDimensions().y()-1; j++)
-//	{
-//		Index i = 0;
-//		tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-//
-//
-//		if(tmp == 0.0)
-//		{}
-//		else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
-//				dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
-//				dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-//		{}
-//		else
-//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-//	}
-//
-//	for(int j = 1; j < Mesh.getDimensions().y()-1; j++)
-//	{
-//		Index i = Mesh.getDimensions().x() - 1;
-//		tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-//
-//
-//		if(tmp == 0.0)
-//		{}
-//		else if(dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
-//				dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
-//				dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
-//		{}
-//		else
-//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-//	}
-//
-//
-//	Index i = Mesh.getDimensions().x() - 1;
-//	Index j = Mesh.getDimensions().y() - 1;
-//
-//	tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-//	if(dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp > 0.0 &&
-//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp > 0.0)
-//
-//		dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-//
-//
-//
-//	j = 0;
-//	tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-//	if(dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp > 0.0 &&
-//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp > 0.0)
-//
-//		dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-//
-//
-//
-//	i = 0;
-//	j = Mesh.getDimensions().y() -1;
-//	tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-//	if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp > 0.0 &&
-//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp > 0.0)
-//
-//		dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-//
-//
-//
-//	j = 0;
-//	tmp = sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
-//	if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp > 0.0 &&
-//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp > 0.0)
-//
-//		dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
-
-	//data.setLike(dofVector2.getData());
-	//data=dofVector2.getData();
-	//cout << data.getType() <<std::endl;
-	dofVector2.save("u-00000.tnl");
-	//dofVector2.getData().save("u-00000.tnl");
-
-	return true;
-}
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: run()
-{
-
-	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-	{
-		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-		{
-			updateValue(i,j);
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-
-	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-	{
-		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-		{
-			updateValue(i,j);
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-
-	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-	{
-		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-		{
-			updateValue(i,j);
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-	{
-		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-		{
-			updateValue(i,j);
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-
-
-//	data.setLike(dofVector2.getData());
-//	data = dofVector2.getData();
-//	cout << data.getType() <<std::endl;
-	dofVector2.save("u-00001.tnl");
-	//dofVector2.getData().save("u-00001.tnl");
-
-	return true;
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j)
-{
-
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighborEntities(Entity);
-
-	Real value = dofVector2[Entity.getIndex()];
-	Real a,b, tmp;
-
-	if( i == 0 )
-		a = dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()];
-	else if( i == Mesh.getDimensions().x() - 1 )
-		a = dofVector2[neighborEntities.template getEntityIndex< -1,  0 >()];
-	else
-	{
-		a = fabsMin( dofVector2[neighborEntities.template getEntityIndex< -1,  0 >()],
-				 dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()] );
-	}
-
-	if( j == 0 )
-		b = dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()];
-	else if( j == Mesh.getDimensions().y() - 1 )
-		b = dofVector2[neighborEntities.template getEntityIndex< 0,  -1 >()];
-	else
-	{
-		b = fabsMin( dofVector2[neighborEntities.template getEntityIndex< 0,  -1 >()],
-				 dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()] );
-	}
-
-
-	if(fabs(a-b) >= h)
-		tmp = fabsMin(a,b) + sign(value)*h;
-	else
-		tmp = 0.5 * (a + b + sign(value)*sqrt(2.0 * h * h - (a - b) * (a - b) ) );
-
-
-	dofVector2[Entity.getIndex()] = fabsMin(value, tmp);
-
-//	if(dofVector2[Entity.getIndex()] > 1.0)
-//		cout << value << "    " << tmp << " " << dofVector2[Entity.getIndex()] <<std::endl;
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-Real tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
-{
-	Real fx = fabs(x);
-	Real fy = fabs(y);
-
-	Real tmpMin = Min(fx,fy);
-
-	if(tmpMin == fx)
-		return x;
-	else
-		return y;
-
-}
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1111( Index i, Index j)
-{
-//	this->Entity.setCoordinates(CoordinatesType(i,j));
-//	this->Entity.refresh();
-//	auto neighborEntities =  Entity.getNeighborEntities();
-//	dofVector2[Entity.getIndex()]=fabsMin(INT_MAX,dofVector2[Entity.getIndex()]);
-//	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(INT_MAX,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-//	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(INT_MAX,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-//	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(INT_MAX,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0000( Index i, Index j)
-{
-//	this->Entity.setCoordinates(CoordinatesType(i,j));
-//	this->Entity.refresh();
-//	auto neighborEntities =  Entity.getNeighborEntities();
-//	dofVector2[Entity.getIndex()]=fabsMin(-INT_MAX,dofVector2[(Entity.getIndex())]);
-//	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-INT_MAX,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-//	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-INT_MAX,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-//	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-INT_MAX,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1110( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1101( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(dofVector[Entity.getIndex()]-
-			 dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1011( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(dofVector[Entity.getIndex()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0111( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	be=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0001( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0010( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(dofVector[Entity.getIndex()]-
-			 dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0100( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(dofVector[Entity.getIndex()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1000( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	be=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	a = be/al;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1100( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-al;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1010( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1001( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	dofVector2[Entity.getIndex()]=fabsMin(dofVector[Entity.getIndex()],dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()],dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()],dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()],dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0011( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-al;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0101( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	Real al,be, a,b,c,s;
-	al=abs(dofVector[Entity.getIndex()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()]-
-			 dofVector[Entity.getIndex()]));
-
-	be=abs(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]/
-			(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()]-
-			 dofVector[neighborEntities.template getEntityIndex< 0,  1 >()]));
-
-	a = al-be;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
-
-
-	dofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0110( Index i, Index j)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j));
-	this->Entity.refresh();
-	auto neighborEntities =  Entity.getNeighborEntities();
-	dofVector2[Entity.getIndex()]=fabsMin(dofVector[Entity.getIndex()],dofVector2[(Entity.getIndex())]);
-	dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]=fabsMin(dofVector[neighborEntities.template getEntityIndex< 0,  1 >()],dofVector2[neighborEntities.template getEntityIndex< 0,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]=fabsMin(dofVector[neighborEntities.template getEntityIndex< 1,  1 >()],dofVector2[neighborEntities.template getEntityIndex< 1,  1 >()]);
-	dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]=fabsMin(dofVector[neighborEntities.template getEntityIndex< 1,  0 >()],dofVector2[neighborEntities.template getEntityIndex< 1,  0 >()]);
-}
-
-
-
-
-#endif /* TNLNARROWBAND_IMPL_H_ */
diff --git a/src/TNL/Legacy/narrow-band/tnlNarrowBand3D_CUDA_impl.h b/src/TNL/Legacy/narrow-band/tnlNarrowBand3D_CUDA_impl.h
deleted file mode 100644
index d362f249a..000000000
--- a/src/TNL/Legacy/narrow-band/tnlNarrowBand3D_CUDA_impl.h
+++ /dev/null
@@ -1,961 +0,0 @@
-/***************************************************************************
-                          tnlNarrowBand2D_CUDA_v4_impl.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLNARROWBAND3D_IMPL_H_
-#define TNLNARROWBAND3D_IMPL_H_
-
-#include "tnlNarrowBand.h"
-
-//__device__
-//double fabsMin( double x, double y)
-//{
-//	double fx = abs(x);
-//
-//	if(Min(fx,abs(y)) == fx)
-//		return x;
-//	else
-//		return y;
-//}
-//
-//__device__
-//double atomicFabsMin(double* address, double val)
-//{
-//	unsigned long long int* address_as_ull =
-//						  (unsigned long long int*)address;
-//	unsigned long long int old = *address_as_ull, assumed;
-//	do {
-//		assumed = old;
-//			old = atomicCAS(address_as_ull, assumed,__double_as_longlong( fabsMin(assumed,val) ));
-//	} while (assumed != old);
-//	return __longlong_as_double(old);
-//}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-String tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
-{
-	   return String( "tnlNarrowBand< " ) +
-	          MeshType::getType() + ", " +
-	          ::getType< Real >() + ", " +
-	          ::getType< Index >() + " >";
-}
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: init( const Config::ParameterContainer& parameters )
-{
-	const String& meshFile = parameters.getParameter< String >( "mesh" );
-
-	if( ! Mesh.load( meshFile ) )
-	{
-		  std::cerr << "I am not able to load the mesh from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-
-	const String& initialCondition = parameters.getParameter <String>("initial-condition");
-	if( ! dofVector.load( initialCondition ) )
-	{
-		  std::cerr << "I am not able to load the initial condition from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-	this->h = Mesh.template getSpaceStepsProducts< 1, 0, 0 >();
-	counter = 0;
-
-	const String& exact_input = parameters.getParameter< String >( "exact-input" );
-
-	if(exact_input == "no")
-		exactInput=false;
-	else
-		exactInput=true;
-
-
-#ifdef HAVE_CUDA
-
-	cudaMalloc(&(cudaDofVector), this->dofVector.getData().getSize()*sizeof(double));
-	cudaMemcpy(cudaDofVector, this->dofVector.getData().getData(), this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyHostToDevice);
-
-	cudaMalloc(&(cudaDofVector2), this->dofVector.getData().getSize()*sizeof(double));
-	cudaMemcpy(cudaDofVector2, this->dofVector.getData().getData(), this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyHostToDevice);
-
-
-	cudaMalloc(&(this->cudaSolver), sizeof(tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index >));
-	cudaMemcpy(this->cudaSolver, this,sizeof(tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index >), cudaMemcpyHostToDevice);
-
-#endif
-
-	int n = Mesh.getDimensions().x();
-	dim3 threadsPerBlock(8, 8,8);
-	dim3 numBlocks(n/8 + 1, n/8 +1, n/8 +1);
-
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-	initCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-
-	return true;
-}
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: run()
-{
-
-	int n = Mesh.getDimensions().x();
-	dim3 threadsPerBlock(1, 512);
-	dim3 numBlocks(8,1);
-
-
-	runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,0,0);
-
-	cudaDeviceSynchronize();
-	TNL_CHECK_CUDA_DEVICE;
-
-	cudaMemcpy(this->dofVector.getData().getData(), cudaDofVector2, this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyDeviceToHost);
-	cudaDeviceSynchronize();
-	cudaFree(cudaDofVector);
-	cudaFree(cudaDofVector2);
-	cudaFree(cudaSolver);
-	dofVector.save("u-00001.tnl");
-	cudaDeviceSynchronize();
-	return true;
-}
-
-
-
-
-#ifdef HAVE_CUDA
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-void tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j, Index k)
-{
-	tnlGridEntity< tnlGrid< 3,double, TNL::Devices::Host, int >, 3, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j,k));
-	Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 3, tnlGridEntityNoStencilStorage >,3> neighborEntities(Entity);
-	Real value = cudaDofVector2[Entity.getIndex()];
-	Real a,b,c, tmp;
-
-	if( i == 0 )
-		a = cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0,  0 >()];
-	else if( i == Mesh.getDimensions().x() - 1 )
-		a = cudaDofVector2[neighborEntities.template getEntityIndex< -1,  0,  0 >()];
-	else
-	{
-		a = fabsMin( cudaDofVector2[neighborEntities.template getEntityIndex< -1,  0,  0 >()],
-				 cudaDofVector2[neighborEntities.template getEntityIndex< 1,  0,  0 >()] );
-	}
-
-	if( j == 0 )
-		b = cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1,  0 >()];
-	else if( j == Mesh.getDimensions().y() - 1 )
-		b = cudaDofVector2[neighborEntities.template getEntityIndex< 0,  -1,  0 >()];
-	else
-	{
-		b = fabsMin( cudaDofVector2[neighborEntities.template getEntityIndex< 0,  -1,  0 >()],
-				 cudaDofVector2[neighborEntities.template getEntityIndex< 0,  1,  0 >()] );
-	}
-
-	if( k == 0 )
-		c = cudaDofVector2[neighborEntities.template getEntityIndex< 0,  0,  1 >()];
-	else if( k == Mesh.getDimensions().z() - 1 )
-		c = cudaDofVector2[neighborEntities.template getEntityIndex< 0,  0,  -1 >()];
-	else
-	{
-		c = fabsMin( cudaDofVector2[neighborEntities.template getEntityIndex< 0,  0,  -1 >()],
-				 cudaDofVector2[neighborEntities.template getEntityIndex< 0,  0,  1 >()] );
-	}
-
-	Real hD = 3.0*h*h - 2.0*(a*a + b*b + c*c - a*b - a*c - b*c);
-
-	if(hD < 0.0)
-		tmp = fabsMin(a,fabsMin(b,c)) + sign(value)*h;
-	else
-		tmp = (1.0/3.0) * ( a + b + c + sign(value)*sqrt(hD) );
-
-	atomicFabsMin(&cudaDofVector2[Entity.getIndex()],tmp);
-
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-bool tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid(int i, int j, int k)
-{
-	tnlGridEntity< tnlGrid< 3,double, TNL::Devices::Host, int >, 3, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j,k));
-	Entity.refresh();
-	int gid = Entity.getIndex();
-
-	if(abs(cudaDofVector[gid]) < 1.8*h)
-		cudaDofVector2[gid] = cudaDofVector[gid];
-	else
-		cudaDofVector2[gid] = INT_MAX*sign(cudaDofVector[gid]);
-
-	return true;
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-__device__
-Real tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
-{
-	Real fx = abs(x);
-	if(Min(fx,abs(y)) == fx)
-		return x;
-	else
-		return y;
-
-
-}
-
-
-
-__global__ void runCUDA(tnlNarrowBand< tnlGrid< 3,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int i)
-{
-
-	int gx = 0;
-	int gy = threadIdx.y;
-
-	int n = solver->Mesh.getDimensions().x();
-	int blockCount = n/blockDim.y +1;
-
-	if(blockIdx.x==0)
-	{
-		for(int gz = 0; gz < n;gz++)
-		{
-		gx = 0;
-		gy = threadIdx.y;
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy < n)
-			{
-				solver->updateValue(gx,gy,gz);
-				gx++;
-				if(gx==n)
-				{
-					gx=0;
-					gy+=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-		__syncthreads();
-		}
-	}
-	else if(blockIdx.x==1)
-	{
-		for(int gz = 0; gz < n;gz++)
-		{
-		gx=n-1;
-		gy=threadIdx.y;
-
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy < n)
-			{
-				solver->updateValue(gx,gy,gz);
-				gx--;
-				if(gx==-1)
-				{
-					gx=n-1;
-					gy+=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-		}
-	}
-	else if(blockIdx.x==2)
-	{
-
-		for(int gz = 0; gz < n;gz++)
-		{
-		gx=0;
-		gy=n-threadIdx.y-1;
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy > -1)
-			{
-				solver->updateValue(gx,gy,gz);
-				gx++;
-				if(gx==n)
-				{
-					gx=0;
-					gy-=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-		}
-	}
-	else if(blockIdx.x==3)
-	{
-		for(int gz = 0; gz < n;gz++)
-		{
-		gx=n-1;
-		gy=n-threadIdx.y-1;
-
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy > -1)
-			{
-				solver->updateValue(gx,gy,gz);
-				gx--;
-				if(gx==-1)
-				{
-					gx=n-1;
-					gy-=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-		}
-	}
-
-
-
-
-	else if(blockIdx.x==4)
-	{
-		for(int gz = n-1; gz > -1;gz--)
-		{
-		gx = 0;
-		gy = threadIdx.y;
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy < n)
-			{
-				solver->updateValue(gx,gy,gz);
-				gx++;
-				if(gx==n)
-				{
-					gx=0;
-					gy+=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-		}
-	}
-	else if(blockIdx.x==5)
-	{
-		for(int gz = n-1; gz > -1;gz--)
-		{
-		gx=n-1;
-		gy=threadIdx.y;
-
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy < n)
-			{
-				solver->updateValue(gx,gy,gz);
-				gx--;
-				if(gx==-1)
-				{
-					gx=n-1;
-					gy+=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-		}
-	}
-	else if(blockIdx.x==6)
-	{
-
-		for(int gz = n-1; gz > -1;gz--)
-		{
-		gx=0;
-		gy=n-threadIdx.y-1;
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy > -1)
-			{
-				solver->updateValue(gx,gy,gz);
-				gx++;
-				if(gx==n)
-				{
-					gx=0;
-					gy-=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-		}
-	}
-	else if(blockIdx.x==7)
-	{
-		for(int gz = n-1; gz > -1;gz--)
-		{
-		gx=n-1;
-		gy=n-threadIdx.y-1;
-
-		for(int k = 0; k < n*blockCount + blockDim.y; k++)
-		{
-			if(threadIdx.y  < k+1 && gy > -1)
-			{
-				solver->updateValue(gx,gy,gz);
-				gx--;
-				if(gx==-1)
-				{
-					gx=n-1;
-					gy-=blockDim.y;
-				}
-			}
-
-
-			__syncthreads();
-		}
-		}
-	}
-
-
-
-
-}
-
-
-__global__ void initCUDA(tnlNarrowBand< tnlGrid< 3,double, TNL::Devices::Host, int >, double, int >* solver)
-{
-	int gx = threadIdx.x + blockDim.x*blockIdx.x;
-	int gy = blockDim.y*blockIdx.y + threadIdx.y;
-	int gz = blockDim.z*blockIdx.z + threadIdx.z;
-
-	if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy && solver->Mesh.getDimensions().z() > gz)
-	{
-		solver->initGrid(gx,gy,gz);
-	}
-
-
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1111( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	cudaDofVector2[index]=fabsMin(INT_MAX,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0000( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	cudaDofVector2[index]=fabsMin(-INT_MAX,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1110( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]));
-//
-//	a = be/al;
-//	b=1.0;
-//	c=-be;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1101( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]));
-//
-//	a = be/al;
-//	b=1.0;
-//	c=-be;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1011( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]));
-//
-//	a = be/al;
-//	b=1.0;
-//	c=-be;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0111( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
-//
-//	a = be/al;
-//	b=1.0;
-//	c=-be;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0001( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]));
-//
-//	a = be/al;
-//	b=1.0;
-//	c=-be;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0010( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]));
-//
-//	a = be/al;
-//	b=1.0;
-//	c=-be;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0100( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]));
-//
-//	a = be/al;
-//	b=1.0;
-//	c=-be;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1000( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
-//
-//	a = be/al;
-//	b=1.0;
-//	c=-be;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//
-//
-//
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1100( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]));
-//
-//	a = al-be;
-//	b=1.0;
-//	c=-al;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1010( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]));
-//
-//	a = al-be;
-//	b=1.0;
-//	c=-be;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1001( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	cudaDofVector2[index]=fabsMin(cudaDofVector[index],cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)],cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)],cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)],cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//
-//
-//
-//
-//
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0011( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]));
-//
-//	a = al-be;
-//	b=1.0;
-//	c=-al;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0101( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	Real al,be, a,b,c,s;
-//	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
-//
-//	be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/
-//			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
-//			 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]));
-//
-//	a = al-be;
-//	b=1.0;
-//	c=-be;
-//	s= h/sqrt(a*a+b*b);
-//
-//
-//	cudaDofVector2[index]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//
-//}
-//
-//template< typename MeshReal,
-//          typename Device,
-//          typename MeshIndex,
-//          typename Real,
-//          typename Index >
-//void tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0110( Index i, Index j)
-//{
-//	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
-//	cudaDofVector2[index]=fabsMin(cudaDofVector[index],cudaDofVector2[(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)],cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)],cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
-//	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)],cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
-//}
-#endif
-
-
-
-
-#endif /* TNLNARROWBAND_IMPL_H_ */
diff --git a/src/TNL/Legacy/narrow-band/tnlNarrowBand3D_impl.h b/src/TNL/Legacy/narrow-band/tnlNarrowBand3D_impl.h
deleted file mode 100644
index 6e63d527b..000000000
--- a/src/TNL/Legacy/narrow-band/tnlNarrowBand3D_impl.h
+++ /dev/null
@@ -1,307 +0,0 @@
-/***************************************************************************
-                          tnlNarrowBand2D_impl.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLNARROWBAND3D_IMPL_H_
-#define TNLNARROWBAND3D_IMPL_H_
-
-#include "tnlNarrowBand.h"
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-String tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
-{
-	   return String( "tnlNarrowBand< " ) +
-	          MeshType::getType() + ", " +
-	          ::getType< Real >() + ", " +
-	          ::getType< Index >() + " >";
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: tnlNarrowBand()
-:Entity(Mesh),
- dofVector(Mesh),
- dofVector2(Mesh)
-{
-}
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: init( const Config::ParameterContainer& parameters )
-{
-	const String& meshFile = parameters.getParameter< String >( "mesh" );
-
-	if( ! Mesh.load( meshFile ) )
-	{
-		  std::cerr << "I am not able to load the mesh from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-
-
-	const String& initialCondition = parameters.getParameter <String>("initial-condition");
-	if( ! dofVector.load( initialCondition ) )
-	{
-		  std::cerr << "I am not able to load the initial condition from the file " << meshFile << "." <<std::endl;
-		   return false;
-	}
-	dofVector2.load(initialCondition);
-
-	h = Mesh.template getSpaceStepsProducts< 1, 0, 0 >();
-	Entity.refresh();
-
-	const String& exact_input = parameters.getParameter< String >( "exact-input" );
-
-	if(exact_input == "no")
-		exactInput=false;
-	else
-		exactInput=true;
-//	cout << "bla "<<endl;
-	return initGrid();
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid()
-{
-	for(int i=0; i< Mesh.getDimensions().x()*Mesh.getDimensions().y()*Mesh.getDimensions().z();i++)
-	{
-
-		if (abs(dofVector[i]) < 1.8*h)
-			dofVector2[i]=dofVector[i];
-		else
-			dofVector2[i]=INT_MAX*sign(dofVector[i]);
-	}
-
-	return true;
-}
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-bool tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: run()
-{
-
-	for(Index k = 0; k < Mesh.getDimensions().z(); k++)
-	{
-		for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-		{
-			for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-			{
-				updateValue(i,j,k);
-			}
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-
-	for(Index k = 0; k < Mesh.getDimensions().z(); k++)
-	{
-		for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-		{
-			for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-			{
-				updateValue(i,j,k);
-			}
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-
-	for(Index k = 0; k < Mesh.getDimensions().z(); k++)
-	{
-		for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-		{
-			for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-			{
-				updateValue(i,j,k);
-			}
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-	for(Index k = 0; k < Mesh.getDimensions().z(); k++)
-	{
-		for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-		{
-			for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-			{
-				updateValue(i,j,k);
-			}
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-
-
-
-
-
-
-
-
-	for(Index k = Mesh.getDimensions().z() -1; k > -1; k--)
-	{
-		for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-		{
-			for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-			{
-				updateValue(i,j,k);
-			}
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-
-	for(Index k = Mesh.getDimensions().z() -1; k > -1; k--)
-	{
-		for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-		{
-			for(Index j = 0; j < Mesh.getDimensions().y(); j++)
-			{
-				updateValue(i,j,k);
-			}
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-
-	for(Index k = Mesh.getDimensions().z() -1; k > -1; k--)
-	{
-		for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
-		{
-			for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-			{
-				updateValue(i,j,k);
-			}
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-	for(Index k = Mesh.getDimensions().z() -1; k > -1; k--)
-	{
-		for(Index i = 0; i < Mesh.getDimensions().x(); i++)
-		{
-			for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
-			{
-				updateValue(i,j,k);
-			}
-		}
-	}
-
-/*---------------------------------------------------------------------------------------------------------------------------*/
-
-
-	dofVector2.save("u-00001.tnl");
-
-	cout << "bla 3"<<endl;
-	return true;
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-void tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j, Index k)
-{
-	this->Entity.setCoordinates(CoordinatesType(i,j,k));
-	this->Entity.refresh();
-	tnlNeighborGridEntityGetter<tnlGridEntity< MeshType, 3, tnlGridEntityNoStencilStorage >,3> neighborEntities(Entity);
-	Real value = dofVector2[Entity.getIndex()];
-	Real a,b,c, tmp;
-
-	if( i == 0 )
-		a = dofVector2[neighborEntities.template getEntityIndex< 1,  0,  0>()];
-	else if( i == Mesh.getDimensions().x() - 1 )
-		a = dofVector2[neighborEntities.template getEntityIndex< -1,  0,  0 >()];
-	else
-	{
-		a = fabsMin( dofVector2[neighborEntities.template getEntityIndex< -1,  0,  0>()],
-				 dofVector2[neighborEntities.template getEntityIndex< 1,  0,  0>()] );
-	}
-
-	if( j == 0 )
-		b = dofVector2[neighborEntities.template getEntityIndex< 0,  1,  0>()];
-	else if( j == Mesh.getDimensions().y() - 1 )
-		b = dofVector2[neighborEntities.template getEntityIndex< 0,  -1,  0>()];
-	else
-	{
-		b = fabsMin( dofVector2[neighborEntities.template getEntityIndex< 0,  -1,  0>()],
-				 dofVector2[neighborEntities.template getEntityIndex< 0,  1,  0>()] );
-	}
-
-	if( k == 0 )
-		c = dofVector2[neighborEntities.template getEntityIndex< 0,  0,  1>()];
-	else if( k == Mesh.getDimensions().z() - 1 )
-		c = dofVector2[neighborEntities.template getEntityIndex< 0,  0,  -1>()];
-	else
-	{
-		c = fabsMin( dofVector2[neighborEntities.template getEntityIndex< 0,  0,  -1>()],
-				 dofVector2[neighborEntities.template getEntityIndex< 0,  0,  1>()] );
-	}
-
-	Real hD = 3.0*h*h - 2.0*(a*a+b*b+c*c-a*b-a*c-b*c);
-
-	if(hD < 0.0)
-		tmp = fabsMin(a,fabsMin(b,c)) + sign(value)*h;
-	else
-		tmp = (1.0/3.0) * ( a + b + c + sign(value)*sqrt(hD) );
-
-
-	dofVector2[Entity.getIndex()]  = fabsMin(value, tmp);
-}
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-Real tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
-{
-	Real fx = fabs(x);
-	Real fy = fabs(y);
-
-	Real tmpMin = Min(fx,fy);
-
-	if(tmpMin == fx)
-		return x;
-	else
-		return y;
-
-}
-
-
-
-#endif /* TNLNARROWBAND_IMPL_H_ */
diff --git a/src/TNL/Legacy/narrow-band/tnlNarrowBand_CUDA.h b/src/TNL/Legacy/narrow-band/tnlNarrowBand_CUDA.h
deleted file mode 100644
index ca9b1da2c..000000000
--- a/src/TNL/Legacy/narrow-band/tnlNarrowBand_CUDA.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/***************************************************************************
-                          tnlNarrowBand_CUDA.h  -  description
-                             -------------------
-    begin                : Oct 15 , 2015
-    copyright            : (C) 2015 by Tomas Sobotik
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-#ifndef TNLNARROWBAND_H_
-#define TNLNARROWBAND_H_
-
-#include <TNL/Config/ParameterContainer.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Containers/StaticVector.h>
-#include <TNL/Devices/Host.h>
-#include <mesh/tnlGrid.h>
-#include <mesh/grids/tnlGridEntity.h>
-
-#include <functions/tnlMeshFunction.h>
-#include <limits.h>
-#include <core/tnlDevice.h>
-#include <ctime>
-
-
-
-
-
-template< typename Mesh,
-		  typename Real,
-		  typename Index >
-class tnlNarrowBand
-{};
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-class tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >
-{
-
-public:
-	typedef Real RealType;
-	typedef Device DeviceType;
-	typedef Index IndexType;
-	typedef tnlGrid< 2, Real, Device, Index > MeshType;
-	typedef TNL::Containers::Vector< RealType, DeviceType, IndexType> DofVectorType;
-	typedef typename MeshType::CoordinatesType CoordinatesType;
-
-	tnlNarrowBand();
-
-        static String getType();
-	bool init( const Config::ParameterContainer& parameters );
-	bool run();
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-	RealType positivePart(const RealType arg) const;
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-	RealType negativePart(const RealType arg) const;
-
-#ifdef HAVE_CUDA
-	__device__ bool initGrid();
-	__device__ void updateValue(const Index i, const Index j);
-	__device__ void updateValue(const Index i, const Index j, double** sharedMem, const int k3);
-	__device__ Real fabsMin(const Real x, const Real y);
-
-	tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >* cudaSolver;
-	double* cudaDofVector;
-	double* cudaDofVector2;
-	int* cudaStatusVector;
-	int counter;
-	int* reinitialize;
-	__device__ void setupSquare1000(Index i, Index j);
-	__device__ void setupSquare1100(Index i, Index j);
-	__device__ void setupSquare1010(Index i, Index j);
-	__device__ void setupSquare1001(Index i, Index j);
-	__device__ void setupSquare1110(Index i, Index j);
-	__device__ void setupSquare1101(Index i, Index j);
-	__device__ void setupSquare1011(Index i, Index j);
-	__device__ void setupSquare1111(Index i, Index j);
-	__device__ void setupSquare0000(Index i, Index j);
-	__device__ void setupSquare0100(Index i, Index j);
-	__device__ void setupSquare0010(Index i, Index j);
-	__device__ void setupSquare0001(Index i, Index j);
-	__device__ void setupSquare0110(Index i, Index j);
-	__device__ void setupSquare0101(Index i, Index j);
-	__device__ void setupSquare0011(Index i, Index j);
-	__device__ void setupSquare0111(Index i, Index j);
-#endif
-
-	MeshType Mesh;
-
-protected:
-
-	int statusGridSize;
-	bool exactInput;
-
-	tnlMeshFunction<MeshType> dofVector;
-	DofVectorType data;
-
-
-	RealType h, tau, finalTime;
-
-
-};
-
-
-
-
-
-
-
-
-
-template< typename MeshReal,
-          typename Device,
-          typename MeshIndex,
-          typename Real,
-          typename Index >
-class tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index >
-{
-
-public:
-	typedef Real RealType;
-	typedef Device DeviceType;
-	typedef Index IndexType;
-	typedef tnlGrid< 3, Real, Device, Index > MeshType;
-	typedef TNL::Containers::Vector< RealType, DeviceType, IndexType> DofVectorType;
-	typedef typename MeshType::CoordinatesType CoordinatesType;
-
-
-
-	static String getType();
-	bool init( const Config::ParameterContainer& parameters );
-	bool run();
-
-#ifdef HAVE_CUDA
-	__device__ bool initGrid(int i, int j, int k);
-	__device__ void updateValue(const Index i, const Index j, const Index k);
-	__device__ void updateValue(const Index i, const Index j, const Index k, double** sharedMem, const int k3);
-	__device__ Real fabsMin(const Real x, const Real y);
-
-	tnlNarrowBand< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index >* cudaSolver;
-	double* cudaDofVector;
-	double* cudaDofVector2;
-	int counter;
-#endif
-
-	MeshType Mesh;
-
-protected:
-
-
-
-	bool exactInput;
-
-	tnlMeshFunction<MeshType> dofVector;
-	DofVectorType data;
-
-	RealType h;
-
-
-};
-
-
-
-
-
-
-
-#ifdef HAVE_CUDA
-//template<int sweep_t>
-__global__ void runCUDA(tnlNarrowBand< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int i);
-//__global__ void runCUDA(tnlNarrowBand< tnlGrid< 3,double, TNL::Devices::Host, int >, double, int >* solver, int sweep, int i);
-
-__global__ void initCUDA(tnlNarrowBand< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver);
-
-__global__ void initSetupGridCUDA(tnlNarrowBand< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver);
-__global__ void initSetupGrid2CUDA(tnlNarrowBand< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver);
-__global__ void initSetupGrid1_2CUDA(tnlNarrowBand< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver);
-__global__ void runNarrowBandCUDA(tnlNarrowBand< tnlGrid< 2,double, TNL::Devices::Host, int >, double, int >* solver, double tau);
-//__global__ void initCUDA(tnlNarrowBand< tnlGrid< 3,double, TNL::Devices::Host, int >, double, int >* solver);
-#endif
-
-
-
-#include "tnlNarrowBand2D_CUDA_v4_impl.h"
-//											#include "tnlNarrowBand3D_CUDA_impl.h"
-
-#endif /* TNLNARROWBAND_H_ */
-- 
GitLab