From 5d1fab6e4fa455940aa0edf159e1ca3229738a36 Mon Sep 17 00:00:00 2001
From: fencl <fenclmat@fjfi.cvut.cz>
Date: Mon, 23 Apr 2018 11:01:29 +0200
Subject: [PATCH] CUDA implemented for exact number of loops in 2D and 3D

---
 .../tnlDirectEikonalMethodsBase.h             |  23 +-
 .../tnlDirectEikonalMethodsBase_impl.h        | 456 +++++++++++-------
 .../hamilton-jacobi/tnlFastSweepingMethod.h   |   2 +
 .../tnlFastSweepingMethod2D_impl.h            |  49 +-
 .../tnlFastSweepingMethod3D_impl.h            | 380 ++++++++-------
 5 files changed, 542 insertions(+), 368 deletions(-)

diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index d8770b0e73..138671a3b7 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -67,9 +67,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
       template< typename MeshEntity >
       __cuda_callable__ void updateCell( MeshFunctionType& u,
                                          const MeshEntity& cell,
-                                         const RealType velocity = 1.0 );
-   protected:
-       
+                                         const RealType velocity = 1.0 );      
 };
 
 template< typename Real,
@@ -78,7 +76,6 @@ template< typename Real,
 class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
 {
    public:
-      
       typedef Meshes::Grid< 3, Real, Device, Index > MeshType;
       typedef Real RealType;
       typedef Device DevcieType;
@@ -96,11 +93,6 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
       __cuda_callable__ void updateCell( MeshFunctionType& u,
                                          const MeshEntity& cell,
                                          const RealType velocity = 1.0);
-      
-      /*Real sort( Real a, Real b, Real c,
-                 const RealType& ha,
-                 const RealType& hb,
-                 const RealType& hc ); */
 };
 
 template < typename T1, typename T2 >
@@ -112,7 +104,8 @@ __cuda_callable__ void sortMinims( T1 pom[] );
 
 #ifdef HAVE_CUDA
 template < typename Real, typename Device, typename Index >
-__global__ void CudaUpdateCellCaller( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
+__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
+                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                       Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux );
 
 template < typename Real, typename Device, typename Index >
@@ -120,7 +113,15 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2,
                                 Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output,
                                 Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap );
 
-//__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, double, TNL::Devices::Cuda, int > >& input );
+template < typename Real, typename Device, typename Index >
+__global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, 
+                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
+                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap );
+
+template < typename Real, typename Device, typename Index >
+__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
+                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
+                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux );
 #endif
 
 #include "tnlDirectEikonalMethodsBase_impl.h"
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
index 37fd245128..bf78a02a48 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h
@@ -11,6 +11,7 @@
 
 #include <iostream>
 #include "tnlFastSweepingMethod.h"
+
 template< typename Real,
           typename Device,
           typename Index >
@@ -81,13 +82,7 @@ initInterface( const MeshFunctionPointer& _input,
                MeshFunctionPointer& _output,
                InterfaceMapPointer& _interfaceMap )
 {
-    /*
-     doplnit přepočty pro cudu:
-     * overit is_same device
-     * na kazdy bod jedno cuda vlakno
-     */
-    
-        
+            
     if( std::is_same< Device, Devices::Cuda >::value )
     {
 #ifdef HAVE_CUDA
@@ -102,7 +97,6 @@ initInterface( const MeshFunctionPointer& _input,
         CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
                                                    _output.template modifyData< Device >(),
                                                    _interfaceMap.template modifyData< Device >() );
-        //CudaInitCaller<<< gridSize, blockSize >>>( input );
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
 #endif
@@ -215,6 +209,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename MeshEntity >
+__cuda_callable__
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
 updateCell( MeshFunctionType& u,
@@ -301,162 +296,187 @@ initInterface( const MeshFunctionPointer& _input,
                MeshFunctionPointer& _output,
                InterfaceMapPointer& _interfaceMap  )
 {
-   const MeshFunctionType& input =  _input.getData();
-   MeshFunctionType& output =  _output.modifyData();
-   InterfaceMapType& interfaceMap =  _interfaceMap.modifyData();
-   const MeshType& mesh = input.getMesh();
-   typedef typename MeshType::Cell Cell;
-   Cell cell( mesh );
-   for( cell.getCoordinates().z() = 0;
-        cell.getCoordinates().z() < mesh.getDimensions().z();
-        cell.getCoordinates().z() ++ )
-        for( cell.getCoordinates().y() = 0;
-             cell.getCoordinates().y() < mesh.getDimensions().y();
-             cell.getCoordinates().y() ++ )
-            for( cell.getCoordinates().x() = 0;
-                 cell.getCoordinates().x() < mesh.getDimensions().x();
-                 cell.getCoordinates().x() ++ )
-            {
-                cell.refresh();
-                output[ cell.getIndex() ] =
-                input( cell ) > 0 ? TypeInfo< RealType >::getMaxValue() :
-                                   - TypeInfo< RealType >::getMaxValue();
-                interfaceMap[ cell.getIndex() ] = false;
-            }
-    
-   const RealType& hx = mesh.getSpaceSteps().x();
-   const RealType& hy = mesh.getSpaceSteps().y();
-   const RealType& hz = mesh.getSpaceSteps().z();
-   for( cell.getCoordinates().z() = 0;
-        cell.getCoordinates().z() < mesh.getDimensions().z();
-        cell.getCoordinates().z() ++ )   
-      for( cell.getCoordinates().y() = 0;
-           cell.getCoordinates().y() < mesh.getDimensions().y();
-           cell.getCoordinates().y() ++ )
-         for( cell.getCoordinates().x() = 0;
-              cell.getCoordinates().x() < mesh.getDimensions().x();
-              cell.getCoordinates().x() ++ )
-         {
-            cell.refresh();
-            const RealType& c = input( cell );
-            if( ! cell.isBoundaryEntity() )
-            {
-               auto neighbors = cell.getNeighborEntities();
-               Real pom = 0;
-               const IndexType e = neighbors.template getEntityIndex<  1,  0,  0 >();
-               const IndexType n = neighbors.template getEntityIndex<  0,  1,  0 >();
-               const IndexType t = neighbors.template getEntityIndex<  0,  0,  1 >();
-               //Try exact initiation
-               /*const IndexType w = neighbors.template getEntityIndex< -1,  0,  0 >();
-               const IndexType s = neighbors.template getEntityIndex<  0, -1,  0 >();
-               const IndexType b = neighbors.template getEntityIndex<  0,  0, -1 >();
-               if( c * input[ e ] <= 0 )
-               {
-                  output[ cell.getIndex() ] = c;
-                  output[ e ] = input[ e ];
-                  interfaceMap[ e ] = true;   
-                  interfaceMap[ cell.getIndex() ] = true;
-               }
-               else if( c * input[ n ] <= 0 )
-               {
-                  output[ cell.getIndex() ] = c;
-                  output[ n ] = input[ n ];
-                  interfaceMap[ n ] = true;   
-                  interfaceMap[ cell.getIndex() ] = true;
-               }
-               else if( c * input[ t ] <= 0 )
-               {
-                  output[ cell.getIndex() ] = c;
-                  output[ t ] = input[ t ];
-                  interfaceMap[ t ] = true;   
-                  interfaceMap[ cell.getIndex() ] = true;
-               }*/
-               if( c * input[ n ] <= 0 )
-               {
-                   if( c >= 0 )
-                   {
-                   pom = ( hy * c )/( c - input[ n ]);
-                   if( output[ cell.getIndex() ] > pom ) 
-                       output[ cell.getIndex() ] = pom;
-                    
-                   if ( output[ n ] < pom - hy)
-                        output[ n ] = pom - hy; // ( hy * c )/( c - input[ n ]) - hy;
-                   
-                   }else
-                   {
-                     pom = - ( hy * c )/( c - input[ n ]);
-                     if( output[ cell.getIndex() ] < pom )
-                         output[ cell.getIndex() ] = pom;
-                     if( output[ n ] > hy + pom )
-                         output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]);
-                   
-                   }
-               interfaceMap[ cell.getIndex() ] = true;
-               interfaceMap[ n ] = true;
-               }
-               if( c * input[ e ] <= 0 )
-               {
-                   if( c >= 0 )
-                   {
-                       pom = ( hx * c )/( c - input[ e ]);
-                       if( output[ cell.getIndex() ] > pom )
-                           output[ cell.getIndex() ] = pom;
-                   
-                       pom = pom - hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
-                       if( output[ e ] < pom )
-                           output[ e ] = pom;      
-                       
-                   }else
-                   {
-                       pom = - (hx * c)/( c - input[ e ]);
-                       if( output[ cell.getIndex() ] < pom )
-                           output[ cell.getIndex() ] = pom;
-                    
-                       pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
-                       if( output[ e ] > pom )
-                           output[ e ] = pom;
-                   }
-               interfaceMap[ cell.getIndex() ] = true;
-               interfaceMap[ e ] = true;
-               }
-               if( c * input[ t ] <= 0 )
-               {
-                   if( c >= 0 )
-                   {
-                       pom = ( hz * c )/( c - input[ t ]);
-                       if( output[ cell.getIndex() ] > pom )
-                           output[ cell.getIndex() ] = pom;
-                   
-                       pom = pom - hz; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
-                       if( output[ t ] < pom )
-                           output[ t ] = pom; 
-                       
-                   }else
-                   {
-                       pom = - (hz * c)/( c - input[ t ]);
-                       if( output[ cell.getIndex() ] < pom )
-                           output[ cell.getIndex() ] = pom;
-                    
-                       pom = pom + hz; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
-                       if( output[ t ] > pom )
-                           output[ t ] = pom;
-                       
-                   }
-               interfaceMap[ cell.getIndex() ] = true;
-               interfaceMap[ t ] = true;
-               }    
-            }
-            /*output[ cell.getIndex() ] =
-               c > 0 ? TypeInfo< RealType >::getMaxValue() :
-                      -TypeInfo< RealType >::getMaxValue();
-            interfaceMap[ cell.getIndex() ] = false;*/ //is on line 245
-         }
+    if( std::is_same< Device, Devices::Cuda >::value )
+    {
+#ifdef HAVE_CUDA
+        const MeshType& mesh = _input->getMesh();
+        
+        const int cudaBlockSize( 8 );
+        int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
+        int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
+        int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize );
+        if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
+            std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
+        dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
+        dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
+        Devices::Cuda::synchronizeDevice();
+        CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(),
+                                                     _output.template modifyData< Device >(),
+                                                     _interfaceMap.template modifyData< Device >() );
+        cudaDeviceSynchronize();
+        TNL_CHECK_CUDA_DEVICE;
+#endif
+    }
+    if( std::is_same< Device, Devices::Host >::value )
+    {
+        const MeshFunctionType& input =  _input.getData();
+        MeshFunctionType& output =  _output.modifyData();
+        InterfaceMapType& interfaceMap =  _interfaceMap.modifyData();
+        const MeshType& mesh = input.getMesh();
+        typedef typename MeshType::Cell Cell;
+        Cell cell( mesh );
+        for( cell.getCoordinates().z() = 0;
+             cell.getCoordinates().z() < mesh.getDimensions().z();
+             cell.getCoordinates().z() ++ )
+             for( cell.getCoordinates().y() = 0;
+                  cell.getCoordinates().y() < mesh.getDimensions().y();
+                  cell.getCoordinates().y() ++ )
+                 for( cell.getCoordinates().x() = 0;
+                      cell.getCoordinates().x() < mesh.getDimensions().x();
+                      cell.getCoordinates().x() ++ )
+                 {
+                     cell.refresh();
+                     output[ cell.getIndex() ] =
+                     input( cell ) > 0 ? TypeInfo< RealType >::getMaxValue() :
+                                        - TypeInfo< RealType >::getMaxValue();
+                     interfaceMap[ cell.getIndex() ] = false;
+                 }
+
+        const RealType& hx = mesh.getSpaceSteps().x();
+        const RealType& hy = mesh.getSpaceSteps().y();
+        const RealType& hz = mesh.getSpaceSteps().z();
+        for( cell.getCoordinates().z() = 0;
+             cell.getCoordinates().z() < mesh.getDimensions().z();
+             cell.getCoordinates().z() ++ )   
+           for( cell.getCoordinates().y() = 0;
+                cell.getCoordinates().y() < mesh.getDimensions().y();
+                cell.getCoordinates().y() ++ )
+              for( cell.getCoordinates().x() = 0;
+                   cell.getCoordinates().x() < mesh.getDimensions().x();
+                   cell.getCoordinates().x() ++ )
+              {
+                 cell.refresh();
+                 const RealType& c = input( cell );
+                 if( ! cell.isBoundaryEntity() )
+                 {
+                    auto neighbors = cell.getNeighborEntities();
+                    Real pom = 0;
+                    const IndexType e = neighbors.template getEntityIndex<  1,  0,  0 >();
+                    const IndexType n = neighbors.template getEntityIndex<  0,  1,  0 >();
+                    const IndexType t = neighbors.template getEntityIndex<  0,  0,  1 >();
+                    //Try exact initiation
+                    /*const IndexType w = neighbors.template getEntityIndex< -1,  0,  0 >();
+                    const IndexType s = neighbors.template getEntityIndex<  0, -1,  0 >();
+                    const IndexType b = neighbors.template getEntityIndex<  0,  0, -1 >();
+                    if( c * input[ e ] <= 0 )
+                    {
+                       output[ cell.getIndex() ] = c;
+                       output[ e ] = input[ e ];
+                       interfaceMap[ e ] = true;   
+                       interfaceMap[ cell.getIndex() ] = true;
+                    }
+                    else if( c * input[ n ] <= 0 )
+                    {
+                       output[ cell.getIndex() ] = c;
+                       output[ n ] = input[ n ];
+                       interfaceMap[ n ] = true;   
+                       interfaceMap[ cell.getIndex() ] = true;
+                    }
+                    else if( c * input[ t ] <= 0 )
+                    {
+                       output[ cell.getIndex() ] = c;
+                       output[ t ] = input[ t ];
+                       interfaceMap[ t ] = true;   
+                       interfaceMap[ cell.getIndex() ] = true;
+                    }*/
+                    if( c * input[ n ] <= 0 )
+                    {
+                        if( c >= 0 )
+                        {
+                        pom = ( hy * c )/( c - input[ n ]);
+                        if( output[ cell.getIndex() ] > pom ) 
+                            output[ cell.getIndex() ] = pom;
+
+                        if ( output[ n ] < pom - hy)
+                             output[ n ] = pom - hy; // ( hy * c )/( c - input[ n ]) - hy;
+
+                        }else
+                        {
+                          pom = - ( hy * c )/( c - input[ n ]);
+                          if( output[ cell.getIndex() ] < pom )
+                              output[ cell.getIndex() ] = pom;
+                          if( output[ n ] > hy + pom )
+                              output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]);
+
+                        }
+                    interfaceMap[ cell.getIndex() ] = true;
+                    interfaceMap[ n ] = true;
+                    }
+                    if( c * input[ e ] <= 0 )
+                    {
+                        if( c >= 0 )
+                        {
+                            pom = ( hx * c )/( c - input[ e ]);
+                            if( output[ cell.getIndex() ] > pom )
+                                output[ cell.getIndex() ] = pom;
+
+                            pom = pom - hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
+                            if( output[ e ] < pom )
+                                output[ e ] = pom;      
+
+                        }else
+                        {
+                            pom = - (hx * c)/( c - input[ e ]);
+                            if( output[ cell.getIndex() ] < pom )
+                                output[ cell.getIndex() ] = pom;
+
+                            pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
+                            if( output[ e ] > pom )
+                                output[ e ] = pom;
+                        }
+                    interfaceMap[ cell.getIndex() ] = true;
+                    interfaceMap[ e ] = true;
+                    }
+                    if( c * input[ t ] <= 0 )
+                    {
+                        if( c >= 0 )
+                        {
+                            pom = ( hz * c )/( c - input[ t ]);
+                            if( output[ cell.getIndex() ] > pom )
+                                output[ cell.getIndex() ] = pom;
+
+                            pom = pom - hz; //output[ e ] = (hx * c)/( c - input[ e ]) - hx;
+                            if( output[ t ] < pom )
+                                output[ t ] = pom; 
+
+                        }else
+                        {
+                            pom = - (hz * c)/( c - input[ t ]);
+                            if( output[ cell.getIndex() ] < pom )
+                                output[ cell.getIndex() ] = pom;
+
+                            pom = pom + hz; //output[ e ] = hx - (hx * c)/( c - input[ e ]);
+                            if( output[ t ] > pom )
+                                output[ t ] = pom;
+
+                        }
+                    interfaceMap[ cell.getIndex() ] = true;
+                    interfaceMap[ t ] = true;
+                    }    
+                 }
+                 /*output[ cell.getIndex() ] =
+                    c > 0 ? TypeInfo< RealType >::getMaxValue() :
+                           -TypeInfo< RealType >::getMaxValue();
+                 interfaceMap[ cell.getIndex() ] = false;*/ //is on line 245
+              }
+    }
 }
 
 template< typename Real,
           typename Device,
           typename Index >
    template< typename MeshEntity >
+__cuda_callable__
 void
 tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
 updateCell( MeshFunctionType& u,
@@ -628,12 +648,13 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2,
         Cell cell( mesh );
         cell.getCoordinates().x() = i; cell.getCoordinates().y() = j;
         cell.refresh();
+        const Index cind = cell.getIndex();
 
 
-        output[ cell.getIndex() ] =
+        output[ cind ] =
                input( cell ) >= 0 ? TypeInfo< Real >::getMaxValue() :
                                     - TypeInfo< Real >::getMaxValue();
-        interfaceMap[ cell.getIndex() ] = false; 
+        interfaceMap[ cind ] = false; 
 
         const Real& hx = mesh.getSpaceSteps().x();
         const Real& hy = mesh.getSpaceSteps().y();
@@ -647,48 +668,131 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2,
            const Index w = neighbors.template getEntityIndex<  -1,  0 >();
            const Index n = neighbors.template getEntityIndex<  0,  1 >();
            const Index s = neighbors.template getEntityIndex<  0,  -1 >();
-          
+           
            if( c * input[ n ] <= 0 )
            {
                pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
-               if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
-                   output[ cell.getIndex() ] = pom;
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+                   output[ cind ] = pom;
 
                interfaceMap[ cell.getIndex() ] = true;
            }
            if( c * input[ e ] <= 0 )
            {
                pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
-               if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) )
-                   output[ cell.getIndex() ] = pom;                       
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
+                   output[ cind ] = pom;                       
 
-               interfaceMap[ cell.getIndex() ] = true;
+               interfaceMap[ cind ] = true;
            }
            if( c * input[ w ] <= 0 )
            {
                pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
-               if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
-                   output[ cell.getIndex() ] = pom;
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+                   output[ cind ] = pom;
 
-               interfaceMap[ cell.getIndex() ] = true;
+               interfaceMap[ cind ] = true;
            }
            if( c * input[ s ] <= 0 )
            {
                pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
-               if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) 
-                   output[ cell.getIndex() ] = pom;
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+                   output[ cind ] = pom;
 
-               interfaceMap[ cell.getIndex() ] = true;
+               interfaceMap[ cind ] = true;
            }
         }
     }
 }
 
-
-/*__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, double, TNL::Devices::Cuda, int > >& input )
+template < typename Real, typename Device, typename Index >
+__global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, 
+                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
+                                  Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap )
 {
     int i = threadIdx.x + blockDim.x*blockIdx.x;
     int j = blockDim.y*blockIdx.y + threadIdx.y;
-    //const Meshes::Grid< 2, double, TNL::Devices::Cuda, int >& mesh = input.getMesh();
+    int k = blockDim.z*blockIdx.z + threadIdx.z;
+    const Meshes::Grid< 3, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >();
     
-}*/
+    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
+    {
+        typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell;
+        Cell cell( mesh );
+        cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k;
+        cell.refresh();
+        const Index cind = cell.getIndex();
+
+
+        output[ cind ] =
+               input( cell ) >= 0 ? TypeInfo< Real >::getMaxValue() :
+                                    - TypeInfo< Real >::getMaxValue();
+        interfaceMap[ cind ] = false; 
+        cell.refresh();
+
+        const Real& hx = mesh.getSpaceSteps().x();
+        const Real& hy = mesh.getSpaceSteps().y();
+        const Real& hz = mesh.getSpaceSteps().z();
+        const Real& c = input( cell );
+        if( ! cell.isBoundaryEntity()  )
+        {
+           auto neighbors = cell.getNeighborEntities();
+           Real pom = 0;
+           const Index e = neighbors.template getEntityIndex<  1, 0, 0 >();
+           const Index w = neighbors.template getEntityIndex<  -1, 0, 0 >();
+           const Index n = neighbors.template getEntityIndex<  0, 1, 0 >();
+           const Index s = neighbors.template getEntityIndex<  0, -1, 0 >();
+           const Index t = neighbors.template getEntityIndex<  0, 0, 1 >();
+           const Index b = neighbors.template getEntityIndex<  0, 0, -1 >();
+           
+           if( c * input[ n ] <= 0 )
+           {
+               pom = TNL::sign( c )*( hy * c )/( c - input[ n ]);
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+                   output[ cind ] = pom;
+
+               interfaceMap[ cind ] = true;
+           }
+           if( c * input[ e ] <= 0 )
+           {
+               pom = TNL::sign( c )*( hx * c )/( c - input[ e ]);
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) )
+                   output[ cind ] = pom;                       
+
+               interfaceMap[ cind ] = true;
+           }
+           if( c * input[ w ] <= 0 )
+           {
+               pom = TNL::sign( c )*( hx * c )/( c - input[ w ]);
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+                   output[ cind ] = pom;
+
+               interfaceMap[ cind ] = true;
+           }
+           if( c * input[ s ] <= 0 )
+           {
+               pom = TNL::sign( c )*( hy * c )/( c - input[ s ]);
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+                   output[ cind ] = pom;
+
+               interfaceMap[ cind ] = true;
+           }
+           if( c * input[ b ] <= 0 )
+           {
+               pom = TNL::sign( c )*( hz * c )/( c - input[ b ]);
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+                   output[ cind ] = pom;
+
+               interfaceMap[ cind ] = true;
+           }
+           if( c * input[ t ] <= 0 )
+           {
+               pom = TNL::sign( c )*( hz * c )/( c - input[ t ]);
+               if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) 
+                   output[ cind ] = pom;
+
+               interfaceMap[ cind ] = true;
+           }
+        }
+    }
+}
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
index 48f3301320..e44aaca871 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h
@@ -15,6 +15,7 @@
 #include <TNL/SharedPointer.h>
 #include "tnlDirectEikonalMethodsBase.h"
 
+
 template< typename Mesh,
           typename Anisotropy = Functions::Analytic::Constant< Mesh::getMeshDimension(), typename Mesh::RealType > >
 class FastSweepingMethod
@@ -145,6 +146,7 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >
       const IndexType maxIterations;
 };
 
+
 #include "tnlFastSweepingMethod1D_impl.h"
 #include "tnlFastSweepingMethod2D_impl.h"
 #include "tnlFastSweepingMethod3D_impl.h"
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index e36224d075..73e1708958 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -14,6 +14,9 @@
 #pragma once
 
 #include "tnlFastSweepingMethod.h"
+#include <TNL/TypeInfo.h>
+#include <TNL/Devices/Cuda.h>
+
 
 template< typename Real,
           typename Device,
@@ -212,15 +215,23 @@ solve( const MeshPointer& mesh,
       {
          // TODO: CUDA code
 #ifdef HAVE_CUDA
-          /*int numBlocks = 2;
-          int threadsPerBlock;
-          if( mesh->getDimensions().x() >= mesh->getDimensions().y() )
-               threadsPerBlock = (int)( mesh->getDimensions().x() );
-          else
-               threadsPerBlock = (int)( mesh->getDimensions().y() );
+          const int cudaBlockSize( 16 );
+          int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
+          int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
+          dim3 blockSize( cudaBlockSize, cudaBlockSize );
+          dim3 gridSize( numBlocksX, numBlocksY );
+          Devices::Cuda::synchronizeDevice();
+          int DIM = mesh->getDimensions().x();
           
-          CudaUpdateCellCaller< Real, Device, Index ><<< numBlocks, threadsPerBlock >>>( interfaceMap, aux );
-          cudaDeviceSynchronize(); //copak dela?*/
+          tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
+          for( int k = 0; k < numBlocksX; k++)
+            CudaUpdateCellCaller< Real, Device, Index ><<< gridSize, blockSize >>>( ptr,
+                                                                                    interfaceMapPtr.template getData< Device >(),
+                                                                                    auxPtr.template modifyData< Device>() );
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
+          aux = *auxPtr;
+          interfaceMap = *interfaceMapPtr;
 #endif
       }
       iteration++;
@@ -228,30 +239,30 @@ solve( const MeshPointer& mesh,
    aux.save("aux-final.tnl");
 }
 
+//#ifdef HAVE_CUDA
 template < typename Real, typename Device, typename Index >
-__global__ void CudaUpdateCellCaller( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
+__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
+                                      const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                       Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux )
 {
     int i = threadIdx.x + blockDim.x*blockIdx.x;
-    int j = threadIdx.y + blockDim.y*blockIdx.y;
-    const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.getMesh();
+    int j = blockDim.y*blockIdx.y + threadIdx.y;
+    const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
     
     if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
     {
-        //make cell of aux from index
         typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell;
         Cell cell( mesh );
         cell.getCoordinates().x() = i; cell.getCoordinates().y() = j;
         cell.refresh();
-
-        //update cell value few times 
-        //for( int i = 0; i < mesh.getDimensions() ; i++ )
-        //{
-            cell.refresh();
+        //tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
+        for( int k = 0; k < 16; k++ )
+        {
             if( ! interfaceMap( cell ) )
             {
-        //        tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::updateCell( aux, cell );
+               ptr.updateCell( aux, cell );
             }
-        //}
+        }
     }
 }
+//#endif
\ No newline at end of file
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index 1fd8c9f2a0..3e8cb7bb19 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -21,7 +21,7 @@ template< typename Real,
           typename Anisotropy >
 FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >::
 FastSweepingMethod()
-: maxIterations( 1 )
+: maxIterations( 2 )
 {
    
 }
@@ -64,6 +64,7 @@ solve( const MeshPointer& mesh,
    interfaceMapPtr->setMesh( mesh );
    std::cout << "Initiating the interface cells ..." << std::endl;
    BaseType::initInterface( u, auxPtr, interfaceMapPtr );
+   cudaDeviceSynchronize();
    auxPtr->save( "aux-ini.tnl" );   
    
    typename MeshType::Cell cell( *mesh );
@@ -71,172 +72,201 @@ solve( const MeshPointer& mesh,
    IndexType iteration( 0 );
    MeshFunctionType aux = *auxPtr;
    InterfaceMapType interfaceMap = * interfaceMapPtr;
-   while( iteration < this->maxIterations )
-   {
-      for( cell.getCoordinates().z() = 0;
-           cell.getCoordinates().z() < mesh->getDimensions().z();
-           cell.getCoordinates().z()++ )
-      {
-         for( cell.getCoordinates().y() = 0;
-              cell.getCoordinates().y() < mesh->getDimensions().y();
-              cell.getCoordinates().y()++ )
-         {
-            for( cell.getCoordinates().x() = 0;
-                 cell.getCoordinates().x() < mesh->getDimensions().x();
-                 cell.getCoordinates().x()++ )
-            {
-               cell.refresh();
-               if( ! interfaceMap( cell ) )
-                  this->updateCell( aux, cell );
-            }
-         }
-      }
-      //aux.save( "aux-1.tnl" );
+    while( iteration < this->maxIterations )
+    {
+        if( std::is_same< DeviceType, Devices::Host >::value )
+        {
+           for( cell.getCoordinates().z() = 0;
+                cell.getCoordinates().z() < mesh->getDimensions().z();
+                cell.getCoordinates().z()++ )
+           {
+              for( cell.getCoordinates().y() = 0;
+                   cell.getCoordinates().y() < mesh->getDimensions().y();
+                   cell.getCoordinates().y()++ )
+              {
+                 for( cell.getCoordinates().x() = 0;
+                      cell.getCoordinates().x() < mesh->getDimensions().x();
+                      cell.getCoordinates().x()++ )
+                 {
+                    cell.refresh();
+                    if( ! interfaceMap( cell ) )
+                       this->updateCell( aux, cell );
+                 }
+              }
+           }
+           //aux.save( "aux-1.tnl" );
 
-      for( cell.getCoordinates().z() = 0;
-           cell.getCoordinates().z() < mesh->getDimensions().z();
-           cell.getCoordinates().z()++ )
-      {
-         for( cell.getCoordinates().y() = 0;
-              cell.getCoordinates().y() < mesh->getDimensions().y();
-              cell.getCoordinates().y()++ )
-         {
-            for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                 cell.getCoordinates().x() >= 0 ;
-                 cell.getCoordinates().x()-- )		
-            {
-               //std::cerr << "2 -> ";
-               cell.refresh();
-               if( ! interfaceMap( cell ) )            
-                  this->updateCell( aux, cell );
-            }
-         }
-      }
-      //aux.save( "aux-2.tnl" );
-      for( cell.getCoordinates().z() = 0;
-           cell.getCoordinates().z() < mesh->getDimensions().z();
-           cell.getCoordinates().z()++ )
-      {
-         for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-              cell.getCoordinates().y() >= 0 ;
-              cell.getCoordinates().y()-- )
-         {
-            for( cell.getCoordinates().x() = 0;
-                 cell.getCoordinates().x() < mesh->getDimensions().x();
-                 cell.getCoordinates().x()++ )
-            {
-               //std::cerr << "3 -> ";
-               cell.refresh();
-               if( ! interfaceMap( cell ) )            
-                  this->updateCell( aux, cell );
-            }
-         }
-      }
-      //aux.save( "aux-3.tnl" );
-      
-      for( cell.getCoordinates().z() = 0;
-           cell.getCoordinates().z() < mesh->getDimensions().z();
-           cell.getCoordinates().z()++ )
-      {
-         for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-              cell.getCoordinates().y() >= 0;
-              cell.getCoordinates().y()-- )
-         {
-            for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                 cell.getCoordinates().x() >= 0 ;
-                 cell.getCoordinates().x()-- )		
-            {
-               //std::cerr << "4 -> ";
-               cell.refresh();
-               if( ! interfaceMap( cell ) )            
-                  this->updateCell( aux, cell );
-            }
-         }
-      }     
-      //aux.save( "aux-4.tnl" );
-      
-      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-           cell.getCoordinates().z() >= 0;
-           cell.getCoordinates().z()-- )
-      {
-         for( cell.getCoordinates().y() = 0;
-              cell.getCoordinates().y() < mesh->getDimensions().y();
-              cell.getCoordinates().y()++ )
-         {
-            for( cell.getCoordinates().x() = 0;
-                 cell.getCoordinates().x() < mesh->getDimensions().x();
-                 cell.getCoordinates().x()++ )
-            {
-               //std::cerr << "5 -> ";
-               cell.refresh();
-               if( ! interfaceMap( cell ) )
-                  this->updateCell( aux, cell );
-            }
-         }
-      }
-      //aux.save( "aux-5.tnl" );
+           for( cell.getCoordinates().z() = 0;
+                cell.getCoordinates().z() < mesh->getDimensions().z();
+                cell.getCoordinates().z()++ )
+           {
+              for( cell.getCoordinates().y() = 0;
+                   cell.getCoordinates().y() < mesh->getDimensions().y();
+                   cell.getCoordinates().y()++ )
+              {
+                 for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                      cell.getCoordinates().x() >= 0 ;
+                      cell.getCoordinates().x()-- )		
+                 {
+                    //std::cerr << "2 -> ";
+                    cell.refresh();
+                    if( ! interfaceMap( cell ) )            
+                       this->updateCell( aux, cell );
+                 }
+              }
+           }
+           //aux.save( "aux-2.tnl" );
+           for( cell.getCoordinates().z() = 0;
+                cell.getCoordinates().z() < mesh->getDimensions().z();
+                cell.getCoordinates().z()++ )
+           {
+              for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+                   cell.getCoordinates().y() >= 0 ;
+                   cell.getCoordinates().y()-- )
+              {
+                 for( cell.getCoordinates().x() = 0;
+                      cell.getCoordinates().x() < mesh->getDimensions().x();
+                      cell.getCoordinates().x()++ )
+                 {
+                    //std::cerr << "3 -> ";
+                    cell.refresh();
+                    if( ! interfaceMap( cell ) )            
+                       this->updateCell( aux, cell );
+                 }
+              }
+           }
+           //aux.save( "aux-3.tnl" );
 
-      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-           cell.getCoordinates().z() >= 0;
-           cell.getCoordinates().z()-- )
-      {
-         for( cell.getCoordinates().y() = 0;
-              cell.getCoordinates().y() < mesh->getDimensions().y();
-              cell.getCoordinates().y()++ )
-         {
-            for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                 cell.getCoordinates().x() >= 0 ;
-                 cell.getCoordinates().x()-- )		
-            {
-               //std::cerr << "6 -> ";
-               cell.refresh();
-               if( ! interfaceMap( cell ) )            
-                  this->updateCell( aux, cell );
-            }
-         }
-      }
-      //aux.save( "aux-6.tnl" );
-      
-      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-           cell.getCoordinates().z() >= 0;
-           cell.getCoordinates().z()-- )
-      {
-         for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-              cell.getCoordinates().y() >= 0 ;
-              cell.getCoordinates().y()-- )
-         {
-            for( cell.getCoordinates().x() = 0;
-                 cell.getCoordinates().x() < mesh->getDimensions().x();
-                 cell.getCoordinates().x()++ )
-            {
-               //std::cerr << "7 -> ";
-               cell.refresh();
-               if( ! interfaceMap( cell ) )            
-                  this->updateCell( aux, cell );
-            }
-         }
-      }
-      //aux.save( "aux-7.tnl" );
+           for( cell.getCoordinates().z() = 0;
+                cell.getCoordinates().z() < mesh->getDimensions().z();
+                cell.getCoordinates().z()++ )
+           {
+              for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+                   cell.getCoordinates().y() >= 0;
+                   cell.getCoordinates().y()-- )
+              {
+                 for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                      cell.getCoordinates().x() >= 0 ;
+                      cell.getCoordinates().x()-- )		
+                 {
+                    //std::cerr << "4 -> ";
+                    cell.refresh();
+                    if( ! interfaceMap( cell ) )            
+                       this->updateCell( aux, cell );
+                 }
+              }
+           }     
+           //aux.save( "aux-4.tnl" );
 
-      for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
-           cell.getCoordinates().z() >= 0;
-           cell.getCoordinates().z()-- )
+           for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+                cell.getCoordinates().z() >= 0;
+                cell.getCoordinates().z()-- )
+           {
+              for( cell.getCoordinates().y() = 0;
+                   cell.getCoordinates().y() < mesh->getDimensions().y();
+                   cell.getCoordinates().y()++ )
+              {
+                 for( cell.getCoordinates().x() = 0;
+                      cell.getCoordinates().x() < mesh->getDimensions().x();
+                      cell.getCoordinates().x()++ )
+                 {
+                    //std::cerr << "5 -> ";
+                    cell.refresh();
+                    if( ! interfaceMap( cell ) )
+                       this->updateCell( aux, cell );
+                 }
+              }
+           }
+           //aux.save( "aux-5.tnl" );
+
+           for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+                cell.getCoordinates().z() >= 0;
+                cell.getCoordinates().z()-- )
+           {
+              for( cell.getCoordinates().y() = 0;
+                   cell.getCoordinates().y() < mesh->getDimensions().y();
+                   cell.getCoordinates().y()++ )
+              {
+                 for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                      cell.getCoordinates().x() >= 0 ;
+                      cell.getCoordinates().x()-- )		
+                 {
+                    //std::cerr << "6 -> ";
+                    cell.refresh();
+                    if( ! interfaceMap( cell ) )            
+                       this->updateCell( aux, cell );
+                 }
+              }
+           }
+           //aux.save( "aux-6.tnl" );
+
+           for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+                cell.getCoordinates().z() >= 0;
+                cell.getCoordinates().z()-- )
+           {
+              for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+                   cell.getCoordinates().y() >= 0 ;
+                   cell.getCoordinates().y()-- )
+              {
+                 for( cell.getCoordinates().x() = 0;
+                      cell.getCoordinates().x() < mesh->getDimensions().x();
+                      cell.getCoordinates().x()++ )
+                 {
+                    //std::cerr << "7 -> ";
+                    cell.refresh();
+                    if( ! interfaceMap( cell ) )            
+                       this->updateCell( aux, cell );
+                 }
+              }
+           }
+           //aux.save( "aux-7.tnl" );
+
+           for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1;
+                cell.getCoordinates().z() >= 0;
+                cell.getCoordinates().z()-- )
+           {
+              for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
+                   cell.getCoordinates().y() >= 0;
+                   cell.getCoordinates().y()-- )
+              {
+                 for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
+                      cell.getCoordinates().x() >= 0 ;
+                      cell.getCoordinates().x()-- )		
+                 {
+                    //std::cerr << "8 -> ";
+                    cell.refresh();
+                    if( ! interfaceMap( cell ) )            
+                       this->updateCell( aux, cell );
+                 }
+              }
+           }
+      }
+      if( std::is_same< DeviceType, Devices::Cuda >::value )
       {
-         for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1;
-              cell.getCoordinates().y() >= 0;
-              cell.getCoordinates().y()-- )
-         {
-            for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1;
-                 cell.getCoordinates().x() >= 0 ;
-                 cell.getCoordinates().x()-- )		
-            {
-               //std::cerr << "8 -> ";
-               cell.refresh();
-               if( ! interfaceMap( cell ) )            
-                  this->updateCell( aux, cell );
-            }
-         }
+         // TODO: CUDA code
+#ifdef HAVE_CUDA
+          const int cudaBlockSize( 8 );
+          int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
+          int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
+          int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z(), cudaBlockSize ); 
+          if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
+              std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
+          dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
+          dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
+          Devices::Cuda::synchronizeDevice();
+          
+          tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
+          for( int k = 0; k < numBlocksX; k++)
+          CudaUpdateCellCaller< Real, Device, Index ><<< gridSize, blockSize >>>( ptr,
+                                                                                  interfaceMapPtr.template getData< Device >(),
+                                                                                  auxPtr.template modifyData< Device>() );
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
+          aux = *auxPtr;
+          interfaceMap = *interfaceMapPtr;
+#endif
       }
+        
       //aux.save( "aux-8.tnl" );
       iteration++;
       
@@ -244,3 +274,29 @@ solve( const MeshPointer& mesh,
    aux.save("aux-final.tnl");
 }
 
+template < typename Real, typename Device, typename Index >
+__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
+                                      const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
+                                      Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux )
+{
+    int i = threadIdx.x + blockDim.x*blockIdx.x;
+    int j = blockDim.y*blockIdx.y + threadIdx.y;
+    int k = blockDim.z*blockIdx.z + threadIdx.z;
+    const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
+    
+    if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
+    {
+        typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell;
+        Cell cell( mesh );
+        cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k;
+        cell.refresh();
+        //tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;
+        for( int l = 0; l < 8; l++ )
+        {
+            if( ! interfaceMap( cell ) )
+            {
+               ptr.updateCell( aux, cell );
+            }
+        }
+    }
+}
\ No newline at end of file
-- 
GitLab