diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
index 50ea7bde8ee87b24317363feb78e30d45128fa3b..c470a77efdbbb7fe8fc91b584ae8af2440a2eb59 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
@@ -353,8 +353,8 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2,
 
 
 template < typename Index >
-__global__ void GetNeighbours( const TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator,
-        TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY )
+__global__ void GetNeighbours( const TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator,
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY )
 {
   int i = blockIdx.x * 1024 + threadIdx.x;
   
@@ -389,7 +389,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
         const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
         const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
         Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
-        TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator,
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator,
         const Containers::StaticVector< 2, Index > vecLowerOverlaps, 
         const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock )
 {
@@ -598,7 +598,7 @@ tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
 updateBlocks( InterfaceMapType interfaceMap,
         MeshFunctionType aux,
         MeshFunctionType helpFunc,
-        ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
+        ArrayContainerView BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ )
 {
 #pragma omp parallel for schedule( dynamic )
   for( IndexType i = 0; i < BlockIterHost.getSize(); i++ )
@@ -769,7 +769,7 @@ template< typename Real,
         typename Index >
 void 
 tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
-getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY )
+getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY )
 {
   int* BlockIterPom; 
   BlockIterPom = new int [numBlockX * numBlockY];
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h
index 5b2a4b685b75a4bb616631b139bca7f1215b4c0b..32548abcfe66affd71a79d3ed5f2f21d67df644e 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h
@@ -480,8 +480,8 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3
 
 
 template < typename Index >
-__global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
-        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,
+__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom,
         int numBlockX, int numBlockY, int numBlockZ )
 {
   int i = blockIdx.x * 1024 + threadIdx.x;
@@ -520,7 +520,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
         const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
         const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
         Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
-        TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
         Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps )
 {
   int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
@@ -1056,7 +1056,7 @@ template< typename Real,
         typename Index >
 void 
 tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >::
-getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ )
+getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY, int numBlockZ )
 {
   int* BlockIterPom; 
   BlockIterPom = new int [ numBlockX * numBlockY * numBlockZ ];
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index e0ece04bf01d4a6e3356f716aa13b471f9933f73..7cba99f6586b84e31fbe1a49da4223c9e621dd5b 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -62,6 +62,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
     typedef Functions::MeshFunction< MeshType > MeshFunctionType;
     typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType;
     typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
+    using ArrayContainerView = typename ArrayContainer::ViewType;
     typedef Containers::StaticVector< 2, Index > StaticVector;
     
     using MeshPointer = Pointers::SharedPointer<  MeshType >;
@@ -87,15 +88,18 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
             const RealType velocity = 1.0 );
         
 // FOR OPENMP WILL BE REMOVED
-    void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY  );
+    void getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY  );
         
     template< int sizeSArray >
-    void updateBlocks( const InterfaceMapType& interfaceMap,
-            MeshFunctionType& aux,
-            MeshFunctionType& helpFunc,
-            ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ );
+    void updateBlocks( InterfaceMapType interfaceMap,
+            MeshFunctionType aux,
+            MeshFunctionType helpFunc,
+            ArrayContainerView BlockIterHost, int numThreadsPerBlock );
+    
+  protected:
     
-    void getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY  );
+   __cuda_callable__ RealType getNewValue( RealType valuesAndSteps[],
+           const RealType originalValue, const RealType v );
 };
 
 template< typename Real,
@@ -111,6 +115,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
     typedef Functions::MeshFunction< MeshType > MeshFunctionType;
     typedef Functions::MeshFunction< MeshType, 3, bool > InterfaceMapType;
     typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
+    using ArrayContainerView = typename ArrayContainer::ViewType;
     typedef Containers::StaticVector< 3, Index > StaticVector;
     using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
     using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;      
@@ -134,15 +139,15 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >
             const RealType velocity = 1.0 );
     
     // OPENMP WILL BE REMOVED
-    void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY, int numBlockZ );
+    void getNeighbours( ArrayContainerView BlockIterHost, int numBlockX, int numBlockY, int numBlockZ );
     
     template< int sizeSArray >
-    void updateBlocks( const InterfaceMapType& interfaceMap,
-            const MeshFunctionType& aux,
+    void updateBlocks( const InterfaceMapType interfaceMap,
+            const MeshFunctionType aux,
             MeshFunctionType& helpFunc,
-            ArrayContainer& BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ );
+            ArrayContainer BlockIterHost, int numThreadsPerBlock );
     
-    void getNeighbours( ArrayContainer& BlockIterHost, int numBlockX, int numBlockY, int numBlockZ );
+  protected:
     
     __cuda_callable__ RealType getNewValue( RealType valuesAndSteps[],
            const RealType originalValue, const RealType v );
@@ -180,17 +185,14 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
         const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
         const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
         Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
-        TNL::Containers::Array< int, Devices::Cuda, Index > blockCalculationIndicator,
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator,
         const Containers::StaticVector< 2, Index > vecLowerOverlaps, 
         const Containers::StaticVector< 2, Index > vecUpperOverlaps, int oddEvenBlock =0);
 
 template < typename Index >
-__global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > dBlock, int nBlocks );
+__global__ void GetNeighbours( const TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicator,
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > blockCalculationIndicatorHelp, int numBlockX, int numBlockY );
 
-template < typename Index >
-__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY );
 
 
 // 3D
@@ -205,10 +207,11 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
         const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
         const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
         Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice );
+        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
+        Containers::StaticVector< 3, Index > vecLowerOverlaps, Containers::StaticVector< 3, Index > vecUpperOverlaps );
 
 template < typename Index >
-__global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
+__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
         TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom,
         int numBlockX, int numBlockY, int numBlockZ );
 #endif
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index 31d3f8b324a7fc2a02d94e54b5909190cacd965d..e5638c11dd71d88d72d8d0590c8e51c0df6baaab 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -80,40 +80,9 @@ solve( const MeshPointer& mesh,
   IndexType iteration( 0 );
   InterfaceMapType interfaceMap = *interfaceMapPtr;
   MeshFunctionType aux = *auxPtr;
-  aux.template synchronize< Communicator >();
-  
-  
-#ifdef HAVE_MPI
-  int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup );
-  //printf( "Hello world from rank: %d ", i );
-  //Communicators::MpiCommunicator::Request r = Communicators::MpiCommunicator::ISend( auxPtr, 0, 0, Communicators::MpiCommunicator::AllGroup );
-  if( i == 1 ) {
-    /*for( int k = 0; k < 16*16; k++ )
-      aux[ k ] = 10;*/
-    printf( "1: mesh x: %d\n", mesh->getDimensions().x() );
-    printf( "1: mesh y: %d\n", mesh->getDimensions().y() );
-    //aux.save("aux_proc1.tnl");
-  }
-  if( i == 0 ) {
-    printf( "0: mesh x: %d\n", mesh->getDimensions().x() );
-    printf( "0: mesh y: %d\n", mesh->getDimensions().y() );
-    //aux.save("aux_proc0.tnl");
-    /*for( int k = 0; k < mesh->getDimensions().x()*mesh->getDimensions().y(); k++ )
-      aux[ k ] = 10;
-    for( int k = 0; k < mesh->getDimensions().x(); k++ ){
-      for( int l = 0; l < mesh->getDimensions().y(); l++ )
-        printf("%f.2\t",aux[ k * 16 + l ] );
-    printf("\n");
-    }*/
-  }
-    
-  /*bool a = Communicators::MpiCommunicator::IsInitialized();
-  if( a )
-    printf("Je Init\n");
-  else
-    printf("Neni Init\n");*/
-#endif
+  aux.template synchronize< Communicator >(); //synchronize initialized overlaps
   
+  std::cout << "Calculating the values ..." << std::endl; 
   while( iteration < this->maxIterations )
   {
     // calculatedBefore indicates weather we calculated in the last passage of the while cycle 
@@ -290,41 +259,8 @@ solve( const MeshPointer& mesh,
         // Need for calling functions from kernel
         BaseType ptr;
         
-        int BlockIterD = 1;
-        /*auxPtr = helpFunc;
-         
-         CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
-         interfaceMapPtr.template getData< Device >(),
-         auxPtr.template getData< Device>(),
-         helpFunc.template modifyData< Device>(),
-         BlockIterDevice,
-         oddEvenBlock.getView() );
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
-         auxPtr = helpFunc;
-         
-         oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
-         
-         CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
-         interfaceMapPtr.template getData< Device >(),
-         auxPtr.template getData< Device>(),
-         helpFunc.template modifyData< Device>(),
-         BlockIterDevice,
-         oddEvenBlock.getView() );
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
-         auxPtr = helpFunc;
-         
-         oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
-         
-         CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY ) );
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
-         CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks );
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
-         
-         BlockIterD = dBlock.getElement( 0 );*/
+        // True if we should calculate again.
+        int calculateCudaBlocksAgain = 1;
         
         // Array that identifies which blocks should be calculated.
         // All blocks should calculate in first passage ( setValue(1) )
@@ -343,16 +279,9 @@ solve( const MeshPointer& mesh,
         MeshFunctionPointer helpFunc( mesh );
         helpFunc.template modifyData() = auxPtr.template getData(); 
         
-        //int pocBloku = 0;
-        Devices::Cuda::synchronizeDevice();
-        CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr,
-                interfaceMapPtr.template getData< Device >(),
-                auxPtr.template modifyData< Device>(),
-                helpFunc.template modifyData< Device>(),
-                BlockIterDevice.getView() );
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
-        
+        // number of iterations of while calculateCudaBlocksAgain
+        int numIter = 0;
+               
         //int oddEvenBlock = 0;
         while( calculateCudaBlocksAgain )
         {
@@ -390,44 +319,16 @@ solve( const MeshPointer& mesh,
           Devices::Cuda::synchronizeDevice();
           CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(),
                   auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(),
-                  blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps );
+                  blockCalculationIndicator.getView(), vecLowerOverlaps, vecUpperOverlaps );
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
-        
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
-        
-        
-        aux = *auxPtr;
-        interfaceMap = *interfaceMapPtr;
-#endif
-      }
-
-      
-/**----------------------MPI-TO-DO---------------------------------------------**/
-        
-#ifdef HAVE_MPI
-        //int i = MPI::GetRank( MPI::AllGroup );
-        //TNL::Meshes::DistributedMeshes::DistributedMesh< MeshType > Mesh;
-        int neighCount = 0; // should this thread calculate again?
-        int calculpom[4] = {0,0,0,0};
-        
-          if( i == 0 ){
-            BlockIterPom1 = BlockIterDevice;
-            for( int i =0; i< numBlocksX; i++ ){
-              for( int j = 0; j < numBlocksY; j++ )
-              {
-                std::cout << BlockIterPom1[j*numBlocksX + i];
-              }
-              std::cout << std::endl;
-            }
-            std::cout << std::endl;
-          }
-#endif
+          
+          // Switching helpFunc and auxPtr.
+          auxPtr.swap( helpFunc );
           
           // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now.
           Devices::Cuda::synchronizeDevice(); 
-          GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator, blockCalculationIndicatorHelp, numBlocksX, numBlocksY );
+          GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator.getView(), blockCalculationIndicatorHelp.getView(), numBlocksX, numBlocksY );
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
           blockCalculationIndicator = blockCalculationIndicatorHelp;
@@ -445,46 +346,24 @@ solve( const MeshPointer& mesh,
 /**-----------------------------------------------------------------------------------------------------------*/
           numIter ++;
         }
-        if( numIter%2  == 1 ){
-          auxPtr = helpFunc;
-        }
-        /*cudaFree( BlockIterDevice );
-         cudaFree( dBlock );
-         delete BlockIter;*/
-        
-        if( neigh[1] != -1 )
-        {
-          req[neighCount] = MPI::ISend( &calculated, 1, neigh[1], 0, MPI::AllGroup ); 
-          neighCount++;
-          
-          
-          req[neighCount] = MPI::IRecv( &calculpom[1], 1, neigh[1], 0, MPI::AllGroup );
-          neighCount++;
-        }
-        
-        if( neigh[2] != -1 )
-        {
-          req[neighCount] = MPI::ISend( &calculated, 1, neigh[2], 0, MPI::AllGroup );
-          neighCount++;
-          
-          req[neighCount] = MPI::IRecv( &calculpom[2], 1, neigh[2], 0, MPI::AllGroup  );
-          neighCount++;
-        }
-        
-        if( neigh[5] != -1 )
+        if( numIter%2 == 1 ) // Need to check parity for MPI overlaps to synchronize ( otherwise doesnt work )
         {
-          req[neighCount] = MPI::ISend( &calculated, 1, neigh[5], 0, MPI::AllGroup );
-          neighCount++;
-          
-          req[neighCount] = MPI::IRecv( &calculpom[3], 1, neigh[5], 0, MPI::AllGroup );
-          neighCount++;
+          helpFunc.swap( auxPtr );
+          Devices::Cuda::synchronizeDevice();
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
         }
-        
-        MPI::WaitAll(req,neighCount);
-#if ForDebug
-        printf( "%d: Sending Calculated = %d.\n", i, calculated );
-#endif        
-        MPI::Allreduce( &calculated, &calculated, 1, MPI_LOR,  MPI::AllGroup );
+        aux = *auxPtr;
+        interfaceMap = *interfaceMapPtr;
+#endif
+      }
+
+      
+/**----------------------MPI-TO-DO---------------------------------------------**/        
+#ifdef HAVE_MPI
+      if( CommunicatorType::isDistributed() ){
+        getInfoFromNeighbours( calculatedBefore, calculateMPIAgain, mesh );
+       
         aux.template synchronize< Communicator >();
       }
 #endif
@@ -518,9 +397,16 @@ setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps,
 #endif
 }
 
-template < typename Index >
-__global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY )
+
+
+
+template< typename Real, typename Device, typename Index, 
+          typename Communicator, typename Anisotropy >
+bool 
+FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
+goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, 
+        MeshFunctionType& aux, const InterfaceMapType& interfaceMap,
+        const AnisotropyPointer& anisotropy )
 {
   bool calculated = false;
   const MeshType& mesh = aux.getMesh();
@@ -548,97 +434,15 @@ __global__ void GetNeighbours( TNL::Containers::ArrayView< int, Devices::Cuda, I
   return calculated;
 }
 
-template < typename Index >
-__global__ void CudaParallelReduc( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > dBlock, int nBlocks )
-{
-  int i = threadIdx.x;
-  int blId = blockIdx.x;
-  int blockSize = blockDim.x;
-  /*if ( i == 0 && blId == 0 ){
-    printf( "nBlocks = %d\n", nBlocks );
-    for( int j = nBlocks-1; j > -1 ; j--){
-      printf( "%d: cislo = %d \n", j, BlockIterDevice[ j ] );
-    }
-  }*/
-  __shared__ int sArray[ 1024 ];
-  sArray[ i ] = 0;
-  if( blId * 1024 + i < nBlocks )
-    sArray[ i ] = BlockIterDevice[ blId * 1024 + i ];
-  __syncthreads();
-  /*if ( i == 0 && blId == 0 ){
-   printf( "nBlocks = %d\n", nBlocks );
-   for( int j = 4; j > -1 ; j--){
-   printf( "%d: cislo = %d \n", j, sArray[ j ] );
-   }
-  }*/
-  /*extern __shared__ volatile int sArray[];
-   unsigned int i = threadIdx.x;
-   unsigned int gid = blockIdx.x * blockSize * 2 + threadIdx.x;
-   unsigned int gridSize = blockSize * 2 * gridDim.x;
-   sArray[ i ] = 0;
-   while( gid < nBlocks )
-   {
-   sArray[ i ] += BlockIterDevice[ gid ] + BlockIterDevice[ gid + blockSize ];
-   gid += gridSize;
-   }
-   __syncthreads();*/
-  
-  if ( blockSize == 1024) {
-    if (i < 512)
-      sArray[ i ] += sArray[ i + 512 ];
-  }
-  __syncthreads();
-  if (blockSize >= 512) {
-    if (i < 256) {
-      sArray[ i ] += sArray[ i + 256 ];
-    }
-  }
-  __syncthreads();
-  if (blockSize >= 256) {
-    if (i < 128) {
-      sArray[ i ] += sArray[ i + 128 ];
-    }
-  }
-  __syncthreads();
-  if (blockSize >= 128) {
-    if (i < 64) {
-      sArray[ i ] += sArray[ i + 64 ];
-    }
-  }
-  __syncthreads();
-  if (i < 32 )
-  {
-    if(  blockSize >= 64 ){ sArray[ i ] += sArray[ i + 32 ];}
-  __syncthreads();
-    if(  blockSize >= 32 ){  sArray[ i ] += sArray[ i + 16 ];}
-  __syncthreads();
-    if(  blockSize >= 16 ){  sArray[ i ] += sArray[ i + 8 ];}
-    if(  blockSize >= 8 ){  sArray[ i ] += sArray[ i + 4 ];}
-  __syncthreads();
-    if(  blockSize >= 4 ){  sArray[ i ] += sArray[ i + 2 ];}
-  __syncthreads();
-    if(  blockSize >= 2 ){  sArray[ i ] += sArray[ i + 1 ];}
-  __syncthreads();
-  }
-  __syncthreads();
-  
-  if( i == 0 )
-    dBlock[ blId ] = sArray[ 0 ];
-}
 
 
 
-template < int sizeSArray, typename Real, typename Device, typename Index >
-__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
-        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
-        const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
-        Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
-        CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY ) );
-        TNL_CHECK_CUDA_DEVICE;
-        
-        CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks );
-        TNL_CHECK_CUDA_DEVICE;
+#ifdef HAVE_MPI
+template< typename Real, typename Device, typename Index, 
+          typename Communicator, typename Anisotropy >
+void 
+FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >::
+getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh )
 {
   Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh();
   
@@ -687,4 +491,3 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
               calculateFromNeighbours[2] || calculateFromNeighbours[3];
 }
 #endif
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock )
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index 4895c7693772a9f33458a119506668a493996ec0..325b626f7bf5262637f8e1b43ec9e156bbeca26b 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -262,24 +262,86 @@ solve( const MeshPointer& mesh,
         // IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2)
         const int cudaBlockSize( 8 );
         
-        CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr,
-                interfaceMapPtr.template getData< Device >(),
-                auxPtr.template getData< Device>(),
-                helpFunc.template modifyData< Device>(),
-                BlockIterDevice.getView() );
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
+        // Getting the number of blocks in grid in each direction (without overlaps bcs we dont calculate on overlaps)
+        int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize );
+        int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize );
+        int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z() - vecLowerOverlaps[2] - vecUpperOverlaps[2], cudaBlockSize ); 
+        if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
+          std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
         
-        GetNeighbours3D<<< nBlocksNeigh, 1024 >>>( BlockIterDevice.getView(), BlockIterPom.getView(), numBlocksX, numBlocksY, numBlocksZ );
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
-        BlockIterDevice = BlockIterPom;
+        // Making the variables for global function CudaUpdateCellCaller.
+        dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
+        dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
         
-        CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice.getView(), dBlock.getView(), ( numBlocksX * numBlocksY * numBlocksZ ) );
-        cudaDeviceSynchronize();
-        TNL_CHECK_CUDA_DEVICE;
+        BaseType ptr; // tnlDirectEikonalMethodBase type for calling of function inside CudaUpdateCellCaller
+        
+        
+        int BlockIterD = 1; //variable that tells us weather we should calculate the main cuda body again
+        
+        // Array containing information about each block in grid, answering question (Have we calculated in this block?)
+        TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice( numBlocksX * numBlocksY * numBlocksZ );
+        BlockIterDevice.setValue( 1 ); // calculate all in the first passage
+        
+        // Helping Array for GetNeighbours3D
+        TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom( numBlocksX * numBlocksY * numBlocksZ );
+        BlockIterPom.setValue( 0 ); //doesnt matter what number
+        
+        
+        
+        // number of neighbours in one block (1024 threads) for GetNeighbours3D
+        int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0);
+        
+        
+        //MeshFunctionPointer helpFunc1( mesh );      
+        MeshFunctionPointer helpFunc( mesh );
+        helpFunc.template modifyData() = auxPtr.template getData();
+        Devices::Cuda::synchronizeDevice(); 
+                
+        int numIter = 0; // number of passages of following while cycle
         
-        CudaParallelReduc<<< 1, nBlocks >>>( dBlock.getView(), dBlock.getView(), nBlocks );
+        while( BlockIterD ) //main body of cuda code
+        {
+          
+          Devices::Cuda::synchronizeDevice();          
+          // main function that calculates all values in each blocks
+          // calculated values are in helpFunc
+          CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr,
+                  interfaceMapPtr.template getData< Device >(),
+                  auxPtr.template getData< Device>(),
+                  helpFunc.template modifyData< Device>(),
+                  BlockIterDevice.getView(), vecLowerOverlaps, vecUpperOverlaps );
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
+          // Switching pointers to helpFunc and auxPtr so real results are in memory of helpFunc but here under variable auxPtr
+          auxPtr.swap( helpFunc );
+          
+          Devices::Cuda::synchronizeDevice();
+          // Neighbours of blocks that calculatedBefore in this passage should calculate in the next!
+          // BlockIterDevice contains blocks that calculatedBefore in this passage and BlockIterPom those that should calculate in next (are neighbours)
+          GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice.getView(), BlockIterPom.getView(), numBlocksX, numBlocksY, numBlocksZ );
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
+          BlockIterDevice = BlockIterPom;
+          Devices::Cuda::synchronizeDevice();
+          
+          // .containsValue(1) is actually parallel reduction implemented in TNL
+          BlockIterD = BlockIterDevice.containsValue(1);
+          cudaDeviceSynchronize();
+          TNL_CHECK_CUDA_DEVICE;
+          
+          numIter++;
+          if( BlockIterD ){ 
+            // if we calculated in this passage, we should send the info via MPI so neighbours should calculate after synchronization
+            calculatedBefore = 1;
+          }
+        }
+        if( numIter%2 == 1 ){
+          
+          // We need auxPtr to point on memory of original auxPtr (not to helpFunc)
+          // last passage of previous while cycle didnt calculate any number anyway so switching names doesnt effect values
+          auxPtr.swap( helpFunc ); 
+          Devices::Cuda::synchronizeDevice();
+        }
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
         aux = *auxPtr;
@@ -375,10 +437,15 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo,
   return calculated;
 }
 
-template < typename Index >
-__global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterPom,
-        int numBlockX, int numBlockY, int numBlockZ )
+
+
+
+#ifdef HAVE_MPI
+template< typename Real, typename Device, typename Index, 
+          typename Communicator, typename Anisotropy >
+void 
+FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >::
+getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh )
 {
   Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh();
   
@@ -397,22 +464,6 @@ __global__ void GetNeighbours3D( TNL::Containers::ArrayView< int, Devices::Cuda,
     requestsInformation[neighCount++] = 
             MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup );
   }
-}
-
-template < int sizeSArray, typename Real, typename Device, typename Index >
-__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
-        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
-        const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
-        Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
-        TNL::Containers::ArrayView< int, Devices::Cuda, Index > BlockIterDevice )
-{
-  int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
-  int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z;
-  int i = threadIdx.x + blockDim.x*blockIdx.x + vLower[0]; // WITH OVERLAPS!!! i,j,k aren't coordinates of all values
-  int j = blockDim.y*blockIdx.y + threadIdx.y + vLower[1];
-  int k = blockDim.z*blockIdx.z + threadIdx.z + vLower[2];
-  int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri;
-  const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
   
   if( neighbours[1] != -1 ) // EAST
   {