diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
index 08ed947ed751935bab8a4ef31df8573619364b3e..0f45be71c87b69c6b76c283a5e8fa970c45b816b 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h
@@ -138,7 +138,11 @@ template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                       Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
+<<<<<<< HEAD
                                       TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne = 1 );
+=======
+                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );
+>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
 
 template < typename Index >
 __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index e23148db5f278f8cba4a9241023e5dfff6f0c7be..4520eab0a45dceeb5c8a530ffc5140dc8954292e 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -187,6 +187,7 @@ solve( const MeshPointer& mesh,
     {
       // TODO: CUDA code
 #ifdef HAVE_CUDA
+<<<<<<< HEAD
       TNL_CHECK_CUDA_DEVICE;
       const int cudaBlockSize( 16 );
       int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
@@ -271,6 +272,86 @@ solve( const MeshPointer& mesh,
         /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
          BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
         //pocIter ++;
+=======
+          const int cudaBlockSize( 16 );
+          int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
+          int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
+          dim3 blockSize( cudaBlockSize, cudaBlockSize );
+          dim3 gridSize( numBlocksX, numBlocksY );
+          
+          tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
+          
+          TNL::Containers::Array< int, Devices::Host, IndexType > BlockIter;
+          BlockIter.setSize( numBlocksX * numBlocksY );
+          BlockIter.setValue( 0 );
+          /*int* BlockIter = (int*)malloc( ( numBlocksX * numBlocksY ) * sizeof( int ) );
+          for( int i = 0; i < numBlocksX*numBlocksY +1; i++)
+              BlockIter[i] = 1;*/
+          
+          int BlockIterD = 1;
+          
+          TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
+          BlockIterDevice.setSize( numBlocksX * numBlocksY );
+          BlockIterDevice.setValue( 1 );
+          /*int *BlockIterDevice;
+          cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );
+          cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);*/
+          
+          int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0);
+          
+          TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
+          dBlock.setSize( nBlocks );
+          dBlock.setValue( 0 );
+          /*int *dBlock;
+          cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/
+          
+          while( BlockIterD )
+          {
+           /*for( int i = 0; i < numBlocksX * numBlocksY; i++ )
+                BlockIter[ i ] = false;*/
+                       
+            CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
+                                                             interfaceMapPtr.template getData< Device >(),
+                                                             auxPtr.template modifyData< Device>(),
+                                                             BlockIterDevice );
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
+            
+            BlockIter = BlockIterDevice;
+            //cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyDeviceToHost);
+            GetNeighbours( BlockIter, numBlocksX, numBlocksY );
+            
+            BlockIterDevice = BlockIter;
+            //cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
+            
+            
+            CudaParallelReduc<<<  nBlocks, 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
+            
+            CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+            cudaDeviceSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
+            
+            cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);
+                                   
+            /*for( int i = 1; i < numBlocksX * numBlocksY; i++ )
+                BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/
+            
+          }
+          /*cudaFree( BlockIterDevice );
+          cudaFree( dBlock );
+          delete BlockIter;*/
+          cudaDeviceSynchronize();
+          
+          TNL_CHECK_CUDA_DEVICE;
+              
+          aux = *auxPtr;
+          interfaceMap = *interfaceMapPtr;
+#endif
+>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
       }
       cudaDeviceSynchronize();
       TNL_CHECK_CUDA_DEVICE;
@@ -285,6 +366,7 @@ solve( const MeshPointer& mesh,
   }
   aux.save("aux-final.tnl");
 }
+<<<<<<< HEAD
 
 #ifdef HAVE_CUDA
 template < typename Index >
@@ -318,10 +400,60 @@ __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index
   }
 }
 
+=======
+template < typename Index >
+void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY )
+{
+    TNL::Containers::Array< int, Devices::Host, Index > BlockIterPom;
+    BlockIterPom.setSize( numBlockX * numBlockY );
+    BlockIterPom.setValue( 0 );
+  /*int* BlockIterPom; 
+  BlockIterPom = new int[numBlockX * numBlockY];*/
+  /*for(int i = 0; i < numBlockX * numBlockY; i++)
+    BlockIterPom[ i ] = 0;*/
+  for(int i = 0; i < numBlockX * numBlockY; i++)
+  {
+      
+      if( BlockIter[ i ] )
+      {
+          // i = k*numBlockY + m;
+          int m=0, k=0;
+          m = i%numBlockY;
+          k = i/numBlockY;
+          if( k > 0 && numBlockY > 1 )
+            BlockIterPom[i - numBlockX] = 1;
+          if( k < numBlockY-1 && numBlockY > 1 )
+            BlockIterPom[i + numBlockX] = 1;
+          
+          if( m >= 0 && m < numBlockX - 1 && numBlockX > 1 )
+              BlockIterPom[ i+1 ] = 1;
+          if( m <= numBlockX -1 && m > 0 && numBlockX > 1 )
+              BlockIterPom[ i-1 ] = 1;
+      }
+  }
+  for(int i = 0; i < numBlockX * numBlockY; i++ ){
+///      if( !BlockIter[ i ] )
+        BlockIter[ i ] = BlockIterPom[ i ];
+///      else
+///        BlockIter[ i ] = 0;
+  }
+  /*for( int i = numBlockX-1; i > -1; i-- )
+  {
+      for( int j = 0; j< numBlockY; j++ )
+          std::cout << BlockIter[ i*numBlockY + j ];
+      std::cout << std::endl;
+  }
+  std::cout << std::endl;*/
+  //delete[] BlockIterPom;
+}
+
+#ifdef HAVE_CUDA
+>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
 template < typename Index >
 __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
                                    TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks )
 {
+<<<<<<< HEAD
   int i = threadIdx.x;
   int blId = blockIdx.x;
   __shared__ volatile int sArray[ 512 ];
@@ -363,6 +495,57 @@ __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, I
   
   if( i == 0 )
     dBlock[ blId ] = sArray[ 0 ];
+=======
+    int i = threadIdx.x;
+    int blId = blockIdx.x;
+    /*if ( i == 0 && blId == 0 ){
+            printf( "nBlocks = %d \n", nBlocks );
+        for( int j = nBlocks-1; j > -1 ; j--){
+            printf( "cislo = %d \n", BlockIterDevice[ j ] );
+        }
+    }*/
+    __shared__ volatile int sArray[ 512 ];
+    sArray[ i ] = 0;
+    if( blId * 512 + i < nBlocks )
+        sArray[ i ] = BlockIterDevice[ blId * 512 + i ];
+    __syncthreads();
+    
+    if (blockDim.x == 1024) {
+        if (i < 512)
+            sArray[ i ] += sArray[ i + 512 ];
+    }
+    __syncthreads();
+    if (blockDim.x >= 512) {
+        if (i < 256) {
+            sArray[ i ] += sArray[ i + 256 ];
+        }
+    }
+    __syncthreads();
+    if (blockDim.x >= 256) {
+        if (i < 128) {
+            sArray[ i ] += sArray[ i + 128 ];
+        }
+    }
+    __syncthreads();
+    if (blockDim.x >= 128) {
+        if (i < 64) {
+            sArray[ i ] += sArray[ i + 64 ];
+        }
+    }
+    __syncthreads();
+    if (i < 32 )
+    {
+        if(  blockDim.x >= 64 ) sArray[ i ] += sArray[ i + 32 ];
+        if(  blockDim.x >= 32 )  sArray[ i ] += sArray[ i + 16 ];
+        if(  blockDim.x >= 16 )  sArray[ i ] += sArray[ i + 8 ];
+        if(  blockDim.x >= 8 )  sArray[ i ] += sArray[ i + 4 ];
+        if(  blockDim.x >= 4 )  sArray[ i ] += sArray[ i + 2 ];
+        if(  blockDim.x >= 2 )  sArray[ i ] += sArray[ i + 1 ];
+    }
+    
+    if( i == 0 )
+        dBlock[ blId ] = sArray[ 0 ];
+>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
 }
 
 
@@ -371,7 +554,11 @@ template < typename Real, typename Device, typename Index >
 __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
                                       const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
                                       Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
+<<<<<<< HEAD
                                       TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne )
+=======
+                                      TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice )
+>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
 {
   int thri = threadIdx.x; int thrj = threadIdx.y;
   int blIdx = blockIdx.x; int blIdy = blockIdx.y;
@@ -408,7 +595,8 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     int i = thri + blockDim.x*blIdx;
     int j = blockDim.y*blIdy + thrj;
     int currentIndex = thrj * blockDim.x + thri;
-    
+    if( BlockIterDevice[ blIdy * gridDim.x + blIdx] )
+    {
     //__shared__ volatile bool changed[ blockDim.x*blockDim.y ];
     __shared__ volatile bool changed[16*16];
     changed[ currentIndex ] = false;
@@ -444,14 +632,24 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
         sArray[thrj+1][0] = std::numeric_limits< Real >::max();
     }
     
+<<<<<<< HEAD
     if( thri == 2 )
     {
       if( dimY > (blIdy+1) * blockDim.y  && thri+1 < xkolik && NE == 1 )
         sArray[ykolik][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thrj+1 ];
       else
         sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();
+=======
+        if( numOfBlockx - 1 == blIdx )
+            xkolik = dimX - (blIdx)*blockDim.x+1;
+
+        if( numOfBlocky -1 == blIdy )
+            ykolik = dimY - (blIdy)*blockDim.y+1;
+        //BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0;
+>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
     }
     
+<<<<<<< HEAD
     if( thri == 3 )
     {
       if( blIdy != 0 && thrj+1 < xkolik && NE == 1 )
@@ -477,6 +675,20 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
       {
         if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] )
+=======
+        if(thri == 0 && thrj == 0 )
+            BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0;
+
+        if( thri == 0 )
+        {        
+            if( dimX > (blIdx+1) * blockDim.x  && thrj+1 < ykolik )
+                sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ];
+            else
+                sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
+        }
+
+        if( thri == 1 )
+>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
         {
           changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy);
         }
@@ -512,6 +724,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
       {
         if( currentIndex < 64 )
         {
+<<<<<<< HEAD
           changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
         }
       }
@@ -533,5 +746,74 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
     if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) )
       aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ];
   }
+=======
+            __syncthreads();
+
+            changed[ currentIndex] = false;
+
+        //calculation of update cell
+            if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
+            {
+                if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] )
+                {
+                    changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy);
+                }
+            }
+            __syncthreads();
+
+        //pyramid reduction
+            if( blockDim.x*blockDim.y == 1024 )
+            {
+                if( currentIndex < 512 )
+                {
+                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ];
+                }
+            }
+            __syncthreads();
+            if( blockDim.x*blockDim.y >= 512 )
+            {
+                if( currentIndex < 256 )
+                {
+                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ];
+                }
+            }
+            __syncthreads();
+            if( blockDim.x*blockDim.y >= 256 )
+            {
+                if( currentIndex < 128 )
+                {
+                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ];
+                }
+            }
+            __syncthreads();
+            if( blockDim.x*blockDim.y >= 128 )
+            {
+                if( currentIndex < 64 )
+                {
+                    changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ];
+                }
+            }
+            __syncthreads();
+            if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
+            {
+                if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ];
+                if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ];
+                if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ];
+                if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ];
+                if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ];
+                if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ];
+            }
+            if( changed[ 0 ] && thri == 0 && thrj == 0 ){
+                BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1;
+            }
+            __syncthreads();
+        }
+
+        if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) )
+            aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ];
+    }
+    /*if( thri == 0 && thrj == 0 )
+        printf( "Block ID = %d, value = %d \n", (blIdy * numOfBlockx + blIdx), BlockIterDevice[ blIdy * numOfBlockx + blIdx ] );*/
+>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
 }
 #endif
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index 4daf9fc920eddd50fb25a4147865193c62149214..8d71bfe06d7c1eb63e63e0527ca58d8d4e95ad9f 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -280,12 +280,17 @@ solve( const MeshPointer& mesh,
                                                               interfaceMapPtr.template getData< Device >(),
                                                               auxPtr.template modifyData< Device>(),
                                                               BlockIterDevice );
+<<<<<<< HEAD
             cudaDeviceSynchronize();
             TNL_CHECK_CUDA_DEVICE;
             
             CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
             cudaDeviceSynchronize();
             TNL_CHECK_CUDA_DEVICE;
+=======
+            //CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
+            //CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
+>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
             
             CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
             cudaDeviceSynchronize();