From 042a23f5c51e2e9ff59d58b6daa187216b25d94d Mon Sep 17 00:00:00 2001
From: Tomas Sobotik <sobotto4@fjfi.cvut.cz>
Date: Tue, 3 May 2016 08:30:51 +0200
Subject: [PATCH] Tweaks

---
 examples/fast-sweeping-map/main.h             |    4 +-
 .../tnlParallelMapSolver2D_impl.h             |    2 +
 .../tnlParallelEikonalSolver3D_impl.h         |    2 +-
 .../tnlNarrowBand2D_CUDA_v4_impl.h            |  123 +-
 .../tnlNarrowBand2D_CUDA_v5_impl.h            | 1313 +++++++++++++++++
 examples/narrow-band/tnlNarrowBand_CUDA.h     |    1 +
 .../parallelGodunovMap2D_impl.h               |   10 +-
 7 files changed, 1394 insertions(+), 61 deletions(-)
 create mode 100644 examples/narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h

diff --git a/examples/fast-sweeping-map/main.h b/examples/fast-sweeping-map/main.h
index 6d88aa7d34..4aee944c0f 100644
--- a/examples/fast-sweeping-map/main.h
+++ b/examples/fast-sweeping-map/main.h
@@ -17,9 +17,9 @@
 
 #include "MainBuildConfig.h"
 	//for HOST versions:
-#include "tnlFastSweepingMap.h"
+//#include "tnlFastSweepingMap.h"
 	//for DEVICE versions:
-//#include "tnlFastSweepingMap_CUDA.h"
+#include "tnlFastSweepingMap_CUDA.h"
 #include "fastSweepingMapConfig.h"
 #include <solvers/tnlBuildConfigTags.h>
 
diff --git a/examples/hamilton-jacobi-parallel-map/tnlParallelMapSolver2D_impl.h b/examples/hamilton-jacobi-parallel-map/tnlParallelMapSolver2D_impl.h
index 78232c7b1c..3be87ca707 100644
--- a/examples/hamilton-jacobi-parallel-map/tnlParallelMapSolver2D_impl.h
+++ b/examples/hamilton-jacobi-parallel-map/tnlParallelMapSolver2D_impl.h
@@ -904,6 +904,8 @@ void tnlParallelMapSolver<2,SchemeHost, SchemeDevice, Device, double, int>::runS
    double fu = 0.0;
 
    double finalTime = this->stopTime;
+   if(boundaryCondition == 0)
+	   finalTime*=2.0;
    __syncthreads();
 
    tnlGridEntity<MeshType, 2, tnlGridEntityNoStencilStorage > Entity(subMesh);
diff --git a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h
index 4b3b37306e..f8cc71933e 100644
--- a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h
+++ b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h
@@ -25,7 +25,7 @@ template< typename SchemeHost, typename SchemeDevice, typename Device>
 tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::tnlParallelEikonalSolver()
 {
 	cout << "a" << endl;
-	this->device = tnlHostDevice;  /////////////// tnlCuda Device --- vypocet na GPU, tnlHostDevice   ---    vypocet na CPU
+	this->device = tnlCudaDevice;  /////////////// tnlCuda Device --- vypocet na GPU, tnlHostDevice   ---    vypocet na CPU
 
 #ifdef HAVE_CUDA
 	if(this->device == tnlCudaDevice)
diff --git a/examples/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h b/examples/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h
index 6017d1d359..61714d8988 100644
--- a/examples/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h
+++ b/examples/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h
@@ -16,7 +16,7 @@
 #ifndef TNLNARROWBAND2D_IMPL_H_
 #define TNLNARROWBAND2D_IMPL_H_
 
-#define NARROWBAND_SUBGRID_SIZE 8
+#define NARROWBAND_SUBGRID_SIZE 16
 
 #include "tnlNarrowBand.h"
 
@@ -225,7 +225,7 @@ bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: r
 		cudaMemcpy(&reinit, this->reinitialize, sizeof(int), cudaMemcpyDeviceToHost);
 		cudaDeviceSynchronize();
 		checkCudaDevice;
-		if(reinit != 0 && time != finalTime )
+		if(reinit != 0 /*&& time != finalTime */)
 		{
 			cout << time << endl;
 
@@ -273,8 +273,8 @@ void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: u
 {
 	//			1 - with curve,  	2 - to the north of curve, 	4  - to the south of curve,
 	//								8 - to the east of curve, 	16 - to the west of curve.
-	int subgridID = i/NARROWBAND_SUBGRID_SIZE + (j/NARROWBAND_SUBGRID_SIZE) * ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE);
-	/*if(cudaStatusVector[subgridID] != 0)*/
+	int subgridID = i/NARROWBAND_SUBGRID_SIZE + (j/NARROWBAND_SUBGRID_SIZE) * statusGridSize;
+	if(/*cudaStatusVector[subgridID] != 0 &&*/ i<Mesh.getDimensions().x() && j < Mesh.getDimensions().y())
 	{
 		tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
 		Entity.setCoordinates(CoordinatesType(i,j));
@@ -283,9 +283,9 @@ void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: u
 		Real value = cudaDofVector2[Entity.getIndex()];
 		Real a,b, tmp;
 
-		if( i == 0 /*|| (i/NARROWBAND_SUBGRID_SIZE == 0 && !(cudaStatusVector[subgridID] & 9)) */)
+		if( i == 0 /*|| (i/NARROWBAND_SUBGRID_SIZE == 0 && !(cudaStatusVector[subgridID] & 9))*/ )
 			a = cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()];
-		else if( i == Mesh.getDimensions().x() - 1 /*|| (i/NARROWBAND_SUBGRID_SIZE == NARROWBAND_SUBGRID_SIZE - 1 && !(cudaStatusVector[subgridID] & 17)) */)
+		else if( i == Mesh.getDimensions().x() - 1 /*|| (i/NARROWBAND_SUBGRID_SIZE == NARROWBAND_SUBGRID_SIZE - 1 && !(cudaStatusVector[subgridID] & 17))*/ )
 			a = cudaDofVector2[neighbourEntities.template getEntityIndex< -1,  0 >()];
 		else
 		{
@@ -293,9 +293,9 @@ void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: u
 					 cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()] );
 		}
 
-		if( j == 0 /*|| (j/NARROWBAND_SUBGRID_SIZE == 0 && !(cudaStatusVector[subgridID] & 3)) */)
+		if( j == 0 /*|| (j/NARROWBAND_SUBGRID_SIZE == 0 && !(cudaStatusVector[subgridID] & 3))*/ )
 			b = cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()];
-		else if( j == Mesh.getDimensions().y() - 1  /*|| (j/NARROWBAND_SUBGRID_SIZE == NARROWBAND_SUBGRID_SIZE - 1 && !(cudaStatusVector[subgridID] & 5)) */)
+		else if( j == Mesh.getDimensions().y() - 1 /* || (j/NARROWBAND_SUBGRID_SIZE == NARROWBAND_SUBGRID_SIZE - 1 && !(cudaStatusVector[subgridID] & 5)) */)
 			b = cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  -1 >()];
 		else
 		{
@@ -353,32 +353,33 @@ bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: i
 
 	int gid = Entity.getIndex();
 
-	cudaDofVector2[gid] = INT_MAX*Sign(cudaDofVector2[gid]);
+	if(abs(cudaDofVector2[gid]) > 1.5*h)
+		cudaDofVector2[gid] = INT_MAX*Sign(cudaDofVector2[gid]);
 
-	if (i >0 && j > 0 && i+1 < Mesh.getDimensions().x() && j+1 < Mesh.getDimensions().y())
-	{
-		if(cudaDofVector2[gid]*cudaDofVector2[gid+1] <= 0 )
-		{
-			cudaDofVector2[gid] = Sign(cudaDofVector2[gid])*0.5*h;
-			cudaDofVector2[gid+1] = Sign(cudaDofVector2[gid+1])*0.5*h;
-		}
-		if( cudaDofVector2[gid]*cudaDofVector2[gid+Mesh.getDimensions().x()] <= 0 )
-		{
-			cudaDofVector2[gid] = Sign(cudaDofVector2[gid])*0.5*h;
-			cudaDofVector2[gid+Mesh.getDimensions().x()] = Sign(cudaDofVector2[gid+Mesh.getDimensions().x()])*0.5*h;
-		}
-
-		if(cudaDofVector2[gid]*cudaDofVector2[gid-1] <= 0 )
-		{
-			cudaDofVector2[gid] = Sign(cudaDofVector2[gid])*0.5*h;
-			cudaDofVector2[gid-1] = Sign(cudaDofVector2[gid-1])*0.5*h;
-		}
-		if( cudaDofVector2[gid]*cudaDofVector2[gid-Mesh.getDimensions().x()] <= 0 )
-		{
-			cudaDofVector2[gid] = Sign(cudaDofVector2[gid])*0.5*h;
-			cudaDofVector2[gid-Mesh.getDimensions().x()] = Sign(cudaDofVector2[gid-Mesh.getDimensions().x()])*0.5*h;
-		}
-	}
+//	if (i >0 && j > 0 && i+1 < Mesh.getDimensions().x() && j+1 < Mesh.getDimensions().y())
+//	{
+//		if(cudaDofVector2[gid]*cudaDofVector2[gid+1] <= 0 )
+//		{
+//			cudaDofVector2[gid] = Sign(cudaDofVector2[gid])*0.5*h;
+//			cudaDofVector2[gid+1] = Sign(cudaDofVector2[gid+1])*0.5*h;
+//		}
+//		if( cudaDofVector2[gid]*cudaDofVector2[gid+Mesh.getDimensions().x()] <= 0 )
+//		{
+//			cudaDofVector2[gid] = Sign(cudaDofVector2[gid])*0.5*h;
+//			cudaDofVector2[gid+Mesh.getDimensions().x()] = Sign(cudaDofVector2[gid+Mesh.getDimensions().x()])*0.5*h;
+//		}
+//
+//		if(cudaDofVector2[gid]*cudaDofVector2[gid-1] <= 0 )
+//		{
+//			cudaDofVector2[gid] = Sign(cudaDofVector2[gid])*0.5*h;
+//			cudaDofVector2[gid-1] = Sign(cudaDofVector2[gid-1])*0.5*h;
+//		}
+//		if( cudaDofVector2[gid]*cudaDofVector2[gid-Mesh.getDimensions().x()] <= 0 )
+//		{
+//			cudaDofVector2[gid] = Sign(cudaDofVector2[gid])*0.5*h;
+//			cudaDofVector2[gid-Mesh.getDimensions().x()] = Sign(cudaDofVector2[gid-Mesh.getDimensions().x()])*0.5*h;
+//		}
+//	}
 
 
 //
@@ -647,7 +648,9 @@ __global__ void initSetupGrid2CUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, in
 //			1 - with curve,  	2 - to the north of curve, 	4  - to the south of curve,
 //								8 - to the east of curve, 	16 - to the west of curve.
 			if(blockIdx.x > 0)
+			{
 				atomicAdd(&(solver->cudaStatusVector[blockIdx.x - 1 + gridDim.x*blockIdx.y]), 16);
+			}
 
 			if(blockIdx.x < gridDim.x - 1)
 				atomicAdd(&(solver->cudaStatusVector[blockIdx.x + 1 + gridDim.x*blockIdx.y]), 8);
@@ -690,26 +693,40 @@ __global__ void runNarrowBandCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int
 			tnlNeighbourGridEntityGetter<tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
 			double value = solver->cudaDofVector2[Entity.getIndex()];
 			double xf,xb,yf,yb, grad, fu, a,b;
+			a = b = 0.0;
 
 			if( i == 0 || (threadIdx.x == 0 && !(status & 9)) )
-				yb = yf = solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()] - value;
+			{
+				xb = value - solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()];
+				xf = solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()] - value;
+			}
 			else if( i == solver->Mesh.getDimensions().x() - 1 || (threadIdx.x == blockDim.x - 1 && !(status & 17)) )
-				yb = yf = value - solver->cudaDofVector2[neighbourEntities.template getEntityIndex< -1,  0 >()];
+			{
+				xb = value - solver->cudaDofVector2[neighbourEntities.template getEntityIndex< -1,  0 >()];
+				xf = solver->cudaDofVector2[neighbourEntities.template getEntityIndex< -1,  0 >()] - value;
+			}
 			else
 			{
-				yb =  value - solver->cudaDofVector2[neighbourEntities.template getEntityIndex< -1,  0 >()];
-				yf = solver-> cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()] - value;
+				xb =  value - solver->cudaDofVector2[neighbourEntities.template getEntityIndex< -1,  0 >()];
+				xf = solver-> cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()] - value;
 			}
 
 			if( j == 0 || (threadIdx.y == 0 && !(status & 3)) )
-				xb = xf = solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()] - value;
+			{
+				yb = value - solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()] ;
+				yf = solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()] - value;
+			}
 			else if( j == solver->Mesh.getDimensions().y() - 1  || (threadIdx.y == blockDim.y - 1 && !(status & 5)) )
-				xb = xf = value - solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 0, -1 >()];
+			{
+				yb = value - solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  -1 >()];
+				yf = solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  -1 >()] - value;
+			}
 			else
 			{
-				xb =  value - solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 0, -1 >()];
-				xf = solver-> cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()] - value;
+				yb = value - solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 0, -1 >()];
+				yf = solver-> cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()] - value;
 			}
+			__syncthreads();
 
 
 
@@ -739,21 +756,21 @@ __global__ void runNarrowBandCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int
 			   }
 
 
-			 /*  if(xb > xf)
-				   a = xb;
+			   if(xb > xf)
+				   a = xb*solver->Mesh.template getSpaceStepsProducts< -1, 0 >();
 			   else
-				   a = xf;
+				   a = xf*solver->Mesh.template getSpaceStepsProducts< -1, 0 >();
 
 			   if(yb > yf)
-				   b = yb;
+				   b = yb*solver->Mesh.template getSpaceStepsProducts< 0, -1 >();
 			   else
-				   b = yf;*/
+				   b = yf*solver->Mesh.template getSpaceStepsProducts< 0, -1 >();
 
 
 
-			   grad = sqrt(/*0.5 **/ (xf*xf + xb*xb    +   yf*yf + yb*yb ) )*solver->Mesh.template getSpaceStepsProducts< -1, 0 >();
+//			grad = sqrt(0.5 * (xf*xf + xb*xb    +   yf*yf + yb*yb ) )*solver->Mesh.template getSpaceStepsProducts< -1, 0 >();
 
-//			   grad = sqrt(/*0.5 **/ (a*a    +   b*b ) )*solver->Mesh.template getSpaceStepsProducts< -1, 0 >();
+			grad = sqrt(/*0.5 **/ (a*a    +   b*b ) );
 
 			fu = -1.0 * grad;
 
@@ -762,17 +779,17 @@ __global__ void runNarrowBandCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int
 				//			1 - with curve,  	2 - to the north of curve, 	4  - to the south of curve,
 				//								8 - to the east of curve, 	16 - to the west of curve.
 
-				if((threadIdx.x == 1 && !(status & 9)) && (blockIdx.x > 0) )
+				if((threadIdx.x == 6 && !(status & 9)) && (blockIdx.x > 0) )
 					atomicMax(solver->reinitialize,1);
-				else if((threadIdx.x == blockDim.x - 2 && !(status & 17)) && (blockIdx.x < gridDim.x - 1) )
+				else if((threadIdx.x == blockDim.x - 7 && !(status & 17)) && (blockIdx.x < gridDim.x - 1) )
 					atomicMax(solver->reinitialize,1);
-				else if((threadIdx.y == 1 && !(status & 3)) && (blockIdx.y > 0) )
+				else if((threadIdx.y == 6 && !(status & 3)) && (blockIdx.y > 0) )
 					atomicMax(solver->reinitialize,1);
-				else if((threadIdx.y == blockDim.y - 2 && !(status & 5)) && (blockIdx.y < gridDim.y - 1) )
+				else if((threadIdx.y == blockDim.y - 7 && !(status & 5)) && (blockIdx.y < gridDim.y - 1) )
 					atomicMax(solver->reinitialize,1);
 			}
 
-			solver->cudaDofVector2[Entity.getIndex()]  = value+tau*fu;
+			solver->cudaDofVector2[Entity.getIndex()]  += tau*fu;
 		}
 	}
 }
diff --git a/examples/narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h b/examples/narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h
new file mode 100644
index 0000000000..aa9bd61c71
--- /dev/null
+++ b/examples/narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h
@@ -0,0 +1,1313 @@
+/***************************************************************************
+                          tnlNarrowBand2D_CUDA_v4_impl.h  -  description
+                             -------------------
+    begin                : Oct 15 , 2015
+    copyright            : (C) 2015 by Tomas Sobotik
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+#ifndef TNLNARROWBAND2D_IMPL_H_
+#define TNLNARROWBAND2D_IMPL_H_
+
+#define NARROWBAND_SUBGRID_SIZE 16
+
+#include "tnlNarrowBand.h"
+
+__device__
+double fabsMin( double x, double y)
+{
+	double fx = abs(x);
+
+	if(Min(fx,abs(y)) == fx)
+		return x;
+	else
+		return y;
+}
+
+__device__
+double atomicFabsMin(double* address, double val)
+{
+	unsigned long long int* address_as_ull =
+						  (unsigned long long int*)address;
+	unsigned long long int old = *address_as_ull, assumed;
+	do {
+		assumed = old;
+			old = atomicCAS(address_as_ull, assumed,__double_as_longlong( fabsMin(__longlong_as_double(assumed),val) ));
+	} while (assumed != old);
+	return __longlong_as_double(old);
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+#ifdef HAVE_CUDA
+   __device__ __host__
+#endif
+Real tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >:: positivePart(const Real arg) const
+{
+	if(arg > 0.0)
+		return arg;
+	return 0.0;
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+#ifdef HAVE_CUDA
+   __device__ __host__
+#endif
+Real  tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: negativePart(const Real arg) const
+{
+	if(arg < 0.0)
+		return -arg;
+	return 0.0;
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+tnlString tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
+{
+	   return tnlString( "tnlNarrowBand< " ) +
+	          MeshType::getType() + ", " +
+	          ::getType< Real >() + ", " +
+	          ::getType< Index >() + " >";
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: tnlNarrowBand()
+:dofVector(Mesh)
+{
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: init( const tnlParameterContainer& parameters )
+{
+	const tnlString& meshFile = parameters.getParameter< tnlString >( "mesh" );
+
+	if( ! Mesh.load( meshFile ) )
+	{
+		   cerr << "I am not able to load the mesh from the file " << meshFile << "." << endl;
+		   return false;
+	}
+
+
+	const tnlString& initialCondition = parameters.getParameter <tnlString>("initial-condition");
+	if( ! dofVector.load( initialCondition ) )
+	{
+		   cerr << "I am not able to load the initial condition from the file " << meshFile << "." << endl;
+		   return false;
+	}
+
+	h = Mesh.template getSpaceStepsProducts< 1, 0 >();
+	//Entity.refresh();
+	counter = 0;
+
+	const tnlString& exact_input = parameters.getParameter< tnlString >( "exact-input" );
+
+	if(exact_input == "no")
+		exactInput=false;
+	else
+		exactInput=true;
+
+	tau = parameters.getParameter< double >( "tau" );
+
+	finalTime = parameters.getParameter< double >( "final-time" );
+
+	statusGridSize = ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE);
+#ifdef HAVE_CUDA
+
+	cudaMalloc(&(cudaDofVector), this->dofVector.getData().getSize()*sizeof(double));
+	cudaMemcpy(cudaDofVector, this->dofVector.getData().getData(), this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyHostToDevice);
+
+	cudaMalloc(&(cudaDofVector2), this->dofVector.getData().getSize()*sizeof(double));
+	cudaMemcpy(cudaDofVector2, this->dofVector.getData().getData(), this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyHostToDevice);
+
+	cudaMalloc(&(cudaStatusVector),  statusGridSize*statusGridSize*sizeof(int));
+//	cudaMemcpy(cudaDofVector, this->dofVector.getData().getData(),  statusGridSize*statusGridSize* sizeof(int)), cudaMemcpyHostToDevice);
+
+	cudaMalloc(&reinitialize, sizeof(int));
+
+
+	cudaMalloc(&(this->cudaSolver), sizeof(tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >));
+	cudaMemcpy(this->cudaSolver, this,sizeof(tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >), cudaMemcpyHostToDevice);
+
+
+	cudaDeviceSynchronize();
+	checkCudaDevice;
+#endif
+
+	int n = Mesh.getDimensions().x();
+
+	dim3 threadsPerBlock2(NARROWBAND_SUBGRID_SIZE, NARROWBAND_SUBGRID_SIZE);
+	dim3 numBlocks2(statusGridSize ,statusGridSize);
+	initSetupGridCUDA<<<numBlocks2,threadsPerBlock2>>>(this->cudaSolver);
+	cudaDeviceSynchronize();
+	checkCudaDevice;
+	initSetupGrid2CUDA<<<numBlocks2,1>>>(this->cudaSolver);
+	cudaDeviceSynchronize();
+	checkCudaDevice;
+
+
+	/*dim3 threadsPerBlock(16, 16);
+	dim3 numBlocks(n/16 + 1 ,n/16 +1);*/
+	initCUDA<<<numBlocks2,threadsPerBlock2>>>(this->cudaSolver);
+	cudaDeviceSynchronize();
+	checkCudaDevice;
+
+
+	cout << "Solver initialized." << endl;
+	return true;
+}
+
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: run()
+{
+
+	int n = Mesh.getDimensions().x();
+	dim3 threadsPerBlockFS(1, 512);
+	dim3 numBlocksFS(4,1);
+	dim3 threadsPerBlockNB(NARROWBAND_SUBGRID_SIZE, NARROWBAND_SUBGRID_SIZE);
+	dim3 numBlocksNB(n/NARROWBAND_SUBGRID_SIZE + 1,n/NARROWBAND_SUBGRID_SIZE + 1);
+
+	double time = 0.0;
+	int reinit = 0;
+
+	cout << "Hi!" << endl;
+	runCUDA<<<numBlocksFS,threadsPerBlockFS>>>(this->cudaSolver,0,0);
+	cudaDeviceSynchronize();
+	checkCudaDevice;
+	cout << "Hi2!" << endl;
+	while(time < finalTime)
+	{
+		if(tau+time > finalTime)
+			tau=finalTime-time;
+
+		runNarrowBandCUDA<<<numBlocksNB,threadsPerBlockNB>>>(this->cudaSolver,tau);
+		cudaDeviceSynchronize();
+		checkCudaDevice;
+
+		time += tau;
+
+
+		cudaMemcpy(&reinit, this->reinitialize, sizeof(int), cudaMemcpyDeviceToHost);
+		cudaDeviceSynchronize();
+		checkCudaDevice;
+		if(reinit != 0 /*&& time != finalTime */)
+		{
+			cout << time << endl;
+
+			initSetupGridCUDA<<<numBlocksNB,threadsPerBlockNB>>>(this->cudaSolver);
+			cudaDeviceSynchronize();
+			checkCudaDevice;
+			initSetupGrid2CUDA<<<numBlocksNB,1>>>(this->cudaSolver);
+			cudaDeviceSynchronize();
+			checkCudaDevice;
+			initCUDA<<<numBlocksNB,threadsPerBlockNB>>>(this->cudaSolver);
+			cudaDeviceSynchronize();
+			checkCudaDevice;
+			runCUDA<<<numBlocksFS,threadsPerBlockFS>>>(this->cudaSolver,0,0);
+			cudaDeviceSynchronize();
+			checkCudaDevice;
+		}
+	}
+
+	//data.setLike(dofVector.getData());
+	//cudaMemcpy(data.getData(), cudaDofVector2, this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyDeviceToHost);
+	cudaMemcpy(dofVector.getData().getData(), cudaDofVector2, this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyDeviceToHost);
+	cudaDeviceSynchronize();
+	cudaFree(cudaDofVector);
+	cudaFree(cudaDofVector2);
+	cudaFree(cudaSolver);
+	//data.save("u-00001.tnl");
+	dofVector.save("u-00001.tnl");
+	cudaDeviceSynchronize();
+	return true;
+}
+
+
+
+
+#ifdef HAVE_CUDA
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+__device__
+void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j)
+{
+	//			1 - with curve,  	2 - to the north of curve, 	4  - to the south of curve,
+	//								8 - to the east of curve, 	16 - to the west of curve.
+	int subgridID = i/NARROWBAND_SUBGRID_SIZE + (j/NARROWBAND_SUBGRID_SIZE) * ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE);
+	if(/*cudaStatusVector[subgridID] != 0 &&*/ i<Mesh.getDimensions().x() && Mesh.getDimensions().y())
+	{
+		tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
+		Entity.setCoordinates(CoordinatesType(i,j));
+		Entity.refresh();
+		tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+		Real value = cudaDofVector2[Entity.getIndex()];
+		Real a,b, tmp;
+
+		if( i == 0 /*|| (i/NARROWBAND_SUBGRID_SIZE == 0 && !(cudaStatusVector[subgridID] & 9)) */)
+			a = cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()];
+		else if( i == Mesh.getDimensions().x() - 1 /*|| (i/NARROWBAND_SUBGRID_SIZE == NARROWBAND_SUBGRID_SIZE - 1 && !(cudaStatusVector[subgridID] & 17)) */)
+			a = cudaDofVector2[neighbourEntities.template getEntityIndex< -1,  0 >()];
+		else
+		{
+			a = fabsMin( cudaDofVector2[neighbourEntities.template getEntityIndex< -1,  0 >()],
+					 cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()] );
+		}
+
+		if( j == 0/* || (j/NARROWBAND_SUBGRID_SIZE == 0 && !(cudaStatusVector[subgridID] & 3)) */)
+			b = cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()];
+		else if( j == Mesh.getDimensions().y() - 1 /* || (j/NARROWBAND_SUBGRID_SIZE == NARROWBAND_SUBGRID_SIZE - 1 && !(cudaStatusVector[subgridID] & 5))*/ )
+			b = cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  -1 >()];
+		else
+		{
+			b = fabsMin( cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  -1 >()],
+					 cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()] );
+		}
+
+
+		if(abs(a-b) >= h)
+			tmp = fabsMin(a,b) + Sign(value)*h;
+		else
+			tmp = 0.5 * (a + b + Sign(value)*sqrt(2.0 * h * h - (a - b) * (a - b) ) );
+
+	//	cudaDofVector2[Entity.getIndex()]  = fabsMin(value, tmp);
+		atomicFabsMin(&(cudaDofVector2[Entity.getIndex()]), tmp);
+	}
+
+}
+
+
+__global__ void initCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver)
+{
+
+
+	int gx = threadIdx.x + blockDim.x*blockIdx.x;
+	int gy = blockDim.y*blockIdx.y + threadIdx.y;
+
+
+	if(solver->Mesh.getDimensions().x() > gx  && solver->Mesh.getDimensions().y() > gy)
+	{
+		solver->initGrid();
+	}
+
+
+}
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+__device__
+bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid()
+{
+	int i = threadIdx.x + blockDim.x*blockIdx.x;
+	int j = blockDim.y*blockIdx.y + threadIdx.y;
+
+	tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
+
+	Entity.setCoordinates(CoordinatesType(i,j));
+	Entity.refresh();
+	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+
+	int gid = Entity.getIndex();
+
+	cudaDofVector2[gid] = INT_MAX*Sign(cudaDofVector2[gid]);
+
+	if (i >0 && j > 0 && i+1 < Mesh.getDimensions().x() && j+1 < Mesh.getDimensions().y())
+	{
+		if(cudaDofVector2[gid]*cudaDofVector2[gid+1] <= 0 )
+		{
+			cudaDofVector2[gid] = Sign(cudaDofVector2[gid])*0.5*h;
+			cudaDofVector2[gid+1] = Sign(cudaDofVector2[gid+1])*0.5*h;
+		}
+		if( cudaDofVector2[gid]*cudaDofVector2[gid+Mesh.getDimensions().x()] <= 0 )
+		{
+			cudaDofVector2[gid] = Sign(cudaDofVector2[gid])*0.5*h;
+			cudaDofVector2[gid+Mesh.getDimensions().x()] = Sign(cudaDofVector2[gid+Mesh.getDimensions().x()])*0.5*h;
+		}
+
+		if(cudaDofVector2[gid]*cudaDofVector2[gid-1] <= 0 )
+		{
+			cudaDofVector2[gid] = Sign(cudaDofVector2[gid])*0.5*h;
+			cudaDofVector2[gid-1] = Sign(cudaDofVector2[gid-1])*0.5*h;
+		}
+		if( cudaDofVector2[gid]*cudaDofVector2[gid-Mesh.getDimensions().x()] <= 0 )
+		{
+			cudaDofVector2[gid] = Sign(cudaDofVector2[gid])*0.5*h;
+			cudaDofVector2[gid-Mesh.getDimensions().x()] = Sign(cudaDofVector2[gid-Mesh.getDimensions().x()])*0.5*h;
+		}
+	}
+
+
+//
+
+
+
+
+
+
+//	if(i+1 < Mesh.getDimensions().x() && j+1 < Mesh.getDimensions().y() )
+//	{
+//		if(cudaDofVector[Entity.getIndex()] > 0)
+//		{
+//			if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()] > 0)
+//			{
+//				if(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()] > 0)
+//				{
+//					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
+//						setupSquare1111(i,j);
+//					else
+//						setupSquare1110(i,j);
+//				}
+//				else
+//				{
+//					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
+//						setupSquare1101(i,j);
+//					else
+//						setupSquare1100(i,j);
+//				}
+//			}
+//			else
+//			{
+//				if(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()] > 0)
+//				{
+//					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
+//						setupSquare1011(i,j);
+//					else
+//						setupSquare1010(i,j);
+//				}
+//				else
+//				{
+//					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
+//						setupSquare1001(i,j);
+//					else
+//						setupSquare1000(i,j);
+//				}
+//			}
+//		}
+//		else
+//		{
+//			if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()] > 0)
+//			{
+//				if(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()] > 0)
+//				{
+//					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
+//						setupSquare0111(i,j);
+//					else
+//						setupSquare0110(i,j);
+//				}
+//				else
+//				{
+//					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
+//						setupSquare0101(i,j);
+//					else
+//						setupSquare0100(i,j);
+//				}
+//			}
+//			else
+//			{
+//				if(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()] > 0)
+//				{
+//					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
+//						setupSquare0011(i,j);
+//					else
+//						setupSquare0010(i,j);
+//				}
+//				else
+//				{
+//					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
+//						setupSquare0001(i,j);
+//					else
+//						setupSquare0000(i,j);
+//				}
+//			}
+//		}
+//
+//	}
+
+	return true;
+
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+__device__
+Real tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
+{
+	Real fx = abs(x);
+	//Real fy = abs(y);
+
+	//Real tmpMin = Min(fx,abs(y));
+
+	if(Min(fx,abs(y)) == fx)
+		return x;
+	else
+		return y;
+
+
+}
+
+
+
+__global__ void runCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int i)
+{
+
+
+	int gx = 0;
+	int gy = threadIdx.y;
+	//if(solver->Mesh.getDimensions().x() <= gx || solver->Mesh.getDimensions().y() <= gy)
+	//	return;
+	int n = solver->Mesh.getDimensions().x();
+	int blockCount = n/blockDim.y +1;
+	//int gid = solver->Mesh.getDimensions().x() * gy + gx;
+	//int max = solver->Mesh.getDimensions().x()*solver->Mesh.getDimensions().x();
+
+	//int id1 = gx+gy;
+	//int id2 = (solver->Mesh.getDimensions().x() - gx - 1) + gy;
+
+	if(blockIdx.x==0)
+	{
+		for(int k = 0; k < n*blockCount + blockDim.y; k++)
+		{
+			if(threadIdx.y  < k+1 && gy < n)
+			{
+				solver->updateValue(gx,gy);
+				gx++;
+				if(gx==n)
+				{
+					gx=0;
+					gy+=blockDim.y;
+				}
+			}
+
+
+			__syncthreads();
+		}
+	}
+	else if(blockIdx.x==1)
+	{
+		gx=n-1;
+		gy=threadIdx.y;
+
+		for(int k = 0; k < n*blockCount + blockDim.y; k++)
+		{
+			if(threadIdx.y  < k+1 && gy < n)
+			{
+				solver->updateValue(gx,gy);
+				gx--;
+				if(gx==-1)
+				{
+					gx=n-1;
+					gy+=blockDim.y;
+				}
+			}
+
+
+			__syncthreads();
+		}
+	}
+	else if(blockIdx.x==2)
+	{
+		gx=0;
+		gy=n-threadIdx.y-1;
+		for(int k = 0; k < n*blockCount + blockDim.y; k++)
+		{
+			if(threadIdx.y  < k+1 && gy > -1)
+			{
+				solver->updateValue(gx,gy);
+				gx++;
+				if(gx==n)
+				{
+					gx=0;
+					gy-=blockDim.y;
+				}
+			}
+
+
+			__syncthreads();
+		}
+	}
+	else if(blockIdx.x==3)
+	{
+		gx=n-1;
+		gy=n-threadIdx.y-1;
+
+		for(int k = 0; k < n*blockCount + blockDim.y; k++)
+		{
+			if(threadIdx.y  < k+1 && gy > -1)
+			{
+				solver->updateValue(gx,gy);
+				gx--;
+				if(gx==-1)
+				{
+					gx=n-1;
+					gy-=blockDim.y;
+				}
+			}
+
+
+			__syncthreads();
+		}
+	}
+
+}
+
+
+
+
+__global__ void initSetupGridCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver)
+{
+	__shared__ double u0;
+	int gx = threadIdx.x + blockDim.x*blockIdx.x;
+	int gy = blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy)
+	{
+
+//		printf("Hello from  block = %d, thread = %d, x = %d, y = %d\n", blockIdx.x + gridDim.x*blockIdx.y,(blockDim.y*blockIdx.y + threadIdx.y)*solver->Mesh.getDimensions().x() + blockDim.x*blockIdx.x + threadIdx.x, threadIdx.x, threadIdx.y);
+		if(threadIdx.x+threadIdx.y == 0)
+		{
+//			printf("Hello from  block = %d, thread = %d, x = %d, y = %d\n", blockIdx.x + gridDim.x*blockIdx.y,(blockDim.y*blockIdx.y + threadIdx.y)*solver->Mesh.getDimensions().x() + blockDim.x*blockIdx.x + threadIdx.x, threadIdx.x, threadIdx.y);
+
+			if(blockIdx.x+blockIdx.y == 0)
+				*(solver->reinitialize) = 0;
+
+			solver->cudaStatusVector[blockIdx.x + gridDim.x*blockIdx.y] = 0;
+
+			u0 = solver->cudaDofVector2[(blockDim.y*blockIdx.y + 0)*solver->Mesh.getDimensions().x() + blockDim.x*blockIdx.x + 0];
+		}
+		__syncthreads();
+
+		double u = solver->cudaDofVector2[(blockDim.y*blockIdx.y + threadIdx.y)*solver->Mesh.getDimensions().x() + blockDim.x*blockIdx.x + threadIdx.x];
+
+		if(u*u0 <=0.0)
+			atomicMax(&(solver->cudaStatusVector[blockIdx.x + gridDim.x*blockIdx.y]),1);
+	}
+//	if(threadIdx.x+threadIdx.y == 0)
+
+//	printf("Bye from  block = %d, thread = %d, x = %d, y = %d\n", blockIdx.x + gridDim.x*blockIdx.y,(blockDim.y*blockIdx.y + threadIdx.y)*solver->Mesh.getDimensions().x() + blockDim.x*blockIdx.x + threadIdx.x, threadIdx.x, threadIdx.y);
+
+
+}
+
+
+
+// run this with one thread per block
+__global__ void initSetupGrid2CUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver)
+{
+//	printf("Hello\n");
+	if(solver->cudaStatusVector[blockIdx.x + gridDim.x*blockIdx.y] == 1)
+	{
+//			1 - with curve,  	2 - to the north of curve, 	4  - to the south of curve,
+//								8 - to the east of curve, 	16 - to the west of curve.
+			if(blockIdx.x > 0)
+				atomicAdd(&(solver->cudaStatusVector[blockIdx.x - 1 + gridDim.x*blockIdx.y]), 16);
+
+			if(blockIdx.x < gridDim.x - 1)
+				atomicAdd(&(solver->cudaStatusVector[blockIdx.x + 1 + gridDim.x*blockIdx.y]), 8);
+
+			if(blockIdx.y > 0 )
+				atomicAdd(&(solver->cudaStatusVector[blockIdx.x + gridDim.x*(blockIdx.y - 1)]), 4);
+
+			if(blockIdx.y < gridDim.y - 1)
+				atomicAdd(&(solver->cudaStatusVector[blockIdx.x + gridDim.x*(blockIdx.y + 1)]), 2);
+	}
+
+
+}
+
+
+
+
+
+__global__ void runNarrowBandCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, double tau)
+{
+	int gid = (blockDim.y*blockIdx.y + threadIdx.y)*solver->Mesh.getDimensions().x()+ threadIdx.x;
+	int i = threadIdx.x + blockIdx.x*blockDim.x;
+	int j = threadIdx.y + blockIdx.y*blockDim.y;
+
+//	if(i+j == 0)
+//		printf("Hello\n");
+
+	int blockID = blockIdx.x + blockIdx.y*gridDim.x; /*i/NARROWBAND_SUBGRID_SIZE + (j/NARROWBAND_SUBGRID_SIZE) * ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE);*/
+
+	int status = solver->cudaStatusVector[blockID];
+
+	if(solver->Mesh.getDimensions().x() > i && solver->Mesh.getDimensions().y() > j)
+	{
+
+//		if(status != 0)
+		{
+			tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(solver->Mesh);
+			Entity.setCoordinates(tnlStaticVector<2,double>(i,j));
+			Entity.refresh();
+			tnlNeighbourGridEntityGetter<tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+			double value = solver->cudaDofVector2[Entity.getIndex()];
+			double xf,xb,yf,yb, grad, fu, a,b;
+			a = b = 0.0;
+
+			if( i == 0 /*|| (threadIdx.x == 0 && !(status & 9)) */)
+			{
+				xb = value - solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()];
+				xf = solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()] - value;
+			}
+			else if( i == solver->Mesh.getDimensions().x() - 1 /*|| (threadIdx.x == blockDim.x - 1 && !(status & 17)) */)
+			{
+				xb = value - solver->cudaDofVector2[neighbourEntities.template getEntityIndex< -1,  0 >()];
+				xf = solver->cudaDofVector2[neighbourEntities.template getEntityIndex< -1,  0 >()] - value;
+			}
+			else
+			{
+				xb =  value - solver->cudaDofVector2[neighbourEntities.template getEntityIndex< -1,  0 >()];
+				xf = solver-> cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()] - value;
+			}
+
+			if( j == 0/* || (threadIdx.y == 0 && !(status & 3))*/ )
+			{
+				yb = value - solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()] ;
+				yf = solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()] - value;
+			}
+			else if( j == solver->Mesh.getDimensions().y() - 1  /*|| (threadIdx.y == blockDim.y - 1 && !(status & 5)) */)
+			{
+				yb = value - solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  -1 >()];
+				yf = solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  -1 >()] - value;
+			}
+			else
+			{
+				yb = value - solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 0, -1 >()];
+				yf = solver-> cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()] - value;
+			}
+			__syncthreads();
+
+
+
+
+
+			   if(Sign(value) > 0.0)
+			   {
+				   xf = solver->negativePart(xf);
+
+				   xb = solver->positivePart(xb);
+
+				   yf = solver->negativePart(yf);
+
+				   yb = solver->positivePart(yb);
+
+			   }
+			   else
+			   {
+
+				   xb = solver->negativePart(xb);
+
+				   xf = solver->positivePart(xf);
+
+				   yb = solver->negativePart(yb);
+
+				   yf = solver->positivePart(yf);
+			   }
+
+
+			   if(xb > xf)
+				   a = xb*solver->Mesh.template getSpaceStepsProducts< -1, 0 >();
+			   else
+				   a = xf*solver->Mesh.template getSpaceStepsProducts< -1, 0 >();
+
+			   if(yb > yf)
+				   b = yb*solver->Mesh.template getSpaceStepsProducts< 0, -1 >();
+			   else
+				   b = yf*solver->Mesh.template getSpaceStepsProducts< 0, -1 >();
+
+
+
+//			grad = sqrt(0.5 * (xf*xf + xb*xb    +   yf*yf + yb*yb ) )*solver->Mesh.template getSpaceStepsProducts< -1, 0 >();
+
+			grad = sqrt(/*0.5 **/ (a*a    +   b*b ) );
+
+			fu = -1.0 * grad;
+
+//			if((tau*fu+value)*value <=0 )
+//			{
+//				//			1 - with curve,  	2 - to the north of curve, 	4  - to the south of curve,
+//				//								8 - to the east of curve, 	16 - to the west of curve.
+//
+//				if((threadIdx.x == 1 && !(status & 9)) && (blockIdx.x > 0) )
+//					atomicMax(solver->reinitialize,1);
+//				else if((threadIdx.x == blockDim.x - 2 && !(status & 17)) && (blockIdx.x < gridDim.x - 1) )
+//					atomicMax(solver->reinitialize,1);
+//				else if((threadIdx.y == 1 && !(status & 3)) && (blockIdx.y > 0) )
+//					atomicMax(solver->reinitialize,1);
+//				else if((threadIdx.y == blockDim.y - 2 && !(status & 5)) && (blockIdx.y < gridDim.y - 1) )
+//					atomicMax(solver->reinitialize,1);
+//			}
+
+			solver->cudaDofVector2[Entity.getIndex()]  += tau*fu;
+		}
+	}
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1111( Index i, Index j)
+{
+	tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
+	Entity.setCoordinates(CoordinatesType(i,j));
+	Entity.refresh();
+	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+	cudaDofVector2[Entity.getIndex()]=fabsMin(INT_MAX,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(INT_MAX,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(INT_MAX,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(INT_MAX,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0000( Index i, Index j)
+{
+	tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
+	Entity.setCoordinates(CoordinatesType(i,j));
+	Entity.refresh();
+	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+	cudaDofVector2[Entity.getIndex()]=fabsMin(-INT_MAX,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-INT_MAX,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-INT_MAX,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-INT_MAX,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1110( Index i, Index j)
+{
+	tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
+	Entity.setCoordinates(CoordinatesType(i,j));
+	Entity.refresh();
+	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()]/
+			(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()]-
+			 cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()]));
+
+	be=abs(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()]/
+			(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()]-
+			 cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1101( Index i, Index j)
+{
+	tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
+	Entity.setCoordinates(CoordinatesType(i,j));
+	Entity.refresh();
+	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()]/
+			(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()]-
+			 cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()]));
+
+	be=abs(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()]/
+			(cudaDofVector[Entity.getIndex()]-
+			 cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1011( Index i, Index j)
+{
+	tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
+	Entity.setCoordinates(CoordinatesType(i,j));
+	Entity.refresh();
+	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()]/
+			(cudaDofVector[Entity.getIndex()]-
+			 cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()]));
+
+	be=abs(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()]/
+			(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()]-
+			 cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0111( Index i, Index j)
+{
+	tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
+	Entity.setCoordinates(CoordinatesType(i,j));
+	Entity.refresh();
+	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[Entity.getIndex()]/
+			(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()]-
+			 cudaDofVector[Entity.getIndex()]));
+
+	be=abs(cudaDofVector[Entity.getIndex()]/
+			(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()]-
+			 cudaDofVector[Entity.getIndex()]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0001( Index i, Index j)
+{
+	tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
+	Entity.setCoordinates(CoordinatesType(i,j));
+	Entity.refresh();
+	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()]/
+			(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()]-
+			 cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()]));
+
+	be=abs(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()]/
+			(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()]-
+			 cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0010( Index i, Index j)
+{
+	tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
+	Entity.setCoordinates(CoordinatesType(i,j));
+	Entity.refresh();
+	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()]/
+			(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()]-
+			 cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()]));
+
+	be=abs(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()]/
+			(cudaDofVector[Entity.getIndex()]-
+			 cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0100( Index i, Index j)
+{
+	tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
+	Entity.setCoordinates(CoordinatesType(i,j));
+	Entity.refresh();
+	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()]/
+			(cudaDofVector[Entity.getIndex()]-
+			 cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()]));
+
+	be=abs(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()]/
+			(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()]-
+			 cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1000( Index i, Index j)
+{
+	tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
+	Entity.setCoordinates(CoordinatesType(i,j));
+	Entity.refresh();
+	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[Entity.getIndex()]/
+			(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()]-
+			 cudaDofVector[Entity.getIndex()]));
+
+	be=abs(cudaDofVector[Entity.getIndex()]/
+			(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()]-
+			 cudaDofVector[Entity.getIndex()]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+
+}
+
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1100( Index i, Index j)
+{
+	tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
+	Entity.setCoordinates(CoordinatesType(i,j));
+	Entity.refresh();
+	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[Entity.getIndex()]/
+			(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()]-
+			 cudaDofVector[Entity.getIndex()]));
+
+	be=abs(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()]/
+			(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()]-
+			 cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()]));
+
+	a = al-be;
+	b=1.0;
+	c=-al;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1010( Index i, Index j)
+{
+	tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
+	Entity.setCoordinates(CoordinatesType(i,j));
+	Entity.refresh();
+	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[Entity.getIndex()]/
+			(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()]-
+			 cudaDofVector[Entity.getIndex()]));
+
+	be=abs(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()]/
+			(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()]-
+			 cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()]));
+
+	a = al-be;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1001( Index i, Index j)
+{
+	tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
+	Entity.setCoordinates(CoordinatesType(i,j));
+	Entity.refresh();
+	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+	cudaDofVector2[Entity.getIndex()]=fabsMin(cudaDofVector[Entity.getIndex()],cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()],cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()],cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()],cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+
+}
+
+
+
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0011( Index i, Index j)
+{
+	tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
+	Entity.setCoordinates(CoordinatesType(i,j));
+	Entity.refresh();
+	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[Entity.getIndex()]/
+			(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()]-
+			 cudaDofVector[Entity.getIndex()]));
+
+	be=abs(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()]/
+			(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()]-
+			 cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()]));
+
+	a = al-be;
+	b=1.0;
+	c=-al;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0101( Index i, Index j)
+{
+	tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
+	Entity.setCoordinates(CoordinatesType(i,j));
+	Entity.refresh();
+	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[Entity.getIndex()]/
+			(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()]-
+			 cudaDofVector[Entity.getIndex()]));
+
+	be=abs(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()]/
+			(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()]-
+			 cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()]));
+
+	a = al-be;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0110( Index i, Index j)
+{
+	tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
+	Entity.setCoordinates(CoordinatesType(i,j));
+	Entity.refresh();
+	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+	cudaDofVector2[Entity.getIndex()]=fabsMin(cudaDofVector[Entity.getIndex()],cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()],cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()],cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()],cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+}
+#endif
+
+
+
+
+#endif /* TNLNARROWBAND_IMPL_H_ */
diff --git a/examples/narrow-band/tnlNarrowBand_CUDA.h b/examples/narrow-band/tnlNarrowBand_CUDA.h
index 179ccb9ed3..a0fa59a1ee 100644
--- a/examples/narrow-band/tnlNarrowBand_CUDA.h
+++ b/examples/narrow-band/tnlNarrowBand_CUDA.h
@@ -190,6 +190,7 @@ __global__ void initCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, doubl
 
 __global__ void initSetupGridCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver);
 __global__ void initSetupGrid2CUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver);
+__global__ void initSetupGrid1_2CUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver);
 __global__ void runNarrowBandCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, double tau);
 //__global__ void initCUDA(tnlNarrowBand< tnlGrid< 3,double, tnlHost, int >, double, int >* solver);
 #endif
diff --git a/src/operators/godunov-eikonal/parallelGodunovMap2D_impl.h b/src/operators/godunov-eikonal/parallelGodunovMap2D_impl.h
index ea66c98316..1c4a1c4dc3 100644
--- a/src/operators/godunov-eikonal/parallelGodunovMap2D_impl.h
+++ b/src/operators/godunov-eikonal/parallelGodunovMap2D_impl.h
@@ -291,11 +291,11 @@ Real parallelGodunovMapScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Real,
 {
 //	int gid = (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x + blockDim.x*blockIdx.x + threadIdx.x;
 
-//	RealType signui;
-//	if(boundaryCondition == 0)
-//		signui = sign(u[cellIndex],/*(boundaryCondition != 0) * */this->epsilon);
-//	else
-	RealType signui = Sign(u[cellIndex]);
+	RealType signui;
+	if(boundaryCondition == 0)
+		signui = sign(u[cellIndex],/*(boundaryCondition != 0) * */this->epsilon);
+	else
+		signui = Sign(u[cellIndex]);
 
 //	RealType value;
 //	if(map[cellIndex] == 0.0)
-- 
GitLab