From 56d3cc6043f4cfee2332a3c2902aae0d1ed37ee4 Mon Sep 17 00:00:00 2001
From: Tomas Sobotik <sobotto4@fjfi.cvut.cz>
Date: Wed, 27 Apr 2016 00:22:57 +0200
Subject: [PATCH] Parallel narrow band working, still needs a bit more
 optimisation.

---
 examples/fast-sweeping/main.h                 |   4 +-
 .../tnlFastSweeping2D_CUDA_v4_impl.h          | 159 ++++---
 .../tnlFastSweeping3D_CUDA_impl.h             |   4 +-
 .../tnlParallelEikonalSolver2D_impl.h         |   2 +-
 .../tnlParallelEikonalSolver3D_impl.h         |   2 +-
 examples/narrow-band/narrowBandConfig.h       |   2 +-
 .../tnlNarrowBand2D_CUDA_v4_impl.h            | 440 ++++++++++--------
 examples/narrow-band/tnlNarrowBand_CUDA.h     |   8 +
 8 files changed, 353 insertions(+), 268 deletions(-)

diff --git a/examples/fast-sweeping/main.h b/examples/fast-sweeping/main.h
index 2732bf9c4e..0acd802358 100644
--- a/examples/fast-sweeping/main.h
+++ b/examples/fast-sweeping/main.h
@@ -17,9 +17,9 @@
 
 #include "MainBuildConfig.h"
 	//for HOST versions:
-#include "tnlFastSweeping.h"
+//#include "tnlFastSweeping.h"
 	//for DEVICE versions:
-//#include "tnlFastSweeping_CUDA.h"
+#include "tnlFastSweeping_CUDA.h"
 #include "fastSweepingConfig.h"
 #include <solvers/tnlBuildConfigTags.h>
 
diff --git a/examples/fast-sweeping/tnlFastSweeping2D_CUDA_v4_impl.h b/examples/fast-sweeping/tnlFastSweeping2D_CUDA_v4_impl.h
index ca6bbb5995..03d9a4f9a9 100644
--- a/examples/fast-sweeping/tnlFastSweeping2D_CUDA_v4_impl.h
+++ b/examples/fast-sweeping/tnlFastSweeping2D_CUDA_v4_impl.h
@@ -180,6 +180,7 @@ void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > ::
 	Entity.setCoordinates(CoordinatesType(i,j));
 	Entity.refresh();
 	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+
 	Real value = cudaDofVector2[Entity.getIndex()];
 	Real a,b, tmp;
 
@@ -243,7 +244,7 @@ bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > ::
 
 
 
-	if(i+1 < Mesh.getDimensions().x() && j+1 < Mesh.getDimensions().y() )
+	if(i+1 < Mesh.getDimensions().x() && j+1 < Mesh.getDimensions().y() && !exactInput)
 	{
 		if(cudaDofVector[Entity.getIndex()] > 0)
 		{
@@ -321,6 +322,12 @@ bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > ::
 		}
 
 	}
+	if(exactInput)
+	{
+		if(abs(cudaDofVector[gid]) < 1.5*h)
+			cudaDofVector2[gid] = cudaDofVector[gid];
+	}
+
 
 	return true;
 
@@ -516,14 +523,14 @@ template< typename MeshReal,
           typename Index >
 void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1111( Index i, Index j)
 {
-	tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
-	cudaDofVector2[Entity.getIndex()]=fabsMin(INT_MAX,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(INT_MAX,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(INT_MAX,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(INT_MAX,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+//	tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
+//	Entity.setCoordinates(CoordinatesType(i,j));
+//	Entity.refresh();
+//	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+//	cudaDofVector2[Entity.getIndex()]=fabsMin(INT_MAX,cudaDofVector2[Entity.getIndex()]);
+//	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(INT_MAX,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+//	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(INT_MAX,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+//	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(INT_MAX,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
 
 }
 
@@ -535,14 +542,14 @@ template< typename MeshReal,
           typename Index >
 void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0000( Index i, Index j)
 {
-	tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
-	Entity.setCoordinates(CoordinatesType(i,j));
-	Entity.refresh();
-	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-INT_MAX,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-INT_MAX,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-INT_MAX,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-INT_MAX,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+//	tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
+//	Entity.setCoordinates(CoordinatesType(i,j));
+//	Entity.refresh();
+//	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+//	cudaDofVector2[Entity.getIndex()]=fabsMin(-INT_MAX,cudaDofVector2[Entity.getIndex()]);
+//	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-INT_MAX,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+//	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-INT_MAX,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+//	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-INT_MAX,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
 
 }
 
@@ -573,10 +580,10 @@ void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > ::
 	s= h/sqrt(a*a+b*b);
 
 
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+	cudaDofVector2[Entity.getIndex()]=INT_MAX;	//fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=0.5*h;	//fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=-0.5*h;	//fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=0.5*h;	//fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
 
 }
 
@@ -606,10 +613,10 @@ void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > ::
 	s= h/sqrt(a*a+b*b);
 
 
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+	cudaDofVector2[Entity.getIndex()]=0.5*h;	//fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=0.5*h;	//fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=0.5*h;	//fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=INT_MAX;	//fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
 
 }
 
@@ -639,10 +646,10 @@ void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > ::
 	s= h/sqrt(a*a+b*b);
 
 
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+	cudaDofVector2[Entity.getIndex()]=0.5*h;	//fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=0.5*h;	//fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=INT_MAX;	//fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=-0.5*h;	//fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
 
 }
 
@@ -672,10 +679,10 @@ void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > ::
 	s= h/sqrt(a*a+b*b);
 
 
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+	cudaDofVector2[Entity.getIndex()]=-0.5*h;	//fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=0.5*h;	//fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=INT_MAX;	//fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=0.5*h;	//fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
 
 }
 
@@ -706,10 +713,10 @@ void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > ::
 	s= h/sqrt(a*a+b*b);
 
 
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+	cudaDofVector2[Entity.getIndex()]=-INT_MAX;	//fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=-0.5*h;	//fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=0.5*h;	//fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=-0.5*h;	//fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
 
 }
 
@@ -739,10 +746,10 @@ void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > ::
 	s= h/sqrt(a*a+b*b);
 
 
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+	cudaDofVector2[Entity.getIndex()]=-0.5*h;	//fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=0.5*h;	//fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=-0.5*h;	//fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=-INT_MAX;	//fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
 
 }
 
@@ -772,10 +779,10 @@ void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > ::
 	s= h/sqrt(a*a+b*b);
 
 
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+	cudaDofVector2[Entity.getIndex()]=-0.5*h;	//fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=INT_MAX;	//fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=-0.5*h;	//fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=0.5*h;	//fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
 
 }
 
@@ -805,10 +812,10 @@ void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > ::
 	s= h/sqrt(a*a+b*b);
 
 
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+	cudaDofVector2[Entity.getIndex()]=0.5*h;	//fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=-0.5*h;	//fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=-INT_MAX;	//fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=-0.5*h;	//fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
 
 }
 
@@ -842,10 +849,10 @@ void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > ::
 	s= h/sqrt(a*a+b*b);
 
 
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+	cudaDofVector2[Entity.getIndex()]=0.5*h;	//fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=-0.5*h;	//fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=-0.5*h;	//fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=0.5*h;	//fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
 
 }
 
@@ -875,10 +882,10 @@ void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > ::
 	s= h/sqrt(a*a+b*b);
 
 
-	cudaDofVector2[Entity.getIndex()]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+	cudaDofVector2[Entity.getIndex()]=0.5*h;	//fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=0.5*h;	//fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=-0.5*h;	//fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=-0.5*h;	//fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
 
 }
 
@@ -893,10 +900,10 @@ void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > ::
 	Entity.setCoordinates(CoordinatesType(i,j));
 	Entity.refresh();
 	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
-	cudaDofVector2[Entity.getIndex()]=fabsMin(cudaDofVector[Entity.getIndex()],cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()],cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()],cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()],cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+	cudaDofVector2[Entity.getIndex()]=0.5*h;	//fabsMin(cudaDofVector[Entity.getIndex()],cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=-0.5*h;	//fabsMin(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()],cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=0.5*h;	//fabsMin(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()],cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=-0.5*h;	//fabsMin(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()],cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
 
 }
 
@@ -932,10 +939,10 @@ void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > ::
 	s= h/sqrt(a*a+b*b);
 
 
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+	cudaDofVector2[Entity.getIndex()]=-0.5*h;	//fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=0.5*h;	//fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=0.5*h;	//fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=-0.5*h;	//fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
 
 }
 
@@ -960,15 +967,15 @@ void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > ::
 			 cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()]));
 
 	a = al-be;
-	b=1.0;
-	c=-be;
-	s= h/sqrt(a*a+b*b);
+	b = 1.0;
+	c = -be;
+	s = h/sqrt(a*a+b*b);
 
 
-	cudaDofVector2[Entity.getIndex()]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+	cudaDofVector2[Entity.getIndex()]=-0.5*h;	//fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=-0.5*h;	//fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=0.5*h;	//fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=0.5*h;	//fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
 
 }
 
@@ -983,10 +990,10 @@ void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > ::
 	Entity.setCoordinates(CoordinatesType(i,j));
 	Entity.refresh();
 	tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
-	cudaDofVector2[Entity.getIndex()]=fabsMin(cudaDofVector[Entity.getIndex()],cudaDofVector2[Entity.getIndex()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=fabsMin(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()],cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=fabsMin(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()],cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
-	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=fabsMin(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()],cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
+	cudaDofVector2[Entity.getIndex()]=-0.5*h;	//fabsMin(cudaDofVector[Entity.getIndex()],cudaDofVector2[Entity.getIndex()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]=0.5*h;	//fabsMin(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()],cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]=-0.5*h;	//fabsMin(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()],cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  1 >()]);
+	cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]=0.5*h;	//fabsMin(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()],cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()]);
 }
 #endif
 
diff --git a/examples/fast-sweeping/tnlFastSweeping3D_CUDA_impl.h b/examples/fast-sweeping/tnlFastSweeping3D_CUDA_impl.h
index dcc3e4e528..7d6f9b4700 100644
--- a/examples/fast-sweeping/tnlFastSweeping3D_CUDA_impl.h
+++ b/examples/fast-sweeping/tnlFastSweeping3D_CUDA_impl.h
@@ -227,8 +227,8 @@ bool tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > ::
 	Entity.refresh();
 	int gid = Entity.getIndex();
 
-	if(abs(cudaDofVector[gid]) < 1.8*h)
-		cudaDofVector2[gid] = cudaDofVector[gid];
+	if(abs(cudaDofVector[gid]) < 1.0*h)
+		cudaDofVector2[gid] = 0.5*h;//cudaDofVector[gid];
 	else
 		cudaDofVector2[gid] = INT_MAX*Sign(cudaDofVector[gid]);
 
diff --git a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver2D_impl.h b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver2D_impl.h
index 1a0869ab66..9c12d66666 100644
--- a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver2D_impl.h
+++ b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver2D_impl.h
@@ -25,7 +25,7 @@ template< typename SchemeHost, typename SchemeDevice, typename Device>
 tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::tnlParallelEikonalSolver()
 {
 	cout << "a" << endl;
-	this->device = tnlHostDevice;  /////////////// tnlCuda Device --- vypocet na GPU, tnlHostDevice   ---    vypocet na CPU
+	this->device = tnlCudaDevice;  /////////////// tnlCuda Device --- vypocet na GPU, tnlHostDevice   ---    vypocet na CPU
 
 #ifdef HAVE_CUDA
 	if(this->device == tnlCudaDevice)
diff --git a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h
index f8cc71933e..4b3b37306e 100644
--- a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h
+++ b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h
@@ -25,7 +25,7 @@ template< typename SchemeHost, typename SchemeDevice, typename Device>
 tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::tnlParallelEikonalSolver()
 {
 	cout << "a" << endl;
-	this->device = tnlCudaDevice;  /////////////// tnlCuda Device --- vypocet na GPU, tnlHostDevice   ---    vypocet na CPU
+	this->device = tnlHostDevice;  /////////////// tnlCuda Device --- vypocet na GPU, tnlHostDevice   ---    vypocet na CPU
 
 #ifdef HAVE_CUDA
 	if(this->device == tnlCudaDevice)
diff --git a/examples/narrow-band/narrowBandConfig.h b/examples/narrow-band/narrowBandConfig.h
index 9ccc1e8cb8..a4c5aee3f9 100644
--- a/examples/narrow-band/narrowBandConfig.h
+++ b/examples/narrow-band/narrowBandConfig.h
@@ -26,7 +26,7 @@ class narrowBandConfig
    public:
       static void configSetup( tnlConfigDescription& config )
       {
-         config.addDelimiter( "Parallel Eikonal solver settings:" );
+         config.addDelimiter( "Narrow Band Solver solver settings:" );
          config.addEntry        < tnlString > ( "problem-name", "This defines particular problem.", "fast-sweeping" );
          config.addRequiredEntry        < tnlString > ( "initial-condition", "Initial condition for solver");
          config.addRequiredEntry        < int > ( "dim", "Dimension of problem.");
diff --git a/examples/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h b/examples/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h
index 320b02f222..bc5b040508 100644
--- a/examples/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h
+++ b/examples/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h
@@ -45,6 +45,37 @@ double atomicFabsMin(double* address, double val)
 }
 
 
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+#ifdef HAVE_CUDA
+   __device__ __host__
+#endif
+Real tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >:: positivePart(const Real arg) const
+{
+	if(arg > 0.0)
+		return arg;
+	return 0.0;
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+#ifdef HAVE_CUDA
+   __device__ __host__
+#endif
+Real  tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: negativePart(const Real arg) const
+{
+	if(arg < 0.0)
+		return -arg;
+	return 0.0;
+}
+
 template< typename MeshReal,
           typename Device,
           typename MeshIndex,
@@ -116,7 +147,7 @@ bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: i
 	cudaMalloc(&(cudaDofVector2), this->dofVector.getData().getSize()*sizeof(double));
 	cudaMemcpy(cudaDofVector2, this->dofVector.getData().getData(), this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyHostToDevice);
 
-	cudaMalloc(&(cudaStatusVector),  statusGridSize*statusGridSize* sizeof(int));
+	cudaMalloc(&(cudaStatusVector),  statusGridSize*statusGridSize*sizeof(int));
 //	cudaMemcpy(cudaDofVector, this->dofVector.getData().getData(),  statusGridSize*statusGridSize* sizeof(int)), cudaMemcpyHostToDevice);
 
 	cudaMalloc(&reinitialize, sizeof(int));
@@ -125,12 +156,15 @@ bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: i
 	cudaMalloc(&(this->cudaSolver), sizeof(tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >));
 	cudaMemcpy(this->cudaSolver, this,sizeof(tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >), cudaMemcpyHostToDevice);
 
+
+	cudaDeviceSynchronize();
+	checkCudaDevice;
 #endif
 
 	int n = Mesh.getDimensions().x();
 
 	dim3 threadsPerBlock2(NARROWBAND_SUBGRID_SIZE, NARROWBAND_SUBGRID_SIZE);
-	dim3 numBlocks2(((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE) ,((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE));
+	dim3 numBlocks2(statusGridSize ,statusGridSize);
 	initSetupGridCUDA<<<numBlocks2,threadsPerBlock2>>>(this->cudaSolver);
 	cudaDeviceSynchronize();
 	checkCudaDevice;
@@ -139,14 +173,14 @@ bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: i
 	checkCudaDevice;
 
 
-	dim3 threadsPerBlock(16, 16);
-	dim3 numBlocks(n/16 + 1 ,n/16 +1);
-	initCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
+	/*dim3 threadsPerBlock(16, 16);
+	dim3 numBlocks(n/16 + 1 ,n/16 +1);*/
+	initCUDA<<<numBlocks2,threadsPerBlock2>>>(this->cudaSolver);
 	cudaDeviceSynchronize();
 	checkCudaDevice;
 
 
-
+	cout << "Solver initialized." << endl;
 	return true;
 }
 
@@ -171,9 +205,11 @@ bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: r
 	double time = 0.0;
 	int reinit = 0;
 
+	cout << "Hi!" << endl;
 	runCUDA<<<numBlocksFS,threadsPerBlockFS>>>(this->cudaSolver,0,0);
 	cudaDeviceSynchronize();
 	checkCudaDevice;
+	cout << "Hi2!" << endl;
 	while(time < finalTime)
 	{
 		if(tau+time > finalTime)
@@ -187,12 +223,19 @@ bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: r
 
 
 		cudaMemcpy(&reinit, this->reinitialize, sizeof(int), cudaMemcpyDeviceToHost);
-		if(reinit != 0)
+		cudaDeviceSynchronize();
+		checkCudaDevice;
+		if(reinit != 0 && time != finalTime )
 		{
+			cout << time << endl;
+
 			initSetupGridCUDA<<<numBlocksNB,threadsPerBlockNB>>>(this->cudaSolver);
 			cudaDeviceSynchronize();
 			checkCudaDevice;
-			initSetupGridCUDA<<<numBlocksNB,1>>>(this->cudaSolver);
+			initSetupGrid2CUDA<<<numBlocksNB,1>>>(this->cudaSolver);
+			cudaDeviceSynchronize();
+			checkCudaDevice;
+			initCUDA<<<numBlocksNB,threadsPerBlockNB>>>(this->cudaSolver);
 			cudaDeviceSynchronize();
 			checkCudaDevice;
 			runCUDA<<<numBlocksFS,threadsPerBlockFS>>>(this->cudaSolver,0,0);
@@ -228,7 +271,10 @@ template< typename MeshReal,
 __device__
 void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j)
 {
-	if(cudaStatusVector[i/NARROWBAND_SUBGRID_SIZE + (j/NARROWBAND_SUBGRID_SIZE) * ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE)] != 0)
+	//			1 - with curve,  	2 - to the north of curve, 	4  - to the south of curve,
+	//								8 - to the east of curve, 	16 - to the west of curve.
+	int subgridID = i/NARROWBAND_SUBGRID_SIZE + (j/NARROWBAND_SUBGRID_SIZE) * ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE);
+	/*if(cudaStatusVector[subgridID] != 0)*/
 	{
 		tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(Mesh);
 		Entity.setCoordinates(CoordinatesType(i,j));
@@ -237,9 +283,9 @@ void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: u
 		Real value = cudaDofVector2[Entity.getIndex()];
 		Real a,b, tmp;
 
-		if( i == 0 )
+		if( i == 0 /*|| (i/NARROWBAND_SUBGRID_SIZE == 0 && !(cudaStatusVector[subgridID] & 9)) */)
 			a = cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()];
-		else if( i == Mesh.getDimensions().x() - 1 )
+		else if( i == Mesh.getDimensions().x() - 1 /*|| (i/NARROWBAND_SUBGRID_SIZE == NARROWBAND_SUBGRID_SIZE - 1 && !(cudaStatusVector[subgridID] & 17)) */)
 			a = cudaDofVector2[neighbourEntities.template getEntityIndex< -1,  0 >()];
 		else
 		{
@@ -247,9 +293,9 @@ void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: u
 					 cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()] );
 		}
 
-		if( j == 0 )
+		if( j == 0 /*|| (j/NARROWBAND_SUBGRID_SIZE == 0 && !(cudaStatusVector[subgridID] & 3)) */)
 			b = cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()];
-		else if( j == Mesh.getDimensions().y() - 1 )
+		else if( j == Mesh.getDimensions().y() - 1  /*|| (j/NARROWBAND_SUBGRID_SIZE == NARROWBAND_SUBGRID_SIZE - 1 && !(cudaStatusVector[subgridID] & 5)) */)
 			b = cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  -1 >()];
 		else
 		{
@@ -270,6 +316,24 @@ void tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: u
 }
 
 
+__global__ void initCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver)
+{
+
+
+	int gx = threadIdx.x + blockDim.x*blockIdx.x;
+	int gy = blockDim.y*blockIdx.y + threadIdx.y;
+
+
+	if(solver->Mesh.getDimensions().x() > gx  && solver->Mesh.getDimensions().y() > gy)
+	{
+		solver->initGrid();
+	}
+
+
+}
+
+
+
 template< typename MeshReal,
           typename Device,
           typename MeshIndex,
@@ -289,94 +353,120 @@ bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: i
 
 	int gid = Entity.getIndex();
 
-	cudaDofVector2[gid] = INT_MAX*Sign(cudaDofVector[gid]);
-//
-//	if(abs(cudaDofVector[gid]) < 1.01*h)
-//		cudaDofVector2[gid] = cudaDofVector[gid];
-
+	cudaDofVector2[gid] = INT_MAX*Sign(cudaDofVector2[gid]);
 
-
-
-
-	if(i+1 < Mesh.getDimensions().x() && j+1 < Mesh.getDimensions().y() )
+	if (i >0 && j > 0 && i+1 < Mesh.getDimensions().x() && j+1 < Mesh.getDimensions().y())
 	{
-		if(cudaDofVector[Entity.getIndex()] > 0)
+		if(cudaDofVector2[gid]*cudaDofVector2[gid+1] <= 0 )
 		{
-			if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()] > 0)
-			{
-				if(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()] > 0)
-				{
-					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
-						setupSquare1111(i,j);
-					else
-						setupSquare1110(i,j);
-				}
-				else
-				{
-					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
-						setupSquare1101(i,j);
-					else
-						setupSquare1100(i,j);
-				}
-			}
-			else
-			{
-				if(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()] > 0)
-				{
-					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
-						setupSquare1011(i,j);
-					else
-						setupSquare1010(i,j);
-				}
-				else
-				{
-					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
-						setupSquare1001(i,j);
-					else
-						setupSquare1000(i,j);
-				}
-			}
+			cudaDofVector2[gid] = Sign(cudaDofVector2[gid])*0.5*h;
+			cudaDofVector2[gid+1] = Sign(cudaDofVector2[gid+1])*0.5*h;
 		}
-		else
+		if( cudaDofVector2[gid]*cudaDofVector2[gid+Mesh.getDimensions().x()] <= 0 )
 		{
-			if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()] > 0)
-			{
-				if(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()] > 0)
-				{
-					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
-						setupSquare0111(i,j);
-					else
-						setupSquare0110(i,j);
-				}
-				else
-				{
-					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
-						setupSquare0101(i,j);
-					else
-						setupSquare0100(i,j);
-				}
-			}
-			else
-			{
-				if(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()] > 0)
-				{
-					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
-						setupSquare0011(i,j);
-					else
-						setupSquare0010(i,j);
-				}
-				else
-				{
-					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
-						setupSquare0001(i,j);
-					else
-						setupSquare0000(i,j);
-				}
-			}
+			cudaDofVector2[gid] = Sign(cudaDofVector2[gid])*0.5*h;
+			cudaDofVector2[gid+Mesh.getDimensions().x()] = Sign(cudaDofVector2[gid+Mesh.getDimensions().x()])*0.5*h;
 		}
 
+		if(cudaDofVector2[gid]*cudaDofVector2[gid-1] <= 0 )
+		{
+			cudaDofVector2[gid] = Sign(cudaDofVector2[gid])*0.5*h;
+			cudaDofVector2[gid-1] = Sign(cudaDofVector2[gid-1])*0.5*h;
+		}
+		if( cudaDofVector2[gid]*cudaDofVector2[gid-Mesh.getDimensions().x()] <= 0 )
+		{
+			cudaDofVector2[gid] = Sign(cudaDofVector2[gid])*0.5*h;
+			cudaDofVector2[gid-Mesh.getDimensions().x()] = Sign(cudaDofVector2[gid-Mesh.getDimensions().x()])*0.5*h;
+		}
 	}
 
+
+//
+
+
+
+
+
+
+//	if(i+1 < Mesh.getDimensions().x() && j+1 < Mesh.getDimensions().y() )
+//	{
+//		if(cudaDofVector[Entity.getIndex()] > 0)
+//		{
+//			if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()] > 0)
+//			{
+//				if(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()] > 0)
+//				{
+//					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
+//						setupSquare1111(i,j);
+//					else
+//						setupSquare1110(i,j);
+//				}
+//				else
+//				{
+//					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
+//						setupSquare1101(i,j);
+//					else
+//						setupSquare1100(i,j);
+//				}
+//			}
+//			else
+//			{
+//				if(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()] > 0)
+//				{
+//					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
+//						setupSquare1011(i,j);
+//					else
+//						setupSquare1010(i,j);
+//				}
+//				else
+//				{
+//					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
+//						setupSquare1001(i,j);
+//					else
+//						setupSquare1000(i,j);
+//				}
+//			}
+//		}
+//		else
+//		{
+//			if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  0 >()] > 0)
+//			{
+//				if(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()] > 0)
+//				{
+//					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
+//						setupSquare0111(i,j);
+//					else
+//						setupSquare0110(i,j);
+//				}
+//				else
+//				{
+//					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
+//						setupSquare0101(i,j);
+//					else
+//						setupSquare0100(i,j);
+//				}
+//			}
+//			else
+//			{
+//				if(cudaDofVector[neighbourEntities.template getEntityIndex< 0,  1 >()] > 0)
+//				{
+//					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
+//						setupSquare0011(i,j);
+//					else
+//						setupSquare0010(i,j);
+//				}
+//				else
+//				{
+//					if(cudaDofVector[neighbourEntities.template getEntityIndex< 1,  1 >()] > 0)
+//						setupSquare0001(i,j);
+//					else
+//						setupSquare0000(i,j);
+//				}
+//			}
+//		}
+//
+//	}
+
 	return true;
 
 }
@@ -506,51 +596,44 @@ __global__ void runCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double
 		}
 	}
 
-
-
-
-
 }
 
 
-__global__ void initCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver)
-{
 
 
+__global__ void initSetupGridCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver)
+{
+	__shared__ double u0;
 	int gx = threadIdx.x + blockDim.x*blockIdx.x;
 	int gy = blockDim.y*blockIdx.y + threadIdx.y;
 
-
 	if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy)
 	{
-		solver->initGrid();
-	}
-
-
-}
-
 
+//		printf("Hello from  block = %d, thread = %d, x = %d, y = %d\n", blockIdx.x + gridDim.x*blockIdx.y,(blockDim.y*blockIdx.y + threadIdx.y)*solver->Mesh.getDimensions().x() + blockDim.x*blockIdx.x + threadIdx.x, threadIdx.x, threadIdx.y);
+		if(threadIdx.x+threadIdx.y == 0)
+		{
+//			printf("Hello from  block = %d, thread = %d, x = %d, y = %d\n", blockIdx.x + gridDim.x*blockIdx.y,(blockDim.y*blockIdx.y + threadIdx.y)*solver->Mesh.getDimensions().x() + blockDim.x*blockIdx.x + threadIdx.x, threadIdx.x, threadIdx.y);
 
+			if(blockIdx.x+blockIdx.y == 0)
+				*(solver->reinitialize) = 0;
 
+			solver->cudaStatusVector[blockIdx.x + gridDim.x*blockIdx.y] = 0;
 
-__global__ void initSetupGridCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver)
-{
-	__shared__ double u0;
-	if(threadIdx.x+threadIdx.y == 0)
-	{
-		if(blockIdx.x+blockIdx.y == 0)
-			*(solver->reinitialize) = 0;
+			u0 = solver->cudaDofVector2[(blockDim.y*blockIdx.y + 0)*solver->Mesh.getDimensions().x() + blockDim.x*blockIdx.x + 0];
+		}
+		__syncthreads();
 
-		solver->cudaStatusVector[blockIdx.x + gridDim.x*blockIdx.y] = 0;
+		double u = solver->cudaDofVector2[(blockDim.y*blockIdx.y + threadIdx.y)*solver->Mesh.getDimensions().x() + blockDim.x*blockIdx.x + threadIdx.x];
 
-		u0 = solver->cudaDofVector2[(blockDim.y*blockIdx.y + 0)*blockDim.x*gridDim.x + blockDim.x*blockIdx.x + 0];
+		if(u*u0 <=0.0)
+			atomicMax(&(solver->cudaStatusVector[blockIdx.x + gridDim.x*blockIdx.y]),1);
 	}
-	__syncthreads();
+//	if(threadIdx.x+threadIdx.y == 0)
+
+//	printf("Bye from  block = %d, thread = %d, x = %d, y = %d\n", blockIdx.x + gridDim.x*blockIdx.y,(blockDim.y*blockIdx.y + threadIdx.y)*solver->Mesh.getDimensions().x() + blockDim.x*blockIdx.x + threadIdx.x, threadIdx.x, threadIdx.y);
 
-	double u = solver->cudaDofVector2[(blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x + blockDim.x*blockIdx.x + threadIdx.x];
 
-	if(u*u0 <=0.0)
-		atomicMax(&(solver->cudaStatusVector[blockIdx.x + gridDim.x*blockIdx.y]),1);
 }
 
 
@@ -558,6 +641,7 @@ __global__ void initSetupGridCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int
 // run this with one thread per block
 __global__ void initSetupGrid2CUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver)
 {
+//	printf("Hello\n");
 	if(solver->cudaStatusVector[blockIdx.x + gridDim.x*blockIdx.y] == 1)
 	{
 //			1 - with curve,  	2 - to the north of curve, 	4  - to the south of curve,
@@ -574,6 +658,8 @@ __global__ void initSetupGrid2CUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, in
 			if(blockIdx.y < gridDim.y - 1)
 				atomicAdd(&(solver->cudaStatusVector[blockIdx.x + gridDim.x*(blockIdx.y + 1)]), 2);
 	}
+
+
 }
 
 
@@ -582,88 +668,72 @@ __global__ void initSetupGrid2CUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, in
 
 __global__ void runNarrowBandCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, double tau)
 {
-	int gid = (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int gid = (blockDim.y*blockIdx.y + threadIdx.y)*solver->Mesh.getDimensions().x()+ threadIdx.x;
 	int i = threadIdx.x + blockIdx.x*blockDim.x;
 	int j = threadIdx.y + blockIdx.y*blockDim.y;
 
-	int blockID = blockIdx.x + blockIdx.y*gridDim.x;
+//	if(i+j == 0)
+//		printf("Hello\n");
+
+	int blockID = blockIdx.x + blockIdx.y*gridDim.x; /*i/NARROWBAND_SUBGRID_SIZE + (j/NARROWBAND_SUBGRID_SIZE) * ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE);*/
 
+	int status = solver->cudaStatusVector[blockID];
 
-	if(solver->cudaStatusVector[blockID/*i/NARROWBAND_SUBGRID_SIZE + (j/NARROWBAND_SUBGRID_SIZE) * ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE)*/] != 0)
+	if(solver->Mesh.getDimensions().x() > i && solver->Mesh.getDimensions().y() > j)
 	{
-		tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(solver->Mesh);
-		Entity.setCoordinates(tnlStaticVector<2,double>(i,j));
-		Entity.refresh();
-		tnlNeighbourGridEntityGetter<tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
-		double value = solver->cudaDofVector2[Entity.getIndex()];
-		double xf,xb,yf,yb, tmp, fu;
-
-		if( i == 0 )
-			yb = yf = solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()];
-		else if( i == solver->Mesh.getDimensions().x() - 1 )
-			yb = yf = solver->cudaDofVector2[neighbourEntities.template getEntityIndex< -1,  0 >()];
-		else
-		{
-			yb =  solver->cudaDofVector2[neighbourEntities.template getEntityIndex< -1,  0 >()];
-			yf = solver-> cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()];
-		}
 
-		if( j == 0 )
-			xb = xf = solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()];
-		else if( j == solver->Mesh.getDimensions().y() - 1 )
-			xb = xf = solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 0, -1 >()];
-		else
+		if(status != 0)
 		{
-			xb =  solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 0, -1 >()];
-			xf = solver-> cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()];
-		}
-
+			tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(solver->Mesh);
+			Entity.setCoordinates(tnlStaticVector<2,double>(i,j));
+			Entity.refresh();
+			tnlNeighbourGridEntityGetter<tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
+			double value = solver->cudaDofVector2[Entity.getIndex()];
+			double xf,xb,yf,yb, grad, fu, a,b;
+
+			if( i == 0 || (threadIdx.x == 0 && !(status & 9)) )
+				yb = yf = solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()] - value;
+			else if( i == solver->Mesh.getDimensions().x() - 1 || (threadIdx.x == blockDim.x - 1 && !(status & 17)) )
+				yb = yf = value - solver->cudaDofVector2[neighbourEntities.template getEntityIndex< -1,  0 >()];
+			else
+			{
+				yb =  value - solver->cudaDofVector2[neighbourEntities.template getEntityIndex< -1,  0 >()];
+				yf = solver-> cudaDofVector2[neighbourEntities.template getEntityIndex< 1,  0 >()] - value;
+			}
 
-		tmp = sqrt(0.5 * (xf*xf + xb*xb    +   yf*yf + yb*yb ) );
+			if( j == 0 || (threadIdx.y == 0 && !(status & 3)) )
+				xb = xf = solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()] - value;
+			else if( j == solver->Mesh.getDimensions().y() - 1  || (threadIdx.y == blockDim.y - 1 && !(status & 5)) )
+				xb = xf = value - solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 0, -1 >()];
+			else
+			{
+				xb =  value - solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 0, -1 >()];
+				xf = solver-> cudaDofVector2[neighbourEntities.template getEntityIndex< 0,  1 >()] - value;
+			}
 
-		fu = 1.0 * tmp;
 
-		if((tau*fu+value)*value <=0 )
-		{
-			int status = solver->cudaStatusVector[blockID];
-//			1 - with curve,  	2 - to the north of curve, 	4  - to the south of curve,
-//								8 - to the east of curve, 	16 - to the west of curve.
 
-			if(threadIdx.x == 0 && !(status & 8) && (blockIdx.x > 0) )
-				atomicMax(solver->reinitialize,1);
-			else if(threadIdx.x == blockDim.x-1 && !(status & 16) && (blockIdx.x < gridDim.x - 1) )
-				atomicMax(solver->reinitialize,1);
-			else if(threadIdx.y == 0 && !(status & 2) && (blockIdx.y > 0) )
-				atomicMax(solver->reinitialize,1);
-			else if(threadIdx.y == blockDim.y-1 && !(status & 4) && (blockIdx.y < gridDim.y - 1) )
-				atomicMax(solver->reinitialize,1);
+			grad = sqrt(0.5 * (xf*xf + xb*xb    +   yf*yf + yb*yb ) )*solver->Mesh.template getSpaceStepsProducts< -1, 0 >();
 
-//			if(blockIdx.x < gridDim.x - 1)
-//			{
-//				atomicMax( &cudaStatusVector[(i+2)/NARROWBAND_SUBGRID_SIZE + (j/NARROWBAND_SUBGRID_SIZE) * ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE)]  ,1);
-//				if(blockIdx.y < gridDim.y -1)
-//					atomicMax( &cudaStatusVector[(i+2)/NARROWBAND_SUBGRID_SIZE + (j+2/NARROWBAND_SUBGRID_SIZE) * ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE)]  ,1);
-//				if(blockIdx.y > 0)
-//					atomicMax( &cudaStatusVector[(i+2)/NARROWBAND_SUBGRID_SIZE + (j-2/NARROWBAND_SUBGRID_SIZE) * ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE)]  ,1);
-//
-//			}
-//			if(blockIdx.x > 0)
-//			{
-//				atomicMax( &cudaStatusVector[(i-2)/NARROWBAND_SUBGRID_SIZE + (j/NARROWBAND_SUBGRID_SIZE) * ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE)]  ,1);
-//				if(blockIdx.y < gridDim.y -1)
-//					atomicMax( &cudaStatusVector[(i-2)/NARROWBAND_SUBGRID_SIZE + (j+2/NARROWBAND_SUBGRID_SIZE) * ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE)]  ,1);
-//				if(blockIdx.y > 0)
-//					atomicMax( &cudaStatusVector[(i-2)/NARROWBAND_SUBGRID_SIZE + (j-2/NARROWBAND_SUBGRID_SIZE) * ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE)]  ,1);
-//
-//			}
-//			if(blockIdx.y < gridDim.y -1)
-//				atomicMax( &cudaStatusVector[i/NARROWBAND_SUBGRID_SIZE + ((j+2)/NARROWBAND_SUBGRID_SIZE) * ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE)]  ,1);
-//			if(blockIdx.y > 0)
-//				atomicMax( &cudaStatusVector[i/NARROWBAND_SUBGRID_SIZE + ((j-2)/NARROWBAND_SUBGRID_SIZE) * ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE)]  ,1);
-		}
+			fu = -1.0 * grad;
 
+			if((tau*fu+value)*value <=0 )
+			{
+				//			1 - with curve,  	2 - to the north of curve, 	4  - to the south of curve,
+				//								8 - to the east of curve, 	16 - to the west of curve.
+
+				if((threadIdx.x == 1 && !(status & 9)) && (blockIdx.x > 0) )
+					atomicMax(solver->reinitialize,1);
+				else if((threadIdx.x == blockDim.x - 2 && !(status & 17)) && (blockIdx.x < gridDim.x - 1) )
+					atomicMax(solver->reinitialize,1);
+				else if((threadIdx.y == 1 && !(status & 3)) && (blockIdx.y > 0) )
+					atomicMax(solver->reinitialize,1);
+				else if((threadIdx.y == blockDim.y - 2 && !(status & 5)) && (blockIdx.y < gridDim.y - 1) )
+					atomicMax(solver->reinitialize,1);
+			}
 
-		solver->cudaDofVector2[Entity.getIndex()]  = value+tau*fu;
+			solver->cudaDofVector2[Entity.getIndex()]  = value+tau*fu;
+		}
 	}
 }
 
diff --git a/examples/narrow-band/tnlNarrowBand_CUDA.h b/examples/narrow-band/tnlNarrowBand_CUDA.h
index 6a4cd5fd49..179ccb9ed3 100644
--- a/examples/narrow-band/tnlNarrowBand_CUDA.h
+++ b/examples/narrow-band/tnlNarrowBand_CUDA.h
@@ -62,6 +62,14 @@ public:
 	__host__ static tnlString getType();
 	__host__ bool init( const tnlParameterContainer& parameters );
 	__host__ bool run();
+#ifdef HAVE_CUDA
+   __device__ __host__
+#endif
+	RealType positivePart(const RealType arg) const;
+#ifdef HAVE_CUDA
+   __device__ __host__
+#endif
+	RealType negativePart(const RealType arg) const;
 
 #ifdef HAVE_CUDA
 	__device__ bool initGrid();
-- 
GitLab