diff --git a/examples/fast-sweeping/main.h b/examples/fast-sweeping/main.h
index 75850bbf6fb26033017272aad22a0dea3cff8499..aabaf684ca797a09f6c37c6fffc864dc000cc93e 100644
--- a/examples/fast-sweeping/main.h
+++ b/examples/fast-sweeping/main.h
@@ -17,9 +17,9 @@
 
 #include "MainBuildConfig.h"
 	//for HOST versions:
-//#include "tnlFastSweeping.h"
+#include "tnlFastSweeping.h"
 	//for DEVICE versions:
-#include "tnlFastSweeping_CUDA.h"
+//#include "tnlFastSweeping_CUDA.h"
 #include "fastSweepingConfig.h"
 #include <solvers/tnlConfigTags.h>
 
@@ -43,7 +43,7 @@ int main( int argc, char* argv[] )
    if( ! parseCommandLine( argc, argv, configDescription, parameters ) )
       return false;
 
-    tnlFastSweeping<tnlGrid<2,double,tnlHost, int>, double, int> solver;
+    tnlFastSweeping<tnlGrid<3,double,tnlHost, int>, double, int> solver;
     if(!solver.init(parameters))
    {
     	cerr << "Solver failed to initialize." << endl;
diff --git a/examples/fast-sweeping/tnlFastSweeping.h b/examples/fast-sweeping/tnlFastSweeping.h
index 5f0871b764b080ba34c06ee53a79a540a6989fc4..c13a28c710785884839fb2f4f9899bbe04d60875 100644
--- a/examples/fast-sweeping/tnlFastSweeping.h
+++ b/examples/fast-sweeping/tnlFastSweeping.h
@@ -107,9 +107,87 @@ protected:
 
 };
 
+
+
+
+
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+class tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index >
+{
+
+public:
+	typedef Real RealType;
+	typedef Device DeviceType;
+	typedef Index IndexType;
+	typedef tnlGrid< 3, Real, Device, Index > MeshType;
+	typedef tnlVector< RealType, DeviceType, IndexType> DofVectorType;
+	typedef typename MeshType::CoordinatesType CoordinatesType;
+
+
+
+	static tnlString getType();
+	bool init( const tnlParameterContainer& parameters );
+
+	bool initGrid();
+	bool run();
+
+	//for single core version use this implementation:
+	void updateValue(const Index i, const Index j, const Index k);
+	//for parallel version use this one instead:
+//	void updateValue(const Index i, const Index j, DofVectorType* grid);
+
+
+	void setupSquare1000(Index i, Index j);
+	void setupSquare1100(Index i, Index j);
+	void setupSquare1010(Index i, Index j);
+	void setupSquare1001(Index i, Index j);
+	void setupSquare1110(Index i, Index j);
+	void setupSquare1101(Index i, Index j);
+	void setupSquare1011(Index i, Index j);
+	void setupSquare1111(Index i, Index j);
+	void setupSquare0000(Index i, Index j);
+	void setupSquare0100(Index i, Index j);
+	void setupSquare0010(Index i, Index j);
+	void setupSquare0001(Index i, Index j);
+	void setupSquare0110(Index i, Index j);
+	void setupSquare0101(Index i, Index j);
+	void setupSquare0011(Index i, Index j);
+	void setupSquare0111(Index i, Index j);
+
+	Real fabsMin(const Real x, const Real y);
+
+
+protected:
+
+	MeshType Mesh;
+
+	bool exactInput;
+
+	DofVectorType dofVector, dofVector2;
+
+	RealType h;
+
+#ifdef HAVE_OPENMP
+//	omp_lock_t* gridLock;
+#endif
+
+
+};
+
+
 	//for single core version use this implementation:
 #include "tnlFastSweeping2D_impl.h"
 	//for parallel version use this one instead:
 // #include "tnlFastSweeping2D_openMP_impl.h"
 
+#include "tnlFastSweeping3D_impl.h"
+
 #endif /* TNLFASTSWEEPING_H_ */
diff --git a/examples/fast-sweeping/tnlFastSweeping3D_CUDA_impl.h b/examples/fast-sweeping/tnlFastSweeping3D_CUDA_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3f8cc216a0361b57d1ebcaca032d75deba5e0ec
--- /dev/null
+++ b/examples/fast-sweeping/tnlFastSweeping3D_CUDA_impl.h
@@ -0,0 +1,1253 @@
+/***************************************************************************
+                          tnlFastSweeping2D_CUDA_v4_impl.h  -  description
+                             -------------------
+    begin                : Oct 15 , 2015
+    copyright            : (C) 2015 by Tomas Sobotik
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+#ifndef TNLFASTSWEEPING3D_IMPL_H_
+#define TNLFASTSWEEPING3D_IMPL_H_
+
+#include "tnlFastSweeping.h"
+
+//__device__
+//double fabsMin( double x, double y)
+//{
+//	double fx = abs(x);
+//
+//	if(Min(fx,abs(y)) == fx)
+//		return x;
+//	else
+//		return y;
+//}
+//
+//__device__
+//double atomicFabsMin(double* address, double val)
+//{
+//	unsigned long long int* address_as_ull =
+//						  (unsigned long long int*)address;
+//	unsigned long long int old = *address_as_ull, assumed;
+//	do {
+//		assumed = old;
+//			old = atomicCAS(address_as_ull, assumed,__double_as_longlong( fabsMin(assumed,val) ));
+//	} while (assumed != old);
+//	return __longlong_as_double(old);
+//}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+tnlString tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
+{
+	   return tnlString( "tnlFastSweeping< " ) +
+	          MeshType::getType() + ", " +
+	          ::getType< Real >() + ", " +
+	          ::getType< Index >() + " >";
+}
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+bool tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: init( const tnlParameterContainer& parameters )
+{
+	const tnlString& meshFile = parameters.getParameter< tnlString >( "mesh" );
+
+	if( ! Mesh.load( meshFile ) )
+	{
+		   cerr << "I am not able to load the mesh from the file " << meshFile << "." << endl;
+		   return false;
+	}
+
+
+	const tnlString& initialCondition = parameters.getParameter <tnlString>("initial-condition");
+	if( ! dofVector.load( initialCondition ) )
+	{
+		   cerr << "I am not able to load the initial condition from the file " << meshFile << "." << endl;
+		   return false;
+	}
+
+	h = Mesh.getHx();
+	counter = 0;
+
+	const tnlString& exact_input = parameters.getParameter< tnlString >( "exact-input" );
+
+	if(exact_input == "no")
+		exactInput=false;
+	else
+		exactInput=true;
+
+
+#ifdef HAVE_CUDA
+
+	cudaMalloc(&(cudaDofVector), this->dofVector.getSize()*sizeof(double));
+	cudaMemcpy(cudaDofVector, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice);
+
+	cudaMalloc(&(cudaDofVector2), this->dofVector.getSize()*sizeof(double));
+	cudaMemcpy(cudaDofVector2, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice);
+
+
+	cudaMalloc(&(this->cudaSolver), sizeof(tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index >));
+	cudaMemcpy(this->cudaSolver, this,sizeof(tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index >), cudaMemcpyHostToDevice);
+
+#endif
+
+	int n = Mesh.getDimensions().x();
+	dim3 threadsPerBlock(8, 8,8);
+	dim3 numBlocks(n/8 + 1, n/8 +1, n/8 +1);
+
+	initCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
+	cudaDeviceSynchronize();
+	checkCudaDevice;
+
+	return true;
+}
+
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+bool tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: run()
+{
+//
+//	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
+//	{
+//		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
+//		{
+//			updateValue(i,j);
+//		}
+//	}
+//
+///*---------------------------------------------------------------------------------------------------------------------------*/
+//
+//	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
+//	{
+//		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
+//		{
+//			updateValue(i,j);
+//		}
+//	}
+//
+///*---------------------------------------------------------------------------------------------------------------------------*/
+//
+//	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
+//	{
+//		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
+//		{
+//			updateValue(i,j);
+//		}
+//	}
+//
+///*---------------------------------------------------------------------------------------------------------------------------*/
+//	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
+//	{
+//		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
+//		{
+//			updateValue(i,j);
+//		}
+//	}
+//
+///*---------------------------------------------------------------------------------------------------------------------------*/
+//
+//
+//	dofVector.save("u-00001.tnl");
+
+	int n = Mesh.getDimensions().x();
+	dim3 threadsPerBlock(1, 512);
+	dim3 numBlocks(8,1);
+
+
+	runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,0,0);
+
+	cudaDeviceSynchronize();
+	checkCudaDevice;
+
+	cudaMemcpy(this->dofVector.getData(), cudaDofVector2, this->dofVector.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
+	cudaDeviceSynchronize();
+	cudaFree(cudaDofVector);
+	cudaFree(cudaDofVector2);
+	cudaFree(cudaSolver);
+	dofVector.save("u-00001.tnl");
+	cudaDeviceSynchronize();
+	return true;
+}
+
+
+
+
+#ifdef HAVE_CUDA
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+__device__
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j, Index k)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j,k));
+	Real value = cudaDofVector2[index];
+	Real a,b,c, tmp;
+
+	if( i == 0 )
+		a = cudaDofVector2[Mesh.template getCellNextToCell<1,0,0>(index)];
+	else if( i == Mesh.getDimensions().x() - 1 )
+		a = cudaDofVector2[Mesh.template getCellNextToCell<-1,0,0>(index)];
+	else
+	{
+		a = fabsMin( cudaDofVector2[Mesh.template getCellNextToCell<-1,0,0>(index)],
+				 cudaDofVector2[Mesh.template getCellNextToCell<1,0,0>(index)] );
+	}
+
+	if( j == 0 )
+		b = cudaDofVector2[Mesh.template getCellNextToCell<0,1,0>(index)];
+	else if( j == Mesh.getDimensions().y() - 1 )
+		b = cudaDofVector2[Mesh.template getCellNextToCell<0,-1,0>(index)];
+	else
+	{
+		b = fabsMin( cudaDofVector2[Mesh.template getCellNextToCell<0,-1,0>(index)],
+				 cudaDofVector2[Mesh.template getCellNextToCell<0,1,0>(index)] );
+	}
+
+
+	if( k == 0 )
+		c = cudaDofVector2[Mesh.template getCellNextToCell<0,0,1>(index)];
+	else if( k == Mesh.getDimensions().z() - 1 )
+		c = cudaDofVector2[Mesh.template getCellNextToCell<0,0,-1>(index)];
+	else
+	{
+		c = fabsMin( cudaDofVector2[Mesh.template getCellNextToCell<0,0,-1>(index)],
+				 cudaDofVector2[Mesh.template getCellNextToCell<0,0,1>(index)] );
+	}
+
+
+	Real hD= 2.0*(a*a+b*b+c*c-a*b-a*c-b*c);
+
+	if(hD >= 3.0*h*h)
+		tmp = fabsMin(a,fabsMin(b,c)) + Sign(value)*h;
+	else
+		tmp = (1.0/3.0) * ( a + b + c + Sign(value)*sqrt(3.0*h*h-hD) );
+
+
+
+//	cudaDofVector2[index]  = fabsMin(value, tmp);
+	atomicFabsMin(&cudaDofVector2[index],tmp);
+
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+__device__
+bool tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid()
+{
+	int i = threadIdx.x + blockDim.x*blockIdx.x;
+	int j = blockDim.y*blockIdx.y + threadIdx.y;
+	int k = blockDim.z*blockIdx.z + threadIdx.z;
+	int gid = Mesh.getCellIndex(CoordinatesType(i,j,k));
+
+	cudaDofVector2[gid] = INT_MAX*Sign(cudaDofVector[gid]);
+	if(abs(cudaDofVector[gid]) < 1.8*this->h)
+		cudaDofVector2[gid] = cudaDofVector[gid];
+//	if(i+1 < Mesh.getDimensions().x() && j+1 < Mesh.getDimensions().y() )
+//	{
+//		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] > 0)
+//		{
+//			if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))] > 0)
+//			{
+//				if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))] > 0)
+//				{
+//					if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0)
+//						setupSquare1111(i,j);
+//					else
+//						setupSquare1110(i,j);
+//				}
+//				else
+//				{
+//					if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0)
+//						setupSquare1101(i,j);
+//					else
+//						setupSquare1100(i,j);
+//				}
+//			}
+//			else
+//			{
+//				if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))] > 0)
+//				{
+//					if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0)
+//						setupSquare1011(i,j);
+//					else
+//						setupSquare1010(i,j);
+//				}
+//				else
+//				{
+//					if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0)
+//						setupSquare1001(i,j);
+//					else
+//						setupSquare1000(i,j);
+//				}
+//			}
+//		}
+//		else
+//		{
+//			if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))] > 0)
+//			{
+//				if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))] > 0)
+//				{
+//					if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0)
+//						setupSquare0111(i,j);
+//					else
+//						setupSquare0110(i,j);
+//				}
+//				else
+//				{
+//					if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0)
+//						setupSquare0101(i,j);
+//					else
+//						setupSquare0100(i,j);
+//				}
+//			}
+//			else
+//			{
+//				if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))] > 0)
+//				{
+//					if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0)
+//						setupSquare0011(i,j);
+//					else
+//						setupSquare0010(i,j);
+//				}
+//				else
+//				{
+//					if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0)
+//						setupSquare0001(i,j);
+//					else
+//						setupSquare0000(i,j);
+//				}
+//			}
+//		}
+//
+//	}
+
+	return true;
+//
+//	int total = blockDim.x*gridDim.x;
+//
+//
+//
+//	Real tmp = 0.0;
+//	int flag = 0;
+//	counter = 0;
+//	tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
+//
+//
+//	if(!exactInput)
+//	{
+//		cudaDofVector[gid]=cudaDofVector[gid]=0.5*h*Sign(cudaDofVector[gid]);
+//	}
+//	__threadfence();
+////	printf("-----------------------------------------------------------------------------------\n");
+//
+//	__threadfence();
+//
+//	if(gx > 0 && gx < Mesh.getDimensions().x()-1)
+//	{
+//		if(gy > 0 && gy < Mesh.getDimensions().y()-1)
+//		{
+//
+//			Index j = gy;
+//			Index i = gx;
+////			 tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+//
+//			if(tmp == 0.0)
+//			{}
+//			else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+//					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+//					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
+//					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+//			{}
+//			else
+//				flag=1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+//		}
+//	}
+//
+////	printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
+////	printf("****************************************************************\n");
+////	printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
+//	if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == 0)
+//	{
+////		printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
+//		Index j = 0;
+//		Index i = gx;
+////		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+//
+//
+//		if(tmp == 0.0)
+//		{}
+//		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+//				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+//				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 )
+//		{}
+//		else
+//			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+//	}
+//
+////	printf("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n");
+//	if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == Mesh.getDimensions().y() - 1)
+//	{
+//		Index i = gx;
+//		Index j = Mesh.getDimensions().y() - 1;
+////		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+//
+//
+//		if(tmp == 0.0)
+//		{}
+//		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+//				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+//				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+//		{}
+//		else
+//			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+//	}
+//
+////	printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+//	if(gy > 0 && gy < Mesh.getDimensions().y()-1 && gx == 0)
+//	{
+//		Index j = gy;
+//		Index i = 0;
+////		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+//
+//
+//		if(tmp == 0.0)
+//		{}
+//		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+//				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
+//				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+//		{}
+//		else
+//			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+//	}
+////	printf("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n");
+//	if(gy > 0 && gy < Mesh.getDimensions().y()-1  && gx == Mesh.getDimensions().x() - 1)
+//	{
+//		Index j = gy;
+//		Index i = Mesh.getDimensions().x() - 1;
+////		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+//
+//
+//		if(tmp == 0.0)
+//		{}
+//		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+//				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
+//				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+//		{}
+//		else
+//			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+//	}
+//
+////	printf("##################################################################################################\n");
+//	if(gx == Mesh.getDimensions().x() - 1 &&
+//	   gy == Mesh.getDimensions().y() - 1)
+//	{
+//
+////		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
+//		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 &&
+//				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0)
+//
+//			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
+//	}
+//	if(gx == Mesh.getDimensions().x() - 1 &&
+//	   gy == 0)
+//	{
+//
+////		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
+//		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 &&
+//				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0)
+//
+//			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
+//	}
+////	printf("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n");
+//	if(gx == 0 &&
+//	   gy == Mesh.getDimensions().y() - 1)
+//	{
+//
+////		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
+//		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 &&
+//				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0)
+//
+//			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
+//	}
+//	if(gx == 0 &&
+//	   gy == 0)
+//	{
+////		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
+//		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 &&
+//				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0)
+//
+//			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
+//	}
+//
+//	__threadfence();
+//
+//	if(flag==1)
+//		cudaDofVector[gid] =  tmp*3;
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+__device__
+Real tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
+{
+	Real fx = abs(x);
+	//Real fy = abs(y);
+
+	//Real tmpMin = Min(fx,abs(y));
+
+	if(Min(fx,abs(y)) == fx)
+		return x;
+	else
+		return y;
+
+
+}
+
+
+
+__global__ void runCUDA(tnlFastSweeping< tnlGrid< 3,double, tnlHost, int >, double, int >* solver, int sweep, int i)
+{
+
+	int gx = 0;
+	int gy = threadIdx.y;
+
+	//if(solver->Mesh.getDimensions().x() <= gx || solver->Mesh.getDimensions().y() <= gy)
+	//	return;
+	int n = solver->Mesh.getDimensions().x();
+	int blockCount = n/blockDim.y +1;
+	//int gid = solver->Mesh.getDimensions().x() * gy + gx;
+	//int max = solver->Mesh.getDimensions().x()*solver->Mesh.getDimensions().x();
+
+	//int id1 = gx+gy;
+	//int id2 = (solver->Mesh.getDimensions().x() - gx - 1) + gy;
+
+
+	if(blockIdx.x==0)
+	{
+		for(int gz = 0; gz < n;gz++)
+		{
+		gx = 0;
+		gy = threadIdx.y;
+		for(int k = 0; k < n*blockCount + blockDim.y; k++)
+		{
+			if(threadIdx.y  < k+1 && gy < n)
+			{
+				solver->updateValue(gx,gy,gz);
+				gx++;
+				if(gx==n)
+				{
+					gx=0;
+					gy+=blockDim.y;
+				}
+			}
+
+
+			__syncthreads();
+		}
+		__syncthreads();
+		}
+	}
+	else if(blockIdx.x==1)
+	{
+		for(int gz = 0; gz < n;gz++)
+		{
+		gx=n-1;
+		gy=threadIdx.y;
+
+		for(int k = 0; k < n*blockCount + blockDim.y; k++)
+		{
+			if(threadIdx.y  < k+1 && gy < n)
+			{
+				solver->updateValue(gx,gy,gz);
+				gx--;
+				if(gx==-1)
+				{
+					gx=n-1;
+					gy+=blockDim.y;
+				}
+			}
+
+
+			__syncthreads();
+		}
+		}
+	}
+	else if(blockIdx.x==2)
+	{
+
+		for(int gz = 0; gz < n;gz++)
+		{
+		gx=0;
+		gy=n-threadIdx.y;
+		for(int k = 0; k < n*blockCount + blockDim.y; k++)
+		{
+			if(threadIdx.y  < k+1 && gy > -1)
+			{
+				solver->updateValue(gx,gy,gz);
+				gx++;
+				if(gx==n)
+				{
+					gx=0;
+					gy-=blockDim.y;
+				}
+			}
+
+
+			__syncthreads();
+		}
+		}
+	}
+	else if(blockIdx.x==3)
+	{
+		for(int gz = 0; gz < n;gz++)
+		{
+		gx=n-1;
+		gy=n-threadIdx.y;
+
+		for(int k = 0; k < n*blockCount + blockDim.y; k++)
+		{
+			if(threadIdx.y  < k+1 && gy > -1)
+			{
+				solver->updateValue(gx,gy,gz);
+				gx--;
+				if(gx==-1)
+				{
+					gx=n-1;
+					gy-=blockDim.y;
+				}
+			}
+
+
+			__syncthreads();
+		}
+		}
+	}
+
+
+
+
+	else if(blockIdx.x==4)
+	{
+		for(int gz = n-1; gz > -1;gz--)
+		{
+		gx = 0;
+		gy = threadIdx.y;
+		for(int k = 0; k < n*blockCount + blockDim.y; k++)
+		{
+			if(threadIdx.y  < k+1 && gy < n)
+			{
+				solver->updateValue(gx,gy,gz);
+				gx++;
+				if(gx==n)
+				{
+					gx=0;
+					gy+=blockDim.y;
+				}
+			}
+
+
+			__syncthreads();
+		}
+		}
+	}
+	else if(blockIdx.x==5)
+	{
+		for(int gz = n-1; gz > -1;gz--)
+		{
+		gx=n-1;
+		gy=threadIdx.y;
+
+		for(int k = 0; k < n*blockCount + blockDim.y; k++)
+		{
+			if(threadIdx.y  < k+1 && gy < n)
+			{
+				solver->updateValue(gx,gy,gz);
+				gx--;
+				if(gx==-1)
+				{
+					gx=n-1;
+					gy+=blockDim.y;
+				}
+			}
+
+
+			__syncthreads();
+		}
+		}
+	}
+	else if(blockIdx.x==6)
+	{
+
+		for(int gz = n-1; gz > -1;gz--)
+		{
+		gx=0;
+		gy=n-threadIdx.y;
+		for(int k = 0; k < n*blockCount + blockDim.y; k++)
+		{
+			if(threadIdx.y  < k+1 && gy > -1)
+			{
+				solver->updateValue(gx,gy,gz);
+				gx++;
+				if(gx==n)
+				{
+					gx=0;
+					gy-=blockDim.y;
+				}
+			}
+
+
+			__syncthreads();
+		}
+		}
+	}
+	else if(blockIdx.x==7)
+	{
+		for(int gz = n-1; gz > -1;gz--)
+		{
+		gx=n-1;
+		gy=n-threadIdx.y;
+
+		for(int k = 0; k < n*blockCount + blockDim.y; k++)
+		{
+			if(threadIdx.y  < k+1 && gy > -1)
+			{
+				solver->updateValue(gx,gy,gz);
+				gx--;
+				if(gx==-1)
+				{
+					gx=n-1;
+					gy-=blockDim.y;
+				}
+			}
+
+
+			__syncthreads();
+		}
+		}
+	}
+
+
+
+
+}
+
+
+__global__ void initCUDA(tnlFastSweeping< tnlGrid< 3,double, tnlHost, int >, double, int >* solver)
+{
+	int gx = threadIdx.x + blockDim.x*blockIdx.x;
+	int gy = blockDim.y*blockIdx.y + threadIdx.y;
+	int gz = blockDim.z*blockIdx.z + threadIdx.z;
+
+	if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy && solver->Mesh.getDimensions().z() > gz)
+	{
+		solver->initGrid();
+	}
+
+
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1111( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	cudaDofVector2[index]=fabsMin(INT_MAX,cudaDofVector2[(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0000( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	cudaDofVector2[index]=fabsMin(-INT_MAX,cudaDofVector2[(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-INT_MAX,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1110( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]));
+
+	be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[index]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1101( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]));
+
+	be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[index]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1011( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]));
+
+	be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[index]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0111( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
+
+	be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[index]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0001( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]));
+
+	be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[index]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0010( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]));
+
+	be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[index]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0100( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]));
+
+	be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[index]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1000( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
+
+	be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[index]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1100( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
+
+	be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]));
+
+	a = al-be;
+	b=1.0;
+	c=-al;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[index]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1010( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
+
+	be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]));
+
+	a = al-be;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[index]=fabsMin(abs(a*0+b*0+c)*s,cudaDofVector2[(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1001( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	cudaDofVector2[index]=fabsMin(cudaDofVector[index],cudaDofVector2[(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)],cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)],cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)],cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+
+
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0011( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
+
+	be=abs(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]));
+
+	a = al-be;
+	b=1.0;
+	c=-al;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[index]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0101( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<0,0>(index)]));
+
+	be=abs(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]/
+			(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)]-
+			 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)]));
+
+	a = al-be;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	cudaDofVector2[index]=fabsMin(-abs(a*0+b*0+c)*s,cudaDofVector2[(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*1+b*0+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*0+b*1+c)*s,cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0110( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	cudaDofVector2[index]=fabsMin(cudaDofVector[index],cudaDofVector2[(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)],cudaDofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<1,1>(index)],cudaDofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)],cudaDofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+}
+#endif
+
+
+
+
+#endif /* TNLFASTSWEEPING_IMPL_H_ */
diff --git a/examples/fast-sweeping/tnlFastSweeping3D_impl.h b/examples/fast-sweeping/tnlFastSweeping3D_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f5eeb07764dd7ac2d8137030967c932841fbc84
--- /dev/null
+++ b/examples/fast-sweeping/tnlFastSweeping3D_impl.h
@@ -0,0 +1,946 @@
+/***************************************************************************
+                          tnlFastSweeping2D_impl.h  -  description
+                             -------------------
+    begin                : Oct 15 , 2015
+    copyright            : (C) 2015 by Tomas Sobotik
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+#ifndef TNLFASTSWEEPING3D_IMPL_H_
+#define TNLFASTSWEEPING3D_IMPL_H_
+
+#include "tnlFastSweeping.h"
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+tnlString tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
+{
+	   return tnlString( "tnlFastSweeping< " ) +
+	          MeshType::getType() + ", " +
+	          ::getType< Real >() + ", " +
+	          ::getType< Index >() + " >";
+}
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+bool tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: init( const tnlParameterContainer& parameters )
+{
+	const tnlString& meshFile = parameters.getParameter< tnlString >( "mesh" );
+
+	if( ! Mesh.load( meshFile ) )
+	{
+		   cerr << "I am not able to load the mesh from the file " << meshFile << "." << endl;
+		   return false;
+	}
+
+
+	const tnlString& initialCondition = parameters.getParameter <tnlString>("initial-condition");
+	if( ! dofVector.load( initialCondition ) )
+	{
+		   cerr << "I am not able to load the initial condition from the file " << meshFile << "." << endl;
+		   return false;
+	}
+	dofVector2.setLike(dofVector);
+
+	h = Mesh.getHx();
+
+	const tnlString& exact_input = parameters.getParameter< tnlString >( "exact-input" );
+
+	if(exact_input == "no")
+		exactInput=false;
+	else
+		exactInput=true;
+	cout << "bla "<<endl;
+	return initGrid();
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+bool tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid()
+{
+
+	for(int i=0; i< Mesh.getDimensions().x()*Mesh.getDimensions().y()*Mesh.getDimensions().z();i++)
+	{
+		dofVector2[i]=INT_MAX*Sign(dofVector[i]);
+		if (abs(dofVector[i]) < 1.8*h)
+			dofVector2[i]=dofVector[i];
+	}
+	cout << "bla 2"<<endl;
+//
+//	for(int i = 0 ; i < Mesh.getDimensions().x()-1; i++)
+//	{
+//		for(int j = 0 ; j < Mesh.getDimensions().x()-1; j++)
+//			{
+//				if(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] > 0)
+//				{
+//					if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))] > 0)
+//					{
+//						if(dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))] > 0)
+//						{
+//							if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0)
+//								setupSquare1111(i,j);
+//							else
+//								setupSquare1110(i,j);
+//						}
+//						else
+//						{
+//							if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0)
+//								setupSquare1101(i,j);
+//							else
+//								setupSquare1100(i,j);
+//						}
+//					}
+//					else
+//					{
+//						if(dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))] > 0)
+//						{
+//							if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0)
+//								setupSquare1011(i,j);
+//							else
+//								setupSquare1010(i,j);
+//						}
+//						else
+//						{
+//							if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0)
+//								setupSquare1001(i,j);
+//							else
+//								setupSquare1000(i,j);
+//						}
+//					}
+//				}
+//				else
+//				{
+//					if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))] > 0)
+//					{
+//						if(dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))] > 0)
+//						{
+//							if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0)
+//								setupSquare0111(i,j);
+//							else
+//								setupSquare0110(i,j);
+//						}
+//						else
+//						{
+//							if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0)
+//								setupSquare0101(i,j);
+//							else
+//								setupSquare0100(i,j);
+//						}
+//					}
+//					else
+//					{
+//						if(dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))] > 0)
+//						{
+//							if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0)
+//								setupSquare0011(i,j);
+//							else
+//								setupSquare0010(i,j);
+//						}
+//						else
+//						{
+//							if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j+1))] > 0)
+//								setupSquare0001(i,j);
+//							else
+//								setupSquare0000(i,j);
+//						}
+//					}
+//				}
+//
+//			}
+//	}
+
+//	Real tmp = 0.0;
+//	Real ax=0.5/sqrt(2.0);
+//
+//	if(!exactInput)
+//	{
+//		for(Index i = 0; i < Mesh.getDimensions().x()*Mesh.getDimensions().y(); i++)
+//				dofVector[i]=0.5*h*Sign(dofVector[i]);
+//	}
+//
+//
+//	for(Index i = 1; i < Mesh.getDimensions().x()-1; i++)
+//	{
+//		for(Index j = 1; j < Mesh.getDimensions().y()-1; j++)
+//		{
+//			 tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+//
+//			if(tmp == 0.0)
+//			{}
+//			else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+//					dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+//					dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
+//					dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+//			{}
+//			else
+//				dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+//		}
+//	}
+//
+//
+//
+//	for(int i = 1; i < Mesh.getDimensions().x()-1; i++)
+//	{
+//		Index j = 0;
+//		tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+//
+//
+//		if(tmp == 0.0)
+//		{}
+//		else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+//				dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+//				dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 )
+//		{}
+//		else
+//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+//	}
+//
+//	for(int i = 1; i < Mesh.getDimensions().x()-1; i++)
+//	{
+//		Index j = Mesh.getDimensions().y() - 1;
+//		tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+//
+//
+//		if(tmp == 0.0)
+//		{}
+//		else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+//				dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+//				dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+//		{}
+//		else
+//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+//	}
+//
+//	for(int j = 1; j < Mesh.getDimensions().y()-1; j++)
+//	{
+//		Index i = 0;
+//		tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+//
+//
+//		if(tmp == 0.0)
+//		{}
+//		else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+//				dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
+//				dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+//		{}
+//		else
+//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+//	}
+//
+//	for(int j = 1; j < Mesh.getDimensions().y()-1; j++)
+//	{
+//		Index i = Mesh.getDimensions().x() - 1;
+//		tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+//
+//
+//		if(tmp == 0.0)
+//		{}
+//		else if(dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+//				dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
+//				dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+//		{}
+//		else
+//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+//	}
+//
+//
+//	Index i = Mesh.getDimensions().x() - 1;
+//	Index j = Mesh.getDimensions().y() - 1;
+//
+//	tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+//	if(dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp > 0.0 &&
+//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp > 0.0)
+//
+//		dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+//
+//
+//
+//	j = 0;
+//	tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+//	if(dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp > 0.0 &&
+//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp > 0.0)
+//
+//		dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+//
+//
+//
+//	i = 0;
+//	j = Mesh.getDimensions().y() -1;
+//	tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+//	if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp > 0.0 &&
+//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp > 0.0)
+//
+//		dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+//
+//
+//
+//	j = 0;
+//	tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+//	if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp > 0.0 &&
+//			dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp > 0.0)
+//
+//		dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+
+
+	dofVector2.save("u-00000.tnl");
+
+	return true;
+}
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+bool tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: run()
+{
+
+	for(Index k = 0; k < Mesh.getDimensions().z(); k++)
+	{
+		for(Index i = 0; i < Mesh.getDimensions().x(); i++)
+		{
+			for(Index j = 0; j < Mesh.getDimensions().y(); j++)
+			{
+				updateValue(i,j,k);
+			}
+		}
+	}
+
+/*---------------------------------------------------------------------------------------------------------------------------*/
+
+	for(Index k = 0; k < Mesh.getDimensions().z(); k++)
+	{
+		for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
+		{
+			for(Index j = 0; j < Mesh.getDimensions().y(); j++)
+			{
+				updateValue(i,j,k);
+			}
+		}
+	}
+
+/*---------------------------------------------------------------------------------------------------------------------------*/
+
+	for(Index k = 0; k < Mesh.getDimensions().z(); k++)
+	{
+		for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
+		{
+			for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
+			{
+				updateValue(i,j,k);
+			}
+		}
+	}
+
+/*---------------------------------------------------------------------------------------------------------------------------*/
+	for(Index k = 0; k < Mesh.getDimensions().z(); k++)
+	{
+		for(Index i = 0; i < Mesh.getDimensions().x(); i++)
+		{
+			for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
+			{
+				updateValue(i,j,k);
+			}
+		}
+	}
+
+/*---------------------------------------------------------------------------------------------------------------------------*/
+
+
+
+
+
+
+
+
+	for(Index k = Mesh.getDimensions().z() -1; k > -1; k--)
+	{
+		for(Index i = 0; i < Mesh.getDimensions().x(); i++)
+		{
+			for(Index j = 0; j < Mesh.getDimensions().y(); j++)
+			{
+				updateValue(i,j,k);
+			}
+		}
+	}
+
+/*---------------------------------------------------------------------------------------------------------------------------*/
+
+	for(Index k = Mesh.getDimensions().z() -1; k > -1; k--)
+	{
+		for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
+		{
+			for(Index j = 0; j < Mesh.getDimensions().y(); j++)
+			{
+				updateValue(i,j,k);
+			}
+		}
+	}
+
+/*---------------------------------------------------------------------------------------------------------------------------*/
+
+	for(Index k = Mesh.getDimensions().z() -1; k > -1; k--)
+	{
+		for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
+		{
+			for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
+			{
+				updateValue(i,j,k);
+			}
+		}
+	}
+
+/*---------------------------------------------------------------------------------------------------------------------------*/
+	for(Index k = Mesh.getDimensions().z() -1; k > -1; k--)
+	{
+		for(Index i = 0; i < Mesh.getDimensions().x(); i++)
+		{
+			for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
+			{
+				updateValue(i,j,k);
+			}
+		}
+	}
+
+/*---------------------------------------------------------------------------------------------------------------------------*/
+
+
+	dofVector2.save("u-00001.tnl");
+
+	cout << "bla 3"<<endl;
+	return true;
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j, Index k)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j,k));
+	Real value = dofVector2[index];
+	Real a,b,c, tmp;
+
+	if( i == 0 )
+		a = dofVector2[Mesh.template getCellNextToCell<1,0,0>(index)];
+	else if( i == Mesh.getDimensions().x() - 1 )
+		a = dofVector2[Mesh.template getCellNextToCell<-1,0,0>(index)];
+	else
+	{
+		a = fabsMin( dofVector2[Mesh.template getCellNextToCell<-1,0,0>(index)],
+				 dofVector2[Mesh.template getCellNextToCell<1,0,0>(index)] );
+	}
+
+	if( j == 0 )
+		b = dofVector2[Mesh.template getCellNextToCell<0,1,0>(index)];
+	else if( j == Mesh.getDimensions().y() - 1 )
+		b = dofVector2[Mesh.template getCellNextToCell<0,-1,0>(index)];
+	else
+	{
+		b = fabsMin( dofVector2[Mesh.template getCellNextToCell<0,-1,0>(index)],
+				 dofVector2[Mesh.template getCellNextToCell<0,1,0>(index)] );
+	}
+
+	if( k == 0 )
+		c = dofVector2[Mesh.template getCellNextToCell<0,0,1>(index)];
+	else if( k == Mesh.getDimensions().z() - 1 )
+		c = dofVector2[Mesh.template getCellNextToCell<0,0,-1>(index)];
+	else
+	{
+		c = fabsMin( dofVector2[Mesh.template getCellNextToCell<0,0,-1>(index)],
+				 dofVector2[Mesh.template getCellNextToCell<0,0,1>(index)] );
+	}
+
+	Real hD= 2.0*(a*a+b*b+c*c-a*b-a*c-b*c);
+
+	if(hD >= 3.0*h)
+		tmp = fabsMin(a,fabsMin(b,c)) + Sign(value)*h;
+	else
+		tmp = (1.0/3.0) * ( a + b + c + Sign(value)*sqrt(3.0*h*h-hD) );
+
+
+	dofVector2[index]  = fabsMin(value, tmp);
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+Real tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
+{
+	Real fx = fabs(x);
+	Real fy = fabs(y);
+
+	Real tmpMin = Min(fx,fy);
+
+	if(tmpMin == fx)
+		return x;
+	else
+		return y;
+
+}
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1111( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	dofVector2[index]=fabsMin(INT_MAX,dofVector2[(index)]);
+	dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(INT_MAX,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(INT_MAX,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(INT_MAX,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0000( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	dofVector2[index]=fabsMin(-INT_MAX,dofVector2[(index)]);
+	dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-INT_MAX,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-INT_MAX,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-INT_MAX,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1110( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(dofVector[Mesh.template getCellNextToCell<1,1>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<1,0>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<1,1>(index)]));
+
+	be=abs(dofVector[Mesh.template getCellNextToCell<1,1>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<0,1>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<1,1>(index)]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	dofVector2[index]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[(index)]);
+	dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1101( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(dofVector[Mesh.template getCellNextToCell<0,1>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<1,1>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<0,1>(index)]));
+
+	be=abs(dofVector[Mesh.template getCellNextToCell<0,1>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<0,0>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<0,1>(index)]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	dofVector2[index]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[(index)]);
+	dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1011( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(dofVector[Mesh.template getCellNextToCell<1,0>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<0,0>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<1,0>(index)]));
+
+	be=abs(dofVector[Mesh.template getCellNextToCell<1,0>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<1,1>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<1,0>(index)]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	dofVector2[index]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[(index)]);
+	dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0111( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(dofVector[Mesh.template getCellNextToCell<0,0>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<0,1>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<0,0>(index)]));
+
+	be=abs(dofVector[Mesh.template getCellNextToCell<0,0>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<1,0>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<0,0>(index)]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	dofVector2[index]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[(index)]);
+	dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0001( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(dofVector[Mesh.template getCellNextToCell<1,1>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<1,0>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<1,1>(index)]));
+
+	be=abs(dofVector[Mesh.template getCellNextToCell<1,1>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<0,1>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<1,1>(index)]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	dofVector2[index]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[(index)]);
+	dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0010( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(dofVector[Mesh.template getCellNextToCell<0,1>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<1,1>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<0,1>(index)]));
+
+	be=abs(dofVector[Mesh.template getCellNextToCell<0,1>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<0,0>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<0,1>(index)]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	dofVector2[index]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[(index)]);
+	dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0100( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(dofVector[Mesh.template getCellNextToCell<1,0>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<0,0>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<1,0>(index)]));
+
+	be=abs(dofVector[Mesh.template getCellNextToCell<1,0>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<1,1>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<1,0>(index)]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	dofVector2[index]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[(index)]);
+	dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1000( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(dofVector[Mesh.template getCellNextToCell<0,0>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<0,1>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<0,0>(index)]));
+
+	be=abs(dofVector[Mesh.template getCellNextToCell<0,0>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<1,0>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<0,0>(index)]));
+
+	a = be/al;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	dofVector2[index]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[(index)]);
+	dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1100( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(dofVector[Mesh.template getCellNextToCell<0,0>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<0,1>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<0,0>(index)]));
+
+	be=abs(dofVector[Mesh.template getCellNextToCell<1,0>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<1,1>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<1,0>(index)]));
+
+	a = al-be;
+	b=1.0;
+	c=-al;
+	s= h/sqrt(a*a+b*b);
+
+
+	dofVector2[index]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[(index)]);
+	dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1010( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(dofVector[Mesh.template getCellNextToCell<0,0>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<1,0>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<0,0>(index)]));
+
+	be=abs(dofVector[Mesh.template getCellNextToCell<0,1>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<1,1>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<0,1>(index)]));
+
+	a = al-be;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	dofVector2[index]=fabsMin(abs(a*0+b*0+c)*s,dofVector2[(index)]);
+	dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*1+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(-abs(a*1+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*0+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare1001( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	dofVector2[index]=fabsMin(dofVector[index],dofVector2[(index)]);
+	dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(dofVector[Mesh.template getCellNextToCell<0,1>(index)],dofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(dofVector[Mesh.template getCellNextToCell<1,1>(index)],dofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(dofVector[Mesh.template getCellNextToCell<1,0>(index)],dofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+
+
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0011( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(dofVector[Mesh.template getCellNextToCell<0,0>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<0,1>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<0,0>(index)]));
+
+	be=abs(dofVector[Mesh.template getCellNextToCell<1,0>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<1,1>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<1,0>(index)]));
+
+	a = al-be;
+	b=1.0;
+	c=-al;
+	s= h/sqrt(a*a+b*b);
+
+
+	dofVector2[index]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[(index)]);
+	dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0101( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real al,be, a,b,c,s;
+	al=abs(dofVector[Mesh.template getCellNextToCell<0,0>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<1,0>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<0,0>(index)]));
+
+	be=abs(dofVector[Mesh.template getCellNextToCell<0,1>(index)]/
+			(dofVector[Mesh.template getCellNextToCell<1,1>(index)]-
+			 dofVector[Mesh.template getCellNextToCell<0,1>(index)]));
+
+	a = al-be;
+	b=1.0;
+	c=-be;
+	s= h/sqrt(a*a+b*b);
+
+
+	dofVector2[index]=fabsMin(-abs(a*0+b*0+c)*s,dofVector2[(index)]);
+	dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(-abs(a*1+b*0+c)*s,dofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(abs(a*1+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(abs(a*0+b*1+c)*s,dofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+
+}
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index > :: setupSquare0110( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	dofVector2[index]=fabsMin(dofVector[index],dofVector2[(index)]);
+	dofVector2[Mesh.template getCellNextToCell<0,1>(index)]=fabsMin(dofVector[Mesh.template getCellNextToCell<0,1>(index)],dofVector2[Mesh.template getCellNextToCell<0,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,1>(index)]=fabsMin(dofVector[Mesh.template getCellNextToCell<1,1>(index)],dofVector2[Mesh.template getCellNextToCell<1,1>(index)]);
+	dofVector2[Mesh.template getCellNextToCell<1,0>(index)]=fabsMin(dofVector[Mesh.template getCellNextToCell<1,0>(index)],dofVector2[Mesh.template getCellNextToCell<1,0>(index)]);
+}
+
+
+
+
+#endif /* TNLFASTSWEEPING_IMPL_H_ */
diff --git a/examples/fast-sweeping/tnlFastSweeping_CUDA.h b/examples/fast-sweeping/tnlFastSweeping_CUDA.h
index 8d47d5cb1c5039fd284b33e6e5e4e7f755d7bf54..4313bbd3a79c1c85e798e0f5e7a04b157a43d12f 100644
--- a/examples/fast-sweeping/tnlFastSweeping_CUDA.h
+++ b/examples/fast-sweeping/tnlFastSweeping_CUDA.h
@@ -105,12 +105,90 @@ protected:
 
 
 
+
+
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+class tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index >
+{
+
+public:
+	typedef Real RealType;
+	typedef Device DeviceType;
+	typedef Index IndexType;
+	typedef tnlGrid< 3, Real, Device, Index > MeshType;
+	typedef tnlVector< RealType, DeviceType, IndexType> DofVectorType;
+	typedef typename MeshType::CoordinatesType CoordinatesType;
+
+
+
+	__host__ static tnlString getType();
+	__host__ bool init( const tnlParameterContainer& parameters );
+	__host__ bool run();
+
+#ifdef HAVE_CUDA
+	__device__ bool initGrid();
+	__device__ void updateValue(const Index i, const Index j, const Index k);
+	__device__ void updateValue(const Index i, const Index j, const Index k, double** sharedMem, const int k3);
+	__device__ Real fabsMin(const Real x, const Real y);
+
+	tnlFastSweeping< tnlGrid< 3,MeshReal, Device, MeshIndex >, Real, Index >* cudaSolver;
+	double* cudaDofVector;
+	double* cudaDofVector2;
+	int counter;
+	__device__ void setupSquare1000(Index i, Index j);
+	__device__ void setupSquare1100(Index i, Index j);
+	__device__ void setupSquare1010(Index i, Index j);
+	__device__ void setupSquare1001(Index i, Index j);
+	__device__ void setupSquare1110(Index i, Index j);
+	__device__ void setupSquare1101(Index i, Index j);
+	__device__ void setupSquare1011(Index i, Index j);
+	__device__ void setupSquare1111(Index i, Index j);
+	__device__ void setupSquare0000(Index i, Index j);
+	__device__ void setupSquare0100(Index i, Index j);
+	__device__ void setupSquare0010(Index i, Index j);
+	__device__ void setupSquare0001(Index i, Index j);
+	__device__ void setupSquare0110(Index i, Index j);
+	__device__ void setupSquare0101(Index i, Index j);
+	__device__ void setupSquare0011(Index i, Index j);
+	__device__ void setupSquare0111(Index i, Index j);
+#endif
+
+	MeshType Mesh;
+
+protected:
+
+
+
+	bool exactInput;
+
+	DofVectorType dofVector;
+
+	RealType h;
+
+
+};
+
+
+
+
+
+
+
 #ifdef HAVE_CUDA
 //template<int sweep_t>
 __global__ void runCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int i);
-//__global__ void runCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int i);
+__global__ void runCUDA(tnlFastSweeping< tnlGrid< 3,double, tnlHost, int >, double, int >* solver, int sweep, int i);
 
 __global__ void initCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver);
+__global__ void initCUDA(tnlFastSweeping< tnlGrid< 3,double, tnlHost, int >, double, int >* solver);
 #endif
 
 /*various implementtions.... choose one*/
@@ -119,5 +197,6 @@ __global__ void initCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, dou
 //#include "tnlFastSweeping2D_CUDA_v3_impl.h"
 #include "tnlFastSweeping2D_CUDA_v4_impl.h"
 //#include "tnlFastSweeping2D_CUDA_v5_impl.h"
+#include "tnlFastSweeping3D_CUDA_impl.h"
 
 #endif /* TNLFASTSWEEPING_H_ */
diff --git a/examples/hamilton-jacobi-parallel/main.h b/examples/hamilton-jacobi-parallel/main.h
index b0edef69159dbfca6cc31fc30dc1d7f7a922e6ee..692a2bc504ab735f69e293132d839d15e07b9743 100644
--- a/examples/hamilton-jacobi-parallel/main.h
+++ b/examples/hamilton-jacobi-parallel/main.h
@@ -57,7 +57,7 @@ int main( int argc, char* argv[] )
 	   typedef tnlHost Device;
 
 
-   	   tnlParallelEikonalSolver<SchemeTypeHost,SchemeTypeDevice, Device> solver;
+   	   tnlParallelEikonalSolver<2,SchemeTypeHost,SchemeTypeDevice, Device> solver;
    	   if(!solver.init(parameters))
    	   {
    		   cerr << "Solver failed to initialize." << endl;
@@ -72,7 +72,7 @@ int main( int argc, char* argv[] )
 	   typedef tnlCuda Device;
   	   //typedef parallelGodunovEikonalScheme< tnlGrid<2,double,Device, int>, double, int > SchemeType;
 
-   	   tnlParallelEikonalSolver<SchemeTypeHost,SchemeTypeDevice, Device> solver;
+   	   tnlParallelEikonalSolver<2,SchemeTypeHost,SchemeTypeDevice, Device> solver;
    	   if(!solver.init(parameters))
    	   {
    		   cerr << "Solver failed to initialize." << endl;
diff --git a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver.h b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver.h
index 547c58ec28a2c132f0ced721923eba5cc2f41e30..2d049321c003c5464c2f47f0521e7c31ad2ae40e 100644
--- a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver.h
+++ b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver.h
@@ -34,7 +34,8 @@
 #endif
 
 
-template< typename SchemeHost,
+template< int Dimension,
+		  typename SchemeHost,
 		  typename SchemeDevice,
 		  typename Device,
 		  typename RealType = double,
@@ -42,8 +43,8 @@ template< typename SchemeHost,
 class tnlParallelEikonalSolver
 {};
 
-template< typename SchemeHost, typename SchemeDevice, typename Device>
-class tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >
+template<typename SchemeHost, typename SchemeDevice, typename Device>
+class tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >
 {
 public:
 
@@ -96,7 +97,7 @@ public:
 	SchemeHost schemeHost;
 	SchemeDevice schemeDevice;
 	double delta, tau0, stopTime,cflCondition;
-	int gridRows, gridCols, currentStep, n;
+	int gridRows, gridCols, gridLevels, currentStep, n;
 
 	std::clock_t start;
 	double time_diff;
@@ -104,14 +105,14 @@ public:
 
 	tnlDeviceEnum device;
 
-	tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* getSelf()
+	tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* getSelf()
 	{
 		return this;
 	};
 
 #ifdef HAVE_CUDA
 
-	tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver;
+	tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver;
 
 	double* work_u_cuda;
 
@@ -129,27 +130,27 @@ public:
 	int run_host;
 
 
-	__device__ void getSubgridCUDA( const int i, tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller, double* a);
+	__device__ void getSubgridCUDA2D( const int i, tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* caller, double* a);
 
-	__device__ void updateSubgridCUDA( const int i, tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller, double* a);
+	__device__ void updateSubgridCUDA2D( const int i, tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* caller, double* a);
 
-	__device__ void insertSubgridCUDA( double u, const int i );
+	__device__ void insertSubgridCUDA2D( double u, const int i );
 
-	__device__ void runSubgridCUDA( int boundaryCondition, double* u, int subGridID);
+	__device__ void runSubgridCUDA2D( int boundaryCondition, double* u, int subGridID);
 
 	/*__global__ void runCUDA();*/
 
 	//__device__ void synchronizeCUDA();
 
-	__device__ int getOwnerCUDA( int i) const;
+	__device__ int getOwnerCUDA2D( int i) const;
 
-	__device__ int getSubgridValueCUDA( int i ) const;
+	__device__ int getSubgridValueCUDA2D( int i ) const;
 
-	__device__ void setSubgridValueCUDA( int i, int value );
+	__device__ void setSubgridValueCUDA2D( int i, int value );
 
-	__device__ int getBoundaryConditionCUDA( int i ) const;
+	__device__ int getBoundaryConditionCUDA2D( int i ) const;
 
-	__device__ void setBoundaryConditionCUDA( int i, int value );
+	__device__ void setBoundaryConditionCUDA2D( int i, int value );
 
 	//__device__ bool initCUDA( tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver);
 
@@ -158,23 +159,173 @@ public:
 #endif
 
 };
+
+
+
+
+
+
+
+	template<typename SchemeHost, typename SchemeDevice, typename Device>
+	class tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >
+	{
+	public:
+
+		typedef SchemeDevice SchemeTypeDevice;
+		typedef SchemeHost SchemeTypeHost;
+		typedef Device DeviceType;
+		typedef tnlVector< double, tnlHost, int > VectorType;
+		typedef tnlVector< int, tnlHost, int > IntVectorType;
+		typedef tnlGrid< 3, double, tnlHost, int > MeshType;
+	#ifdef HAVE_CUDA
+		typedef tnlVector< double, tnlHost, int > VectorTypeCUDA;
+		typedef tnlVector< int, tnlHost, int > IntVectorTypeCUDA;
+		typedef tnlGrid< 3, double, tnlHost, int > MeshTypeCUDA;
+	#endif
+		tnlParallelEikonalSolver();
+		bool init( const tnlParameterContainer& parameters );
+		void run();
+
+		void test();
+
+	/*private:*/
+
+
+		void synchronize();
+
+		int getOwner( int i) const;
+
+		int getSubgridValue( int i ) const;
+
+		void setSubgridValue( int i, int value );
+
+		int getBoundaryCondition( int i ) const;
+
+		void setBoundaryCondition( int i, int value );
+
+		void stretchGrid();
+
+		void contractGrid();
+
+		VectorType getSubgrid( const int i ) const;
+
+		void insertSubgrid( VectorType u, const int i );
+
+		VectorType runSubgrid( int boundaryCondition, VectorType u, int subGridID);
+
+
+		VectorType u0, work_u;
+		IntVectorType subgridValues, boundaryConditions, unusedCell, calculationsCount;
+		MeshType mesh, subMesh;
+		SchemeHost schemeHost;
+		SchemeDevice schemeDevice;
+		double delta, tau0, stopTime,cflCondition;
+		int gridRows, gridCols, gridLevels, currentStep, n;
+
+		std::clock_t start;
+		double time_diff;
+
+
+		tnlDeviceEnum device;
+
+		tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* getSelf()
+		{
+			return this;
+		};
+
 #ifdef HAVE_CUDA
+
+	tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver;
+
+	double* work_u_cuda;
+
+	int* subgridValues_cuda;
+	int*boundaryConditions_cuda;
+	int* unusedCell_cuda;
+	int* calculationsCount_cuda;
+	double* tmpw;
+	//MeshTypeCUDA mesh_cuda, subMesh_cuda;
+	//SchemeDevice scheme_cuda;
+	//double delta_cuda, tau0_cuda, stopTime_cuda,cflCondition_cuda;
+	//int gridRows_cuda, gridCols_cuda, currentStep_cuda, n_cuda;
+
+	int* runcuda;
+	int run_host;
+
+
+	__device__ void getSubgridCUDA3D( const int i, tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* caller, double* a);
+
+	__device__ void updateSubgridCUDA3D( const int i, tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* caller, double* a);
+
+	__device__ void insertSubgridCUDA3D( double u, const int i );
+
+	__device__ void runSubgridCUDA3D( int boundaryCondition, double* u, int subGridID);
+
+	/*__global__ void runCUDA();*/
+
+	//__device__ void synchronizeCUDA();
+
+	__device__ int getOwnerCUDA3D( int i) const;
+
+	__device__ int getSubgridValueCUDA3D( int i ) const;
+
+	__device__ void setSubgridValueCUDA3D( int i, int value );
+
+	__device__ int getBoundaryConditionCUDA3D( int i ) const;
+
+	__device__ void setBoundaryConditionCUDA3D( int i, int value );
+
+	//__device__ bool initCUDA( tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver);
+
+	/*__global__ void initRunCUDA(tnlParallelEikonalSolver<Scheme, double, tnlHost, int >* caller);*/
+
+#endif
+
+};
+
+
+
+
+
+
+#ifdef HAVE_CUDA
+template <typename SchemeHost, typename SchemeDevice, typename Device>
+__global__ void runCUDA2D(tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* caller);
+
+template <typename SchemeHost, typename SchemeDevice, typename Device>
+__global__ void initRunCUDA2D(tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* caller);
+
+template <typename SchemeHost, typename SchemeDevice, typename Device>
+__global__ void initCUDA2D( tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr, int * ptr2, int* ptr3);
+
+template <typename SchemeHost, typename SchemeDevice, typename Device>
+__global__ void synchronizeCUDA2D(tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver);
+
+template <typename SchemeHost, typename SchemeDevice, typename Device>
+__global__ void synchronize2CUDA2D(tnlParallelEikonalSolver<2, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver);
+
+
+
+
+
+
+
 template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__ void runCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller);
+__global__ void runCUDA3D(tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* caller);
 
 template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__ void initRunCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller);
+__global__ void initRunCUDA3D(tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* caller);
 
 template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__ void initCUDA( tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr, int * ptr2, int* ptr3);
+__global__ void initCUDA3D( tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr, int * ptr2, int* ptr3);
 
 template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__ void synchronizeCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver);
+__global__ void synchronizeCUDA3D(tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver);
 
 template <typename SchemeHost, typename SchemeDevice, typename Device>
-__global__ void synchronize2CUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver);
+__global__ void synchronize2CUDA3D(tnlParallelEikonalSolver<3, SchemeHost, SchemeDevice, Device, double, int >* cudaSolver);
 #endif
 
-#include "tnlParallelEikonalSolver_impl.h"
+#include "tnlParallelEikonalSolver2D_impl.h"
 
 #endif /* TNLPARALLELEIKONALSOLVER_H_ */
diff --git a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver2D_impl.h
similarity index 80%
rename from examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h
rename to examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver2D_impl.h
index a20abadec1a94388e6f6da19fa2be9f89f1204a9..a989361a89da748c5d74c439252bbedd9b7d0ccf 100644
--- a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h
+++ b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver2D_impl.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          tnlParallelEikonalSolver_impl.h  -  description
+                          tnlParallelEikonalSolver2D_impl.h  -  description
                              -------------------
     begin                : Nov 28 , 2014
     copyright            : (C) 2014 by Tomas Sobotik
@@ -14,15 +14,15 @@
  *                                                                         *
  ***************************************************************************/
 
-#ifndef TNLPARALLELEIKONALSOLVER_IMPL_H_
-#define TNLPARALLELEIKONALSOLVER_IMPL_H_
+#ifndef TNLPARALLELEIKONALSOLVER2D_IMPL_H_
+#define TNLPARALLELEIKONALSOLVER2D_IMPL_H_
 
 
 #include "tnlParallelEikonalSolver.h"
 #include <core/mfilename.h>
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
-tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::tnlParallelEikonalSolver()
+tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::tnlParallelEikonalSolver()
 {
 	cout << "a" << endl;
 	this->device = tnlCudaDevice;  /////////////// tnlCuda Device --- vypocet na GPU, tnlHostDevice   ---    vypocet na CPU
@@ -38,7 +38,7 @@ tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::tnlPara
 }
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::test()
+void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::test()
 {
 /*
 	for(int i =0; i < this->subgridValues.getSize(); i++ )
@@ -50,7 +50,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::te
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
 
-bool tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::init( const tnlParameterContainer& parameters )
+bool tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::init( const tnlParameterContainer& parameters )
 {
 	cout << "Initializating solver..." << endl;
 	const tnlString& meshLocation = parameters.getParameter <tnlString>("mesh");
@@ -110,13 +110,13 @@ bool tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::in
 	/*cout << "Testing... " << endl;
 	if(this->device == tnlCudaDevice)
 	{
-	if( !initCUDA(parameters, gridRows, gridCols) )
+	if( !initCUDA2D(parameters, gridRows, gridCols) )
 		return false;
 	}*/
 		//cout << "s" << endl;
-	cudaMalloc(&(this->cudaSolver), sizeof(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >));
+	cudaMalloc(&(this->cudaSolver), sizeof(tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >));
 	//cout << "s" << endl;
-	cudaMemcpy(this->cudaSolver, this,sizeof(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >), cudaMemcpyHostToDevice);
+	cudaMemcpy(this->cudaSolver, this,sizeof(tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >), cudaMemcpyHostToDevice);
 	//cout << "s" << endl;
 	double** tmpdev = NULL;
 	cudaMalloc(&tmpdev, sizeof(double*));
@@ -129,7 +129,7 @@ bool tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::in
 	cudaMalloc(&(tmpUC), this->work_u.getSize()*sizeof(int));
 	cudaMemcpy(tmpUC, this->unusedCell.getData(), this->unusedCell.getSize()*sizeof(int), cudaMemcpyHostToDevice);
 
-	initCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<1,1>>>(this->cudaSolver, (this->tmpw), (this->runcuda),tmpUC);
+	initCUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<1,1>>>(this->cudaSolver, (this->tmpw), (this->runcuda),tmpUC);
 	cudaDeviceSynchronize();
 	checkCudaDevice;
 	//cout << "s " << endl;
@@ -186,7 +186,7 @@ bool tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::in
 		dim3 numBlocks(this->gridCols,this->gridRows);
 		cudaDeviceSynchronize();
 		checkCudaDevice;
-		initRunCUDA<SchemeTypeHost,SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver);
+		initRunCUDA2D<SchemeTypeHost,SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver);
 		cudaDeviceSynchronize();
 //		cout << "post 1 kernel" << endl;
 
@@ -209,10 +209,10 @@ bool tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::in
 
 		checkCudaDevice;
 
-		synchronizeCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
+		synchronizeCUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
 		cudaDeviceSynchronize();
 		checkCudaDevice;
-		synchronize2CUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver);
+		synchronize2CUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver);
 		cudaDeviceSynchronize();
 		checkCudaDevice;
 		//cout << test[0] << "   " <<test[1] <<"   " << test[2] << "   " <<test[3] << endl;
@@ -230,7 +230,7 @@ bool tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::in
 }
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::run()
+void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::run()
 {
 	if(this->device == tnlHostDevice)
 	{
@@ -349,16 +349,16 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru
 			cudaDeviceSynchronize();
 			checkCudaDevice;
 			start = std::clock();
-			runCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver);
+			runCUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver);
 			//cout << "a" << endl;
 			cudaDeviceSynchronize();
 			time_diff += (std::clock() - start) / (double)(CLOCKS_PER_SEC);
 
 			//start = std::clock();
-			synchronizeCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
+			synchronizeCUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
 			cudaDeviceSynchronize();
 			checkCudaDevice;
-			synchronize2CUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver);
+			synchronize2CUDA2D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver);
 			cudaDeviceSynchronize();
 			checkCudaDevice;
 			//time_diff += (std::clock() - start) / (double)(CLOCKS_PER_SEC);
@@ -407,7 +407,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru
 
 //north - 1, east - 2, west - 4, south - 8
 template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::synchronize() //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now
+void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::synchronize() //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now
 {
 	cout << "Synchronizig..." << endl;
 	int tmp1, tmp2;
@@ -504,38 +504,38 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::sy
 
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
-int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getOwner(int i) const
+int tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getOwner(int i) const
 {
 
 	return (i / (this->gridCols*this->n*this->n))*this->gridCols + (i % (this->gridCols*this->n))/this->n;
 }
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
-int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getSubgridValue( int i ) const
+int tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getSubgridValue( int i ) const
 {
 	return this->subgridValues[i];
 }
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::setSubgridValue(int i, int value)
+void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::setSubgridValue(int i, int value)
 {
 	this->subgridValues[i] = value;
 }
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
-int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getBoundaryCondition( int i ) const
+int tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getBoundaryCondition( int i ) const
 {
 	return this->boundaryConditions[i];
 }
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::setBoundaryCondition(int i, int value)
+void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::setBoundaryCondition(int i, int value)
 {
 	this->boundaryConditions[i] = value;
 }
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::stretchGrid()
+void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::stretchGrid()
 {
 	cout << "Stretching grid..." << endl;
 
@@ -612,7 +612,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::st
 }
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::contractGrid()
+void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::contractGrid()
 {
 	cout << "Contracting grid..." << endl;
 	int stretchedSize = this->n*this->n*this->gridCols*this->gridRows;
@@ -637,8 +637,8 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::co
 }
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
-typename tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::VectorType
-tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getSubgrid( const int i ) const
+typename tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::VectorType
+tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getSubgrid( const int i ) const
 {
 	VectorType u;
 	u.setSize(this->n*this->n);
@@ -654,7 +654,7 @@ tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getSubg
 }
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
-void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::insertSubgrid( VectorType u, const int i )
+void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::insertSubgrid( VectorType u, const int i )
 {
 
 	for( int j = 0; j < this->n*this->n; j++)
@@ -671,8 +671,8 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::in
 }
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
-typename tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::VectorType
-tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::runSubgrid( int boundaryCondition, VectorType u, int subGridID)
+typename tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::VectorType
+tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::runSubgrid( int boundaryCondition, VectorType u, int subGridID)
 {
 
 	VectorType fu;
@@ -1020,7 +1020,7 @@ tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::runSubg
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
 __device__
-void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getSubgridCUDA( const int i ,tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller, double* a)
+void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getSubgridCUDA2D( const int i ,tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >* caller, double* a)
 {
 	//int j = threadIdx.x + threadIdx.y * blockDim.x;
 	int th = (blockIdx.y) * caller->n*caller->n*caller->gridCols
@@ -1036,7 +1036,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ge
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
 __device__
-void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::updateSubgridCUDA( const int i ,tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller, double* a)
+void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::updateSubgridCUDA2D( const int i ,tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >* caller, double* a)
 {
 	int j = threadIdx.x + threadIdx.y * blockDim.x;
 	int index = (blockIdx.y) * caller->n*caller->n*caller->gridCols
@@ -1057,7 +1057,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::up
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
 __device__
-void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::insertSubgridCUDA( double u, const int i )
+void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::insertSubgridCUDA2D( double u, const int i )
 {
 
 
@@ -1083,7 +1083,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::in
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
 __device__
-void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::runSubgridCUDA( int boundaryCondition, double* u, int subGridID)
+void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::runSubgridCUDA2D( int boundaryCondition, double* u, int subGridID)
 {
 
 	__shared__ int tmp;
@@ -1199,7 +1199,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
 __device__
-int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getOwnerCUDA(int i) const
+int tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getOwnerCUDA2D(int i) const
 {
 
 	return ((i / (this->gridCols*this->n*this->n))*this->gridCols
@@ -1209,7 +1209,7 @@ int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::get
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
 __device__
-int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getSubgridValueCUDA( int i ) const
+int tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getSubgridValueCUDA2D( int i ) const
 {
 	return this->subgridValues_cuda[i];
 }
@@ -1217,7 +1217,7 @@ int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::get
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
 __device__
-void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::setSubgridValueCUDA(int i, int value)
+void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::setSubgridValueCUDA2D(int i, int value)
 {
 	this->subgridValues_cuda[i] = value;
 }
@@ -1225,7 +1225,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::se
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
 __device__
-int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getBoundaryConditionCUDA( int i ) const
+int tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::getBoundaryConditionCUDA2D( int i ) const
 {
 	return this->boundaryConditions_cuda[i];
 }
@@ -1233,7 +1233,7 @@ int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::get
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
 __device__
-void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::setBoundaryConditionCUDA(int i, int value)
+void tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::setBoundaryConditionCUDA2D(int i, int value)
 {
 	this->boundaryConditions_cuda[i] = value;
 }
@@ -1244,7 +1244,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::se
 
 template <typename SchemeHost, typename SchemeDevice, typename Device>
 __global__
-void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::*/synchronizeCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver) //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now
+void /*tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::*/synchronizeCUDA2D(tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver) //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now
 {
 
 	__shared__ int boundary[4]; // north,east,west,south
@@ -1261,7 +1261,7 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 
 	if(threadIdx.x+threadIdx.y == 0)
 	{
-		subgridValue = cudaSolver->getSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x);
+		subgridValue = cudaSolver->getSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x);
 		boundary[0] = 0;
 		boundary[1] = 0;
 		boundary[2] = 0;
@@ -1281,26 +1281,26 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 		if(threadIdx.x == 0 && (blockIdx.x != 0)/* && !(cudaSolver->currentStep & 1)*/)
 		{
 			u_cmp = cudaSolver->work_u_cuda[gid - 1];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x - 1);
+			subgridValue_cmp = cudaSolver->getSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x - 1);
 			boundary_index = 2;
 		}
 
 		if(threadIdx.x == blockDim.x - 1 && (blockIdx.x != gridDim.x - 1)/* && !(cudaSolver->currentStep & 1)*/)
 		{
 			u_cmp = cudaSolver->work_u_cuda[gid + 1];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x + 1);
+			subgridValue_cmp = cudaSolver->getSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x + 1);
 			boundary_index = 1;
 		}
 //		if(threadIdx.y == 0 && (blockIdx.y != 0) && (cudaSolver->currentStep & 1))
 //		{
 //			u_cmp = cudaSolver->work_u_cuda[gid - blockDim.x*gridDim.x];
-//			subgridValue_cmp = cudaSolver->getSubgridValueCUDA((blockIdx.y - 1)*gridDim.x + blockIdx.x);
+//			subgridValue_cmp = cudaSolver->getSubgridValueCUDA2D((blockIdx.y - 1)*gridDim.x + blockIdx.x);
 //			boundary_index = 3;
 //		}
 //		if(threadIdx.y == blockDim.y - 1 && (blockIdx.y != gridDim.y - 1) && (cudaSolver->currentStep & 1))
 //		{
 //			u_cmp = cudaSolver->work_u_cuda[gid + blockDim.x*gridDim.x];
-//			subgridValue_cmp = cudaSolver->getSubgridValueCUDA((blockIdx.y + 1)*gridDim.x + blockIdx.x);
+//			subgridValue_cmp = cudaSolver->getSubgridValueCUDA2D((blockIdx.y + 1)*gridDim.x + blockIdx.x);
 //			boundary_index = 0;
 //		}
 
@@ -1317,13 +1317,13 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 		if(threadIdx.y == 0 && (blockIdx.y != 0)/* && (cudaSolver->currentStep & 1)*/)
 		{
 			u_cmp = cudaSolver->work_u_cuda[gid - blockDim.x*gridDim.x];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA((blockIdx.y - 1)*gridDim.x + blockIdx.x);
+			subgridValue_cmp = cudaSolver->getSubgridValueCUDA2D((blockIdx.y - 1)*gridDim.x + blockIdx.x);
 			boundary_index = 3;
 		}
 		if(threadIdx.y == blockDim.y - 1 && (blockIdx.y != gridDim.y - 1)/* && (cudaSolver->currentStep & 1)*/)
 		{
 			u_cmp = cudaSolver->work_u_cuda[gid + blockDim.x*gridDim.x];
-			subgridValue_cmp = cudaSolver->getSubgridValueCUDA((blockIdx.y + 1)*gridDim.x + blockIdx.x);
+			subgridValue_cmp = cudaSolver->getSubgridValueCUDA2D((blockIdx.y + 1)*gridDim.x + blockIdx.x);
 			boundary_index = 0;
 		}
 
@@ -1341,9 +1341,9 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 	if(threadIdx.x+threadIdx.y == 0)
 	{
 		if(subgridValue == INT_MAX && newSubgridValue !=0)
-			cudaSolver->setSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x, -INT_MAX);
+			cudaSolver->setSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x, -INT_MAX);
 
-		cudaSolver->setBoundaryConditionCUDA(blockIdx.y*gridDim.x + blockIdx.x, 	boundary[0] +
+		cudaSolver->setBoundaryConditionCUDA2D(blockIdx.y*gridDim.x + blockIdx.x, 	boundary[0] +
 																				2 * boundary[1] +
 																				4 * boundary[2] +
 																				8 * boundary[3]);
@@ -1366,32 +1366,32 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 			{
 				tmp1 = cudaSolver->gridCols*cudaSolver->n*((cudaSolver->n-1)+j*cudaSolver->n) + i;
 				tmp2 = cudaSolver->gridCols*cudaSolver->n*((cudaSolver->n)+j*cudaSolver->n) + i;
-				grid1 = cudaSolver->getSubgridValueCUDA(cudaSolver->getOwnerCUDA(tmp1));
-				grid2 = cudaSolver->getSubgridValueCUDA(cudaSolver->getOwnerCUDA(tmp2));
+				grid1 = cudaSolver->getSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1));
+				grid2 = cudaSolver->getSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2));
 
 				if ((fabs(cudaSolver->work_u_cuda[tmp1]) < fabs(cudaSolver->work_u_cuda[tmp2]) - cudaSolver->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX))
 				{
-					//printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA(tmp1),cudaSolver->getOwnerCUDA(tmp2));
+					//printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA2D(tmp1),cudaSolver->getOwnerCUDA2D(tmp2));
 					cudaSolver->work_u_cuda[tmp2] = cudaSolver->work_u_cuda[tmp1];
 					cudaSolver->unusedCell_cuda[tmp2] = 0;
 					if(grid2 == INT_MAX)
 					{
-						cudaSolver->setSubgridValueCUDA(cudaSolver->getOwnerCUDA(tmp2), -INT_MAX);
+						cudaSolver->setSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2), -INT_MAX);
 					}
-					if(! (cudaSolver->getBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp2)) & 8) )
-						cudaSolver->setBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp2), cudaSolver->getBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp2))+8);
+					if(! (cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2)) & 8) )
+						cudaSolver->setBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2), cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2))+8);
 				}
 				else if ((fabs(cudaSolver->work_u_cuda[tmp1]) > fabs(cudaSolver->work_u_cuda[tmp2]) + cudaSolver->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX))
 				{
-					//printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA(tmp1),cudaSolver->getOwnerCUDA(tmp2));
+					//printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA2D(tmp1),cudaSolver->getOwnerCUDA2D(tmp2));
 					cudaSolver->work_u_cuda[tmp1] = cudaSolver->work_u_cuda[tmp2];
 					cudaSolver->unusedCell_cuda[tmp1] = 0;
 					if(grid1 == INT_MAX)
 					{
-						cudaSolver->setSubgridValueCUDA(cudaSolver->getOwnerCUDA(tmp1), -INT_MAX);
+						cudaSolver->setSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1), -INT_MAX);
 					}
-					if(! (cudaSolver->getBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp1)) & 1) )
-						cudaSolver->setBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp1), cudaSolver->getBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp1))+1);
+					if(! (cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1)) & 1) )
+						cudaSolver->setBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1), cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1))+1);
 				}
 			}
 		}
@@ -1408,32 +1408,32 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 
 				tmp1 = cudaSolver->gridCols*cudaSolver->n*j + i*cudaSolver->n - 1;
 				tmp2 = cudaSolver->gridCols*cudaSolver->n*j + i*cudaSolver->n ;
-				grid1 = cudaSolver->getSubgridValueCUDA(cudaSolver->getOwnerCUDA(tmp1));
-				grid2 = cudaSolver->getSubgridValueCUDA(cudaSolver->getOwnerCUDA(tmp2));
+				grid1 = cudaSolver->getSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1));
+				grid2 = cudaSolver->getSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2));
 
 				if ((fabs(cudaSolver->work_u_cuda[tmp1]) < fabs(cudaSolver->work_u_cuda[tmp2]) - cudaSolver->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX))
 				{
-					//printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA(tmp1),cudaSolver->getOwnerCUDA(tmp2));
+					//printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA2D(tmp1),cudaSolver->getOwnerCUDA2D(tmp2));
 					cudaSolver->work_u_cuda[tmp2] = cudaSolver->work_u_cuda[tmp1];
 					cudaSolver->unusedCell_cuda[tmp2] = 0;
 					if(grid2 == INT_MAX)
 					{
-						cudaSolver->setSubgridValueCUDA(cudaSolver->getOwnerCUDA(tmp2), -INT_MAX);
+						cudaSolver->setSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2), -INT_MAX);
 					}
-					if(! (cudaSolver->getBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp2)) & 4) )
-						cudaSolver->setBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp2), cudaSolver->getBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp2))+4);
+					if(! (cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2)) & 4) )
+						cudaSolver->setBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2), cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp2))+4);
 				}
 				else if ((fabs(cudaSolver->work_u_cuda[tmp1]) > fabs(cudaSolver->work_u_cuda[tmp2]) + cudaSolver->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX))
 				{
-					//printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA(tmp1),cudaSolver->getOwnerCUDA(tmp2));
+					//printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA2D(tmp1),cudaSolver->getOwnerCUDA2D(tmp2));
 					cudaSolver->work_u_cuda[tmp1] = cudaSolver->work_u_cuda[tmp2];
 					cudaSolver->unusedCell_cuda[tmp1] = 0;
 					if(grid1 == INT_MAX)
 					{
-						cudaSolver->setSubgridValueCUDA(cudaSolver->getOwnerCUDA(tmp1), -INT_MAX);
+						cudaSolver->setSubgridValueCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1), -INT_MAX);
 					}
-					if(! (cudaSolver->getBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp1)) & 2) )
-						cudaSolver->setBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp1), cudaSolver->getBoundaryConditionCUDA(cudaSolver->getOwnerCUDA(tmp1))+2);
+					if(! (cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1)) & 2) )
+						cudaSolver->setBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1), cudaSolver->getBoundaryConditionCUDA2D(cudaSolver->getOwnerCUDA2D(tmp1))+2);
 				}
 			}
 		}
@@ -1444,15 +1444,15 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 	int stepValue = cudaSolver->currentStep + 4;
 	for (int i = 0; i < cudaSolver->gridRows * cudaSolver->gridCols; i++)
 	{
-		if( cudaSolver->getSubgridValueCUDA(i) == -INT_MAX )
-			cudaSolver->setSubgridValueCUDA(i, stepValue);
+		if( cudaSolver->getSubgridValueCUDA2D(i) == -INT_MAX )
+			cudaSolver->setSubgridValueCUDA2D(i, stepValue);
 	}
 
 	int maxi = 0;
 	for(int q=0; q < cudaSolver->gridRows*cudaSolver->gridCols;q++)
 	{
 		//printf("%d : %d\n", q, cudaSolver->boundaryConditions_cuda[q]);
-		maxi=Max(maxi,cudaSolver->getBoundaryConditionCUDA(q));
+		maxi=Max(maxi,cudaSolver->getBoundaryConditionCUDA2D(q));
 	}
 	//printf("I am not an empty kernel! %d\n", maxi);
 	*(cudaSolver->runcuda) = (maxi > 0);
@@ -1465,7 +1465,7 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 
 template <typename SchemeHost, typename SchemeDevice, typename Device>
 __global__
-void synchronize2CUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver)
+void synchronize2CUDA2D(tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver)
 {
 	if(blockIdx.x+blockIdx.y ==0)
 	{
@@ -1474,10 +1474,10 @@ void synchronize2CUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device,
 	}
 
 	int stepValue = cudaSolver->currentStep + 4;
-	if( cudaSolver->getSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x) == -INT_MAX )
-			cudaSolver->setSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x, stepValue);
+	if( cudaSolver->getSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x) == -INT_MAX )
+			cudaSolver->setSubgridValueCUDA2D(blockIdx.y*gridDim.x + blockIdx.x, stepValue);
 
-	atomicMax((cudaSolver->runcuda),cudaSolver->getBoundaryConditionCUDA(blockIdx.y*gridDim.x + blockIdx.x));
+	atomicMax((cudaSolver->runcuda),cudaSolver->getBoundaryConditionCUDA2D(blockIdx.y*gridDim.x + blockIdx.x));
 }
 
 
@@ -1489,7 +1489,7 @@ void synchronize2CUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device,
 
 template< typename SchemeHost, typename SchemeDevice, typename Device>
 __global__
-void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::*/initCUDA( tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr , int* ptr2, int* ptr3)
+void /*tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::*/initCUDA2D( tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr , int* ptr2, int* ptr3)
 {
 	//cout << "Initializating solver..." << endl;
 	//const tnlString& meshLocation = parameters.getParameter <tnlString>("mesh");
@@ -1582,7 +1582,7 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 //extern __shared__ double array[];
 template< typename SchemeHost, typename SchemeDevice, typename Device >
 __global__
-void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::*/initRunCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller)
+void /*tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::*/initRunCUDA2D(tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >* caller)
 
 {
 
@@ -1598,7 +1598,7 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 		containsCurve = 0;
 
 	//double a;
-	caller->getSubgridCUDA(i,caller, &u[l]);
+	caller->getSubgridCUDA2D(i,caller, &u[l]);
 	//printf("%f   %f\n",a , u[l]);
 	//u[l] = a;
 	//printf("Hi %f \n", u[l]);
@@ -1616,13 +1616,13 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 	if(containsCurve == 1)
 	{
 		//printf("have curve \n");
-		caller->runSubgridCUDA(0,u,i);
+		caller->runSubgridCUDA2D(0,u,i);
 		//printf("%d : %f\n", l, u[l]);
 		__syncthreads();
-		caller->insertSubgridCUDA(u[l],i);
+		caller->insertSubgridCUDA2D(u[l],i);
 		__syncthreads();
 		if(l == 0)
-			caller->setSubgridValueCUDA(i, 4);
+			caller->setSubgridValueCUDA2D(i, 4);
 	}
 
 
@@ -1634,59 +1634,59 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 
 template< typename SchemeHost, typename SchemeDevice, typename Device >
 __global__
-void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::*/runCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller)
+void /*tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int>::*/runCUDA2D(tnlParallelEikonalSolver<2,SchemeHost, SchemeDevice, Device, double, int >* caller)
 {
 	extern __shared__ double u[];
 	int i = blockIdx.y * gridDim.x + blockIdx.x;
 	int l = threadIdx.y * blockDim.x + threadIdx.x;
-	int bound = caller->getBoundaryConditionCUDA(i);
+	int bound = caller->getBoundaryConditionCUDA2D(i);
 
-	if(caller->getSubgridValueCUDA(i) != INT_MAX && bound != 0 && caller->getSubgridValueCUDA(i) > 0)
+	if(caller->getSubgridValueCUDA2D(i) != INT_MAX && bound != 0 && caller->getSubgridValueCUDA2D(i) > 0)
 	{
-		caller->getSubgridCUDA(i,caller, &u[l]);
+		caller->getSubgridCUDA2D(i,caller, &u[l]);
 
 		//if(l == 0)
-			//printf("i = %d, bound = %d\n",i,caller->getSubgridValueCUDA(i));
-		if(caller->getSubgridValueCUDA(i) == caller->currentStep+4)
+			//printf("i = %d, bound = %d\n",i,caller->getSubgridValueCUDA2D(i));
+		if(caller->getSubgridValueCUDA2D(i) == caller->currentStep+4)
 		{
 			if(bound & 1)
 			{
-				caller->runSubgridCUDA(1,u,i);
+				caller->runSubgridCUDA2D(1,u,i);
 				//__syncthreads();
-				//caller->insertSubgridCUDA(u[l],i);
+				//caller->insertSubgridCUDA2D(u[l],i);
 				//__syncthreads();
-				//caller->getSubgridCUDA(i,caller, &u[l]);
-				caller->updateSubgridCUDA(i,caller, &u[l]);
+				//caller->getSubgridCUDA2D(i,caller, &u[l]);
+				caller->updateSubgridCUDA2D(i,caller, &u[l]);
 				__syncthreads();
 			}
 			if(bound & 2 )
 			{
-				caller->runSubgridCUDA(2,u,i);
+				caller->runSubgridCUDA2D(2,u,i);
 				//__syncthreads();
-				//caller->insertSubgridCUDA(u[l],i);
+				//caller->insertSubgridCUDA2D(u[l],i);
 				//__syncthreads();
-				//caller->getSubgridCUDA(i,caller, &u[l]);
-				caller->updateSubgridCUDA(i,caller, &u[l]);
+				//caller->getSubgridCUDA2D(i,caller, &u[l]);
+				caller->updateSubgridCUDA2D(i,caller, &u[l]);
 				__syncthreads();
 			}
 			if(bound & 4)
 			{
-				caller->runSubgridCUDA(4,u,i);
+				caller->runSubgridCUDA2D(4,u,i);
 				//__syncthreads();
-				//caller->insertSubgridCUDA(u[l],i);
+				//caller->insertSubgridCUDA2D(u[l],i);
 				//__syncthreads();
-				//caller->getSubgridCUDA(i,caller, &u[l]);
-				caller->updateSubgridCUDA(i,caller, &u[l]);
+				//caller->getSubgridCUDA2D(i,caller, &u[l]);
+				caller->updateSubgridCUDA2D(i,caller, &u[l]);
 				__syncthreads();
 			}
 			if(bound & 8)
 			{
-				caller->runSubgridCUDA(8,u,i);
+				caller->runSubgridCUDA2D(8,u,i);
 				//__syncthreads();
-				//caller->insertSubgridCUDA(u[l],i);
+				//caller->insertSubgridCUDA2D(u[l],i);
 				//__syncthreads();
-				//caller->getSubgridCUDA(i,caller, &u[l]);
-				caller->updateSubgridCUDA(i,caller, &u[l]);
+				//caller->getSubgridCUDA2D(i,caller, &u[l]);
+				caller->updateSubgridCUDA2D(i,caller, &u[l]);
 				__syncthreads();
 			}
 
@@ -1696,42 +1696,42 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 
 			if( ((bound & 3 )))
 				{
-					caller->runSubgridCUDA(3,u,i);
+					caller->runSubgridCUDA2D(3,u,i);
 					//__syncthreads();
-					//caller->insertSubgridCUDA(u[l],i);
+					//caller->insertSubgridCUDA2D(u[l],i);
 					//__syncthreads();
-					//caller->getSubgridCUDA(i,caller, &u[l]);
-					caller->updateSubgridCUDA(i,caller, &u[l]);
+					//caller->getSubgridCUDA2D(i,caller, &u[l]);
+					caller->updateSubgridCUDA2D(i,caller, &u[l]);
 					__syncthreads();
 				}
 				if( ((bound & 5 )))
 				{
-					caller->runSubgridCUDA(5,u,i);
+					caller->runSubgridCUDA2D(5,u,i);
 					//__syncthreads();
-					//caller->insertSubgridCUDA(u[l],i);
+					//caller->insertSubgridCUDA2D(u[l],i);
 					//__syncthreads();
-					//caller->getSubgridCUDA(i,caller, &u[l]);
-					caller->updateSubgridCUDA(i,caller, &u[l]);
+					//caller->getSubgridCUDA2D(i,caller, &u[l]);
+					caller->updateSubgridCUDA2D(i,caller, &u[l]);
 					__syncthreads();
 				}
 				if( ((bound & 10 )))
 				{
-					caller->runSubgridCUDA(10,u,i);
+					caller->runSubgridCUDA2D(10,u,i);
 					//__syncthreads();
-					//caller->insertSubgridCUDA(u[l],i);
+					//caller->insertSubgridCUDA2D(u[l],i);
 					//__syncthreads();
-					//caller->getSubgridCUDA(i,caller, &u[l]);
-					caller->updateSubgridCUDA(i,caller, &u[l]);
+					//caller->getSubgridCUDA2D(i,caller, &u[l]);
+					caller->updateSubgridCUDA2D(i,caller, &u[l]);
 					__syncthreads();
 				}
 				if(   (bound & 12 ))
 				{
-					caller->runSubgridCUDA(12,u,i);
+					caller->runSubgridCUDA2D(12,u,i);
 					//__syncthreads();
-					//caller->insertSubgridCUDA(u[l],i);
+					//caller->insertSubgridCUDA2D(u[l],i);
 					//__syncthreads();
-					//caller->getSubgridCUDA(i,caller, &u[l]);
-					caller->updateSubgridCUDA(i,caller, &u[l]);
+					//caller->getSubgridCUDA2D(i,caller, &u[l]);
+					caller->updateSubgridCUDA2D(i,caller, &u[l]);
 					__syncthreads();
 				}
 
@@ -1755,42 +1755,42 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 
 			if( ((bound == 2)))
 						{
-							caller->runSubgridCUDA(2,u,i);
+							caller->runSubgridCUDA2D(2,u,i);
 							//__syncthreads();
-							//caller->insertSubgridCUDA(u[l],i);
+							//caller->insertSubgridCUDA2D(u[l],i);
 							//__syncthreads();
-							//caller->getSubgridCUDA(i,caller, &u[l]);
-							caller->updateSubgridCUDA(i,caller, &u[l]);
+							//caller->getSubgridCUDA2D(i,caller, &u[l]);
+							caller->updateSubgridCUDA2D(i,caller, &u[l]);
 							__syncthreads();
 						}
 						if( ((bound == 1) ))
 						{
-							caller->runSubgridCUDA(1,u,i);
+							caller->runSubgridCUDA2D(1,u,i);
 							//__syncthreads();
-							//caller->insertSubgridCUDA(u[l],i);
+							//caller->insertSubgridCUDA2D(u[l],i);
 							//__syncthreads();
-							//caller->getSubgridCUDA(i,caller, &u[l]);
-							caller->updateSubgridCUDA(i,caller, &u[l]);
+							//caller->getSubgridCUDA2D(i,caller, &u[l]);
+							caller->updateSubgridCUDA2D(i,caller, &u[l]);
 							__syncthreads();
 						}
 						if( ((bound == 8) ))
 						{
-							caller->runSubgridCUDA(8,u,i);
+							caller->runSubgridCUDA2D(8,u,i);
 							//__syncthreads();
-							//caller->insertSubgridCUDA(u[l],i);
+							//caller->insertSubgridCUDA2D(u[l],i);
 							//__syncthreads();
-							//caller->getSubgridCUDA(i,caller, &u[l]);
-							caller->updateSubgridCUDA(i,caller, &u[l]);
+							//caller->getSubgridCUDA2D(i,caller, &u[l]);
+							caller->updateSubgridCUDA2D(i,caller, &u[l]);
 							__syncthreads();
 						}
 						if(   (bound == 4))
 						{
-							caller->runSubgridCUDA(4,u,i);
+							caller->runSubgridCUDA2D(4,u,i);
 							//__syncthreads();
-							//caller->insertSubgridCUDA(u[l],i);
+							//caller->insertSubgridCUDA2D(u[l],i);
 							//__syncthreads();
-							//caller->getSubgridCUDA(i,caller, &u[l]);
-							caller->updateSubgridCUDA(i,caller, &u[l]);
+							//caller->getSubgridCUDA2D(i,caller, &u[l]);
+							caller->updateSubgridCUDA2D(i,caller, &u[l]);
 							__syncthreads();
 						}
 
@@ -1805,42 +1805,42 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 
 			if( ((bound & 3) ))
 			{
-				caller->runSubgridCUDA(3,u,i);
+				caller->runSubgridCUDA2D(3,u,i);
 				//__syncthreads();
-				//caller->insertSubgridCUDA(u[l],i);
+				//caller->insertSubgridCUDA2D(u[l],i);
 				//__syncthreads();
-				//caller->getSubgridCUDA(i,caller, &u[l]);
-				caller->updateSubgridCUDA(i,caller, &u[l]);
+				//caller->getSubgridCUDA2D(i,caller, &u[l]);
+				caller->updateSubgridCUDA2D(i,caller, &u[l]);
 				__syncthreads();
 			}
 			if( ((bound & 5) ))
 			{
-				caller->runSubgridCUDA(5,u,i);
+				caller->runSubgridCUDA2D(5,u,i);
 				//__syncthreads();
-				//caller->insertSubgridCUDA(u[l],i);
+				//caller->insertSubgridCUDA2D(u[l],i);
 				//__syncthreads();
-				//caller->getSubgridCUDA(i,caller, &u[l]);
-				caller->updateSubgridCUDA(i,caller, &u[l]);
+				//caller->getSubgridCUDA2D(i,caller, &u[l]);
+				caller->updateSubgridCUDA2D(i,caller, &u[l]);
 				__syncthreads();
 			}
 			if( ((bound & 10) ))
 			{
-				caller->runSubgridCUDA(10,u,i);
+				caller->runSubgridCUDA2D(10,u,i);
 				//__syncthreads();
-				//caller->insertSubgridCUDA(u[l],i);
+				//caller->insertSubgridCUDA2D(u[l],i);
 				//__syncthreads();
-				//caller->getSubgridCUDA(i,caller, &u[l]);
-				caller->updateSubgridCUDA(i,caller, &u[l]);
+				//caller->getSubgridCUDA2D(i,caller, &u[l]);
+				caller->updateSubgridCUDA2D(i,caller, &u[l]);
 				__syncthreads();
 			}
 			if(   (bound & 12) )
 			{
-				caller->runSubgridCUDA(12,u,i);
+				caller->runSubgridCUDA2D(12,u,i);
 				//__syncthreads();
-				//caller->insertSubgridCUDA(u[l],i);
+				//caller->insertSubgridCUDA2D(u[l],i);
 				//__syncthreads();
-				//caller->getSubgridCUDA(i,caller, &u[l]);
-				caller->updateSubgridCUDA(i,caller, &u[l]);
+				//caller->getSubgridCUDA2D(i,caller, &u[l]);
+				caller->updateSubgridCUDA2D(i,caller, &u[l]);
 				__syncthreads();
 			}
 
@@ -1858,19 +1858,19 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 		}
 		/*if( bound )
 		{
-			caller->runSubgridCUDA(15,u,i);
+			caller->runSubgridCUDA2D(15,u,i);
 			__syncthreads();
-			//caller->insertSubgridCUDA(u[l],i);
+			//caller->insertSubgridCUDA2D(u[l],i);
 			//__syncthreads();
-			//caller->getSubgridCUDA(i,caller, &u[l]);
-			caller->updateSubgridCUDA(i,caller, &u[l]);
+			//caller->getSubgridCUDA2D(i,caller, &u[l]);
+			caller->updateSubgridCUDA2D(i,caller, &u[l]);
 			__syncthreads();
 		}*/
 
 		if(l==0)
 		{
-			caller->setBoundaryConditionCUDA(i, 0);
-			caller->setSubgridValueCUDA(i, caller->getSubgridValueCUDA(i) - 1 );
+			caller->setBoundaryConditionCUDA2D(i, 0);
+			caller->setSubgridValueCUDA2D(i, caller->getSubgridValueCUDA2D(i) - 1 );
 		}
 
 
@@ -1882,4 +1882,4 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 
 #endif /*HAVE_CUDA*/
 
-#endif /* TNLPARALLELEIKONALSOLVER_IMPL_H_ */
+#endif /* TNLPARALLELEIKONALSOLVER2D_IMPL_H_ */
diff --git a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..91ec8afa4f75c2ac031a037d1e9e8e027108609e
--- /dev/null
+++ b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h
@@ -0,0 +1,1885 @@
+/***************************************************************************
+                          tnlParallelEikonalSolver2D_impl.h  -  description
+                             -------------------
+    begin                : Nov 28 , 2014
+    copyright            : (C) 2014 by Tomas Sobotik
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifndef TNLPARALLELEIKONALSOLVER3D_IMPL_H_
+#define TNLPARALLELEIKONALSOLVER3D_IMPL_H_
+
+
+#include "tnlParallelEikonalSolver.h"
+#include <core/mfilename.h>
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::tnlParallelEikonalSolver()
+{
+	cout << "a" << endl;
+	this->device = tnlCudaDevice;  /////////////// tnlCuda Device --- vypocet na GPU, tnlHostDevice   ---    vypocet na CPU
+
+#ifdef HAVE_CUDA
+	if(this->device == tnlCudaDevice)
+	{
+	run_host = 1;
+	}
+#endif
+
+	cout << "b" << endl;
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::test()
+{
+/*
+	for(int i =0; i < this->subgridValues.getSize(); i++ )
+	{
+		insertSubgrid(getSubgrid(i), i);
+	}
+*/
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+
+bool tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::init( const tnlParameterContainer& parameters )
+{
+	cout << "Initializating solver..." << endl;
+	const tnlString& meshLocation = parameters.getParameter <tnlString>("mesh");
+	this->mesh.load( meshLocation );
+
+	this->n = parameters.getParameter <int>("subgrid-size");
+	cout << "Setting N to " << this->n << endl;
+
+	this->subMesh.setDimensions( this->n, this->n );
+	this->subMesh.setDomain( tnlStaticVector<3,double>(0.0, 0.0),
+							 tnlStaticVector<3,double>(this->mesh.getHx()*(double)(this->n), this->mesh.getHy()*(double)(this->n),this->mesh.getHz()*(double)(this->n)) );
+
+	this->subMesh.save("submesh.tnl");
+
+	const tnlString& initialCondition = parameters.getParameter <tnlString>("initial-condition");
+	this->u0.load( initialCondition );
+
+	//cout << this->mesh.getCellCenter(0) << endl;
+
+	this->delta = parameters.getParameter <double>("delta");
+	this->delta *= this->mesh.getHx()*this->mesh.getHy();
+
+	cout << "Setting delta to " << this->delta << endl;
+
+	this->tau0 = parameters.getParameter <double>("initial-tau");
+	cout << "Setting initial tau to " << this->tau0 << endl;
+	this->stopTime = parameters.getParameter <double>("stop-time");
+
+	this->cflCondition = parameters.getParameter <double>("cfl-condition");
+	this -> cflCondition *= sqrt(this->mesh.getHx()*this->mesh.getHy());
+	cout << "Setting CFL to " << this->cflCondition << endl;
+
+	stretchGrid();
+	this->stopTime /= (double)(this->gridCols);
+	this->stopTime *= (1.0+1.0/((double)(this->n) - 2.0));
+	cout << "Setting stopping time to " << this->stopTime << endl;
+	//this->stopTime = 1.5*((double)(this->n))*parameters.getParameter <double>("stop-time")*this->mesh.getHx();
+	//cout << "Setting stopping time to " << this->stopTime << endl;
+
+	cout << "Initializating scheme..." << endl;
+	if(!this->schemeHost.init(parameters))
+	{
+		cerr << "SchemeHost failed to initialize." << endl;
+		return false;
+	}
+	cout << "Scheme initialized." << endl;
+
+	test();
+
+	VectorType* tmp = new VectorType[subgridValues.getSize()];
+	bool containsCurve = false;
+
+#ifdef HAVE_CUDA
+
+	if(this->device == tnlCudaDevice)
+	{
+	/*cout << "Testing... " << endl;
+	if(this->device == tnlCudaDevice)
+	{
+	if( !initCUDA3D(parameters, gridRows, gridCols) )
+		return false;
+	}*/
+		//cout << "s" << endl;
+	cudaMalloc(&(this->cudaSolver), sizeof(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >));
+	//cout << "s" << endl;
+	cudaMemcpy(this->cudaSolver, this,sizeof(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >), cudaMemcpyHostToDevice);
+	//cout << "s" << endl;
+	double** tmpdev = NULL;
+	cudaMalloc(&tmpdev, sizeof(double*));
+	//double* tmpw;
+	cudaMalloc(&(this->tmpw), this->work_u.getSize()*sizeof(double));
+	cudaMalloc(&(this->runcuda), sizeof(int));
+	cudaDeviceSynchronize();
+	checkCudaDevice;
+	int* tmpUC;
+	cudaMalloc(&(tmpUC), this->work_u.getSize()*sizeof(int));
+	cudaMemcpy(tmpUC, this->unusedCell.getData(), this->unusedCell.getSize()*sizeof(int), cudaMemcpyHostToDevice);
+
+	initCUDA3D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<1,1>>>(this->cudaSolver, (this->tmpw), (this->runcuda),tmpUC);
+	cudaDeviceSynchronize();
+	checkCudaDevice;
+	//cout << "s " << endl;
+	//cudaMalloc(&(cudaSolver->work_u_cuda), this->work_u.getSize()*sizeof(double));
+	double* tmpu = NULL;
+
+	cudaMemcpy(&tmpu, tmpdev,sizeof(double*), cudaMemcpyDeviceToHost);
+	//printf("%p %p \n",tmpu,tmpw);
+	cudaMemcpy((this->tmpw), this->work_u.getData(), this->work_u.getSize()*sizeof(double), cudaMemcpyHostToDevice);
+	cudaDeviceSynchronize();
+	checkCudaDevice;
+	//cout << "s "<< endl;
+
+	}
+#endif
+
+	if(this->device == tnlHostDevice)
+	{
+	for(int i = 0; i < this->subgridValues.getSize(); i++)
+	{
+
+		if(! tmp[i].setSize(this->n * this->n))
+			cout << "Could not allocate tmp["<< i <<"] array." << endl;
+			tmp[i] = getSubgrid(i);
+		containsCurve = false;
+
+		for(int j = 0; j < tmp[i].getSize(); j++)
+		{
+			if(tmp[i][0]*tmp[i][j] <= 0.0)
+			{
+				containsCurve = true;
+				j=tmp[i].getSize();
+			}
+
+		}
+		if(containsCurve)
+		{
+			//cout << "Computing initial SDF on subgrid " << i << "." << endl;
+			insertSubgrid(runSubgrid(0, tmp[i],i), i);
+			setSubgridValue(i, 4);
+			//cout << "Computed initial SDF on subgrid " << i  << "." << endl;
+		}
+		containsCurve = false;
+
+	}
+	}
+#ifdef HAVE_CUDA
+	else if(this->device == tnlCudaDevice)
+	{
+//		cout << "pre 1 kernel" << endl;
+		cudaDeviceSynchronize();
+		checkCudaDevice;
+		dim3 threadsPerBlock(this->n, this->n);
+		dim3 numBlocks(this->gridCols,this->gridRows);
+		cudaDeviceSynchronize();
+		checkCudaDevice;
+		initRunCUDA3D<SchemeTypeHost,SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver);
+		cudaDeviceSynchronize();
+//		cout << "post 1 kernel" << endl;
+
+	}
+#endif
+
+
+	this->currentStep = 1;
+	if(this->device == tnlHostDevice)
+		synchronize();
+#ifdef HAVE_CUDA
+	else if(this->device == tnlCudaDevice)
+	{
+		dim3 threadsPerBlock(this->n, this->n);
+		dim3 numBlocks(this->gridCols,this->gridRows);
+		//double * test = (double*)malloc(this->work_u.getSize()*sizeof(double));
+		//cout << test[0] <<"   " << test[1] <<"   " << test[2] <<"   " << test[3] << endl;
+		//cudaMemcpy(/*this->work_u.getData()*/ test, (this->tmpw), this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
+		//cout << this->tmpw << "   " <<  test[0] <<"   " << test[1] << "   " <<test[2] << "   " <<test[3] << endl;
+
+		checkCudaDevice;
+
+		synchronizeCUDA3D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
+		cudaDeviceSynchronize();
+		checkCudaDevice;
+		synchronize2CUDA3D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver);
+		cudaDeviceSynchronize();
+		checkCudaDevice;
+		//cout << test[0] << "   " <<test[1] <<"   " << test[2] << "   " <<test[3] << endl;
+		//cudaMemcpy(/*this->work_u.getData()*/ test, (this->tmpw), this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
+		//checkCudaDevice;
+		//cout << this->tmpw << "   " <<  test[0] << "   " <<test[1] << "   " <<test[2] <<"   " << test[3] << endl;
+		//free(test);
+
+	}
+
+#endif
+	cout << "Solver initialized." << endl;
+
+	return true;
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::run()
+{
+	if(this->device == tnlHostDevice)
+	{
+
+	bool end = false;
+	while ((this->boundaryConditions.max() > 0 ) || !end)
+	{
+		if(this->boundaryConditions.max() == 0 )
+			end=true;
+		else
+			end=false;
+#ifdef HAVE_OPENMP
+#pragma omp parallel for num_threads(4) schedule(dynamic)
+#endif
+		for(int i = 0; i < this->subgridValues.getSize(); i++)
+		{
+			if(getSubgridValue(i) != INT_MAX)
+			{
+				//cout << "subMesh: " << i << ", BC: " << getBoundaryCondition(i) << endl;
+
+				if(getSubgridValue(i) == currentStep+4)
+				{
+
+				if(getBoundaryCondition(i) & 1)
+				{
+					insertSubgrid( runSubgrid(1, getSubgrid(i),i), i);
+					this->calculationsCount[i]++;
+				}
+				if(getBoundaryCondition(i) & 2)
+				{
+					insertSubgrid( runSubgrid(2, getSubgrid(i),i), i);
+					this->calculationsCount[i]++;
+				}
+				if(getBoundaryCondition(i) & 4)
+				{
+					insertSubgrid( runSubgrid(4, getSubgrid(i),i), i);
+					this->calculationsCount[i]++;
+				}
+				if(getBoundaryCondition(i) & 8)
+				{
+					insertSubgrid( runSubgrid(8, getSubgrid(i),i), i);
+					this->calculationsCount[i]++;
+				}
+				}
+
+				if( ((getBoundaryCondition(i) & 2) )|| (getBoundaryCondition(i) & 1)//)
+					/*	&&(!(getBoundaryCondition(i) & 5) && !(getBoundaryCondition(i) & 10)) */)
+				{
+					//cout << "3 @ " << getBoundaryCondition(i) << endl;
+					insertSubgrid( runSubgrid(3, getSubgrid(i),i), i);
+				}
+				if( ((getBoundaryCondition(i) & 4) )|| (getBoundaryCondition(i) & 1)//)
+					/*	&&(!(getBoundaryCondition(i) & 3) && !(getBoundaryCondition(i) & 12)) */)
+				{
+					//cout << "5 @ " << getBoundaryCondition(i) << endl;
+					insertSubgrid( runSubgrid(5, getSubgrid(i),i), i);
+				}
+				if( ((getBoundaryCondition(i) & 2) )|| (getBoundaryCondition(i) & 8)//)
+					/*	&&(!(getBoundaryCondition(i) & 12) && !(getBoundaryCondition(i) & 3))*/ )
+				{
+					//cout << "10 @ " << getBoundaryCondition(i) << endl;
+					insertSubgrid( runSubgrid(10, getSubgrid(i),i), i);
+				}
+				if(   ((getBoundaryCondition(i) & 4) )|| (getBoundaryCondition(i) & 8)//)
+					/*&&(!(getBoundaryCondition(i) & 10) && !(getBoundaryCondition(i) & 5)) */)
+				{
+					//cout << "12 @ " << getBoundaryCondition(i) << endl;
+					insertSubgrid( runSubgrid(12, getSubgrid(i),i), i);
+				}
+
+
+				/*if(getBoundaryCondition(i))
+				{
+					insertSubgrid( runSubgrid(15, getSubgrid(i),i), i);
+				}*/
+
+				setBoundaryCondition(i, 0);
+
+				setSubgridValue(i, getSubgridValue(i)-1);
+
+			}
+		}
+		synchronize();
+	}
+	}
+#ifdef HAVE_CUDA
+	else if(this->device == tnlCudaDevice)
+	{
+		//cout << "fn" << endl;
+		bool end_cuda = false;
+		dim3 threadsPerBlock(this->n, this->n);
+		dim3 numBlocks(this->gridCols,this->gridRows);
+		cudaDeviceSynchronize();
+		checkCudaDevice;
+		//cudaMalloc(&runcuda,sizeof(bool));
+		//cudaMemcpy(runcuda, &run_host, sizeof(bool), cudaMemcpyHostToDevice);
+		//cout << "fn" << endl;
+		bool* tmpb;
+		//cudaMemcpy(tmpb, &(cudaSolver->runcuda),sizeof(bool*), cudaMemcpyDeviceToHost);
+		//cudaDeviceSynchronize();
+		//checkCudaDevice;
+		cudaMemcpy(&(this->run_host),this->runcuda,sizeof(int), cudaMemcpyDeviceToHost);
+		cudaDeviceSynchronize();
+		checkCudaDevice;
+		//cout << "fn" << endl;
+		int i = 1;
+		time_diff = 0.0;
+		while (run_host || !end_cuda)
+		{
+			cout << "Computing at step "<< i++ << endl;
+			if(run_host != 0 )
+				end_cuda = true;
+			else
+				end_cuda = false;
+			//cout << "a" << endl;
+			cudaDeviceSynchronize();
+			checkCudaDevice;
+			start = std::clock();
+			runCUDA3D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver);
+			//cout << "a" << endl;
+			cudaDeviceSynchronize();
+			time_diff += (std::clock() - start) / (double)(CLOCKS_PER_SEC);
+
+			//start = std::clock();
+			synchronizeCUDA3D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
+			cudaDeviceSynchronize();
+			checkCudaDevice;
+			synchronize2CUDA3D<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver);
+			cudaDeviceSynchronize();
+			checkCudaDevice;
+			//time_diff += (std::clock() - start) / (double)(CLOCKS_PER_SEC);
+
+
+			//cout << "a" << endl;
+			//run_host = false;
+			//cout << "in kernel loop" << run_host << endl;
+			//cudaMemcpy(tmpb, &(cudaSolver->runcuda),sizeof(bool*), cudaMemcpyDeviceToHost);
+			cudaMemcpy(&run_host, (this->runcuda),sizeof(int), cudaMemcpyDeviceToHost);
+			//cout << "in kernel loop" << run_host << endl;
+		}
+		cout << "Solving time was: " << time_diff << endl;
+		//cout << "b" << endl;
+
+		//double* tmpu;
+		//cudaMemcpy(tmpu, &(cudaSolver->work_u_cuda),sizeof(double*), cudaMemcpyHostToDevice);
+		//cudaMemcpy(this->work_u.getData(), tmpu, this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
+		//cout << this->work_u.getData()[0] << endl;
+
+		//double * test = (double*)malloc(this->work_u.getSize()*sizeof(double));
+		//cout << test[0] << test[1] << test[2] << test[3] << endl;
+		cudaMemcpy(this->work_u.getData()/* test*/, (this->tmpw), this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
+		//cout << this->tmpw << "   " <<  test[0] << test[1] << test[2] << test[3] << endl;
+		//free(test);
+
+		cudaDeviceSynchronize();
+	}
+#endif
+	contractGrid();
+	this->u0.save("u-00001.tnl");
+	cout << "Maximum number of calculations on one subgrid was " << this->calculationsCount.absMax() << endl;
+	cout << "Average number of calculations on one subgrid was " << ( (double) this->calculationsCount.sum() / (double) this->calculationsCount.getSize() ) << endl;
+	cout << "Solver finished" << endl;
+
+#ifdef HAVE_CUDA
+	if(this->device == tnlCudaDevice)
+	{
+		cudaFree(this->runcuda);
+		cudaFree(this->tmpw);
+		cudaFree(this->cudaSolver);
+	}
+#endif
+
+}
+
+//north - 1, east - 2, west - 4, south - 8
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::synchronize() //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now
+{
+	cout << "Synchronizig..." << endl;
+	int tmp1, tmp2;
+	int grid1, grid2;
+
+	if(this->currentStep & 1)
+	{
+		for(int j = 0; j < this->gridRows - 1; j++)
+		{
+			for (int i = 0; i < this->gridCols*this->n; i++)
+			{
+				tmp1 = this->gridCols*this->n*((this->n-1)+j*this->n) + i;
+				tmp2 = this->gridCols*this->n*((this->n)+j*this->n) + i;
+				grid1 = getSubgridValue(getOwner(tmp1));
+				grid2 = getSubgridValue(getOwner(tmp2));
+				if(getOwner(tmp1)==getOwner(tmp2))
+					cout << "i, j" << i << "," << j << endl;
+				if ((fabs(this->work_u[tmp1]) < fabs(this->work_u[tmp2]) - this->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX))
+				{
+					this->work_u[tmp2] = this->work_u[tmp1];
+					this->unusedCell[tmp2] = 0;
+					if(grid2 == INT_MAX)
+					{
+						setSubgridValue(getOwner(tmp2), -INT_MAX);
+					}
+					if(! (getBoundaryCondition(getOwner(tmp2)) & 8) )
+						setBoundaryCondition(getOwner(tmp2), getBoundaryCondition(getOwner(tmp2))+8);
+				}
+				else if ((fabs(this->work_u[tmp1]) > fabs(this->work_u[tmp2]) + this->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX))
+				{
+					this->work_u[tmp1] = this->work_u[tmp2];
+					this->unusedCell[tmp1] = 0;
+					if(grid1 == INT_MAX)
+					{
+						setSubgridValue(getOwner(tmp1), -INT_MAX);
+					}
+					if(! (getBoundaryCondition(getOwner(tmp1)) & 1) )
+						setBoundaryCondition(getOwner(tmp1), getBoundaryCondition(getOwner(tmp1))+1);
+				}
+			}
+		}
+
+	}
+	else
+	{
+		for(int i = 1; i < this->gridCols; i++)
+		{
+			for (int j = 0; j < this->gridRows*this->n; j++)
+			{
+				tmp1 = this->gridCols*this->n*j + i*this->n - 1;
+				tmp2 = this->gridCols*this->n*j + i*this->n ;
+				grid1 = getSubgridValue(getOwner(tmp1));
+				grid2 = getSubgridValue(getOwner(tmp2));
+				if(getOwner(tmp1)==getOwner(tmp2))
+					cout << "i, j" << i << "," << j << endl;
+				if ((fabs(this->work_u[tmp1]) < fabs(this->work_u[tmp2]) - this->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX))
+				{
+					this->work_u[tmp2] = this->work_u[tmp1];
+					this->unusedCell[tmp2] = 0;
+					if(grid2 == INT_MAX)
+					{
+						setSubgridValue(getOwner(tmp2), -INT_MAX);
+					}
+					if(! (getBoundaryCondition(getOwner(tmp2)) & 4) )
+						setBoundaryCondition(getOwner(tmp2), getBoundaryCondition(getOwner(tmp2))+4);
+				}
+				else if ((fabs(this->work_u[tmp1]) > fabs(this->work_u[tmp2]) + this->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX))
+				{
+					this->work_u[tmp1] = this->work_u[tmp2];
+					this->unusedCell[tmp1] = 0;
+					if(grid1 == INT_MAX)
+					{
+						setSubgridValue(getOwner(tmp1), -INT_MAX);
+					}
+					if(! (getBoundaryCondition(getOwner(tmp1)) & 2) )
+						setBoundaryCondition(getOwner(tmp1), getBoundaryCondition(getOwner(tmp1))+2);
+				}
+			}
+		}
+	}
+
+
+	this->currentStep++;
+	int stepValue = this->currentStep + 4;
+	for (int i = 0; i < this->subgridValues.getSize(); i++)
+	{
+		if( getSubgridValue(i) == -INT_MAX )
+			setSubgridValue(i, stepValue);
+	}
+
+	cout << "Grid synchronized at step " << (this->currentStep - 1 ) << endl;
+
+}
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+int tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getOwner(int i) const
+{
+
+	return (i / (this->gridCols*this->n*this->n))*this->gridCols + (i % (this->gridCols*this->n))/this->n;
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+int tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getSubgridValue( int i ) const
+{
+	return this->subgridValues[i];
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::setSubgridValue(int i, int value)
+{
+	this->subgridValues[i] = value;
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+int tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getBoundaryCondition( int i ) const
+{
+	return this->boundaryConditions[i];
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::setBoundaryCondition(int i, int value)
+{
+	this->boundaryConditions[i] = value;
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::stretchGrid()
+{
+	cout << "Stretching grid..." << endl;
+
+
+	this->gridCols = ceil( ((double)(this->mesh.getDimensions().x()-1)) / ((double)(this->n-1)) );
+	this->gridRows = ceil( ((double)(this->mesh.getDimensions().y()-1)) / ((double)(this->n-1)) );
+
+	//this->gridCols = (this->mesh.getDimensions().x()-1) / (this->n-1) ;
+	//this->gridRows = (this->mesh.getDimensions().y()-1) / (this->n-1) ;
+
+	cout << "Setting gridCols to " << this->gridCols << "." << endl;
+	cout << "Setting gridRows to " << this->gridRows << "." << endl;
+
+	this->subgridValues.setSize(this->gridCols*this->gridRows);
+	this->subgridValues.setValue(0);
+	this->boundaryConditions.setSize(this->gridCols*this->gridRows);
+	this->boundaryConditions.setValue(0);
+	this->calculationsCount.setSize(this->gridCols*this->gridRows);
+	this->calculationsCount.setValue(0);
+
+	for(int i = 0; i < this->subgridValues.getSize(); i++ )
+	{
+		this->subgridValues[i] = INT_MAX;
+		this->boundaryConditions[i] = 0;
+	}
+
+	int stretchedSize = this->n*this->n*this->gridCols*this->gridRows;
+
+	if(!this->work_u.setSize(stretchedSize))
+		cerr << "Could not allocate memory for stretched grid." << endl;
+	if(!this->unusedCell.setSize(stretchedSize))
+		cerr << "Could not allocate memory for supporting stretched grid." << endl;
+	int idealStretch =this->mesh.getDimensions().x() + (this->mesh.getDimensions().x()-2)/(this->n-1);
+	cout << idealStretch << endl;
+
+	for(int i = 0; i < stretchedSize; i++)
+	{
+		this->unusedCell[i] = 1;
+		int diff =(this->n*this->gridCols) - idealStretch ;
+		//cout << "diff = " << diff <<endl;
+		int k = i/this->n - i/(this->n*this->gridCols) + this->mesh.getDimensions().x()*(i/(this->n*this->n*this->gridCols)) + (i/(this->n*this->gridCols))*diff;
+
+		if(i%(this->n*this->gridCols) - idealStretch  >= 0)
+		{
+			//cout << i%(this->n*this->gridCols) - idealStretch +1 << endl;
+			k+= i%(this->n*this->gridCols) - idealStretch +1 ;
+		}
+
+		if(i/(this->n*this->gridCols) - idealStretch + 1  > 0)
+		{
+			//cout << i/(this->n*this->gridCols) - idealStretch + 1  << endl;
+			k+= (i/(this->n*this->gridCols) - idealStretch +1 )* this->mesh.getDimensions().x() ;
+		}
+
+		//cout << "i = " << i << " : i-k = " << i-k << endl;
+		/*int j=(i % (this->n*this->gridCols)) - ( (this->mesh.getDimensions().x() - this->n)/(this->n - 1) + this->mesh.getDimensions().x() - 1)
+				+ (this->n*this->gridCols - this->mesh.getDimensions().x())*(i/(this->n*this->n*this->gridCols)) ;
+
+		if(j > 0)
+			k += j;
+
+		int l = i-k - (this->u0.getSize() - 1);
+		int m = (l % this->mesh.getDimensions().x());
+
+		if(l>0)
+			k+= l + ( (l / this->mesh.getDimensions().x()) + 1 )*this->mesh.getDimensions().x() - (l % this->mesh.getDimensions().x());*/
+
+		this->work_u[i] = this->u0[i-k];
+		//cout << (i-k) <<endl;
+	}
+
+
+	cout << "Grid stretched." << endl;
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::contractGrid()
+{
+	cout << "Contracting grid..." << endl;
+	int stretchedSize = this->n*this->n*this->gridCols*this->gridRows;
+
+	int idealStretch =this->mesh.getDimensions().x() + (this->mesh.getDimensions().x()-2)/(this->n-1);
+	cout << idealStretch << endl;
+
+	for(int i = 0; i < stretchedSize; i++)
+	{
+		int diff =(this->n*this->gridCols) - idealStretch ;
+		int k = i/this->n - i/(this->n*this->gridCols) + this->mesh.getDimensions().x()*(i/(this->n*this->n*this->gridCols)) + (i/(this->n*this->gridCols))*diff;
+
+		if((i%(this->n*this->gridCols) - idealStretch  < 0) && (i/(this->n*this->gridCols) - idealStretch + 1  <= 0))
+		{
+			//cout << i <<" : " <<i-k<< endl;
+			this->u0[i-k] = this->work_u[i];
+		}
+
+	}
+
+	cout << "Grid contracted" << endl;
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+typename tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::VectorType
+tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getSubgrid( const int i ) const
+{
+	VectorType u;
+	u.setSize(this->n*this->n);
+
+	for( int j = 0; j < u.getSize(); j++)
+	{
+		u[j] = this->work_u[ (i / this->gridCols) * this->n*this->n*this->gridCols
+		                     + (i % this->gridCols) * this->n
+		                     + (j/this->n) * this->n*this->gridCols
+		                     + (j % this->n) ];
+	}
+	return u;
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::insertSubgrid( VectorType u, const int i )
+{
+
+	for( int j = 0; j < this->n*this->n; j++)
+	{
+		int index = (i / this->gridCols)*this->n*this->n*this->gridCols + (i % this->gridCols)*this->n + (j/this->n)*this->n*this->gridCols + (j % this->n);
+		//OMP LOCK index
+		if( (fabs(this->work_u[index]) > fabs(u[j])) || (this->unusedCell[index] == 1) )
+		{
+			this->work_u[index] = u[j];
+			this->unusedCell[index] = 0;
+		}
+		//OMP UNLOCK index
+	}
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+typename tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::VectorType
+tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::runSubgrid( int boundaryCondition, VectorType u, int subGridID)
+{
+
+	VectorType fu;
+
+	fu.setLike(u);
+	fu.setValue( 0.0 );
+
+/*
+ *          Insert Euler-Solver Here
+ */
+
+	/**/
+
+	/*for(int i = 0; i < u.getSize(); i++)
+	{
+		int x = this->subMesh.getCellCoordinates(i).x();
+		int y = this->subMesh.getCellCoordinates(i).y();
+
+		if(x == 0 && (boundaryCondition & 4) && y ==0)
+		{
+			if((u[subMesh.getCellYSuccessor( i )] - u[i])/subMesh.getHy() > 1.0)
+			{
+				//cout << "x = 0; y = 0" << endl;
+				u[i] = u[subMesh.getCellYSuccessor( i )] - subMesh.getHy();
+			}
+		}
+		else if(x == 0 && (boundaryCondition & 4) && y == subMesh.getDimensions().y() - 1)
+		{
+			if((u[subMesh.getCellYPredecessor( i )] - u[i])/subMesh.getHy() > 1.0)
+			{
+				//cout << "x = 0; y = n" << endl;
+				u[i] = u[subMesh.getCellYPredecessor( i )] - subMesh.getHy();
+			}
+		}
+
+
+		else if(x == subMesh.getDimensions().x() - 1 && (boundaryCondition & 2) && y ==0)
+		{
+			if((u[subMesh.getCellYSuccessor( i )] - u[i])/subMesh.getHy() > 1.0)
+			{
+				//cout << "x = n; y = 0" << endl;
+				u[i] = u[subMesh.getCellYSuccessor( i )] - subMesh.getHy();
+			}
+		}
+		else if(x == subMesh.getDimensions().x() - 1 && (boundaryCondition & 2) && y == subMesh.getDimensions().y() - 1)
+		{
+			if((u[subMesh.getCellYPredecessor( i )] - u[i])/subMesh.getHy() > 1.0)
+			{
+				//cout << "x = n; y = n" << endl;
+				u[i] = u[subMesh.getCellYPredecessor( i )] - subMesh.getHy();
+			}
+		}
+
+
+		else if(y == 0 && (boundaryCondition & 8) && x ==0)
+		{
+			if((u[subMesh.getCellXSuccessor( i )] - u[i])/subMesh.getHx() > 1.0)
+			{
+				//cout << "y = 0; x = 0" << endl;
+				u[i] = u[subMesh.getCellXSuccessor( i )] - subMesh.getHx();
+			}
+		}
+		else if(y == 0 && (boundaryCondition & 8) && x == subMesh.getDimensions().x() - 1)
+		{
+			if((u[subMesh.getCellXPredecessor( i )] - u[i])/subMesh.getHx() > 1.0)
+			{
+				//cout << "y = 0; x = n" << endl;
+				u[i] = u[subMesh.getCellXPredecessor( i )] - subMesh.getHx();
+			}
+		}
+
+
+		else if(y == subMesh.getDimensions().y() - 1 && (boundaryCondition & 1) && x ==0)
+		{
+			if((u[subMesh.getCellXSuccessor( i )] - u[i])/subMesh.getHx() > 1.0)			{
+				//cout << "y = n; x = 0" << endl;
+				u[i] = u[subMesh.getCellXSuccessor( i )] - subMesh.getHx();
+			}
+		}
+		else if(y == subMesh.getDimensions().y() - 1 && (boundaryCondition & 1) && x == subMesh.getDimensions().x() - 1)
+		{
+			if((u[subMesh.getCellXPredecessor( i )] - u[i])/subMesh.getHx() > 1.0)
+			{
+				//cout << "y = n; x = n" << endl;
+				u[i] = u[subMesh.getCellXPredecessor( i )] - subMesh.getHx();
+			}
+		}
+	}*/
+
+	/**/
+
+
+/*	bool tmp = false;
+	for(int i = 0; i < u.getSize(); i++)
+	{
+		if(u[0]*u[i] <= 0.0)
+			tmp=true;
+	}
+
+
+	if(tmp)
+	{}
+	else if(boundaryCondition == 4)
+	{
+		int i;
+		for(i = 0; i < u.getSize() - subMesh.getDimensions().x() ; i=subMesh.getCellYSuccessor(i))
+		{
+			int j;
+			for(j = i; j < subMesh.getDimensions().x() - 1; j=subMesh.getCellXSuccessor(j))
+			{
+				u[j] = u[i];
+			}
+			u[j] = u[i];
+		}
+		int j;
+		for(j = i; j < subMesh.getDimensions().x() - 1; j=subMesh.getCellXSuccessor(j))
+		{
+			u[j] = u[i];
+		}
+		u[j] = u[i];
+	}
+	else if(boundaryCondition == 8)
+	{
+		int i;
+		for(i = 0; i < subMesh.getDimensions().x() - 1; i=subMesh.getCellXSuccessor(i))
+		{
+			int j;
+			for(j = i; j < u.getSize() - subMesh.getDimensions().x(); j=subMesh.getCellYSuccessor(j))
+			{
+				u[j] = u[i];
+			}
+			u[j] = u[i];
+		}
+		int j;
+		for(j = i; j < u.getSize() - subMesh.getDimensions().x(); j=subMesh.getCellYSuccessor(j))
+		{
+			u[j] = u[i];
+		}
+		u[j] = u[i];
+
+	}
+	else if(boundaryCondition == 2)
+	{
+		int i;
+		for(i = subMesh.getDimensions().x() - 1; i < u.getSize() - subMesh.getDimensions().x() ; i=subMesh.getCellYSuccessor(i))
+		{
+			int j;
+			for(j = i; j > (i-1)*subMesh.getDimensions().x(); j=subMesh.getCellXPredecessor(j))
+			{
+				u[j] = u[i];
+			}
+			u[j] = u[i];
+		}
+		int j;
+		for(j = i; j > (i-1)*subMesh.getDimensions().x(); j=subMesh.getCellXPredecessor(j))
+		{
+			u[j] = u[i];
+		}
+		u[j] = u[i];
+	}
+	else if(boundaryCondition == 1)
+	{
+		int i;
+		for(i = (subMesh.getDimensions().y() - 1)*subMesh.getDimensions().x(); i < u.getSize() - 1; i=subMesh.getCellXSuccessor(i))
+		{
+			int j;
+			for(j = i; j >=subMesh.getDimensions().x(); j=subMesh.getCellYPredecessor(j))
+			{
+				u[j] = u[i];
+			}
+			u[j] = u[i];
+		}
+		int j;
+		for(j = i; j >=subMesh.getDimensions().x(); j=subMesh.getCellYPredecessor(j))
+		{
+			u[j] = u[i];
+		}
+		u[j] = u[i];
+	}
+*/
+	/**/
+
+
+
+	bool tmp = false;
+	for(int i = 0; i < u.getSize(); i++)
+	{
+		if(u[0]*u[i] <= 0.0)
+			tmp=true;
+		int centreGID = (this->n*(subGridID / this->gridRows)+ (this->n >> 1))*(this->n*this->gridCols) + this->n*(subGridID % this->gridRows) + (this->n >> 1);
+		if(this->unusedCell[centreGID] == 0 || boundaryCondition == 0)
+			tmp = true;
+	}
+	//if(this->currentStep + 3 < getSubgridValue(subGridID))
+		//tmp = true;
+
+
+	double value = Sign(u[0]) * u.absMax();
+
+	if(tmp)
+	{}
+
+
+	//north - 1, east - 2, west - 4, south - 8
+	else if(boundaryCondition == 4)
+	{
+		for(int i = 0; i < this->n; i++)
+			for(int j = 1;j < this->n; j++)
+				//if(fabs(u[i*this->n + j]) <  fabs(u[i*this->n]))
+				u[i*this->n + j] = value;// u[i*this->n];
+	}
+	else if(boundaryCondition == 2)
+	{
+		for(int i = 0; i < this->n; i++)
+			for(int j =0 ;j < this->n -1; j++)
+				//if(fabs(u[i*this->n + j]) < fabs(u[(i+1)*this->n - 1]))
+				u[i*this->n + j] = value;// u[(i+1)*this->n - 1];
+	}
+	else if(boundaryCondition == 1)
+	{
+		for(int j = 0; j < this->n; j++)
+			for(int i = 0;i < this->n - 1; i++)
+				//if(fabs(u[i*this->n + j]) < fabs(u[j + this->n*(this->n - 1)]))
+				u[i*this->n + j] = value;// u[j + this->n*(this->n - 1)];
+	}
+	else if(boundaryCondition == 8)
+	{
+		for(int j = 0; j < this->n; j++)
+			for(int i = 1;i < this->n; i++)
+				//if(fabs(u[i*this->n + j]) < fabs(u[j]))
+				u[i*this->n + j] = value;// u[j];
+	}
+
+/*
+
+	else if(boundaryCondition == 5)
+	{
+		for(int i = 0; i < this->n - 1; i++)
+			for(int j = 1;j < this->n; j++)
+				//if(fabs(u[i*this->n + j]) <  fabs(u[i*this->n]))
+				u[i*this->n + j] = value;// u[i*this->n];
+	}
+	else if(boundaryCondition == 10)
+	{
+		for(int i = 1; i < this->n; i++)
+			for(int j =0 ;j < this->n -1; j++)
+				//if(fabs(u[i*this->n + j]) < fabs(u[(i+1)*this->n - 1]))
+				u[i*this->n + j] = value;// u[(i+1)*this->n - 1];
+	}
+	else if(boundaryCondition == 3)
+	{
+		for(int j = 0; j < this->n - 1; j++)
+			for(int i = 0;i < this->n - 1; i++)
+				//if(fabs(u[i*this->n + j]) < fabs(u[j + this->n*(this->n - 1)]))
+				u[i*this->n + j] = value;// u[j + this->n*(this->n - 1)];
+	}
+	else if(boundaryCondition == 12)
+	{
+		for(int j = 1; j < this->n; j++)
+			for(int i = 1;i < this->n; i++)
+				//if(fabs(u[i*this->n + j]) < fabs(u[j]))
+				u[i*this->n + j] = value;// u[j];
+	}
+*/
+
+
+	/**/
+
+	/*if (u.max() > 0.0)
+		this->stopTime *=(double) this->gridCols;*/
+
+
+   double time = 0.0;
+   double currentTau = this->tau0;
+   double finalTime = this->stopTime;// + 3.0*(u.max() - u.min());
+   if( time + currentTau > finalTime ) currentTau = finalTime - time;
+
+   double maxResidue( 1.0 );
+   //double lastResidue( 10000.0 );
+   while( time < finalTime /*|| maxResidue > subMesh.getHx()*/)
+   {
+      /****
+       * Compute the RHS
+       */
+
+      for( int i = 0; i < fu.getSize(); i ++ )
+      {
+    	  fu[ i ] = schemeHost.getValue( this->subMesh, i, this->subMesh.getCellCoordinates(i), u, time, boundaryCondition );
+      }
+      maxResidue = fu. absMax();
+
+
+      if( this -> cflCondition * maxResidue != 0.0)
+    	  currentTau =  this -> cflCondition / maxResidue;
+
+     /* if (maxResidue < 0.05)
+    	  cout << "Max < 0.05" << endl;*/
+      if(currentTau > 1.0 * this->subMesh.getHx())
+      {
+    	  //cout << currentTau << " >= " << 2.0 * this->subMesh.getHx() << endl;
+    	  currentTau = 1.0 * this->subMesh.getHx();
+      }
+      /*if(maxResidue > lastResidue)
+    	  currentTau *=(1.0/10.0);*/
+
+
+      if( time + currentTau > finalTime ) currentTau = finalTime - time;
+//      for( int i = 0; i < fu.getSize(); i ++ )
+//      {
+//    	  //cout << "Too big RHS! i = " << i << ", fu = " << fu[i] << ", u = " << u[i] << endl;
+//    	  if((u[i]+currentTau * fu[ i ])*u[i] < 0.0 && fu[i] != 0.0 && u[i] != 0.0 )
+//    		  currentTau = fabs(u[i]/(2.0*fu[i]));
+//
+//      }
+
+
+      for( int i = 0; i < fu.getSize(); i ++ )
+      {
+    	  double add = u[i] + currentTau * fu[ i ];
+    	  //if( fabs(u[i]) < fabs(add) or (this->subgridValues[subGridID] == this->currentStep +4) )
+    		  u[ i ] = add;
+      }
+      time += currentTau;
+
+      //cout << '\r' << flush;
+     //cout << maxResidue << "   " << currentTau << " @ " << time << flush;
+     //lastResidue = maxResidue;
+   }
+   //cout << "Time: " << time << ", Res: " << maxResidue <<endl;
+	/*if (u.max() > 0.0)
+		this->stopTime /=(double) this->gridCols;*/
+
+	VectorType solution;
+	solution.setLike(u);
+    for( int i = 0; i < u.getSize(); i ++ )
+  	{
+    	solution[i]=u[i];
+   	}
+	return solution;
+}
+
+
+#ifdef HAVE_CUDA
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+__device__
+void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getSubgridCUDA3D( const int i ,tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller, double* a)
+{
+	//int j = threadIdx.x + threadIdx.y * blockDim.x;
+	int th = (blockIdx.y) * caller->n*caller->n*caller->gridCols
+            + (blockIdx.x) * caller->n
+            + threadIdx.y * caller->n*caller->gridCols
+            + threadIdx.x;
+	//printf("i= %d,j= %d,th= %d\n",i,j,th);
+	*a = caller->work_u_cuda[th];
+	//printf("Hi %f \n", *a);
+	//return ret;
+}
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+__device__
+void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::updateSubgridCUDA3D( const int i ,tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller, double* a)
+{
+	int j = threadIdx.x + threadIdx.y * blockDim.x;
+	int index = (blockIdx.y) * caller->n*caller->n*caller->gridCols
+            + (blockIdx.x) * caller->n
+            + threadIdx.y * caller->n*caller->gridCols
+            + threadIdx.x;
+
+	if( (fabs(caller->work_u_cuda[index]) > fabs(*a)) || (caller->unusedCell_cuda[index] == 1) )
+	{
+		caller->work_u_cuda[index] = *a;
+		caller->unusedCell_cuda[index] = 0;
+
+	}
+
+	*a = caller->work_u_cuda[index];
+}
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+__device__
+void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::insertSubgridCUDA3D( double u, const int i )
+{
+
+
+//	int j = threadIdx.x + threadIdx.y * blockDim.x;
+	//printf("j = %d, u = %f\n", j,u);
+
+		int index = (blockIdx.y)*this->n*this->n*this->gridCols
+					+ (blockIdx.x)*this->n
+					+ threadIdx.y*this->n*this->gridCols
+					+ threadIdx.x;
+
+		//printf("i= %d,j= %d,index= %d\n",i,j,index);
+		if( (fabs(this->work_u_cuda[index]) > fabs(u)) || (this->unusedCell_cuda[index] == 1) )
+		{
+			this->work_u_cuda[index] = u;
+			this->unusedCell_cuda[index] = 0;
+
+		}
+
+
+}
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+__device__
+void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::runSubgridCUDA3D( int boundaryCondition, double* u, int subGridID)
+{
+
+	__shared__ int tmp;
+	__shared__ double value;
+	//double tmpRes = 0.0;
+	volatile double* sharedTau = &u[blockDim.x*blockDim.y];
+	volatile double* absVal = &u[2*blockDim.x*blockDim.y];
+	int i = threadIdx.x;
+	int j = threadIdx.y;
+	int l = threadIdx.y * blockDim.x + threadIdx.x;
+	bool computeFU = !((i == 0 && (boundaryCondition & 4)) or
+			 (i == blockDim.x - 1 && (boundaryCondition & 2)) or
+			 (j == 0 && (boundaryCondition & 8)) or
+			 (j == blockDim.y - 1  && (boundaryCondition & 1)));
+
+	if(l == 0)
+	{
+		tmp = 0;
+		int centreGID = (blockDim.y*blockIdx.y + (blockDim.y>>1))*(blockDim.x*gridDim.x) + blockDim.x*blockIdx.x + (blockDim.x>>1);
+		if(this->unusedCell_cuda[centreGID] == 0 || boundaryCondition == 0)
+			tmp = 1;
+	}
+	__syncthreads();
+
+	/*if(!tmp && (u[0]*u[l] <= 0.0))
+		atomicMax( &tmp, 1);*/
+
+	__syncthreads();
+	if(tmp !=1)
+	{
+//		if(computeFU)
+//			absVal[l]=0.0;
+//		else
+//			absVal[l] = fabs(u[l]);
+//
+//		__syncthreads();
+//
+//	      if((blockDim.x == 16) && (l < 128))		absVal[l] = Max(absVal[l],absVal[l+128]);
+//	      __syncthreads();
+//	      if((blockDim.x == 16) && (l < 64))		absVal[l] = Max(absVal[l],absVal[l+64]);
+//	      __syncthreads();
+//	      if(l < 32)    							absVal[l] = Max(absVal[l],absVal[l+32]);
+//	      if(l < 16)								absVal[l] = Max(absVal[l],absVal[l+16]);
+//	      if(l < 8)									absVal[l] = Max(absVal[l],absVal[l+8]);
+//	      if(l < 4)									absVal[l] = Max(absVal[l],absVal[l+4]);
+//	      if(l < 2)									absVal[l] = Max(absVal[l],absVal[l+2]);
+//	      if(l < 1)									value   = Sign(u[0])*Max(absVal[l],absVal[l+1]);
+//		__syncthreads();
+//
+//		if(computeFU)
+//			u[l] = value;
+		if(computeFU)
+		{
+			if(boundaryCondition == 4)
+				u[l] = u[threadIdx.y * blockDim.x] + Sign(u[0])*this->subMesh.getHx()*(threadIdx.x) ;//+  2*Sign(u[0])*this->subMesh.getHx()*(threadIdx.x+this->n);
+			else if(boundaryCondition == 2)
+				u[l] = u[threadIdx.y * blockDim.x + blockDim.x - 1] + Sign(u[0])*this->subMesh.getHx()*(this->n - 1 - threadIdx.x);//+ 2*Sign(u[0])*this->subMesh.getHx()*(blockDim.x - threadIdx.x - 1+this->n);
+			else if(boundaryCondition == 8)
+				u[l] = u[threadIdx.x] + Sign(u[0])*this->subMesh.getHx()*(threadIdx.y) ;//+ 2*Sign(u[0])*this->subMesh.getHx()*(threadIdx.y+this->n);
+			else if(boundaryCondition == 1)
+				u[l] = u[(blockDim.y - 1)* blockDim.x + threadIdx.x] + Sign(u[0])*this->subMesh.getHx()*(this->n - 1 - threadIdx.y) ;//+ Sign(u[0])*this->subMesh.getHx()*(blockDim.y - threadIdx.y  - 1 +this->n);
+		}
+	}
+
+   double time = 0.0;
+   __shared__ double currentTau;
+   double cfl = this->cflCondition;
+   double fu = 0.0;
+//   if(threadIdx.x * threadIdx.y == 0)
+//   {
+//	   currentTau = this->tau0;
+//   }
+   double finalTime = this->stopTime;
+   __syncthreads();
+   if( time + currentTau > finalTime ) currentTau = finalTime - time;
+
+
+   while( time < finalTime )
+   {
+	  if(computeFU)
+		  fu = schemeHost.getValueDev( this->subMesh, l, tnlStaticVector<3,int>(i,j)/*this->subMesh.getCellCoordinates(l)*/, u, time, boundaryCondition);
+
+	  sharedTau[l]=cfl/fabs(fu);
+
+      if(l == 0)
+      {
+    	  if(sharedTau[0] > 1.0 * this->subMesh.getHx())	sharedTau[0] = 1.0 * this->subMesh.getHx();
+      }
+      else if(l == blockDim.x*blockDim.y - 1)
+    	  if( time + sharedTau[l] > finalTime )		sharedTau[l] = finalTime - time;
+
+
+
+      if((blockDim.x == 16) && (l < 128))		sharedTau[l] = Min(sharedTau[l],sharedTau[l+128]);
+      __syncthreads();
+      if((blockDim.x == 16) && (l < 64))		sharedTau[l] = Min(sharedTau[l],sharedTau[l+64]);
+      __syncthreads();
+      if(l < 32)    							sharedTau[l] = Min(sharedTau[l],sharedTau[l+32]);
+      if(l < 16)								sharedTau[l] = Min(sharedTau[l],sharedTau[l+16]);
+      if(l < 8)									sharedTau[l] = Min(sharedTau[l],sharedTau[l+8]);
+      if(l < 4)									sharedTau[l] = Min(sharedTau[l],sharedTau[l+4]);
+      if(l < 2)									sharedTau[l] = Min(sharedTau[l],sharedTau[l+2]);
+      if(l < 1)									currentTau   = Min(sharedTau[l],sharedTau[l+1]);
+	__syncthreads();
+
+      u[l] += currentTau * fu;
+      time += currentTau;
+   }
+
+
+}
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+__device__
+int tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getOwnerCUDA3D(int i) const
+{
+
+	return ((i / (this->gridCols*this->n*this->n))*this->gridCols
+			+ (i % (this->gridCols*this->n))/this->n);
+}
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+__device__
+int tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getSubgridValueCUDA3D( int i ) const
+{
+	return this->subgridValues_cuda[i];
+}
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+__device__
+void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::setSubgridValueCUDA3D(int i, int value)
+{
+	this->subgridValues_cuda[i] = value;
+}
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+__device__
+int tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getBoundaryConditionCUDA3D( int i ) const
+{
+	return this->boundaryConditions_cuda[i];
+}
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+__device__
+void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::setBoundaryConditionCUDA3D(int i, int value)
+{
+	this->boundaryConditions_cuda[i] = value;
+}
+
+
+
+//north - 1, east - 2, west - 4, south - 8
+
+template <typename SchemeHost, typename SchemeDevice, typename Device>
+__global__
+void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::*/synchronizeCUDA3D(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver) //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now
+{
+
+	__shared__ int boundary[4]; // north,east,west,south
+	__shared__ int subgridValue;
+	__shared__ int newSubgridValue;
+
+
+	int gid = (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x + blockDim.x*blockIdx.x + threadIdx.x;
+	double u = cudaSolver->work_u_cuda[gid];
+	double u_cmp;
+	int subgridValue_cmp=INT_MAX;
+	int boundary_index=0;
+
+
+	if(threadIdx.x+threadIdx.y == 0)
+	{
+		subgridValue = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x);
+		boundary[0] = 0;
+		boundary[1] = 0;
+		boundary[2] = 0;
+		boundary[3] = 0;
+		newSubgridValue = 0;
+		//printf("%d   %d\n", blockDim.x, gridDim.x);
+	}
+	__syncthreads();
+
+
+
+	if(		(threadIdx.x == 0 				/*				&& !(cudaSolver->currentStep & 1)*/) 		||
+			(threadIdx.y == 0 				 	/*			&& (cudaSolver->currentStep & 1)*/) 		||
+			(threadIdx.x == blockDim.x - 1 	 /*	&& !(cudaSolver->currentStep & 1)*/) 		||
+			(threadIdx.y == blockDim.y - 1 	 /*	&& (cudaSolver->currentStep & 1)*/) 		)
+	{
+		if(threadIdx.x == 0 && (blockIdx.x != 0)/* && !(cudaSolver->currentStep & 1)*/)
+		{
+			u_cmp = cudaSolver->work_u_cuda[gid - 1];
+			subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x - 1);
+			boundary_index = 2;
+		}
+
+		if(threadIdx.x == blockDim.x - 1 && (blockIdx.x != gridDim.x - 1)/* && !(cudaSolver->currentStep & 1)*/)
+		{
+			u_cmp = cudaSolver->work_u_cuda[gid + 1];
+			subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + 1);
+			boundary_index = 1;
+		}
+//		if(threadIdx.y == 0 && (blockIdx.y != 0) && (cudaSolver->currentStep & 1))
+//		{
+//			u_cmp = cudaSolver->work_u_cuda[gid - blockDim.x*gridDim.x];
+//			subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D((blockIdx.y - 1)*gridDim.x + blockIdx.x);
+//			boundary_index = 3;
+//		}
+//		if(threadIdx.y == blockDim.y - 1 && (blockIdx.y != gridDim.y - 1) && (cudaSolver->currentStep & 1))
+//		{
+//			u_cmp = cudaSolver->work_u_cuda[gid + blockDim.x*gridDim.x];
+//			subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D((blockIdx.y + 1)*gridDim.x + blockIdx.x);
+//			boundary_index = 0;
+//		}
+
+		__threadfence();
+		if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX))
+		{
+			cudaSolver->unusedCell_cuda[gid] = 0;
+			atomicMax(&newSubgridValue, INT_MAX);
+			atomicMax(&boundary[boundary_index], 1);
+			cudaSolver->work_u_cuda[gid] = u_cmp;
+			u=u_cmp;
+		}
+		//__threadfence();
+		if(threadIdx.y == 0 && (blockIdx.y != 0)/* && (cudaSolver->currentStep & 1)*/)
+		{
+			u_cmp = cudaSolver->work_u_cuda[gid - blockDim.x*gridDim.x];
+			subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D((blockIdx.y - 1)*gridDim.x + blockIdx.x);
+			boundary_index = 3;
+		}
+		if(threadIdx.y == blockDim.y - 1 && (blockIdx.y != gridDim.y - 1)/* && (cudaSolver->currentStep & 1)*/)
+		{
+			u_cmp = cudaSolver->work_u_cuda[gid + blockDim.x*gridDim.x];
+			subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D((blockIdx.y + 1)*gridDim.x + blockIdx.x);
+			boundary_index = 0;
+		}
+
+		__threadfence();
+		if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX))
+		{
+			cudaSolver->unusedCell_cuda[gid] = 0;
+			atomicMax(&newSubgridValue, INT_MAX);
+			atomicMax(&boundary[boundary_index], 1);
+			cudaSolver->work_u_cuda[gid] = u_cmp;
+		}
+	}
+	__syncthreads();
+
+	if(threadIdx.x+threadIdx.y == 0)
+	{
+		if(subgridValue == INT_MAX && newSubgridValue !=0)
+			cudaSolver->setSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x, -INT_MAX);
+
+		cudaSolver->setBoundaryConditionCUDA3D(blockIdx.y*gridDim.x + blockIdx.x, 	boundary[0] +
+																				2 * boundary[1] +
+																				4 * boundary[2] +
+																				8 * boundary[3]);
+	}
+
+
+	/*
+	//printf("I am not an empty kernel!\n");
+	//cout << "Synchronizig..." << endl;
+	int tmp1, tmp2;
+	int grid1, grid2;
+
+	if(cudaSolver->currentStep & 1)
+	{
+		//printf("I am not an empty kernel! 1\n");
+		for(int j = 0; j < cudaSolver->gridRows - 1; j++)
+		{
+			//printf("I am not an empty kernel! 3\n");
+			for (int i = 0; i < cudaSolver->gridCols*cudaSolver->n; i++)
+			{
+				tmp1 = cudaSolver->gridCols*cudaSolver->n*((cudaSolver->n-1)+j*cudaSolver->n) + i;
+				tmp2 = cudaSolver->gridCols*cudaSolver->n*((cudaSolver->n)+j*cudaSolver->n) + i;
+				grid1 = cudaSolver->getSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1));
+				grid2 = cudaSolver->getSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2));
+
+				if ((fabs(cudaSolver->work_u_cuda[tmp1]) < fabs(cudaSolver->work_u_cuda[tmp2]) - cudaSolver->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX))
+				{
+					//printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA3D(tmp1),cudaSolver->getOwnerCUDA3D(tmp2));
+					cudaSolver->work_u_cuda[tmp2] = cudaSolver->work_u_cuda[tmp1];
+					cudaSolver->unusedCell_cuda[tmp2] = 0;
+					if(grid2 == INT_MAX)
+					{
+						cudaSolver->setSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2), -INT_MAX);
+					}
+					if(! (cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2)) & 8) )
+						cudaSolver->setBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2), cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2))+8);
+				}
+				else if ((fabs(cudaSolver->work_u_cuda[tmp1]) > fabs(cudaSolver->work_u_cuda[tmp2]) + cudaSolver->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX))
+				{
+					//printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA3D(tmp1),cudaSolver->getOwnerCUDA3D(tmp2));
+					cudaSolver->work_u_cuda[tmp1] = cudaSolver->work_u_cuda[tmp2];
+					cudaSolver->unusedCell_cuda[tmp1] = 0;
+					if(grid1 == INT_MAX)
+					{
+						cudaSolver->setSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1), -INT_MAX);
+					}
+					if(! (cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1)) & 1) )
+						cudaSolver->setBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1), cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1))+1);
+				}
+			}
+		}
+
+	}
+	else
+	{
+		//printf("I am not an empty kernel! 2\n");
+		for(int i = 1; i < cudaSolver->gridCols; i++)
+		{
+			//printf("I am not an empty kernel! 4\n");
+			for (int j = 0; j < cudaSolver->gridRows*cudaSolver->n; j++)
+			{
+
+				tmp1 = cudaSolver->gridCols*cudaSolver->n*j + i*cudaSolver->n - 1;
+				tmp2 = cudaSolver->gridCols*cudaSolver->n*j + i*cudaSolver->n ;
+				grid1 = cudaSolver->getSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1));
+				grid2 = cudaSolver->getSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2));
+
+				if ((fabs(cudaSolver->work_u_cuda[tmp1]) < fabs(cudaSolver->work_u_cuda[tmp2]) - cudaSolver->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX))
+				{
+					//printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA3D(tmp1),cudaSolver->getOwnerCUDA3D(tmp2));
+					cudaSolver->work_u_cuda[tmp2] = cudaSolver->work_u_cuda[tmp1];
+					cudaSolver->unusedCell_cuda[tmp2] = 0;
+					if(grid2 == INT_MAX)
+					{
+						cudaSolver->setSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2), -INT_MAX);
+					}
+					if(! (cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2)) & 4) )
+						cudaSolver->setBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2), cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp2))+4);
+				}
+				else if ((fabs(cudaSolver->work_u_cuda[tmp1]) > fabs(cudaSolver->work_u_cuda[tmp2]) + cudaSolver->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX))
+				{
+					//printf("%d %d %d %d \n",tmp1,tmp2,cudaSolver->getOwnerCUDA3D(tmp1),cudaSolver->getOwnerCUDA3D(tmp2));
+					cudaSolver->work_u_cuda[tmp1] = cudaSolver->work_u_cuda[tmp2];
+					cudaSolver->unusedCell_cuda[tmp1] = 0;
+					if(grid1 == INT_MAX)
+					{
+						cudaSolver->setSubgridValueCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1), -INT_MAX);
+					}
+					if(! (cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1)) & 2) )
+						cudaSolver->setBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1), cudaSolver->getBoundaryConditionCUDA3D(cudaSolver->getOwnerCUDA3D(tmp1))+2);
+				}
+			}
+		}
+	}
+	//printf("I am not an empty kernel! 5 cudaSolver->currentStep : %d \n", cudaSolver->currentStep);
+
+	cudaSolver->currentStep = cudaSolver->currentStep + 1;
+	int stepValue = cudaSolver->currentStep + 4;
+	for (int i = 0; i < cudaSolver->gridRows * cudaSolver->gridCols; i++)
+	{
+		if( cudaSolver->getSubgridValueCUDA3D(i) == -INT_MAX )
+			cudaSolver->setSubgridValueCUDA3D(i, stepValue);
+	}
+
+	int maxi = 0;
+	for(int q=0; q < cudaSolver->gridRows*cudaSolver->gridCols;q++)
+	{
+		//printf("%d : %d\n", q, cudaSolver->boundaryConditions_cuda[q]);
+		maxi=Max(maxi,cudaSolver->getBoundaryConditionCUDA3D(q));
+	}
+	//printf("I am not an empty kernel! %d\n", maxi);
+	*(cudaSolver->runcuda) = (maxi > 0);
+	//printf("I am not an empty kernel! 7 %d\n", cudaSolver->boundaryConditions_cuda[0]);
+	//cout << "Grid synchronized at step " << (this->currentStep - 1 ) << endl;
+*/
+}
+
+
+
+template <typename SchemeHost, typename SchemeDevice, typename Device>
+__global__
+void synchronize2CUDA3D(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver)
+{
+	if(blockIdx.x+blockIdx.y ==0)
+	{
+		cudaSolver->currentStep = cudaSolver->currentStep + 1;
+		*(cudaSolver->runcuda) = 0;
+	}
+
+	int stepValue = cudaSolver->currentStep + 4;
+	if( cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x) == -INT_MAX )
+			cudaSolver->setSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x, stepValue);
+
+	atomicMax((cudaSolver->runcuda),cudaSolver->getBoundaryConditionCUDA3D(blockIdx.y*gridDim.x + blockIdx.x));
+}
+
+
+
+
+
+
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+__global__
+void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::*/initCUDA3D( tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr , int* ptr2, int* ptr3)
+{
+	//cout << "Initializating solver..." << endl;
+	//const tnlString& meshLocation = parameters.getParameter <tnlString>("mesh");
+	//this->mesh_cuda.load( meshLocation );
+
+	//this->n_cuda = parameters.getParameter <int>("subgrid-size");
+	//cout << "Setting N << this->n_cuda << endl;
+
+	//this->subMesh_cuda.setDimensions( this->n_cuda, this->n_cuda );
+	//this->subMesh_cuda.setDomain( tnlStaticVector<3,double>(0.0, 0.0),
+							 //tnlStaticVector<3,double>(this->mesh_cuda.getHx()*(double)(this->n_cuda), this->mesh_cuda.getHy()*(double)(this->n_cuda)) );
+
+	//this->subMesh_cuda.save("submesh.tnl");
+
+//	const tnlString& initialCondition = parameters.getParameter <tnlString>("initial-condition");
+//	this->u0.load( initialCondition );
+
+	//cout << this->mesh.getCellCenter(0) << endl;
+
+	//this->delta_cuda = parameters.getParameter <double>("delta");
+	//this->delta_cuda *= this->mesh_cuda.getHx()*this->mesh_cuda.getHy();
+
+	//cout << "Setting delta to " << this->delta << endl;
+
+	//this->tau0_cuda = parameters.getParameter <double>("initial-tau");
+	//cout << "Setting initial tau to " << this->tau0_cuda << endl;
+	//this->stopTime_cuda = parameters.getParameter <double>("stop-time");
+
+	//this->cflCondition_cuda = parameters.getParameter <double>("cfl-condition");
+	//this -> cflCondition_cuda *= sqrt(this->mesh_cuda.getHx()*this->mesh_cuda.getHy());
+	//cout << "Setting CFL to " << this->cflCondition << endl;
+////
+////
+
+//	this->gridRows_cuda = gridRows;
+//	this->gridCols_cuda = gridCols;
+
+	cudaSolver->work_u_cuda = ptr;//(double*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*cudaSolver->n*cudaSolver->n*sizeof(double));
+	cudaSolver->unusedCell_cuda = ptr3;//(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*cudaSolver->n*cudaSolver->n*sizeof(int));
+	cudaSolver->subgridValues_cuda =(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*sizeof(int));
+	cudaSolver->boundaryConditions_cuda =(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*sizeof(int));
+	cudaSolver->runcuda = ptr2;//(bool*)malloc(sizeof(bool));
+	*(cudaSolver->runcuda) = 1;
+	cudaSolver->currentStep = 1;
+	//cudaMemcpy(ptr,&(cudaSolver->work_u_cuda), sizeof(double*),cudaMemcpyDeviceToHost);
+	//ptr = cudaSolver->work_u_cuda;
+	printf("GPU memory allocated.\n");
+
+	for(int i = 0; i < cudaSolver->gridCols*cudaSolver->gridRows; i++)
+	{
+		cudaSolver->subgridValues_cuda[i] = INT_MAX;
+		cudaSolver->boundaryConditions_cuda[i] = 0;
+	}
+
+	/*for(long int j = 0; j < cudaSolver->n*cudaSolver->n*cudaSolver->gridCols*cudaSolver->gridRows; j++)
+	{
+		printf("%d\n",j);
+		cudaSolver->unusedCell_cuda[ j] = 1;
+	}*/
+	printf("GPU memory initialized.\n");
+
+
+	//cudaSolver->work_u_cuda[50] = 32.153438;
+////
+////
+	//stretchGrid();
+	//this->stopTime_cuda /= (double)(this->gridCols_cuda);
+	//this->stopTime_cuda *= (1.0+1.0/((double)(this->n_cuda) - 1.0));
+	//cout << "Setting stopping time to " << this->stopTime << endl;
+	//this->stopTime_cuda = 1.5*((double)(this->n_cuda))*parameters.getParameter <double>("stop-time")*this->mesh_cuda.getHx();
+	//cout << "Setting stopping time to " << this->stopTime << endl;
+
+	//cout << "Initializating scheme..." << endl;
+	//if(!this->schemeDevice.init(parameters))
+//	{
+		//cerr << "Scheme failed to initialize." << endl;
+//		return false;
+//	}
+	//cout << "Scheme initialized." << endl;
+
+	//test();
+
+//	this->currentStep_cuda = 1;
+	//return true;
+}
+
+
+
+
+//extern __shared__ double array[];
+template< typename SchemeHost, typename SchemeDevice, typename Device >
+__global__
+void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::*/initRunCUDA3D(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller)
+
+{
+
+
+	extern __shared__ double u[];
+	//printf("%p\n",caller->work_u_cuda);
+
+	int i = blockIdx.y * gridDim.x + blockIdx.x;
+	int l = threadIdx.y * blockDim.x + threadIdx.x;
+
+	__shared__ int containsCurve;
+	if(l == 0)
+		containsCurve = 0;
+
+	//double a;
+	caller->getSubgridCUDA3D(i,caller, &u[l]);
+	//printf("%f   %f\n",a , u[l]);
+	//u[l] = a;
+	//printf("Hi %f \n", u[l]);
+	__syncthreads();
+	//printf("hurewrwr %f \n", u[l]);
+	if(u[0] * u[l] <= 0.0)
+	{
+		//printf("contains %d \n",i);
+		atomicMax( &containsCurve, 1);
+	}
+
+	__syncthreads();
+	//printf("hu");
+	//printf("%d : %f\n", l, u[l]);
+	if(containsCurve == 1)
+	{
+		//printf("have curve \n");
+		caller->runSubgridCUDA3D(0,u,i);
+		//printf("%d : %f\n", l, u[l]);
+		__syncthreads();
+		caller->insertSubgridCUDA3D(u[l],i);
+		__syncthreads();
+		if(l == 0)
+			caller->setSubgridValueCUDA3D(i, 4);
+	}
+
+
+}
+
+
+
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device >
+__global__
+void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::*/runCUDA3D(tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller)
+{
+	extern __shared__ double u[];
+	int i = blockIdx.y * gridDim.x + blockIdx.x;
+	int l = threadIdx.y * blockDim.x + threadIdx.x;
+	int bound = caller->getBoundaryConditionCUDA3D(i);
+
+	if(caller->getSubgridValueCUDA3D(i) != INT_MAX && bound != 0 && caller->getSubgridValueCUDA3D(i) > 0)
+	{
+		caller->getSubgridCUDA3D(i,caller, &u[l]);
+
+		//if(l == 0)
+			//printf("i = %d, bound = %d\n",i,caller->getSubgridValueCUDA3D(i));
+		if(caller->getSubgridValueCUDA3D(i) == caller->currentStep+4)
+		{
+			if(bound & 1)
+			{
+				caller->runSubgridCUDA3D(1,u,i);
+				//__syncthreads();
+				//caller->insertSubgridCUDA3D(u[l],i);
+				//__syncthreads();
+				//caller->getSubgridCUDA3D(i,caller, &u[l]);
+				caller->updateSubgridCUDA3D(i,caller, &u[l]);
+				__syncthreads();
+			}
+			if(bound & 2 )
+			{
+				caller->runSubgridCUDA3D(2,u,i);
+				//__syncthreads();
+				//caller->insertSubgridCUDA3D(u[l],i);
+				//__syncthreads();
+				//caller->getSubgridCUDA3D(i,caller, &u[l]);
+				caller->updateSubgridCUDA3D(i,caller, &u[l]);
+				__syncthreads();
+			}
+			if(bound & 4)
+			{
+				caller->runSubgridCUDA3D(4,u,i);
+				//__syncthreads();
+				//caller->insertSubgridCUDA3D(u[l],i);
+				//__syncthreads();
+				//caller->getSubgridCUDA3D(i,caller, &u[l]);
+				caller->updateSubgridCUDA3D(i,caller, &u[l]);
+				__syncthreads();
+			}
+			if(bound & 8)
+			{
+				caller->runSubgridCUDA3D(8,u,i);
+				//__syncthreads();
+				//caller->insertSubgridCUDA3D(u[l],i);
+				//__syncthreads();
+				//caller->getSubgridCUDA3D(i,caller, &u[l]);
+				caller->updateSubgridCUDA3D(i,caller, &u[l]);
+				__syncthreads();
+			}
+
+
+
+
+
+			if( ((bound & 3 )))
+				{
+					caller->runSubgridCUDA3D(3,u,i);
+					//__syncthreads();
+					//caller->insertSubgridCUDA3D(u[l],i);
+					//__syncthreads();
+					//caller->getSubgridCUDA3D(i,caller, &u[l]);
+					caller->updateSubgridCUDA3D(i,caller, &u[l]);
+					__syncthreads();
+				}
+				if( ((bound & 5 )))
+				{
+					caller->runSubgridCUDA3D(5,u,i);
+					//__syncthreads();
+					//caller->insertSubgridCUDA3D(u[l],i);
+					//__syncthreads();
+					//caller->getSubgridCUDA3D(i,caller, &u[l]);
+					caller->updateSubgridCUDA3D(i,caller, &u[l]);
+					__syncthreads();
+				}
+				if( ((bound & 10 )))
+				{
+					caller->runSubgridCUDA3D(10,u,i);
+					//__syncthreads();
+					//caller->insertSubgridCUDA3D(u[l],i);
+					//__syncthreads();
+					//caller->getSubgridCUDA3D(i,caller, &u[l]);
+					caller->updateSubgridCUDA3D(i,caller, &u[l]);
+					__syncthreads();
+				}
+				if(   (bound & 12 ))
+				{
+					caller->runSubgridCUDA3D(12,u,i);
+					//__syncthreads();
+					//caller->insertSubgridCUDA3D(u[l],i);
+					//__syncthreads();
+					//caller->getSubgridCUDA3D(i,caller, &u[l]);
+					caller->updateSubgridCUDA3D(i,caller, &u[l]);
+					__syncthreads();
+				}
+
+
+
+
+
+		}
+
+
+		else
+		{
+
+
+
+
+
+
+
+
+
+			if( ((bound == 2)))
+						{
+							caller->runSubgridCUDA3D(2,u,i);
+							//__syncthreads();
+							//caller->insertSubgridCUDA3D(u[l],i);
+							//__syncthreads();
+							//caller->getSubgridCUDA3D(i,caller, &u[l]);
+							caller->updateSubgridCUDA3D(i,caller, &u[l]);
+							__syncthreads();
+						}
+						if( ((bound == 1) ))
+						{
+							caller->runSubgridCUDA3D(1,u,i);
+							//__syncthreads();
+							//caller->insertSubgridCUDA3D(u[l],i);
+							//__syncthreads();
+							//caller->getSubgridCUDA3D(i,caller, &u[l]);
+							caller->updateSubgridCUDA3D(i,caller, &u[l]);
+							__syncthreads();
+						}
+						if( ((bound == 8) ))
+						{
+							caller->runSubgridCUDA3D(8,u,i);
+							//__syncthreads();
+							//caller->insertSubgridCUDA3D(u[l],i);
+							//__syncthreads();
+							//caller->getSubgridCUDA3D(i,caller, &u[l]);
+							caller->updateSubgridCUDA3D(i,caller, &u[l]);
+							__syncthreads();
+						}
+						if(   (bound == 4))
+						{
+							caller->runSubgridCUDA3D(4,u,i);
+							//__syncthreads();
+							//caller->insertSubgridCUDA3D(u[l],i);
+							//__syncthreads();
+							//caller->getSubgridCUDA3D(i,caller, &u[l]);
+							caller->updateSubgridCUDA3D(i,caller, &u[l]);
+							__syncthreads();
+						}
+
+
+
+
+
+
+
+
+
+
+			if( ((bound & 3) ))
+			{
+				caller->runSubgridCUDA3D(3,u,i);
+				//__syncthreads();
+				//caller->insertSubgridCUDA3D(u[l],i);
+				//__syncthreads();
+				//caller->getSubgridCUDA3D(i,caller, &u[l]);
+				caller->updateSubgridCUDA3D(i,caller, &u[l]);
+				__syncthreads();
+			}
+			if( ((bound & 5) ))
+			{
+				caller->runSubgridCUDA3D(5,u,i);
+				//__syncthreads();
+				//caller->insertSubgridCUDA3D(u[l],i);
+				//__syncthreads();
+				//caller->getSubgridCUDA3D(i,caller, &u[l]);
+				caller->updateSubgridCUDA3D(i,caller, &u[l]);
+				__syncthreads();
+			}
+			if( ((bound & 10) ))
+			{
+				caller->runSubgridCUDA3D(10,u,i);
+				//__syncthreads();
+				//caller->insertSubgridCUDA3D(u[l],i);
+				//__syncthreads();
+				//caller->getSubgridCUDA3D(i,caller, &u[l]);
+				caller->updateSubgridCUDA3D(i,caller, &u[l]);
+				__syncthreads();
+			}
+			if(   (bound & 12) )
+			{
+				caller->runSubgridCUDA3D(12,u,i);
+				//__syncthreads();
+				//caller->insertSubgridCUDA3D(u[l],i);
+				//__syncthreads();
+				//caller->getSubgridCUDA3D(i,caller, &u[l]);
+				caller->updateSubgridCUDA3D(i,caller, &u[l]);
+				__syncthreads();
+			}
+
+
+
+
+
+
+
+
+
+
+
+
+		}
+		/*if( bound )
+		{
+			caller->runSubgridCUDA3D(15,u,i);
+			__syncthreads();
+			//caller->insertSubgridCUDA3D(u[l],i);
+			//__syncthreads();
+			//caller->getSubgridCUDA3D(i,caller, &u[l]);
+			caller->updateSubgridCUDA3D(i,caller, &u[l]);
+			__syncthreads();
+		}*/
+
+		if(l==0)
+		{
+			caller->setBoundaryConditionCUDA3D(i, 0);
+			caller->setSubgridValueCUDA3D(i, caller->getSubgridValueCUDA3D(i) - 1 );
+		}
+
+
+	}
+
+
+
+}
+
+#endif /*HAVE_CUDA*/
+
+#endif /* TNLPARALLELEIKONALSOLVER3D_IMPL_H_ */