From 229e25d6c5d42340d79cdbea83c21b3b46d25f09 Mon Sep 17 00:00:00 2001
From: Tomas Sobotik <sobotto4@fjfi.cvut.cz>
Date: Tue, 3 Nov 2015 08:44:55 +0100
Subject: [PATCH] Added several parallel implementations for fast-sweeping
 method

---
 examples/fast-sweeping/fastSweepingConfig.h   |   1 +
 examples/fast-sweeping/main.h                 |   6 +-
 examples/fast-sweeping/tnlFastSweeping.h      |  22 +-
 .../tnlFastSweeping2D_CUDA_impl.h             | 522 ++++++++++
 .../tnlFastSweeping2D_CUDA_v2_impl.h          | 588 +++++++++++
 .../tnlFastSweeping2D_CUDA_v3_impl.h          | 920 ++++++++++++++++++
 .../fast-sweeping/tnlFastSweeping2D_impl.h    |  16 +
 .../tnlFastSweeping2D_openMP_impl.h           | 399 ++++++++
 examples/fast-sweeping/tnlFastSweeping_CUDA.h | 104 ++
 9 files changed, 2574 insertions(+), 4 deletions(-)
 create mode 100644 examples/fast-sweeping/tnlFastSweeping2D_CUDA_impl.h
 create mode 100644 examples/fast-sweeping/tnlFastSweeping2D_CUDA_v2_impl.h
 create mode 100644 examples/fast-sweeping/tnlFastSweeping2D_CUDA_v3_impl.h
 create mode 100644 examples/fast-sweeping/tnlFastSweeping2D_openMP_impl.h
 create mode 100644 examples/fast-sweeping/tnlFastSweeping_CUDA.h

diff --git a/examples/fast-sweeping/fastSweepingConfig.h b/examples/fast-sweeping/fastSweepingConfig.h
index 6b8b7b9a91..ff14036301 100644
--- a/examples/fast-sweeping/fastSweepingConfig.h
+++ b/examples/fast-sweeping/fastSweepingConfig.h
@@ -30,6 +30,7 @@ class fastSweepingConfig
          config.addEntry        < tnlString > ( "problem-name", "This defines particular problem.", "fast-sweeping" );
          config.addRequiredEntry        < tnlString > ( "initial-condition", "Initial condition for solver");
          config.addEntry       < tnlString > ( "mesh", "Name of mesh.", "mesh.tnl" );
+         config.addEntry       < tnlString > ( "exact-input", "Are the function values near the curve equal to the SDF? (yes/no)", "no" );
       }
 };
 
diff --git a/examples/fast-sweeping/main.h b/examples/fast-sweeping/main.h
index c21b591358..75850bbf6f 100644
--- a/examples/fast-sweeping/main.h
+++ b/examples/fast-sweeping/main.h
@@ -16,7 +16,10 @@
 
 
 #include "MainBuildConfig.h"
-#include "tnlFastSweeping.h"
+	//for HOST versions:
+//#include "tnlFastSweeping.h"
+	//for DEVICE versions:
+#include "tnlFastSweeping_CUDA.h"
 #include "fastSweepingConfig.h"
 #include <solvers/tnlConfigTags.h>
 
@@ -46,6 +49,7 @@ int main( int argc, char* argv[] )
     	cerr << "Solver failed to initialize." << endl;
    		return EXIT_FAILURE;
    }
+    checkCudaDevice;
    cout << "-------------------------------------------------------------" << endl;
    cout << "Starting solver..." << endl;
    solver.run();
diff --git a/examples/fast-sweeping/tnlFastSweeping.h b/examples/fast-sweeping/tnlFastSweeping.h
index 1e04925ecb..b18722b2a6 100644
--- a/examples/fast-sweeping/tnlFastSweeping.h
+++ b/examples/fast-sweeping/tnlFastSweeping.h
@@ -24,6 +24,9 @@
 #include <limits.h>
 #include <core/tnlDevice.h>
 #include <ctime>
+#ifdef HAVE_OPENMP
+#include <omp.h>
+#endif
 
 
 
@@ -60,7 +63,12 @@ public:
 
 	bool initGrid();
 	bool run();
-	void updateValue(const Index i, const Index j);
+
+	//for single core version use this implementation:
+	//void updateValue(const Index i, const Index j);
+	//for parallel version use this one instead:
+	void updateValue(const Index i, const Index j, DofVectorType* grid);
+
 	Real fabsMin(const Real x, const Real y);
 
 
@@ -68,14 +76,22 @@ protected:
 
 	MeshType Mesh;
 
+	bool exactInput;
+
 	DofVectorType dofVector;
 
 	RealType h;
 
+#ifdef HAVE_OPENMP
+//	omp_lock_t* gridLock;
+#endif
 
-};
 
+};
 
-#include "tnlFastSweeping2D_impl.h"
+	//for single core version use this implementation:
+//#include "tnlFastSweeping2D_impl.h"
+	//for parallel version use this one instead:
+ #include "tnlFastSweeping2D_openMP_impl.h"
 
 #endif /* TNLFASTSWEEPING_H_ */
diff --git a/examples/fast-sweeping/tnlFastSweeping2D_CUDA_impl.h b/examples/fast-sweeping/tnlFastSweeping2D_CUDA_impl.h
new file mode 100644
index 0000000000..5b90938c2a
--- /dev/null
+++ b/examples/fast-sweeping/tnlFastSweeping2D_CUDA_impl.h
@@ -0,0 +1,522 @@
+/***************************************************************************
+                          tnlFastSweeping_impl.h  -  description
+                             -------------------
+    begin                : Oct 15 , 2015
+    copyright            : (C) 2015 by Tomas Sobotik
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+#ifndef TNLFASTSWEEPING2D_IMPL_H_
+#define TNLFASTSWEEPING2D_IMPL_H_
+
+#include "tnlFastSweeping.h"
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+tnlString tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
+{
+	   return tnlString( "tnlFastSweeping< " ) +
+	          MeshType::getType() + ", " +
+	          ::getType< Real >() + ", " +
+	          ::getType< Index >() + " >";
+}
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: init( const tnlParameterContainer& parameters )
+{
+	const tnlString& meshFile = parameters.getParameter< tnlString >( "mesh" );
+
+	if( ! Mesh.load( meshFile ) )
+	{
+		   cerr << "I am not able to load the mesh from the file " << meshFile << "." << endl;
+		   return false;
+	}
+
+
+	const tnlString& initialCondition = parameters.getParameter <tnlString>("initial-condition");
+	if( ! dofVector.load( initialCondition ) )
+	{
+		   cerr << "I am not able to load the initial condition from the file " << meshFile << "." << endl;
+		   return false;
+	}
+
+	h = Mesh.getHx();
+	counter = 0;
+
+	const tnlString& exact_input = parameters.getParameter< tnlString >( "exact-input" );
+
+	if(exact_input == "no")
+		exactInput=false;
+	else
+		exactInput=true;
+
+
+#ifdef HAVE_CUDA
+
+	cudaMalloc(&(cudaDofVector), this->dofVector.getSize()*sizeof(double));
+	cudaMemcpy(cudaDofVector, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice);
+
+	cudaMalloc(&(cudaDofVector2), this->dofVector.getSize()*sizeof(double));
+	cudaMemcpy(cudaDofVector2, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice);
+
+
+	cudaMalloc(&(this->cudaSolver), sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >));
+	cudaMemcpy(this->cudaSolver, this,sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >), cudaMemcpyHostToDevice);
+
+#endif
+
+	int n = Mesh.getDimensions().x();
+	dim3 threadsPerBlock(16, 16);
+	dim3 numBlocks(n/16 + 1 ,n/16 +1);
+
+	initCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
+	cudaDeviceSynchronize();
+	checkCudaDevice;
+
+	return true;
+}
+
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: run()
+{
+//
+//	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
+//	{
+//		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
+//		{
+//			updateValue(i,j);
+//		}
+//	}
+//
+///*---------------------------------------------------------------------------------------------------------------------------*/
+//
+//	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
+//	{
+//		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
+//		{
+//			updateValue(i,j);
+//		}
+//	}
+//
+///*---------------------------------------------------------------------------------------------------------------------------*/
+//
+//	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
+//	{
+//		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
+//		{
+//			updateValue(i,j);
+//		}
+//	}
+//
+///*---------------------------------------------------------------------------------------------------------------------------*/
+//	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
+//	{
+//		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
+//		{
+//			updateValue(i,j);
+//		}
+//	}
+//
+///*---------------------------------------------------------------------------------------------------------------------------*/
+//
+//
+//	dofVector.save("u-00001.tnl");
+
+	int n = Mesh.getDimensions().x();
+	dim3 threadsPerBlock(32, 32);
+	dim3 numBlocks(n/32 + 1 ,n/32 +1);
+
+	for(int i = 2*n - 1; i > -1; i--)
+	{
+		runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,4,i);
+		cudaDeviceSynchronize();
+	}
+	cudaDeviceSynchronize();
+	checkCudaDevice;
+	for(int i = 0; i < 2*n ; i++)
+	{
+		runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,1,i);
+		cudaDeviceSynchronize();
+	}
+	cudaDeviceSynchronize();
+	checkCudaDevice;
+	for(int i = 0; i < 2*n ; i++)
+	{
+		runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,2,i);
+		cudaDeviceSynchronize();
+	}
+	cudaDeviceSynchronize();
+	checkCudaDevice;
+	for(int i = 2*n - 1; i > -1; i--)
+	{
+		runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,3,i);
+		cudaDeviceSynchronize();
+	}
+
+	cudaDeviceSynchronize();
+	checkCudaDevice;
+
+	cudaMemcpy(this->dofVector.getData(), cudaDofVector, this->dofVector.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
+	cudaDeviceSynchronize();
+	cudaFree(cudaDofVector);
+	cudaFree(cudaDofVector2);
+	cudaFree(cudaSolver);
+	dofVector.save("u-00001.tnl");
+	cudaDeviceSynchronize();
+	return true;
+}
+
+
+
+
+#ifdef HAVE_CUDA
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+__device__
+void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real value = cudaDofVector[index];
+	Real a,b, tmp;
+
+	if( i == 0 )
+		a = cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)];
+	else if( i == Mesh.getDimensions().x() - 1 )
+		a = cudaDofVector[Mesh.template getCellNextToCell<-1,0>(index)];
+	else
+	{
+		a = fabsMin( cudaDofVector[Mesh.template getCellNextToCell<-1,0>(index)],
+				 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)] );
+	}
+
+	if( j == 0 )
+		b = cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)];
+	else if( j == Mesh.getDimensions().y() - 1 )
+		b = cudaDofVector[Mesh.template getCellNextToCell<0,-1>(index)];
+	else
+	{
+		b = fabsMin( cudaDofVector[Mesh.template getCellNextToCell<0,-1>(index)],
+				 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)] );
+	}
+
+
+	if(abs(a-b) >= h)
+		tmp = fabsMin(a,b) + Sign(value)*h;
+	else
+		tmp = 0.5 * (a + b + Sign(value)*sqrt(2.0 * h * h - (a - b) * (a - b) ) );
+
+	cudaDofVector[index]  = fabsMin(value, tmp);
+
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+__device__
+bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid()
+{
+	int gx = threadIdx.x + blockDim.x*blockIdx.x;
+	int gy = blockDim.y*blockIdx.y + threadIdx.y;
+	int gid = Mesh.getCellIndex(CoordinatesType(gx,gy));
+
+	int total = blockDim.x*gridDim.x;
+
+
+
+	Real tmp = 0.0;
+	int flag = 0;
+	counter = 0;
+	tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
+
+
+	if(!exactInput)
+	{
+		cudaDofVector[gid]=cudaDofVector[gid]=0.5*h*Sign(cudaDofVector[gid]);
+	}
+	__threadfence();
+//	printf("-----------------------------------------------------------------------------------\n");
+
+	__threadfence();
+
+	if(gx > 0 && gx < Mesh.getDimensions().x()-1)
+	{
+		if(gy > 0 && gy < Mesh.getDimensions().y()-1)
+		{
+
+			Index j = gy;
+			Index i = gx;
+//			 tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+
+			if(tmp == 0.0)
+			{}
+			else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
+					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+			{}
+			else
+				flag=1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+		}
+	}
+
+//	printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
+//	printf("****************************************************************\n");
+//	printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
+	if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == 0)
+	{
+//		printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
+		Index j = 0;
+		Index i = gx;
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+
+
+		if(tmp == 0.0)
+		{}
+		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 )
+		{}
+		else
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+	}
+
+//	printf("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n");
+	if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == Mesh.getDimensions().y() - 1)
+	{
+		Index i = gx;
+		Index j = Mesh.getDimensions().y() - 1;
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+
+
+		if(tmp == 0.0)
+		{}
+		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+		{}
+		else
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+	}
+
+//	printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+	if(gy > 0 && gy < Mesh.getDimensions().y()-1 && gx == 0)
+	{
+		Index j = gy;
+		Index i = 0;
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+
+
+		if(tmp == 0.0)
+		{}
+		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+		{}
+		else
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+	}
+//	printf("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n");
+	if(gy > 0 && gy < Mesh.getDimensions().y()-1  && gx == Mesh.getDimensions().x() - 1)
+	{
+		Index j = gy;
+		Index i = Mesh.getDimensions().x() - 1;
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+
+
+		if(tmp == 0.0)
+		{}
+		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+		{}
+		else
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+	}
+
+//	printf("##################################################################################################\n");
+	if(gx == Mesh.getDimensions().x() - 1 &&
+	   gy == Mesh.getDimensions().y() - 1)
+	{
+
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
+		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 &&
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0)
+
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
+	}
+	if(gx == Mesh.getDimensions().x() - 1 &&
+	   gy == 0)
+	{
+
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
+		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 &&
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0)
+
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
+	}
+//	printf("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n");
+	if(gx == 0 &&
+	   gy == Mesh.getDimensions().y() - 1)
+	{
+
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
+		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 &&
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0)
+
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
+	}
+	if(gx == 0 &&
+	   gy == 0)
+	{
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
+		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 &&
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0)
+
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
+	}
+
+	__threadfence();
+
+	if(flag==1)
+		cudaDofVector[gid] =  tmp*3;
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+__device__
+Real tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
+{
+	Real fx = abs(x);
+	Real fy = abs(y);
+
+	Real tmpMin = Min(fx,fy);
+
+	if(tmpMin == fx)
+		return x;
+	else
+		return y;
+
+
+}
+
+
+
+__global__ void runCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int i)
+{
+
+	int gx = threadIdx.x + blockDim.x*blockIdx.x;
+	int gy = blockDim.y*blockIdx.y + threadIdx.y;
+	if(solver->Mesh.getDimensions().x() <= gx || solver->Mesh.getDimensions().y() <= gy)
+		return;
+	int total = solver->Mesh.getDimensions().x();
+	//int gid = solver->Mesh.getDimensions().x() * gy + gx;
+	int max = solver->Mesh.getDimensions().x()*solver->Mesh.getDimensions().x();
+
+	int id1 = gx+gy;
+	int id2 = (solver->Mesh.getDimensions().x() - gx - 1) + gy;
+
+	/*---------------------------------------------------------------------------------------------------------------------------*/
+	if(sweep == 1)
+//	for(int i = 0; i < 2*total - 1; i++)
+	{
+		if(id1 == i)
+		{
+			solver->updateValue(gx,gy);
+			return;
+		}
+
+	}
+	/*---------------------------------------------------------------------------------------------------------------------------*/
+	else if(sweep == 2)
+//	for(int i = 0; i < 2*total - 1; i++)
+	{
+		if(id2 == i)
+		{
+			solver->updateValue(gx,gy);
+			return;
+		}
+	}
+	/*---------------------------------------------------------------------------------------------------------------------------*/
+	else if(sweep == 3)
+//	for(int i = 2*total - 2; i > -1; i--)
+	{
+		if(id1 == i)
+		{
+			solver->updateValue(gx,gy);
+			return;
+		}
+	}
+	/*---------------------------------------------------------------------------------------------------------------------------*/
+	else if(sweep == 4)
+//	for(int i = 2*total - 2; i > -1; i--)
+	{
+		if(id2 == i)
+		{
+			solver->updateValue(gx,gy);
+			return;
+		}
+	}
+	/*---------------------------------------------------------------------------------------------------------------------------*/
+
+
+
+
+}
+
+
+__global__ void initCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver)
+{
+	int gx = threadIdx.x + blockDim.x*blockIdx.x;
+	int gy = blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy)
+	{
+		solver->initGrid();
+	}
+
+
+}
+#endif
+
+
+
+
+#endif /* TNLFASTSWEEPING_IMPL_H_ */
diff --git a/examples/fast-sweeping/tnlFastSweeping2D_CUDA_v2_impl.h b/examples/fast-sweeping/tnlFastSweeping2D_CUDA_v2_impl.h
new file mode 100644
index 0000000000..cbaf200b69
--- /dev/null
+++ b/examples/fast-sweeping/tnlFastSweeping2D_CUDA_v2_impl.h
@@ -0,0 +1,588 @@
+/***************************************************************************
+                          tnlFastSweeping_impl.h  -  description
+                             -------------------
+    begin                : Oct 15 , 2015
+    copyright            : (C) 2015 by Tomas Sobotik
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+#ifndef TNLFASTSWEEPING2D_IMPL_H_
+#define TNLFASTSWEEPING2D_IMPL_H_
+
+#include "tnlFastSweeping.h"
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+tnlString tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
+{
+	   return tnlString( "tnlFastSweeping< " ) +
+	          MeshType::getType() + ", " +
+	          ::getType< Real >() + ", " +
+	          ::getType< Index >() + " >";
+}
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: init( const tnlParameterContainer& parameters )
+{
+	const tnlString& meshFile = parameters.getParameter< tnlString >( "mesh" );
+
+	if( ! Mesh.load( meshFile ) )
+	{
+		   cerr << "I am not able to load the mesh from the file " << meshFile << "." << endl;
+		   return false;
+	}
+
+
+	const tnlString& initialCondition = parameters.getParameter <tnlString>("initial-condition");
+	if( ! dofVector.load( initialCondition ) )
+	{
+		   cerr << "I am not able to load the initial condition from the file " << meshFile << "." << endl;
+		   return false;
+	}
+
+	h = Mesh.getHx();
+	counter = 0;
+
+	const tnlString& exact_input = parameters.getParameter< tnlString >( "exact-input" );
+
+	if(exact_input == "no")
+		exactInput=false;
+	else
+		exactInput=true;
+
+
+#ifdef HAVE_CUDA
+
+	cudaMalloc(&(cudaDofVector), this->dofVector.getSize()*sizeof(double));
+	cudaMemcpy(cudaDofVector, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice);
+
+	cudaMalloc(&(cudaDofVector2), this->dofVector.getSize()*sizeof(double));
+	cudaMemcpy(cudaDofVector2, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice);
+
+
+	cudaMalloc(&(this->cudaSolver), sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >));
+	cudaMemcpy(this->cudaSolver, this,sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >), cudaMemcpyHostToDevice);
+
+#endif
+
+	int n = Mesh.getDimensions().x();
+	dim3 threadsPerBlock(16, 16);
+	dim3 numBlocks(n/16 + 1 ,n/16 +1);
+
+	initCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
+	cudaDeviceSynchronize();
+	checkCudaDevice;
+
+	return true;
+}
+
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: run()
+{
+//
+//	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
+//	{
+//		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
+//		{
+//			updateValue(i,j);
+//		}
+//	}
+//
+///*---------------------------------------------------------------------------------------------------------------------------*/
+//
+//	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
+//	{
+//		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
+//		{
+//			updateValue(i,j);
+//		}
+//	}
+//
+///*---------------------------------------------------------------------------------------------------------------------------*/
+//
+//	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
+//	{
+//		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
+//		{
+//			updateValue(i,j);
+//		}
+//	}
+//
+///*---------------------------------------------------------------------------------------------------------------------------*/
+//	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
+//	{
+//		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
+//		{
+//			updateValue(i,j);
+//		}
+//	}
+//
+///*---------------------------------------------------------------------------------------------------------------------------*/
+//
+//
+//	dofVector.save("u-00001.tnl");
+
+	int n = Mesh.getDimensions().x();
+	dim3 threadsPerBlock(27, 27);
+	dim3 numBlocks(1 ,1);
+
+//	for(int i = 2*n - 1; i > -1; i--)
+	{
+		runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,4,0);
+		cudaDeviceSynchronize();
+	}
+	cudaDeviceSynchronize();
+	checkCudaDevice;
+////	for(int i = 0; i < 2*n ; i++)
+//	{
+//		runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,1,0);
+//		cudaDeviceSynchronize();
+//	}
+//	cudaDeviceSynchronize();
+//	checkCudaDevice;
+////	for(int i = 0; i < 2*n ; i++)
+//	{
+//		runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,2,0);
+//		cudaDeviceSynchronize();
+//	}
+//	cudaDeviceSynchronize();
+//	checkCudaDevice;
+////	for(int i = 2*n - 1; i > -1; i--)
+//	{
+//		runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,3,0);
+//		cudaDeviceSynchronize();
+//	}
+//
+//	cudaDeviceSynchronize();
+//	checkCudaDevice;
+
+	cudaMemcpy(this->dofVector.getData(), cudaDofVector, this->dofVector.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
+	cudaDeviceSynchronize();
+	cudaFree(cudaDofVector);
+	cudaFree(cudaDofVector2);
+	cudaFree(cudaSolver);
+	dofVector.save("u-00001.tnl");
+	cudaDeviceSynchronize();
+	return true;
+}
+
+
+
+
+#ifdef HAVE_CUDA
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+__device__
+void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real value = cudaDofVector[index];
+	Real a,b, tmp;
+
+	if( i == 0 )
+		a = cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)];
+	else if( i == Mesh.getDimensions().x() - 1 )
+		a = cudaDofVector[Mesh.template getCellNextToCell<-1,0>(index)];
+	else
+	{
+		a = fabsMin( cudaDofVector[Mesh.template getCellNextToCell<-1,0>(index)],
+				 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)] );
+	}
+
+	if( j == 0 )
+		b = cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)];
+	else if( j == Mesh.getDimensions().y() - 1 )
+		b = cudaDofVector[Mesh.template getCellNextToCell<0,-1>(index)];
+	else
+	{
+		b = fabsMin( cudaDofVector[Mesh.template getCellNextToCell<0,-1>(index)],
+				 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)] );
+	}
+
+
+	if(abs(a-b) >= h)
+		tmp = fabsMin(a,b) + Sign(value)*h;
+	else
+		tmp = 0.5 * (a + b + Sign(value)*sqrt(2.0 * h * h - (a - b) * (a - b) ) );
+
+	cudaDofVector[index]  = fabsMin(value, tmp);
+
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+__device__
+bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid()
+{
+	int gx = threadIdx.x + blockDim.x*blockIdx.x;
+	int gy = blockDim.y*blockIdx.y + threadIdx.y;
+	int gid = Mesh.getCellIndex(CoordinatesType(gx,gy));
+
+	int total = blockDim.x*gridDim.x;
+
+
+
+	Real tmp = 0.0;
+	int flag = 0;
+	counter = 0;
+	tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
+
+
+	if(!exactInput)
+	{
+		cudaDofVector[gid]=cudaDofVector[gid]=0.5*h*Sign(cudaDofVector[gid]);
+	}
+	__threadfence();
+//	printf("-----------------------------------------------------------------------------------\n");
+
+	__threadfence();
+
+	if(gx > 0 && gx < Mesh.getDimensions().x()-1)
+	{
+		if(gy > 0 && gy < Mesh.getDimensions().y()-1)
+		{
+
+			Index j = gy;
+			Index i = gx;
+//			 tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+
+			if(tmp == 0.0)
+			{}
+			else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
+					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+			{}
+			else
+				flag=1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+		}
+	}
+
+//	printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
+//	printf("****************************************************************\n");
+//	printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
+	if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == 0)
+	{
+//		printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
+		Index j = 0;
+		Index i = gx;
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+
+
+		if(tmp == 0.0)
+		{}
+		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 )
+		{}
+		else
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+	}
+
+//	printf("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n");
+	if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == Mesh.getDimensions().y() - 1)
+	{
+		Index i = gx;
+		Index j = Mesh.getDimensions().y() - 1;
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+
+
+		if(tmp == 0.0)
+		{}
+		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+		{}
+		else
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+	}
+
+//	printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+	if(gy > 0 && gy < Mesh.getDimensions().y()-1 && gx == 0)
+	{
+		Index j = gy;
+		Index i = 0;
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+
+
+		if(tmp == 0.0)
+		{}
+		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+		{}
+		else
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+	}
+//	printf("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n");
+	if(gy > 0 && gy < Mesh.getDimensions().y()-1  && gx == Mesh.getDimensions().x() - 1)
+	{
+		Index j = gy;
+		Index i = Mesh.getDimensions().x() - 1;
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+
+
+		if(tmp == 0.0)
+		{}
+		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+		{}
+		else
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+	}
+
+//	printf("##################################################################################################\n");
+	if(gx == Mesh.getDimensions().x() - 1 &&
+	   gy == Mesh.getDimensions().y() - 1)
+	{
+
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
+		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 &&
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0)
+
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
+	}
+	if(gx == Mesh.getDimensions().x() - 1 &&
+	   gy == 0)
+	{
+
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
+		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 &&
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0)
+
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
+	}
+//	printf("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n");
+	if(gx == 0 &&
+	   gy == Mesh.getDimensions().y() - 1)
+	{
+
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
+		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 &&
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0)
+
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
+	}
+	if(gx == 0 &&
+	   gy == 0)
+	{
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
+		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 &&
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0)
+
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
+	}
+
+	__threadfence();
+
+	if(flag==1)
+		cudaDofVector[gid] =  tmp*3;
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+__device__
+Real tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
+{
+	Real fx = abs(x);
+
+	Real tmpMin = Min(fx,abs(y));
+
+	if(tmpMin == fx)
+		return x;
+	else
+		return y;
+
+
+}
+
+
+
+__global__ void runCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int k)
+{
+
+	//int gx = threadIdx.x;
+	//int gy = threadIdx.y;
+	int id1,id2;
+	int nx = solver->Mesh.getDimensions().x()+ threadIdx.x;
+	int ny = solver->Mesh.getDimensions().y()+ threadIdx.y;
+
+	int blockCount = solver->Mesh.getDimensions().x()/blockDim.x + 1;
+
+	for(int gy = threadIdx.y; gy < ny;gy+=blockDim.y)
+	{
+		for(int gx = threadIdx.x; gx < nx;gx+=blockDim.x)
+		{
+//			if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy && gy > -1&& gx > -1)
+			{
+				id1 = threadIdx.x+threadIdx.y;
+
+				for(int l = 0; l < 2*blockDim.x - 1; l++)
+				{
+					if(id1 == l)
+					{
+						if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
+						solver->updateValue(gx,gy);
+					}
+					__syncthreads();
+				}
+
+			}
+			//gx+=blockDim.x;
+			//__syncthreads();
+		}
+		//gx = threadIdx.x;
+		//gy+=blockDim.y;
+		//__syncthreads();
+	}
+			/*---------------------------------------------------------------------------------------------------------------------------*/
+//	gx = blockDim.x*(blockCount-1) + threadIdx.x;
+//	gy = threadIdx.y;
+	for(int gy = threadIdx.y; gy < ny;gy+=blockDim.y)
+	{
+		for(int gx = blockDim.x*(blockCount-1) + threadIdx.x; gx >- 1;gx-=blockDim.x)
+		{
+//			if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy && gy > -1&& gx > -1)
+			{
+				id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y;
+
+				for(int l = 0; l < 2*blockDim.x - 1; l++)
+				{
+					if(id2 == l)
+					{
+						if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
+						solver->updateValue(gx,gy);
+					}
+					__syncthreads();
+				}
+			}
+			//gx-=blockDim.x;
+			//__syncthreads();
+		}
+		//gx = blockDim.x*(blockCount-1) + threadIdx.x;
+		//gy+=blockDim.y;
+		//__syncthreads();
+	}
+			/*---------------------------------------------------------------------------------------------------------------------------*/
+//	gx = blockDim.x*(blockCount-1) + threadIdx.x;
+//	gy = blockDim.x*(blockCount-1) + threadIdx.y;
+	for(int gy = blockDim.x*(blockCount-1) +threadIdx.y; gy >- 1;gy-=blockDim.y)
+	{
+		for(int gx = blockDim.x*(blockCount-1) + threadIdx.x; gx >- 1;gx-=blockDim.x)
+		{
+//			if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy && gy > -1&& gx > -1)
+			{
+				id1 = threadIdx.x+threadIdx.y;
+
+				for(int l = 2*blockDim.x - 2; l > -1; l--)
+				{
+					if(id1 == l)
+					{
+						if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
+						solver->updateValue(gx,gy);
+					}
+					__syncthreads();
+				}
+			}
+			//gx-=blockDim.x;
+			//__syncthreads();
+		}
+		//gx = blockDim.x*(blockCount-1) + threadIdx.x;
+		//gy-=blockDim.y;
+		//__syncthreads();
+	}
+			/*---------------------------------------------------------------------------------------------------------------------------*/
+	//gx = threadIdx.x;
+	//gy = blockDim.x*(blockCount-1) +threadIdx.y;
+	for(int gy = blockDim.x*(blockCount-1) +threadIdx.y; gy >- 1;gy-=blockDim.y)
+	{
+		for(int gx = threadIdx.x; gx < nx;gx+=blockDim.x)
+		{
+//			if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy && gy > -1&& gx > -1)
+			{
+				id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y;
+
+				for(int l = 2*blockDim.x - 2; l > -1; l--)
+				{
+					if(id2 == l)
+					{
+						if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
+						solver->updateValue(gx,gy);
+					}
+					__syncthreads();
+				}
+			}
+			//gx+=blockDim.x;
+			//__syncthreads();
+		}
+		//gx = threadIdx.x;
+		//gy-=blockDim.y;
+		///__syncthreads();
+	}
+			/*---------------------------------------------------------------------------------------------------------------------------*/
+
+
+
+
+
+}
+
+
+__global__ void initCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver)
+{
+	int gx = threadIdx.x + blockDim.x*blockIdx.x;
+	int gy = blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy)
+	{
+		solver->initGrid();
+	}
+
+
+}
+#endif
+
+
+
+
+#endif /* TNLFASTSWEEPING_IMPL_H_ */
diff --git a/examples/fast-sweeping/tnlFastSweeping2D_CUDA_v3_impl.h b/examples/fast-sweeping/tnlFastSweeping2D_CUDA_v3_impl.h
new file mode 100644
index 0000000000..44aa2fbc59
--- /dev/null
+++ b/examples/fast-sweeping/tnlFastSweeping2D_CUDA_v3_impl.h
@@ -0,0 +1,920 @@
+/***************************************************************************
+                          tnlFastSweeping_impl.h  -  description
+                             -------------------
+    begin                : Oct 15 , 2015
+    copyright            : (C) 2015 by Tomas Sobotik
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+#ifndef TNLFASTSWEEPING2D_IMPL_H_
+#define TNLFASTSWEEPING2D_IMPL_H_
+
+#include "tnlFastSweeping.h"
+
+
+
+
+    __device__ double atomicSet(double* address, double val)
+    {
+        unsigned long long int* address_as_ull =
+                              (unsigned long long int*)address;
+        unsigned long long int old = *address_as_ull, assumed;
+        do {
+            assumed = old;
+                old = atomicCAS(address_as_ull, assumed,__double_as_longlong(val ));
+        } while (assumed != old);
+        return __longlong_as_double(old);
+    }
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+tnlString tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
+{
+	   return tnlString( "tnlFastSweeping< " ) +
+	          MeshType::getType() + ", " +
+	          ::getType< Real >() + ", " +
+	          ::getType< Index >() + " >";
+}
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: init( const tnlParameterContainer& parameters )
+{
+	const tnlString& meshFile = parameters.getParameter< tnlString >( "mesh" );
+
+	if( ! Mesh.load( meshFile ) )
+	{
+		   cerr << "I am not able to load the mesh from the file " << meshFile << "." << endl;
+		   return false;
+	}
+
+
+	const tnlString& initialCondition = parameters.getParameter <tnlString>("initial-condition");
+	if( ! dofVector.load( initialCondition ) )
+	{
+		   cerr << "I am not able to load the initial condition from the file " << meshFile << "." << endl;
+		   return false;
+	}
+
+	h = Mesh.getHx();
+	counter = 0;
+
+	const tnlString& exact_input = parameters.getParameter< tnlString >( "exact-input" );
+
+	if(exact_input == "no")
+		exactInput=false;
+	else
+		exactInput=true;
+
+
+#ifdef HAVE_CUDA
+
+	cudaMalloc(&(cudaDofVector), this->dofVector.getSize()*sizeof(double));
+	cudaMemcpy(cudaDofVector, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice);
+
+	cudaMalloc(&(cudaDofVector2), this->dofVector.getSize()*sizeof(double));
+	cudaMemcpy(cudaDofVector2, this->dofVector.getData(), this->dofVector.getSize()*sizeof(double), cudaMemcpyHostToDevice);
+
+
+	cudaMalloc(&(this->cudaSolver), sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >));
+	cudaMemcpy(this->cudaSolver, this,sizeof(tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >), cudaMemcpyHostToDevice);
+
+#endif
+
+	int n = Mesh.getDimensions().x();
+	dim3 threadsPerBlock(16, 16);
+	dim3 numBlocks(n/16 + 1 ,n/16 +1);
+
+	initCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
+	cudaDeviceSynchronize();
+	checkCudaDevice;
+
+	return true;
+}
+
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: run()
+{
+//
+//	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
+//	{
+//		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
+//		{
+//			updateValue(i,j);
+//		}
+//	}
+//
+///*---------------------------------------------------------------------------------------------------------------------------*/
+//
+//	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
+//	{
+//		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
+//		{
+//			updateValue(i,j);
+//		}
+//	}
+//
+///*---------------------------------------------------------------------------------------------------------------------------*/
+//
+//	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
+//	{
+//		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
+//		{
+//			updateValue(i,j);
+//		}
+//	}
+//
+///*---------------------------------------------------------------------------------------------------------------------------*/
+//	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
+//	{
+//		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
+//		{
+//			updateValue(i,j);
+//		}
+//	}
+//
+///*---------------------------------------------------------------------------------------------------------------------------*/
+//
+//
+//	dofVector.save("u-00001.tnl");
+
+	int n = Mesh.getDimensions().x();
+	dim3 threadsPerBlock(16, 16);
+	dim3 numBlocks(n/16 +1 ,n/16 +1);
+	int m =n/16 +1;
+
+	for(int i = 0; i < 2*m -1; i++)
+	{
+		runCUDA<15><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,1,i);
+		//cudaDeviceSynchronize();
+	}
+//	cudaDeviceSynchronize();
+//	checkCudaDevice;
+//	for(int i = 0; i < 2*m -1; i++)
+//	{
+//		runCUDA<2><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,2,i);
+//		cudaDeviceSynchronize();
+//	}
+//	cudaDeviceSynchronize();
+//	checkCudaDevice;
+//	for(int i = 0; i < 2*m -1; i++)
+//	{
+//		runCUDA<4><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,4,i);
+//		cudaDeviceSynchronize();
+//	}
+//	cudaDeviceSynchronize();
+//	checkCudaDevice;
+//	for(int i = 0; i < 2*m -1; i++)
+//	{
+//		runCUDA<8><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,8,i);
+//		cudaDeviceSynchronize();
+//	}
+
+
+
+
+//	for(int i = 0; i < (2*m -1)/4 -1; i++)
+//	{
+//		runCUDA<15><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,15,i);//all
+//		cudaDeviceSynchronize();
+//	}
+//	for(int i = (2*m -1)/4 -1; i < (2*m -1)/2 -1; i++)
+//	{
+//		runCUDA<5><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,5,i); //two
+//		cudaDeviceSynchronize();
+//		runCUDA<10><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,10,i); //two
+//		cudaDeviceSynchronize();
+//	}
+//	for(int i = (2*m -1)/2 -1; i < (2*m -1)/2 +1; i++)
+//	{
+//		runCUDA<1><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,1,i); //separate
+//		cudaDeviceSynchronize();
+//		runCUDA<2><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,2,i); //separate
+//		cudaDeviceSynchronize();
+//		runCUDA<4><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,4,i); //separate
+//		cudaDeviceSynchronize();
+//		runCUDA<8><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,8,i); //separate
+//		cudaDeviceSynchronize();
+//	}
+//	for(int i = (2*m -1)/2 +1; i < (2*m -1/4)*3 +1; i++)
+//	{
+//		runCUDA<5><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,5,i); //two
+//		cudaDeviceSynchronize();
+//		runCUDA<10><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,10,i); //two
+//		cudaDeviceSynchronize();
+//	}
+//	for(int i = (2*m -1/4)*3 +1; i < 2*m -1; i++)
+//	{
+//		runCUDA<15><<<numBlocks,threadsPerBlock>>>(this->cudaSolver,15,i);//all
+//		cudaDeviceSynchronize();
+//	}
+cudaDeviceSynchronize();
+	checkCudaDevice;
+
+	cudaMemcpy(this->dofVector.getData(), cudaDofVector, this->dofVector.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
+	cudaDeviceSynchronize();
+	cudaFree(cudaDofVector);
+	cudaFree(cudaDofVector2);
+	cudaFree(cudaSolver);
+	dofVector.save("u-00001.tnl");
+	cudaDeviceSynchronize();
+	return true;
+}
+
+
+
+
+#ifdef HAVE_CUDA
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+__device__
+void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real value = cudaDofVector[index];
+	Real a,b, tmp;
+
+	if( i == 0 )
+		a = cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)];
+	else if( i == Mesh.getDimensions().x() - 1 )
+		a = cudaDofVector[Mesh.template getCellNextToCell<-1,0>(index)];
+	else
+	{
+		a = fabsMin( cudaDofVector[Mesh.template getCellNextToCell<-1,0>(index)],
+				 cudaDofVector[Mesh.template getCellNextToCell<1,0>(index)] );
+	}
+
+	if( j == 0 )
+		b = cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)];
+	else if( j == Mesh.getDimensions().y() - 1 )
+		b = cudaDofVector[Mesh.template getCellNextToCell<0,-1>(index)];
+	else
+	{
+		b = fabsMin( cudaDofVector[Mesh.template getCellNextToCell<0,-1>(index)],
+				 cudaDofVector[Mesh.template getCellNextToCell<0,1>(index)] );
+	}
+
+
+	if(abs(a-b) >= h)
+		tmp = fabsMin(a,b) + Sign(value)*h;
+	else
+		tmp = 0.5 * (a + b + Sign(value)*sqrt(2.0 * h * h - (a - b) * (a - b) ) );
+
+	atomicSet(&cudaDofVector[index],fabsMin(value, tmp));
+
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+__device__
+bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid()
+{
+	int gx = threadIdx.x + blockDim.x*blockIdx.x;
+	int gy = blockDim.y*blockIdx.y + threadIdx.y;
+	int gid = Mesh.getCellIndex(CoordinatesType(gx,gy));
+
+	int total = blockDim.x*gridDim.x;
+
+
+
+	Real tmp = 0.0;
+	int flag = 0;
+	counter = 0;
+	tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
+
+
+	if(!exactInput)
+	{
+		cudaDofVector[gid]=cudaDofVector[gid]=0.5*h*Sign(cudaDofVector[gid]);
+	}
+	__threadfence();
+//	printf("-----------------------------------------------------------------------------------\n");
+
+	__threadfence();
+
+	if(gx > 0 && gx < Mesh.getDimensions().x()-1)
+	{
+		if(gy > 0 && gy < Mesh.getDimensions().y()-1)
+		{
+
+			Index j = gy;
+			Index i = gx;
+//			 tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+
+			if(tmp == 0.0)
+			{}
+			else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
+					cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+			{}
+			else
+				flag=1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+		}
+	}
+
+//	printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
+//	printf("****************************************************************\n");
+//	printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
+	if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == 0)
+	{
+//		printf("gx: %d, gy: %d, gid: %d \n", gx, gy,gid);
+		Index j = 0;
+		Index i = gx;
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+
+
+		if(tmp == 0.0)
+		{}
+		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 )
+		{}
+		else
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+	}
+
+//	printf("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n");
+	if(gx > 0 && gx < Mesh.getDimensions().x()-1 && gy == Mesh.getDimensions().y() - 1)
+	{
+		Index i = gx;
+		Index j = Mesh.getDimensions().y() - 1;
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+
+
+		if(tmp == 0.0)
+		{}
+		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+		{}
+		else
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+	}
+
+//	printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+	if(gy > 0 && gy < Mesh.getDimensions().y()-1 && gx == 0)
+	{
+		Index j = gy;
+		Index i = 0;
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+
+
+		if(tmp == 0.0)
+		{}
+		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+		{}
+		else
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+	}
+//	printf("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n");
+	if(gy > 0 && gy < Mesh.getDimensions().y()-1  && gx == Mesh.getDimensions().x() - 1)
+	{
+		Index j = gy;
+		Index i = Mesh.getDimensions().x() - 1;
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+
+
+		if(tmp == 0.0)
+		{}
+		else if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+		{}
+		else
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+	}
+
+//	printf("##################################################################################################\n");
+	if(gx == Mesh.getDimensions().x() - 1 &&
+	   gy == Mesh.getDimensions().y() - 1)
+	{
+
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
+		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 &&
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0)
+
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
+	}
+	if(gx == Mesh.getDimensions().x() - 1 &&
+	   gy == 0)
+	{
+
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
+		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx-1,gy))]*tmp > 0.0 &&
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0)
+
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
+	}
+//	printf("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n");
+	if(gx == 0 &&
+	   gy == Mesh.getDimensions().y() - 1)
+	{
+
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
+		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 &&
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy-1))]*tmp > 0.0)
+
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
+	}
+	if(gx == 0 &&
+	   gy == 0)
+	{
+//		tmp = Sign(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))]);
+		if(cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx+1,gy))]*tmp > 0.0 &&
+				cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy+1))]*tmp > 0.0)
+
+			flag = 1;//cudaDofVector[Mesh.getCellIndex(CoordinatesType(gx,gy))] = tmp*INT_MAX;
+	}
+
+	__threadfence();
+
+	if(flag==1)
+		cudaDofVector[gid] =  tmp*3;
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+__device__
+Real tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
+{
+//	Real fx = abs(x);
+//
+//	Real tmpMin = Min(fx,abs(y));
+
+	if(abs(y) > abs(x))
+		return x;
+	else
+		return y;
+
+
+}
+
+
+template<>
+__global__ void runCUDA<1>(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int k)
+{
+
+	if(blockIdx.x+blockIdx.y == k)
+	{
+		int gx = threadIdx.x + blockDim.x*blockIdx.x;
+		int gy = threadIdx.y + blockDim.y*blockIdx.y;
+
+		int id1 = threadIdx.x+threadIdx.y;
+
+						for(int l = 0; l < 2*blockDim.x - 1; l++)
+						{
+							if(id1 == l)
+							{
+								if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
+								solver->updateValue(gx,gy);
+							}
+							__syncthreads();
+						}
+
+	}
+			/*---------------------------------------------------------------------------------------------------------------------------*/
+}
+	template<>
+	__global__ void runCUDA<2>(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int k)
+	{
+	if((gridDim.x - blockIdx.x - 1)+blockIdx.y == k)
+	{
+		int gx = threadIdx.x + blockDim.x*blockIdx.x;
+		int gy = threadIdx.y + blockDim.y*blockIdx.y;
+
+		int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y;
+
+				for(int l = 0; l < 2*blockDim.x - 1; l++)
+				{
+					if(id2 == l)
+					{
+						if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
+						solver->updateValue(gx,gy);
+					}
+					__syncthreads();
+				}
+
+	}
+	}			/*---------------------------------------------------------------------------------------------------------------------------*/
+	template<>
+	__global__ void runCUDA<4>(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int k)
+	{
+	if(blockIdx.x+blockIdx.y == gridDim.x+gridDim.y-k-2)
+		{
+		int gx = threadIdx.x + blockDim.x*blockIdx.x;
+		int gy = threadIdx.y + blockDim.y*blockIdx.y;
+
+		int id1 = threadIdx.x+threadIdx.y;
+
+				for(int l = 2*blockDim.x - 2; l > -1; l--)
+				{
+					if(id1 == l)
+					{
+						if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
+						solver->updateValue(gx,gy);
+						return;
+					}
+					__syncthreads();
+				}
+
+		}
+			/*---------------------------------------------------------------------------------------------------------------------------*/
+
+	}
+
+	template<>
+	__global__ void runCUDA<8>(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int k)
+	{
+	if((gridDim.x - blockIdx.x - 1)+blockIdx.y == gridDim.x+gridDim.y-k-2)
+		{
+		int gx = threadIdx.x + blockDim.x*blockIdx.x;
+		int gy = threadIdx.y + blockDim.y*blockIdx.y;
+
+		int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y;
+
+				for(int l = 2*blockDim.x - 2; l > -1; l--)
+				{
+					if(id2 == l)
+					{
+						if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
+						solver->updateValue(gx,gy);
+						return;
+					}
+					__syncthreads();
+				}
+
+		}
+			/*---------------------------------------------------------------------------------------------------------------------------*/
+
+
+
+
+
+}
+
+
+	template<>
+		__global__ void runCUDA<5>(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int k)
+		{
+
+			if(blockIdx.x+blockIdx.y == k)
+			{
+				int gx = threadIdx.x + blockDim.x*blockIdx.x;
+				int gy = threadIdx.y + blockDim.y*blockIdx.y;
+
+				int id1 = threadIdx.x+threadIdx.y;
+
+								for(int l = 0; l < 2*blockDim.x - 1; l++)
+								{
+									if(id1 == l)
+									{
+										if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
+										solver->updateValue(gx,gy);
+										return;
+									}
+									__syncthreads();
+								}
+
+			}
+			else if(blockIdx.x+blockIdx.y == gridDim.x+gridDim.y-k-2)
+				{
+				int gx = threadIdx.x + blockDim.x*blockIdx.x;
+				int gy = threadIdx.y + blockDim.y*blockIdx.y;
+
+				int id1 = threadIdx.x+threadIdx.y;
+
+						for(int l = 2*blockDim.x - 2; l > -1; l--)
+						{
+							if(id1 == l)
+							{
+								if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
+								solver->updateValue(gx,gy);
+								return;
+							}
+							__syncthreads();
+						}
+
+				}
+		}
+
+
+	template<>
+		__global__ void runCUDA<10>(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int k)
+		{
+			if((gridDim.x - blockIdx.x - 1)+blockIdx.y == k)
+			{
+				int gx = threadIdx.x + blockDim.x*blockIdx.x;
+				int gy = threadIdx.y + blockDim.y*blockIdx.y;
+
+				int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y;
+
+						for(int l = 0; l < 2*blockDim.x - 1; l++)
+						{
+							if(id2 == l)
+							{
+								if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
+								solver->updateValue(gx,gy);
+								return;
+							}
+							__syncthreads();
+						}
+
+			}
+
+			else if((gridDim.x - blockIdx.x - 1)+blockIdx.y == gridDim.x+gridDim.y-k-2)
+				{
+				int gx = threadIdx.x + blockDim.x*blockIdx.x;
+				int gy = threadIdx.y + blockDim.y*blockIdx.y;
+
+				int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y;
+
+						for(int l = 2*blockDim.x - 2; l > -1; l--)
+						{
+							if(id2 == l)
+							{
+								if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
+								solver->updateValue(gx,gy);
+								return;
+							}
+							__syncthreads();
+						}
+
+				}
+
+		}
+
+
+
+	template<>
+	__global__ void runCUDA<15>(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int k)
+	{
+
+		if(blockIdx.x+blockIdx.y == k)
+		{
+			int gx = threadIdx.x + blockDim.x*blockIdx.x;
+			int gy = threadIdx.y + blockDim.y*blockIdx.y;
+
+			int id1 = threadIdx.x+threadIdx.y;
+
+							for(int l = 0; l < 2*blockDim.x - 1; l++)
+							{
+								if(id1 == l)
+								{
+									if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
+									solver->updateValue(gx,gy);
+									return;
+								}
+								__syncthreads();
+							}
+
+		}
+				/*---------------------------------------------------------------------------------------------------------------------------*/
+
+		if((gridDim.x - blockIdx.x - 1)+blockIdx.y == k)
+		{
+			int gx = threadIdx.x + blockDim.x*blockIdx.x;
+			int gy = threadIdx.y + blockDim.y*blockIdx.y;
+
+			int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y;
+
+					for(int l = 0; l < 2*blockDim.x - 1; l++)
+					{
+						if(id2 == l)
+						{
+							if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
+							solver->updateValue(gx,gy);
+							return;
+						}
+						__syncthreads();
+					}
+
+		}
+				/*---------------------------------------------------------------------------------------------------------------------------*/
+
+		if(blockIdx.x+blockIdx.y == gridDim.x+gridDim.y-k-2)
+			{
+			int gx = threadIdx.x + blockDim.x*blockIdx.x;
+			int gy = threadIdx.y + blockDim.y*blockIdx.y;
+
+			int id1 = threadIdx.x+threadIdx.y;
+
+					for(int l = 2*blockDim.x - 2; l > -1; l--)
+					{
+						if(id1 == l)
+						{
+							if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
+							solver->updateValue(gx,gy);
+							return;
+						}
+						__syncthreads();
+					}
+
+			}
+				/*---------------------------------------------------------------------------------------------------------------------------*/
+
+		if((gridDim.x - blockIdx.x - 1)+blockIdx.y == gridDim.x+gridDim.y-k-2)
+			{
+			int gx = threadIdx.x + blockDim.x*blockIdx.x;
+			int gy = threadIdx.y + blockDim.y*blockIdx.y;
+
+			int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y;
+
+					for(int l = 2*blockDim.x - 2; l > -1; l--)
+					{
+						if(id2 == l)
+						{
+							if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
+							solver->updateValue(gx,gy);
+							return;
+						}
+						__syncthreads();
+					}
+
+			}
+				/*---------------------------------------------------------------------------------------------------------------------------*/
+
+
+
+
+
+	}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+__global__ void initCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver)
+{
+	int gx = threadIdx.x + blockDim.x*blockIdx.x;
+	int gy = blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy)
+	{
+		solver->initGrid();
+	}
+
+
+}
+#endif
+
+
+
+
+
+
+//__global__ void runCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int k)
+//{
+//
+//	if(sweep==1 && blockIdx.x+blockIdx.y == k)
+//	{
+//		int gx = threadIdx.x + blockDim.x*blockIdx.x;
+//		int gy = threadIdx.y + blockDim.y*blockIdx.y;
+//
+//		int id1 = threadIdx.x+threadIdx.y;
+//
+//						for(int l = 0; l < 2*blockDim.x - 1; l++)
+//						{
+//							if(id1 == l)
+//							{
+//								if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
+//								solver->updateValue(gx,gy);
+//							}
+//							__syncthreads();
+//						}
+//
+//	}
+//			/*---------------------------------------------------------------------------------------------------------------------------*/
+//
+//	else if(sweep==2 && (gridDim.x - blockIdx.x - 1)+blockIdx.y == k)
+//	{
+//		int gx = threadIdx.x + blockDim.x*blockIdx.x;
+//		int gy = threadIdx.y + blockDim.y*blockIdx.y;
+//
+//		int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y;
+//
+//				for(int l = 0; l < 2*blockDim.x - 1; l++)
+//				{
+//					if(id2 == l)
+//					{
+//						if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
+//						solver->updateValue(gx,gy);
+//					}
+//					__syncthreads();
+//				}
+//
+//	}
+//			/*---------------------------------------------------------------------------------------------------------------------------*/
+//
+//	else if(sweep==4 && blockIdx.x+blockIdx.y == gridDim.x+gridDim.y-k-2)
+//		{
+//		int gx = threadIdx.x + blockDim.x*blockIdx.x;
+//		int gy = threadIdx.y + blockDim.y*blockIdx.y;
+//
+//		int id1 = threadIdx.x+threadIdx.y;
+//
+//				for(int l = 2*blockDim.x - 2; l > -1; l--)
+//				{
+//					if(id1 == l)
+//					{
+//						if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
+//						solver->updateValue(gx,gy);
+//						return;
+//					}
+//					__syncthreads();
+//				}
+//
+//		}
+//			/*---------------------------------------------------------------------------------------------------------------------------*/
+//
+//	else if(sweep==8 && (gridDim.x - blockIdx.x - 1)+blockIdx.y == gridDim.x+gridDim.y-k-2)
+//		{
+//		int gx = threadIdx.x + blockDim.x*blockIdx.x;
+//		int gy = threadIdx.y + blockDim.y*blockIdx.y;
+//
+//		int id2 = (blockDim.x - threadIdx.x - 1) + threadIdx.y;
+//
+//				for(int l = 2*blockDim.x - 2; l > -1; l--)
+//				{
+//					if(id2 == l)
+//					{
+//						if(solver->Mesh.getDimensions().x() > gx && solver->Mesh.getDimensions().y() > gy /*&& gy > -1&& gx > -1*/)
+//						solver->updateValue(gx,gy);
+//						return;
+//					}
+//					__syncthreads();
+//				}
+//
+//		}
+//			/*---------------------------------------------------------------------------------------------------------------------------*/
+//
+//
+//
+//
+//
+//}
+
+
+#endif /* TNLFASTSWEEPING_IMPL_H_ */
diff --git a/examples/fast-sweeping/tnlFastSweeping2D_impl.h b/examples/fast-sweeping/tnlFastSweeping2D_impl.h
index 0abf197618..a2197864bb 100644
--- a/examples/fast-sweeping/tnlFastSweeping2D_impl.h
+++ b/examples/fast-sweeping/tnlFastSweeping2D_impl.h
@@ -59,6 +59,13 @@ bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > ::
 
 	h = Mesh.getHx();
 
+	const tnlString& exact_input = parameters.getParameter< tnlString >( "exact-input" );
+
+	if(exact_input == "no")
+		exactInput=false;
+	else
+		exactInput=true;
+
 	return initGrid();
 }
 
@@ -72,6 +79,15 @@ bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > ::
 {
 
 	Real tmp = 0.0;
+	Real ax=0.5/sqrt(2.0);
+
+	if(!exactInput)
+	{
+		for(Index i = 0; i < Mesh.getDimensions().x()*Mesh.getDimensions().y(); i++)
+				dofVector[i]=0.5*h*Sign(dofVector[i]);
+	}
+
+
 	for(Index i = 1; i < Mesh.getDimensions().x()-1; i++)
 	{
 		for(Index j = 1; j < Mesh.getDimensions().y()-1; j++)
diff --git a/examples/fast-sweeping/tnlFastSweeping2D_openMP_impl.h b/examples/fast-sweeping/tnlFastSweeping2D_openMP_impl.h
new file mode 100644
index 0000000000..1bf8504439
--- /dev/null
+++ b/examples/fast-sweeping/tnlFastSweeping2D_openMP_impl.h
@@ -0,0 +1,399 @@
+/***************************************************************************
+                          tnlFastSweeping_impl.h  -  description
+                             -------------------
+    begin                : Oct 15 , 2015
+    copyright            : (C) 2015 by Tomas Sobotik
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+#ifndef TNLFASTSWEEPING2D_IMPL_H_
+#define TNLFASTSWEEPING2D_IMPL_H_
+
+#include "tnlFastSweeping.h"
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+tnlString tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: getType()
+{
+	   return tnlString( "tnlFastSweeping< " ) +
+	          MeshType::getType() + ", " +
+	          ::getType< Real >() + ", " +
+	          ::getType< Index >() + " >";
+}
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: init( const tnlParameterContainer& parameters )
+{
+	const tnlString& meshFile = parameters.getParameter< tnlString >( "mesh" );
+
+	if( ! Mesh.load( meshFile ) )
+	{
+		   cerr << "I am not able to load the mesh from the file " << meshFile << "." << endl;
+		   return false;
+	}
+
+
+	const tnlString& initialCondition = parameters.getParameter <tnlString>("initial-condition");
+	if( ! dofVector.load( initialCondition ) )
+	{
+		   cerr << "I am not able to load the initial condition from the file " << meshFile << "." << endl;
+		   return false;
+	}
+
+	h = Mesh.getHx();
+
+
+	const tnlString& exact_input = parameters.getParameter< tnlString >( "exact-input" );
+
+	if(exact_input == "no")
+		exactInput=false;
+	else
+		exactInput=true;
+
+#ifdef HAVE_OPENMP
+//	gridLock = (omp_lock_t*) malloc(sizeof(omp_lock_t)*Mesh.getDimensions().x()*Mesh.getDimensions().y());
+//
+//	for(int i = 0; i < Mesh.getDimensions().x()*Mesh.getDimensions().y(); i++)
+//			omp_init_lock(&gridLock[i]);
+#endif
+
+	return initGrid();
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: initGrid()
+{
+
+	Real tmp = 0.0;
+
+	if(!exactInput)
+	{
+		for(Index i = 0; i < Mesh.getDimensions().x()*Mesh.getDimensions().y(); i++)
+				dofVector[i]=0.5*h*Sign(dofVector[i]);
+	}
+
+
+	for(Index i = 1; i < Mesh.getDimensions().x()-1; i++)
+	{
+		for(Index j = 1; j < Mesh.getDimensions().y()-1; j++)
+		{
+			 tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+
+
+			if(tmp == 0.0)
+			{}
+			else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+					dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+					dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
+					dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+			{}
+			else
+				dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+		}
+	}
+
+
+
+	for(int i = 1; i < Mesh.getDimensions().x()-1; i++)
+	{
+		Index j = 0;
+		tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+
+
+		if(tmp == 0.0)
+		{}
+		else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+				dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+				dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 )
+		{}
+		else
+			dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+	}
+
+	for(int i = 1; i < Mesh.getDimensions().x()-1; i++)
+	{
+		Index j = Mesh.getDimensions().y() - 1;
+		tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+
+
+		if(tmp == 0.0)
+		{}
+		else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+				dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+				dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+		{}
+		else
+			dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+	}
+
+	for(int j = 1; j < Mesh.getDimensions().y()-1; j++)
+	{
+		Index i = 0;
+		tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+
+
+		if(tmp == 0.0)
+		{}
+		else if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp < 0.0 ||
+				dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
+				dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+		{}
+		else
+			dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+	}
+
+	for(int j = 1; j < Mesh.getDimensions().y()-1; j++)
+	{
+		Index i = Mesh.getDimensions().x() - 1;
+		tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+
+
+		if(tmp == 0.0)
+		{}
+		else if(dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp < 0.0 ||
+				dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp < 0.0 ||
+				dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp < 0.0 )
+		{}
+		else
+			dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+	}
+
+
+	Index i = Mesh.getDimensions().x() - 1;
+	Index j = Mesh.getDimensions().y() - 1;
+
+	tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+	if(dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp > 0.0 &&
+			dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp > 0.0)
+
+		dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+
+
+
+	j = 0;
+	tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+	if(dofVector[Mesh.getCellIndex(CoordinatesType(i-1,j))]*tmp > 0.0 &&
+			dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp > 0.0)
+
+		dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+
+
+
+	i = 0;
+	j = Mesh.getDimensions().y() -1;
+	tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+	if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp > 0.0 &&
+			dofVector[Mesh.getCellIndex(CoordinatesType(i,j-1))]*tmp > 0.0)
+
+		dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+
+
+
+	j = 0;
+	tmp = Sign(dofVector[Mesh.getCellIndex(CoordinatesType(i,j))]);
+	if(dofVector[Mesh.getCellIndex(CoordinatesType(i+1,j))]*tmp > 0.0 &&
+			dofVector[Mesh.getCellIndex(CoordinatesType(i,j+1))]*tmp > 0.0)
+
+		dofVector[Mesh.getCellIndex(CoordinatesType(i,j))] = tmp*INT_MAX;
+
+
+	dofVector.save("u-00000.tnl");
+
+	return true;
+}
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+bool tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: run()
+{
+
+	DofVectorType d2,d3,d4;
+	d2.setLike(dofVector);
+	d2=dofVector;
+	d3.setLike(dofVector);
+	d3=dofVector;
+	d4.setLike(dofVector);
+	d4=dofVector;
+
+
+#ifdef HAVE_OPENMP
+#pragma omp parallel sections num_threads(4)
+	{
+	{
+#endif
+
+	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
+	{
+		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
+		{
+			updateValue(i,j,&dofVector);
+		}
+	}
+
+/*---------------------------------------------------------------------------------------------------------------------------*/
+#ifdef HAVE_OPENMP
+	}
+#pragma omp section
+	{
+#endif
+	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
+	{
+		for(Index j = 0; j < Mesh.getDimensions().y(); j++)
+		{
+			updateValue(i,j,&d2);
+		}
+	}
+
+/*---------------------------------------------------------------------------------------------------------------------------*/
+#ifdef HAVE_OPENMP
+	}
+#pragma omp section
+	{
+#endif
+	for(Index i = Mesh.getDimensions().x() - 1; i > -1; i--)
+	{
+		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
+		{
+			updateValue(i,j, &d3);
+		}
+	}
+
+/*---------------------------------------------------------------------------------------------------------------------------*/
+#ifdef HAVE_OPENMP
+	}
+#pragma omp section
+	{
+#endif
+	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
+	{
+		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
+		{
+			updateValue(i,j, &d4);
+		}
+	}
+
+/*---------------------------------------------------------------------------------------------------------------------------*/
+#ifdef HAVE_OPENMP
+	}
+	}
+#endif
+
+
+#ifdef HAVE_OPENMP
+#pragma omp parallel for num_threads(4) schedule(dynamic)
+#endif
+	for(Index i = 0; i < Mesh.getDimensions().x(); i++)
+	{
+		for(Index j = Mesh.getDimensions().y() - 1; j > -1; j--)
+		{
+			int index = Mesh.getCellIndex(CoordinatesType(i,j));
+			dofVector[index] = fabsMin(dofVector[index], d2[index]);
+			dofVector[index] = fabsMin(dofVector[index], d3[index]);
+			dofVector[index] = fabsMin(dofVector[index], d4[index]);
+		}
+	}
+
+	dofVector.save("u-00001.tnl");
+
+	return true;
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+void tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: updateValue( Index i, Index j, DofVectorType* grid)
+{
+	Index index = Mesh.getCellIndex(CoordinatesType(i,j));
+	Real value = (*grid)[index];
+	Real a,b, tmp;
+
+	if( i == 0 )
+		a = (*grid)[Mesh.template getCellNextToCell<1,0>(index)];
+	else if( i == Mesh.getDimensions().x() - 1 )
+		a = (*grid)[Mesh.template getCellNextToCell<-1,0>(index)];
+	else
+	{
+		a = fabsMin( (*grid)[Mesh.template getCellNextToCell<-1,0>(index)],
+				 (*grid)[Mesh.template getCellNextToCell<1,0>(index)] );
+	}
+
+	if( j == 0 )
+		b = (*grid)[Mesh.template getCellNextToCell<0,1>(index)];
+	else if( j == Mesh.getDimensions().y() - 1 )
+		b = (*grid)[Mesh.template getCellNextToCell<0,-1>(index)];
+	else
+	{
+		b = fabsMin( (*grid)[Mesh.template getCellNextToCell<0,-1>(index)],
+				 (*grid)[Mesh.template getCellNextToCell<0,1>(index)] );
+	}
+
+
+	if(fabs(a-b) >= h)
+		tmp = fabsMin(a,b) + Sign(value)*h;
+	else
+		tmp = 0.5 * (a + b + Sign(value)*sqrt(2.0 * h * h - (a - b) * (a - b) ) );
+
+#ifdef HAVE_OPENMP
+//	omp_set_lock(&gridLock[index]);
+#endif
+	(*grid)[index]  = fabsMin(value, tmp);
+#ifdef HAVE_OPENMP
+//	omp_unset_lock(&gridLock[index]);
+#endif
+}
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+Real tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: fabsMin( Real x, Real y)
+{
+	Real fx = fabs(x);
+	Real fy = fabs(y);
+
+	Real tmpMin = Min(fx,fy);
+
+	if(tmpMin == fx)
+		return x;
+	else
+		return y;
+
+
+}
+
+
+
+
+#endif /* TNLFASTSWEEPING_IMPL_H_ */
diff --git a/examples/fast-sweeping/tnlFastSweeping_CUDA.h b/examples/fast-sweeping/tnlFastSweeping_CUDA.h
new file mode 100644
index 0000000000..e5023b7c32
--- /dev/null
+++ b/examples/fast-sweeping/tnlFastSweeping_CUDA.h
@@ -0,0 +1,104 @@
+/***************************************************************************
+                          tnlFastSweeping.h  -  description
+                             -------------------
+    begin                : Oct 15 , 2015
+    copyright            : (C) 2015 by Tomas Sobotik
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+#ifndef TNLFASTSWEEPING_H_
+#define TNLFASTSWEEPING_H_
+
+#include <config/tnlParameterContainer.h>
+#include <core/vectors/tnlVector.h>
+#include <core/vectors/tnlStaticVector.h>
+#include <core/tnlHost.h>
+#include <mesh/tnlGrid.h>
+#include <limits.h>
+#include <core/tnlDevice.h>
+#include <ctime>
+
+
+
+
+
+template< typename Mesh,
+		  typename Real,
+		  typename Index >
+class tnlFastSweeping
+{};
+
+
+
+
+template< typename MeshReal,
+          typename Device,
+          typename MeshIndex,
+          typename Real,
+          typename Index >
+class tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >
+{
+
+public:
+	typedef Real RealType;
+	typedef Device DeviceType;
+	typedef Index IndexType;
+	typedef tnlGrid< 2, Real, Device, Index > MeshType;
+	typedef tnlVector< RealType, DeviceType, IndexType> DofVectorType;
+	typedef typename MeshType::CoordinatesType CoordinatesType;
+
+
+
+	__host__ static tnlString getType();
+	__host__ bool init( const tnlParameterContainer& parameters );
+	__host__ bool run();
+
+#ifdef HAVE_CUDA
+	__device__ bool initGrid();
+	__device__ void updateValue(const Index i, const Index j);
+	__device__ Real fabsMin(const Real x, const Real y);
+
+	tnlFastSweeping< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >* cudaSolver;
+	double* cudaDofVector;
+	double* cudaDofVector2;
+	int counter;
+#endif
+
+	MeshType Mesh;
+
+protected:
+
+
+
+	bool exactInput;
+
+	DofVectorType dofVector;
+
+	RealType h;
+
+
+};
+
+
+
+#ifdef HAVE_CUDA
+template<int sweep_t>
+__global__ void runCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int i);
+//__global__ void runCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, int sweep, int i);
+
+__global__ void initCUDA(tnlFastSweeping< tnlGrid< 2,double, tnlHost, int >, double, int >* solver);
+#endif
+
+/*various implementtions.... choose one*/
+//#include "tnlFastSweeping2D_CUDA_impl.h"
+//#include "tnlFastSweeping2D_CUDA_v2_impl.h"
+#include "tnlFastSweeping2D_CUDA_v3_impl.h"
+
+#endif /* TNLFASTSWEEPING_H_ */
-- 
GitLab