Merging done

69063c92 · Tomas Sobotik · 4b6d0617 · 69063c92
Commit 69063c92 authored 9 years ago by Tomas Sobotik
--- a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h
+++ b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h
+/***************************************************************************
+                          tnlParallelEikonalSolver_impl.h  -  description
+                             -------------------
+    begin                : Nov 28 , 2014
+    copyright            : (C) 2014 by Tomas Sobotik
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifndef TNLPARALLELEIKONALSOLVER_IMPL_H_
+#define TNLPARALLELEIKONALSOLVER_IMPL_H_
+
+//north - 1, east - 2, west - 4, south - 8
+
+#define FROM_NORTH 1
+#define FROM_EAST 2
+#define FROM_WEST 4
+#define FROM_SOUTH 8
+
+#include "tnlParallelEikonalSolver.h"
+#include <core/mfilename.h>
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::tnlParallelEikonalSolver()
+{
+	cout << "a" << endl;
+	this->device = tnlCudaDevice;  /////////////// tnlCuda Device --- vypocet na GPU, tnlHostDevice   ---    vypocet na CPU
+
+#ifdef HAVE_CUDA
+	if(this->device == tnlCudaDevice)
+	{
+	run_host = 1;
+	}
+#endif
+
+	cout << "b" << endl;
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::test()
+{
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+
+bool tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::init( const tnlParameterContainer& parameters )
+{
+	cout << "Initializating solver..." << endl;
+	const tnlString& meshLocation = parameters.getParameter <tnlString>("mesh");
+	this->mesh.load( meshLocation );
+
+	this->n = parameters.getParameter <int>("subgrid-size");
+	cout << "Setting N to " << this->n << endl;
+
+	this->subMesh.setDimensions( this->n, this->n );
+	this->subMesh.setDomain( tnlStaticVector<2,double>(0.0, 0.0),
+							 tnlStaticVector<2,double>(this->mesh.getHx()*(double)(this->n), this->mesh.getHy()*(double)(this->n)) );
+
+	this->subMesh.save("submesh.tnl");
+
+	const tnlString& initialCondition = parameters.getParameter <tnlString>("initial-condition");
+	this->u0.load( initialCondition );
+
+	this->delta = parameters.getParameter <double>("delta");
+	this->delta *= this->mesh.getHx()*this->mesh.getHy();
+
+	cout << "Setting delta to " << this->delta << endl;
+
+	this->tau0 = parameters.getParameter <double>("initial-tau");
+	cout << "Setting initial tau to " << this->tau0 << endl;
+	this->stopTime = parameters.getParameter <double>("stop-time");
+
+	this->cflCondition = parameters.getParameter <double>("cfl-condition");
+	this -> cflCondition *= sqrt(this->mesh.getHx()*this->mesh.getHy());
+	cout << "Setting CFL to " << this->cflCondition << endl;
+
+	stretchGrid();
+	this->stopTime /= (double)(this->gridCols);
+	this->stopTime *= (1.0+1.0/((double)(this->n) - 2.0));
+	cout << "Setting stopping time to " << this->stopTime << endl;
+	//this->stopTime = 1.5*((double)(this->n))*parameters.getParameter <double>("stop-time")*this->mesh.getHx();
+	//cout << "Setting stopping time to " << this->stopTime << endl;
+
+	cout << "Initializating scheme..." << endl;
+	if(!this->schemeHost.init(parameters))
+	{
+		cerr << "SchemeHost failed to initialize." << endl;
+		return false;
+	}
+	cout << "Scheme initialized." << endl;
+
+	test();
+
+	VectorType* tmp = new VectorType[subgridValues.getSize()];
+	bool containsCurve = false;
+
+#ifdef HAVE_CUDA
+
+	if(this->device == tnlCudaDevice)
+	{
+	cudaMalloc(&(this->cudaSolver), sizeof(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >));
+	cudaMemcpy(this->cudaSolver, this,sizeof(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >), cudaMemcpyHostToDevice);
+	double** tmpdev = NULL;
+	cudaMalloc(&tmpdev, sizeof(double*));
+	cudaMalloc(&(this->tmpw), this->work_u.getSize()*sizeof(double));
+	cudaMalloc(&(this->runcuda), sizeof(int));
+	cudaDeviceSynchronize();
+	checkCudaDevice;
+	int* tmpUC;
+	cudaMalloc(&(tmpUC), this->work_u.getSize()*sizeof(int));
+	cudaMemcpy(tmpUC, this->unusedCell.getData(), this->unusedCell.getSize()*sizeof(int), cudaMemcpyHostToDevice);
+
+	initCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<1,1>>>(this->cudaSolver, (this->tmpw), (this->runcuda),tmpUC);
+	cudaDeviceSynchronize();
+	checkCudaDevice;
+	double* tmpu = NULL;
+
+	cudaMemcpy(&tmpu, tmpdev,sizeof(double*), cudaMemcpyDeviceToHost);
+	cudaMemcpy((this->tmpw), this->work_u.getData(), this->work_u.getSize()*sizeof(double), cudaMemcpyHostToDevice);
+	cudaDeviceSynchronize();
+	checkCudaDevice;
+	}
+#endif
+
+	if(this->device == tnlHostDevice)
+	{
+	for(int i = 0; i < this->subgridValues.getSize(); i++)
+	{
+
+		if(! tmp[i].setSize(this->n * this->n))
+			cout << "Could not allocate tmp["<< i <<"] array." << endl;
+			tmp[i] = getSubgrid(i);
+		containsCurve = false;
+
+		for(int j = 0; j < tmp[i].getSize(); j++)
+		{
+			if(tmp[i][0]*tmp[i][j] <= 0.0)
+			{
+				containsCurve = true;
+				j=tmp[i].getSize();
+			}
+
+		}
+		if(containsCurve)
+		{
+			insertSubgrid(runSubgrid(0, tmp[i],i), i);
+			setSubgridValue(i, 4);
+		}
+		containsCurve = false;
+
+	}
+	}
+#ifdef HAVE_CUDA
+	else if(this->device == tnlCudaDevice)
+	{
+		cudaDeviceSynchronize();
+		checkCudaDevice;
+		dim3 threadsPerBlock(this->n, this->n);
+		dim3 numBlocks(this->gridCols,this->gridRows);
+		cudaDeviceSynchronize();
+		checkCudaDevice;
+		initRunCUDA<SchemeTypeHost,SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver);
+		cudaDeviceSynchronize();
+	}
+#endif
+
+
+	this->currentStep = 1;
+	if(this->device == tnlHostDevice)
+		synchronize();
+#ifdef HAVE_CUDA
+	else if(this->device == tnlCudaDevice)
+	{
+		dim3 threadsPerBlock(this->n, this->n);
+		dim3 numBlocks(this->gridCols,this->gridRows);
+
+		checkCudaDevice;
+
+		synchronizeCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
+		cudaDeviceSynchronize();
+		checkCudaDevice;
+		synchronize2CUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver);
+		cudaDeviceSynchronize();
+		checkCudaDevice;
+
+	}
+
+#endif
+	cout << "Solver initialized." << endl;
+
+	return true;
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::run()
+{
+	if(this->device == tnlHostDevice)
+	{
+
+	bool end = false;
+	while ((this->boundaryConditions.max() > 0 ) || !end)
+	{
+		if(this->boundaryConditions.max() == 0 )
+			end=true;
+		else
+			end=false;
+#ifdef HAVE_OPENMP
+#pragma omp parallel for num_threads(4) schedule(dynamic)
+#endif
+		for(int i = 0; i < this->subgridValues.getSize(); i++)
+		{
+			if(getSubgridValue(i) != INT_MAX)
+			{
+
+<<<<<<< HEAD
+				if(getSubgridValue(i) == currentStep+4)
+				{
+
+				if(getBoundaryCondition(i) & 1)
+=======
+				if(getBoundaryCondition(i) & FROM_NORTH)
+>>>>>>> 86250e05537ea6311fa87d0f7bbae298788aa432
+				{
+					insertSubgrid( runSubgrid(FROM_NORTH, getSubgrid(i),i), i);
+					this->calculationsCount[i]++;
+				}
+				if(getBoundaryCondition(i) & FROM_EAST)
+				{
+					insertSubgrid( runSubgrid(FROM_EAST, getSubgrid(i),i), i);
+					this->calculationsCount[i]++;
+				}
+				if(getBoundaryCondition(i) & FROM_WEST)
+				{
+					insertSubgrid( runSubgrid(FROM_WEST, getSubgrid(i),i), i);
+					this->calculationsCount[i]++;
+				}
+				if(getBoundaryCondition(i) & FROM_SOUTH)
+				{
+					insertSubgrid( runSubgrid(FROM_SOUTH, getSubgrid(i),i), i);
+					this->calculationsCount[i]++;
+				}
+				}
+
+<<<<<<< HEAD
+				if( ((getBoundaryCondition(i) & 2) )|| (getBoundaryCondition(i) & 1)//)
+					/*	&&(!(getBoundaryCondition(i) & 5) && !(getBoundaryCondition(i) & 10)) */)
+=======
+
+				if( ((getBoundaryCondition(i) & FROM_EAST) ))
+>>>>>>> 86250e05537ea6311fa87d0f7bbae298788aa432
+				{
+					insertSubgrid( runSubgrid(FROM_NORTH + FROM_EAST, getSubgrid(i),i), i);
+				}
+<<<<<<< HEAD
+				if( ((getBoundaryCondition(i) & 4) )|| (getBoundaryCondition(i) & 1)//)
+					/*	&&(!(getBoundaryCondition(i) & 3) && !(getBoundaryCondition(i) & 12)) */)
+=======
+				if( ((getBoundaryCondition(i) & FROM_WEST) ))
+>>>>>>> 86250e05537ea6311fa87d0f7bbae298788aa432
+				{
+					insertSubgrid( runSubgrid(FROM_NORTH + FROM_WEST, getSubgrid(i),i), i);
+				}
+<<<<<<< HEAD
+				if( ((getBoundaryCondition(i) & 2) )|| (getBoundaryCondition(i) & 8)//)
+					/*	&&(!(getBoundaryCondition(i) & 12) && !(getBoundaryCondition(i) & 3))*/ )
+=======
+				if( ((getBoundaryCondition(i) & FROM_EAST) ))
+>>>>>>> 86250e05537ea6311fa87d0f7bbae298788aa432
+				{
+					insertSubgrid( runSubgrid(FROM_SOUTH + FROM_EAST, getSubgrid(i),i), i);
+				}
+<<<<<<< HEAD
+				if(   ((getBoundaryCondition(i) & 4) )|| (getBoundaryCondition(i) & 8)//)
+					/*&&(!(getBoundaryCondition(i) & 10) && !(getBoundaryCondition(i) & 5)) */)
+=======
+				if(   ((getBoundaryCondition(i) & FROM_WEST) ))
+>>>>>>> 86250e05537ea6311fa87d0f7bbae298788aa432
+				{
+					insertSubgrid( runSubgrid(FROM_SOUTH + FROM_WEST, getSubgrid(i),i), i);
+				}
+
+				setBoundaryCondition(i, 0);
+
+				setSubgridValue(i, getSubgridValue(i)-1);
+
+			}
+		}
+		synchronize();
+	}
+	}
+#ifdef HAVE_CUDA
+	else if(this->device == tnlCudaDevice)
+	{
+		bool end_cuda = false;
+		dim3 threadsPerBlock(this->n, this->n);
+		dim3 numBlocks(this->gridCols,this->gridRows);
+		cudaDeviceSynchronize();
+		checkCudaDevice;
+		bool* tmpb;
+		cudaMemcpy(&(this->run_host),this->runcuda,sizeof(int), cudaMemcpyDeviceToHost);
+		cudaDeviceSynchronize();
+		checkCudaDevice;
+		int i = 1;
+		time_diff = 0.0;
+		while (run_host || !end_cuda)
+		{
+			cout << "Computing at step "<< i++ << endl;
+			if(run_host != 0 )
+				end_cuda = true;
+			else
+				end_cuda = false;
+			cudaDeviceSynchronize();
+			checkCudaDevice;
+			start = std::clock();
+			runCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver);
+			cudaDeviceSynchronize();
+			time_diff += (std::clock() - start) / (double)(CLOCKS_PER_SEC);
+
+			//start = std::clock();
+			synchronizeCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
+			cudaDeviceSynchronize();
+			checkCudaDevice;
+			synchronize2CUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,1>>>(this->cudaSolver);
+			cudaDeviceSynchronize();
+			checkCudaDevice;
+			//time_diff += (std::clock() - start) / (double)(CLOCKS_PER_SEC);
+
+			cudaMemcpy(&run_host, (this->runcuda),sizeof(int), cudaMemcpyDeviceToHost);
+		}
+		cout << "Solving time was: " << time_diff << endl;
+		cudaMemcpy(this->work_u.getData(), (this->tmpw), this->work_u.getSize()*sizeof(double), cudaMemcpyDeviceToHost);
+
+		cudaDeviceSynchronize();
+	}
+#endif
+	contractGrid();
+	this->u0.save("u-00001.tnl");
+	cout << "Maximum number of calculations on one subgrid was " << this->calculationsCount.absMax() << endl;
+	cout << "Average number of calculations on one subgrid was " << ( (double) this->calculationsCount.sum() / (double) this->calculationsCount.getSize() ) << endl;
+	cout << "Solver finished" << endl;
+
+#ifdef HAVE_CUDA
+	if(this->device == tnlCudaDevice)
+	{
+		cudaFree(this->runcuda);
+		cudaFree(this->tmpw);
+		cudaFree(this->cudaSolver);
+	}
+#endif
+
+}
+
+//north - 1, east - 2, west - 4, south - 8
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::synchronize()
+{
+	cout << "Synchronizig..." << endl;
+	int tmp1, tmp2;
+	int grid1, grid2;
+
+	if(this->currentStep & 1)
+	{
+		for(int j = 0; j < this->gridRows - 1; j++)
+		{
+			for (int i = 0; i < this->gridCols*this->n; i++)
+			{
+				tmp1 = this->gridCols*this->n*((this->n-1)+j*this->n) + i;
+				tmp2 = this->gridCols*this->n*((this->n)+j*this->n) + i;
+				grid1 = getSubgridValue(getOwner(tmp1));
+				grid2 = getSubgridValue(getOwner(tmp2));
+				if(getOwner(tmp1)==getOwner(tmp2))
+					cout << "i, j" << i << "," << j << endl;
+				if ((fabs(this->work_u[tmp1]) < fabs(this->work_u[tmp2]) - this->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX))
+				{
+					this->work_u[tmp2] = this->work_u[tmp1];
+					this->unusedCell[tmp2] = 0;
+					if(grid2 == INT_MAX)
+					{
+						setSubgridValue(getOwner(tmp2), -INT_MAX);
+					}
+					if(! (getBoundaryCondition(getOwner(tmp2)) & FROM_SOUTH) )
+						setBoundaryCondition(getOwner(tmp2), getBoundaryCondition(getOwner(tmp2))+FROM_SOUTH);
+				}
+				else if ((fabs(this->work_u[tmp1]) > fabs(this->work_u[tmp2]) + this->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX))
+				{
+					this->work_u[tmp1] = this->work_u[tmp2];
+					this->unusedCell[tmp1] = 0;
+					if(grid1 == INT_MAX)
+					{
+						setSubgridValue(getOwner(tmp1), -INT_MAX);
+					}
+					if(! (getBoundaryCondition(getOwner(tmp1)) & FROM_NORTH) )
+						setBoundaryCondition(getOwner(tmp1), getBoundaryCondition(getOwner(tmp1))+FROM_NORTH);
+				}
+			}
+		}
+
+	}
+	else
+	{
+		for(int i = 1; i < this->gridCols; i++)
+		{
+			for (int j = 0; j < this->gridRows*this->n; j++)
+			{
+				tmp1 = this->gridCols*this->n*j + i*this->n - 1;
+				tmp2 = this->gridCols*this->n*j + i*this->n ;
+				grid1 = getSubgridValue(getOwner(tmp1));
+				grid2 = getSubgridValue(getOwner(tmp2));
+				if(getOwner(tmp1)==getOwner(tmp2))
+					cout << "i, j" << i << "," << j << endl;
+				if ((fabs(this->work_u[tmp1]) < fabs(this->work_u[tmp2]) - this->delta || grid2 == INT_MAX || grid2 == -INT_MAX) && (grid1 != INT_MAX && grid1 != -INT_MAX))
+				{
+					this->work_u[tmp2] = this->work_u[tmp1];
+					this->unusedCell[tmp2] = 0;
+					if(grid2 == INT_MAX)
+					{
+						setSubgridValue(getOwner(tmp2), -INT_MAX);
+					}
+					if(! (getBoundaryCondition(getOwner(tmp2)) & FROM_WEST) )
+						setBoundaryCondition(getOwner(tmp2), getBoundaryCondition(getOwner(tmp2))+FROM_WEST);
+				}
+				else if ((fabs(this->work_u[tmp1]) > fabs(this->work_u[tmp2]) + this->delta || grid1 == INT_MAX || grid1 == -INT_MAX) && (grid2 != INT_MAX && grid2 != -INT_MAX))
+				{
+					this->work_u[tmp1] = this->work_u[tmp2];
+					this->unusedCell[tmp1] = 0;
+					if(grid1 == INT_MAX)
+					{
+						setSubgridValue(getOwner(tmp1), -INT_MAX);
+					}
+					if(! (getBoundaryCondition(getOwner(tmp1)) & FROM_EAST) )
+						setBoundaryCondition(getOwner(tmp1), getBoundaryCondition(getOwner(tmp1))+FROM_EAST);
+				}
+			}
+		}
+	}
+
+
+	this->currentStep++;
+	int stepValue = this->currentStep + 4;
+	for (int i = 0; i < this->subgridValues.getSize(); i++)
+	{
+		if( getSubgridValue(i) == -INT_MAX )
+			setSubgridValue(i, stepValue);
+	}
+
+	cout << "Grid synchronized at step " << (this->currentStep - 1 ) << endl;
+
+}
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getOwner(int i) const
+{
+
+	return (i / (this->gridCols*this->n*this->n))*this->gridCols + (i % (this->gridCols*this->n))/this->n;
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getSubgridValue( int i ) const
+{
+	return this->subgridValues[i];
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::setSubgridValue(int i, int value)
+{
+	this->subgridValues[i] = value;
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getBoundaryCondition( int i ) const
+{
+	return this->boundaryConditions[i];
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::setBoundaryCondition(int i, int value)
+{
+	this->boundaryConditions[i] = value;
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::stretchGrid()
+{
+	cout << "Stretching grid..." << endl;
+
+
+	this->gridCols = ceil( ((double)(this->mesh.getDimensions().x()-1)) / ((double)(this->n-1)) );
+	this->gridRows = ceil( ((double)(this->mesh.getDimensions().y()-1)) / ((double)(this->n-1)) );
+
+	cout << "Setting gridCols to " << this->gridCols << "." << endl;
+	cout << "Setting gridRows to " << this->gridRows << "." << endl;
+
+	this->subgridValues.setSize(this->gridCols*this->gridRows);
+	this->subgridValues.setValue(0);
+	this->boundaryConditions.setSize(this->gridCols*this->gridRows);
+	this->boundaryConditions.setValue(0);
+	this->calculationsCount.setSize(this->gridCols*this->gridRows);
+	this->calculationsCount.setValue(0);
+
+	for(int i = 0; i < this->subgridValues.getSize(); i++ )
+	{
+		this->subgridValues[i] = INT_MAX;
+		this->boundaryConditions[i] = 0;
+	}
+
+	int stretchedSize = this->n*this->n*this->gridCols*this->gridRows;
+
+	if(!this->work_u.setSize(stretchedSize))
+		cerr << "Could not allocate memory for stretched grid." << endl;
+	if(!this->unusedCell.setSize(stretchedSize))
+		cerr << "Could not allocate memory for supporting stretched grid." << endl;
+	int idealStretch =this->mesh.getDimensions().x() + (this->mesh.getDimensions().x()-2)/(this->n-1);
+	cout << idealStretch << endl;
+
+	for(int i = 0; i < stretchedSize; i++)
+	{
+		this->unusedCell[i] = 1;
+		int diff =(this->n*this->gridCols) - idealStretch ;
+		int k = i/this->n - i/(this->n*this->gridCols) + this->mesh.getDimensions().x()*(i/(this->n*this->n*this->gridCols)) + (i/(this->n*this->gridCols))*diff;
+
+		if(i%(this->n*this->gridCols) - idealStretch  >= 0)
+		{
+			k+= i%(this->n*this->gridCols) - idealStretch +1 ;
+		}
+
+		if(i/(this->n*this->gridCols) - idealStretch + 1  > 0)
+		{
+			k+= (i/(this->n*this->gridCols) - idealStretch +1 )* this->mesh.getDimensions().x() ;
+		}
+
+		this->work_u[i] = this->u0[i-k];
+	}
+
+	cout << "Grid stretched." << endl;
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::contractGrid()
+{
+	cout << "Contracting grid..." << endl;
+	int stretchedSize = this->n*this->n*this->gridCols*this->gridRows;
+
+	int idealStretch =this->mesh.getDimensions().x() + (this->mesh.getDimensions().x()-2)/(this->n-1);
+	cout << idealStretch << endl;
+
+	for(int i = 0; i < stretchedSize; i++)
+	{
+		int diff =(this->n*this->gridCols) - idealStretch ;
+		int k = i/this->n - i/(this->n*this->gridCols) + this->mesh.getDimensions().x()*(i/(this->n*this->n*this->gridCols)) + (i/(this->n*this->gridCols))*diff;
+
+		if((i%(this->n*this->gridCols) - idealStretch  < 0) && (i/(this->n*this->gridCols) - idealStretch + 1  <= 0))
+		{
+			this->u0[i-k] = this->work_u[i];
+		}
+
+	}
+
+	cout << "Grid contracted" << endl;
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+typename tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::VectorType
+tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getSubgrid( const int i ) const
+{
+	VectorType u;
+	u.setSize(this->n*this->n);
+
+	for( int j = 0; j < u.getSize(); j++)
+	{
+		u[j] = this->work_u[ (i / this->gridCols) * this->n*this->n*this->gridCols
+		                     + (i % this->gridCols) * this->n
+		                     + (j/this->n) * this->n*this->gridCols
+		                     + (j % this->n) ];
+	}
+	return u;
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::insertSubgrid( VectorType u, const int i )
+{
+
+	for( int j = 0; j < this->n*this->n; j++)
+	{
+		int index = (i / this->gridCols)*this->n*this->n*this->gridCols + (i % this->gridCols)*this->n + (j/this->n)*this->n*this->gridCols + (j % this->n);
+		//OMP LOCK index
+		if( (fabs(this->work_u[index]) > fabs(u[j])) || (this->unusedCell[index] == 1) )
+		{
+			this->work_u[index] = u[j];
+			this->unusedCell[index] = 0;
+		}
+		//OMP UNLOCK index
+	}
+}
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+typename tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::VectorType
+tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::runSubgrid( int boundaryCondition, VectorType u, int subGridID)
+{
+
+	VectorType fu;
+
+	fu.setLike(u);
+	fu.setValue( 0.0 );
+	bool tmp = false;
+	for(int i = 0; i < u.getSize(); i++)
+	{
+		if(u[0]*u[i] <= 0.0)
+			tmp=true;
+		int centreGID = (this->n*(subGridID / this->gridRows)+ (this->n >> 1))*(this->n*this->gridCols) + this->n*(subGridID % this->gridRows) + (this->n >> 1);
+		if(this->unusedCell[centreGID] == 0 || boundaryCondition == 0)
+			tmp = true;
+	}
+
+	double value = Sign(u[0]) * u.absMax();
+
+	if(tmp)
+	{}
+
+
+	//north - 1, east - 2, west - 4, south - 8
+	else if(boundaryCondition == FROM_WEST)
+	{
+		for(int i = 0; i < this->n; i++)
+			for(int j = 1;j < this->n; j++)
+				u[i*this->n + j] = value;// u[i*this->n];
+	}
+	else if(boundaryCondition == FROM_EAST)
+	{
+		for(int i = 0; i < this->n; i++)
+			for(int j =0 ;j < this->n -1; j++)
+				u[i*this->n + j] = value;// u[(i+1)*this->n - 1];
+	}
+	else if(boundaryCondition == FROM_NORTH)
+	{
+		for(int j = 0; j < this->n; j++)
+			for(int i = 0;i < this->n - 1; i++)
+				u[i*this->n + j] = value;// u[j + this->n*(this->n - 1)];
+	}
+	else if(boundaryCondition == FROM_SOUTH)
+	{
+		for(int j = 0; j < this->n; j++)
+			for(int i = 1;i < this->n; i++)
+				u[i*this->n + j] = value;// u[j];
+	}
+
+   double time = 0.0;
+   double currentTau = this->tau0;
+   double finalTime = this->stopTime;// + 3.0*(u.max() - u.min());
+   if( time + currentTau > finalTime ) currentTau = finalTime - time;
+
+   double maxResidue( 1.0 );
+   while( time < finalTime /*|| maxResidue > subMesh.getHx()*/)
+   {
+      /****
+       * Compute the RHS
+       */
+
+      for( int i = 0; i < fu.getSize(); i ++ )
+      {
+    	  fu[ i ] = schemeHost.getValue( this->subMesh, i, this->subMesh.getCellCoordinates(i), u, time, boundaryCondition );
+      }
+      maxResidue = fu. absMax();
+
+
+      if( this -> cflCondition * maxResidue != 0.0)
+    	  currentTau =  this -> cflCondition / maxResidue;
+
+      if(currentTau > 1.0 * this->subMesh.getHx())
+      {
+    	  currentTau = 1.0 * this->subMesh.getHx();
+      }
+
+
+      if( time + currentTau > finalTime ) currentTau = finalTime - time;
+<<<<<<< HEAD
+//      for( int i = 0; i < fu.getSize(); i ++ )
+//      {
+//    	  //cout << "Too big RHS! i = " << i << ", fu = " << fu[i] << ", u = " << u[i] << endl;
+//    	  if((u[i]+currentTau * fu[ i ])*u[i] < 0.0 && fu[i] != 0.0 && u[i] != 0.0 )
+//    		  currentTau = fabs(u[i]/(2.0*fu[i]));
+//
+//      }
+=======
+      for( int i = 0; i < fu.getSize(); i ++ )
+      {
+    	  if((u[i]+currentTau * fu[ i ])*u[i] < 0.0 && fu[i] != 0.0 && u[i] != 0.0 )
+    		  currentTau = fabs(u[i]/(2.0*fu[i]));
+    	  
+      }
+>>>>>>> 86250e05537ea6311fa87d0f7bbae298788aa432
+
+
+      for( int i = 0; i < fu.getSize(); i ++ )
+      {
+    	  double add = u[i] + currentTau * fu[ i ];
+    		  u[ i ] = add;
+      }
+      time += currentTau;
+
+   }
+	VectorType solution;
+	solution.setLike(u);
+    for( int i = 0; i < u.getSize(); i ++ )
+  	{
+    	solution[i]=u[i];
+   	}
+	return solution;
+}
+
+
+#ifdef HAVE_CUDA
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+__device__
+void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getSubgridCUDA( const int i ,tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller, double* a)
+{
+<<<<<<< HEAD
+	//int j = threadIdx.x + threadIdx.y * blockDim.x;
+	int th = (blockIdx.y) * caller->n*caller->n*caller->gridCols
+            + (blockIdx.x) * caller->n
+            + threadIdx.y * caller->n*caller->gridCols
+            + threadIdx.x;
+	//printf("i= %d,j= %d,th= %d\n",i,j,th);
+=======
+	int j = threadIdx.x + threadIdx.y * blockDim.x;
+	int th = (i / caller->gridCols) * caller->n*caller->n*caller->gridCols
+            + (i % caller->gridCols) * caller->n
+            + (j/caller->n) * caller->n*caller->gridCols
+            + (j % caller->n);
+>>>>>>> 86250e05537ea6311fa87d0f7bbae298788aa432
+	*a = caller->work_u_cuda[th];
+}
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+__device__
+void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::updateSubgridCUDA( const int i ,tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller, double* a)
+{
+	int j = threadIdx.x + threadIdx.y * blockDim.x;
+	int index = (blockIdx.y) * caller->n*caller->n*caller->gridCols
+            + (blockIdx.x) * caller->n
+            + threadIdx.y * caller->n*caller->gridCols
+            + threadIdx.x;
+
+	if( (fabs(caller->work_u_cuda[index]) > fabs(*a)) || (caller->unusedCell_cuda[index] == 1) )
+	{
+		caller->work_u_cuda[index] = *a;
+		caller->unusedCell_cuda[index] = 0;
+
+	}
+
+	*a = caller->work_u_cuda[index];
+}
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+__device__
+void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::insertSubgridCUDA( double u, const int i )
+{
+
+
+<<<<<<< HEAD
+//	int j = threadIdx.x + threadIdx.y * blockDim.x;
+	//printf("j = %d, u = %f\n", j,u);
+=======
+	int j = threadIdx.x + threadIdx.y * blockDim.x;
+>>>>>>> 86250e05537ea6311fa87d0f7bbae298788aa432
+
+		int index = (blockIdx.y)*this->n*this->n*this->gridCols
+					+ (blockIdx.x)*this->n
+					+ threadIdx.y*this->n*this->gridCols
+					+ threadIdx.x;
+
+		if( (fabs(this->work_u_cuda[index]) > fabs(u)) || (this->unusedCell_cuda[index] == 1) )
+		{
+			this->work_u_cuda[index] = u;
+			this->unusedCell_cuda[index] = 0;
+
+		}
+
+
+}
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+__device__
+void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::runSubgridCUDA( int boundaryCondition, double* u, int subGridID)
+{
+
+	__shared__ int tmp;
+	__shared__ double value;
+	volatile double* sharedTau = &u[blockDim.x*blockDim.y];
+	volatile double* absVal = &u[2*blockDim.x*blockDim.y];
+	int i = threadIdx.x;
+	int j = threadIdx.y;
+	int l = threadIdx.y * blockDim.x + threadIdx.x;
+	bool computeFU = !((i == 0 && (boundaryCondition & FROM_WEST)) or
+			 (i == blockDim.x - 1 && (boundaryCondition & FROM_EAST)) or
+			 (j == 0 && (boundaryCondition & FROM_SOUTH)) or
+			 (j == blockDim.y - 1  && (boundaryCondition & FROM_NORTH)));
+
+	if(l == 0)
+	{
+		tmp = 0;
+		int centreGID = (blockDim.y*blockIdx.y + (blockDim.y>>1))*(blockDim.x*gridDim.x) + blockDim.x*blockIdx.x + (blockDim.x>>1);
+		if(this->unusedCell_cuda[centreGID] == 0 || boundaryCondition == 0)
+			tmp = 1;
+	}
+	__syncthreads();
+
+
+	__syncthreads();
+	if(tmp !=1)
+	{
+//		if(computeFU)
+//			absVal[l]=0.0;
+//		else
+//			absVal[l] = fabs(u[l]);
+//
+//		__syncthreads();
+//
+//	      if((blockDim.x == 16) && (l < 128))		absVal[l] = Max(absVal[l],absVal[l+128]);
+//	      __syncthreads();
+//	      if((blockDim.x == 16) && (l < 64))		absVal[l] = Max(absVal[l],absVal[l+64]);
+//	      __syncthreads();
+//	      if(l < 32)    							absVal[l] = Max(absVal[l],absVal[l+32]);
+//	      if(l < 16)								absVal[l] = Max(absVal[l],absVal[l+16]);
+//	      if(l < 8)									absVal[l] = Max(absVal[l],absVal[l+8]);
+//	      if(l < 4)									absVal[l] = Max(absVal[l],absVal[l+4]);
+//	      if(l < 2)									absVal[l] = Max(absVal[l],absVal[l+2]);
+//	      if(l < 1)									value   = Sign(u[0])*Max(absVal[l],absVal[l+1]);
+//		__syncthreads();
+//
+//		if(computeFU)
+//			u[l] = value;
+		if(computeFU)
+<<<<<<< HEAD
+		{
+			if(boundaryCondition == 4)
+				u[l] = u[threadIdx.y * blockDim.x] + Sign(u[0])*this->subMesh.getHx()*(threadIdx.x) ;//+  2*Sign(u[0])*this->subMesh.getHx()*(threadIdx.x+this->n);
+			else if(boundaryCondition == 2)
+				u[l] = u[threadIdx.y * blockDim.x + blockDim.x - 1] + Sign(u[0])*this->subMesh.getHx()*(this->n - 1 - threadIdx.x);//+ 2*Sign(u[0])*this->subMesh.getHx()*(blockDim.x - threadIdx.x - 1+this->n);
+			else if(boundaryCondition == 8)
+				u[l] = u[threadIdx.x] + Sign(u[0])*this->subMesh.getHx()*(threadIdx.y) ;//+ 2*Sign(u[0])*this->subMesh.getHx()*(threadIdx.y+this->n);
+			else if(boundaryCondition == 1)
+				u[l] = u[(blockDim.y - 1)* blockDim.x + threadIdx.x] + Sign(u[0])*this->subMesh.getHx()*(this->n - 1 - threadIdx.y) ;//+ Sign(u[0])*this->subMesh.getHx()*(blockDim.y - threadIdx.y  - 1 +this->n);
+=======
+			absVal[l]=0.0;
+		else
+			absVal[l] = fabs(u[l]);
+
+		__syncthreads();
+
+	      if((blockDim.x == 16) && (l < 128))		absVal[l] = Max(absVal[l],absVal[l+128]);
+	      __syncthreads();
+	      if((blockDim.x == 16) && (l < 64))		absVal[l] = Max(absVal[l],absVal[l+64]);
+	      __syncthreads();
+	      if(l < 32)    							absVal[l] = Max(absVal[l],absVal[l+32]);
+	      if(l < 16)								absVal[l] = Max(absVal[l],absVal[l+16]);
+	      if(l < 8)									absVal[l] = Max(absVal[l],absVal[l+8]);
+	      if(l < 4)									absVal[l] = Max(absVal[l],absVal[l+4]);
+	      if(l < 2)									absVal[l] = Max(absVal[l],absVal[l+2]);
+	      if(l < 1)									value   = Sign(u[0])*Max(absVal[l],absVal[l+1]);
+		__syncthreads();
+
+		if(computeFU)
+		{
+//			u[l] = value;
+			if(boundaryCondition==FROM_NORTH)
+				u[l] = u[i + blockDim.x*(blockDim.y-1)];
+			else if(boundaryCondition==FROM_SOUTH)
+				u[l] = u[i];
+			else if(boundaryCondition==FROM_EAST)
+				u[l] = u[blockDim.x-1 + blockDim.x*j];
+			else if(boundaryCondition==FROM_WEST)
+				u[l] = u[blockDim.x*j];
+>>>>>>> 86250e05537ea6311fa87d0f7bbae298788aa432
+		}
+	}
+
+   double time = 0.0;
+   __shared__ double currentTau;
+   double cfl = this->cflCondition;
+   double fu = 0.0;
+//   if(threadIdx.x * threadIdx.y == 0)
+//   {
+//	   currentTau = this->tau0;
+//   }
+   double finalTime = this->stopTime;
+   __syncthreads();
+   if( time + currentTau > finalTime ) currentTau = finalTime - time;
+
+
+   while( time < finalTime )
+   {
+	  if(computeFU)
+		  fu = schemeHost.getValueDev( this->subMesh, l, tnlStaticVector<2,int>(i,j)/*this->subMesh.getCellCoordinates(l)*/, u, time, boundaryCondition);
+
+	  sharedTau[l]=cfl/fabs(fu);
+
+      if(l == 0)
+      {
+    	  if(sharedTau[0] > 1.0 * this->subMesh.getHx())	sharedTau[0] = 1.0 * this->subMesh.getHx();
+      }
+      else if(l == blockDim.x*blockDim.y - 1)
+    	  if( time + sharedTau[l] > finalTime )		sharedTau[l] = finalTime - time;
+
+
+
+      if((blockDim.x == 16) && (l < 128))		sharedTau[l] = Min(sharedTau[l],sharedTau[l+128]);
+      __syncthreads();
+      if((blockDim.x == 16) && (l < 64))		sharedTau[l] = Min(sharedTau[l],sharedTau[l+64]);
+      __syncthreads();
+      if(l < 32)    							sharedTau[l] = Min(sharedTau[l],sharedTau[l+32]);
+      if(l < 16)								sharedTau[l] = Min(sharedTau[l],sharedTau[l+16]);
+      if(l < 8)									sharedTau[l] = Min(sharedTau[l],sharedTau[l+8]);
+      if(l < 4)									sharedTau[l] = Min(sharedTau[l],sharedTau[l+4]);
+      if(l < 2)									sharedTau[l] = Min(sharedTau[l],sharedTau[l+2]);
+      if(l < 1)									currentTau   = Min(sharedTau[l],sharedTau[l+1]);
+	__syncthreads();
+
+      u[l] += currentTau * fu;
+      time += currentTau;
+   }
+
+
+}
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+__device__
+int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getOwnerCUDA(int i) const
+{
+
+	return ((i / (this->gridCols*this->n*this->n))*this->gridCols
+			+ (i % (this->gridCols*this->n))/this->n);
+}
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+__device__
+int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getSubgridValueCUDA( int i ) const
+{
+	return this->subgridValues_cuda[i];
+}
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+__device__
+void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::setSubgridValueCUDA(int i, int value)
+{
+	this->subgridValues_cuda[i] = value;
+}
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+__device__
+int tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::getBoundaryConditionCUDA( int i ) const
+{
+	return this->boundaryConditions_cuda[i];
+}
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+__device__
+void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::setBoundaryConditionCUDA(int i, int value)
+{
+	this->boundaryConditions_cuda[i] = value;
+}
+
+
+
+//north - 1, east - 2, west - 4, south - 8
+
+template <typename SchemeHost, typename SchemeDevice, typename Device>
+__global__
+void synchronizeCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver) //needs fix ---- maybe not anymore --- but frankly: yeah, it does -- aaaa-and maybe fixed now
+{
+
+	__shared__ int boundary[4]; // north,east,west,south
+	__shared__ int subgridValue;
+	__shared__ int newSubgridValue;
+
+
+	int gid = (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x + blockDim.x*blockIdx.x + threadIdx.x;
+	double u = cudaSolver->work_u_cuda[gid];
+	double u_cmp;
+	int subgridValue_cmp=INT_MAX;
+	int boundary_index=0;
+
+
+	if(threadIdx.x+threadIdx.y == 0)
+	{
+		subgridValue = cudaSolver->getSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x);
+		boundary[0] = 0;
+		boundary[1] = 0;
+		boundary[2] = 0;
+		boundary[3] = 0;
+		newSubgridValue = 0;
+	}
+	__syncthreads();
+
+
+
+	if(		(threadIdx.x == 0 					/*	&& !(cudaSolver->currentStep & 1)*/) 		||
+			(threadIdx.y == 0 				 	/*	&& (cudaSolver->currentStep & 1)*/) 		||
+			(threadIdx.x == blockDim.x - 1	 	/*	&& !(cudaSolver->currentStep & 1)*/) 		||
+			(threadIdx.y == blockDim.y - 1 		/*	&& (cudaSolver->currentStep & 1)*/) 		)
+	{
+		if(threadIdx.x == 0 && (blockIdx.x != 0)/* && !(cudaSolver->currentStep & 1)*/)
+		{
+			u_cmp = cudaSolver->work_u_cuda[gid - 1];
+			subgridValue_cmp = cudaSolver->getSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x - 1);
+			boundary_index = 2;
+		}
+
+		if(threadIdx.x == blockDim.x - 1 && (blockIdx.x != gridDim.x - 1)/* && !(cudaSolver->currentStep & 1)*/)
+		{
+			u_cmp = cudaSolver->work_u_cuda[gid + 1];
+			subgridValue_cmp = cudaSolver->getSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x + 1);
+			boundary_index = 1;
+		}
+		__threadfence();
+		if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX))
+		{
+			cudaSolver->unusedCell_cuda[gid] = 0;
+			atomicMax(&newSubgridValue, INT_MAX);
+			atomicMax(&boundary[boundary_index], 1);
+			cudaSolver->work_u_cuda[gid] = u_cmp;
+			u=u_cmp;
+		}
+		if(threadIdx.y == 0 && (blockIdx.y != 0)/* && (cudaSolver->currentStep & 1)*/)
+		{
+			u_cmp = cudaSolver->work_u_cuda[gid - blockDim.x*gridDim.x];
+			subgridValue_cmp = cudaSolver->getSubgridValueCUDA((blockIdx.y - 1)*gridDim.x + blockIdx.x);
+			boundary_index = 3;
+		}
+		if(threadIdx.y == blockDim.y - 1 && (blockIdx.y != gridDim.y - 1)/* && (cudaSolver->currentStep & 1)*/)
+		{
+			u_cmp = cudaSolver->work_u_cuda[gid + blockDim.x*gridDim.x];
+			subgridValue_cmp = cudaSolver->getSubgridValueCUDA((blockIdx.y + 1)*gridDim.x + blockIdx.x);
+			boundary_index = 0;
+		}
+
+		__threadfence();
+		if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX))
+		{
+			cudaSolver->unusedCell_cuda[gid] = 0;
+			atomicMax(&newSubgridValue, INT_MAX);
+			atomicMax(&boundary[boundary_index], 1);
+			cudaSolver->work_u_cuda[gid] = u_cmp;
+		}
+	}
+	__syncthreads();
+
+	if(threadIdx.x+threadIdx.y == 0)
+	{
+		if(subgridValue == INT_MAX && newSubgridValue !=0)
+			cudaSolver->setSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x, -INT_MAX);
+
+		cudaSolver->setBoundaryConditionCUDA(blockIdx.y*gridDim.x + blockIdx.x, FROM_NORTH * boundary[0] +
+																				FROM_EAST * boundary[1] +
+																				FROM_WEST * boundary[2] +
+																				FROM_SOUTH * boundary[3]);
+	}
+}
+
+
+
+template <typename SchemeHost, typename SchemeDevice, typename Device>
+__global__
+void synchronize2CUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver)
+{
+	if(blockIdx.x+blockIdx.y ==0)
+	{
+		cudaSolver->currentStep = cudaSolver->currentStep + 1;
+		*(cudaSolver->runcuda) = 0;
+	}
+
+	int stepValue = cudaSolver->currentStep + 4;
+	if( cudaSolver->getSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x) == -INT_MAX )
+			cudaSolver->setSubgridValueCUDA(blockIdx.y*gridDim.x + blockIdx.x, stepValue);
+
+	atomicMax((cudaSolver->runcuda),cudaSolver->getBoundaryConditionCUDA(blockIdx.y*gridDim.x + blockIdx.x));
+}
+
+
+
+
+
+
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device>
+__global__
+void initCUDA( tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* cudaSolver, double* ptr , int* ptr2, int* ptr3)
+{
+	cudaSolver->work_u_cuda = ptr;
+	cudaSolver->unusedCell_cuda = ptr3;
+	cudaSolver->subgridValues_cuda =(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*sizeof(int));
+	cudaSolver->boundaryConditions_cuda =(int*)malloc(cudaSolver->gridCols*cudaSolver->gridRows*sizeof(int));
+	cudaSolver->runcuda = ptr2;
+	*(cudaSolver->runcuda) = 1;
+	cudaSolver->currentStep = 1;
+	printf("GPU memory allocated.\n");
+
+	for(int i = 0; i < cudaSolver->gridCols*cudaSolver->gridRows; i++)
+	{
+		cudaSolver->subgridValues_cuda[i] = INT_MAX;
+		cudaSolver->boundaryConditions_cuda[i] = 0;
+	}
+
+	printf("GPU memory initialized.\n");
+}
+
+
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device >
+__global__
+void initRunCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller)
+
+{
+
+
+	extern __shared__ double u[];
+
+	int i = blockIdx.y * gridDim.x + blockIdx.x;
+	int l = threadIdx.y * blockDim.x + threadIdx.x;
+
+	__shared__ int containsCurve;
+	if(l == 0)
+		containsCurve = 0;
+
+	caller->getSubgridCUDA(i,caller, &u[l]);
+	__syncthreads();
+	if(u[0] * u[l] <= 0.0)
+	{
+		atomicMax( &containsCurve, 1);
+	}
+
+	__syncthreads();
+	if(containsCurve == 1)
+	{
+		caller->runSubgridCUDA(0,u,i);
+		__syncthreads();
+		caller->insertSubgridCUDA(u[l],i);
+		__syncthreads();
+		if(l == 0)
+			caller->setSubgridValueCUDA(i, 4);
+	}
+
+
+}
+
+
+
+
+
+template< typename SchemeHost, typename SchemeDevice, typename Device >
+__global__
+void runCUDA(tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int >* caller)
+{
+	extern __shared__ double u[];
+	int i = blockIdx.y * gridDim.x + blockIdx.x;
+	int l = threadIdx.y * blockDim.x + threadIdx.x;
+	int bound = caller->getBoundaryConditionCUDA(i);
+
+	if(caller->getSubgridValueCUDA(i) != INT_MAX && bound != 0 && caller->getSubgridValueCUDA(i) > 0)
+	{
+		caller->getSubgridCUDA(i,caller, &u[l]);
+<<<<<<< HEAD
+
+		//if(l == 0)
+			//printf("i = %d, bound = %d\n",i,caller->getSubgridValueCUDA(i));
+		if(caller->getSubgridValueCUDA(i) == caller->currentStep+4)
+		{
+			if(bound & 1)
+			{
+				caller->runSubgridCUDA(1,u,i);
+				//__syncthreads();
+				//caller->insertSubgridCUDA(u[l],i);
+				//__syncthreads();
+				//caller->getSubgridCUDA(i,caller, &u[l]);
+				caller->updateSubgridCUDA(i,caller, &u[l]);
+				__syncthreads();
+			}
+			if(bound & 2 )
+			{
+				caller->runSubgridCUDA(2,u,i);
+				//__syncthreads();
+				//caller->insertSubgridCUDA(u[l],i);
+				//__syncthreads();
+				//caller->getSubgridCUDA(i,caller, &u[l]);
+				caller->updateSubgridCUDA(i,caller, &u[l]);
+				__syncthreads();
+			}
+			if(bound & 4)
+			{
+				caller->runSubgridCUDA(4,u,i);
+				//__syncthreads();
+				//caller->insertSubgridCUDA(u[l],i);
+				//__syncthreads();
+				//caller->getSubgridCUDA(i,caller, &u[l]);
+				caller->updateSubgridCUDA(i,caller, &u[l]);
+				__syncthreads();
+			}
+			if(bound & 8)
+			{
+				caller->runSubgridCUDA(8,u,i);
+				//__syncthreads();
+				//caller->insertSubgridCUDA(u[l],i);
+				//__syncthreads();
+				//caller->getSubgridCUDA(i,caller, &u[l]);
+				caller->updateSubgridCUDA(i,caller, &u[l]);
+				__syncthreads();
+			}
+
+
+
+
+
+			if( ((bound & 3 )))
+				{
+					caller->runSubgridCUDA(3,u,i);
+					//__syncthreads();
+					//caller->insertSubgridCUDA(u[l],i);
+					//__syncthreads();
+					//caller->getSubgridCUDA(i,caller, &u[l]);
+					caller->updateSubgridCUDA(i,caller, &u[l]);
+					__syncthreads();
+				}
+				if( ((bound & 5 )))
+				{
+					caller->runSubgridCUDA(5,u,i);
+					//__syncthreads();
+					//caller->insertSubgridCUDA(u[l],i);
+					//__syncthreads();
+					//caller->getSubgridCUDA(i,caller, &u[l]);
+					caller->updateSubgridCUDA(i,caller, &u[l]);
+					__syncthreads();
+				}
+				if( ((bound & 10 )))
+				{
+					caller->runSubgridCUDA(10,u,i);
+					//__syncthreads();
+					//caller->insertSubgridCUDA(u[l],i);
+					//__syncthreads();
+					//caller->getSubgridCUDA(i,caller, &u[l]);
+					caller->updateSubgridCUDA(i,caller, &u[l]);
+					__syncthreads();
+				}
+				if(   (bound & 12 ))
+				{
+					caller->runSubgridCUDA(12,u,i);
+					//__syncthreads();
+					//caller->insertSubgridCUDA(u[l],i);
+					//__syncthreads();
+					//caller->getSubgridCUDA(i,caller, &u[l]);
+					caller->updateSubgridCUDA(i,caller, &u[l]);
+					__syncthreads();
+				}
+
+
+
+
+
+		}
+
+
+		else
+		{
+
+
+
+
+
+
+
+
+
+			if( ((bound == 2)))
+						{
+							caller->runSubgridCUDA(2,u,i);
+							//__syncthreads();
+							//caller->insertSubgridCUDA(u[l],i);
+							//__syncthreads();
+							//caller->getSubgridCUDA(i,caller, &u[l]);
+							caller->updateSubgridCUDA(i,caller, &u[l]);
+							__syncthreads();
+						}
+						if( ((bound == 1) ))
+						{
+							caller->runSubgridCUDA(1,u,i);
+							//__syncthreads();
+							//caller->insertSubgridCUDA(u[l],i);
+							//__syncthreads();
+							//caller->getSubgridCUDA(i,caller, &u[l]);
+							caller->updateSubgridCUDA(i,caller, &u[l]);
+							__syncthreads();
+						}
+						if( ((bound == 8) ))
+						{
+							caller->runSubgridCUDA(8,u,i);
+							//__syncthreads();
+							//caller->insertSubgridCUDA(u[l],i);
+							//__syncthreads();
+							//caller->getSubgridCUDA(i,caller, &u[l]);
+							caller->updateSubgridCUDA(i,caller, &u[l]);
+							__syncthreads();
+						}
+						if(   (bound == 4))
+						{
+							caller->runSubgridCUDA(4,u,i);
+							//__syncthreads();
+							//caller->insertSubgridCUDA(u[l],i);
+							//__syncthreads();
+							//caller->getSubgridCUDA(i,caller, &u[l]);
+							caller->updateSubgridCUDA(i,caller, &u[l]);
+							__syncthreads();
+						}
+
+
+
+
+
+
+
+
+
+
+			if( ((bound & 3) ))
+			{
+				caller->runSubgridCUDA(3,u,i);
+				//__syncthreads();
+				//caller->insertSubgridCUDA(u[l],i);
+				//__syncthreads();
+				//caller->getSubgridCUDA(i,caller, &u[l]);
+				caller->updateSubgridCUDA(i,caller, &u[l]);
+				__syncthreads();
+			}
+			if( ((bound & 5) ))
+			{
+				caller->runSubgridCUDA(5,u,i);
+				//__syncthreads();
+				//caller->insertSubgridCUDA(u[l],i);
+				//__syncthreads();
+				//caller->getSubgridCUDA(i,caller, &u[l]);
+				caller->updateSubgridCUDA(i,caller, &u[l]);
+				__syncthreads();
+			}
+			if( ((bound & 10) ))
+			{
+				caller->runSubgridCUDA(10,u,i);
+				//__syncthreads();
+				//caller->insertSubgridCUDA(u[l],i);
+				//__syncthreads();
+				//caller->getSubgridCUDA(i,caller, &u[l]);
+				caller->updateSubgridCUDA(i,caller, &u[l]);
+				__syncthreads();
+			}
+			if(   (bound & 12) )
+			{
+				caller->runSubgridCUDA(12,u,i);
+				//__syncthreads();
+				//caller->insertSubgridCUDA(u[l],i);
+				//__syncthreads();
+				//caller->getSubgridCUDA(i,caller, &u[l]);
+				caller->updateSubgridCUDA(i,caller, &u[l]);
+				__syncthreads();
+			}
+
+
+
+
+
+
+
+
+
+
+
+
+		}
+		/*if( bound )
+		{
+			caller->runSubgridCUDA(15,u,i);
+			__syncthreads();
+			//caller->insertSubgridCUDA(u[l],i);
+			//__syncthreads();
+			//caller->getSubgridCUDA(i,caller, &u[l]);
+			caller->updateSubgridCUDA(i,caller, &u[l]);
+			__syncthreads();
+		}*/
+=======
+		int bound = caller->getBoundaryConditionCUDA(i);
+		if(caller->getSubgridValueCUDA(i) == caller->currentStep+4)
+		{
+			if(bound & FROM_NORTH)
+			{
+				caller->runSubgridCUDA(FROM_NORTH,u,i);
+				__syncthreads();
+				caller->updateSubgridCUDA(i,caller, &u[l]);
+				__syncthreads();
+			}
+			if(bound & FROM_EAST )
+			{
+				caller->runSubgridCUDA(FROM_EAST,u,i);
+				__syncthreads();
+				caller->updateSubgridCUDA(i,caller, &u[l]);
+				__syncthreads();
+			}
+			if(bound & FROM_WEST)
+			{
+				caller->runSubgridCUDA(FROM_WEST,u,i);
+				__syncthreads();
+				caller->updateSubgridCUDA(i,caller, &u[l]);
+				__syncthreads();
+			}
+			if(bound & FROM_SOUTH)
+			{
+				caller->runSubgridCUDA(FROM_SOUTH,u,i);
+				__syncthreads();
+				caller->updateSubgridCUDA(i,caller, &u[l]);
+				__syncthreads();
+			}
+		}
+
+
+
+		if( ((bound & FROM_EAST) || (bound & FROM_NORTH) ))
+		{
+			caller->runSubgridCUDA(FROM_NORTH + FROM_EAST,u,i);
+			__syncthreads();
+			caller->updateSubgridCUDA(i,caller, &u[l]);
+			__syncthreads();
+		}
+		if( ((bound & FROM_WEST) || (bound & FROM_NORTH) ))
+		{
+			caller->runSubgridCUDA(FROM_NORTH + FROM_WEST,u,i);
+			__syncthreads();
+			caller->updateSubgridCUDA(i,caller, &u[l]);
+			__syncthreads();
+		}
+		if( ((bound & FROM_EAST) || (bound & FROM_SOUTH) ))
+		{
+			caller->runSubgridCUDA(FROM_SOUTH + FROM_EAST,u,i);
+			__syncthreads();
+			caller->updateSubgridCUDA(i,caller, &u[l]);
+			__syncthreads();
+		}
+		if(   (bound & FROM_WEST) || (bound & FROM_SOUTH) )
+		{
+			caller->runSubgridCUDA(FROM_SOUTH + FROM_WEST,u,i);
+			__syncthreads();
+			caller->updateSubgridCUDA(i,caller, &u[l]);
+			__syncthreads();
+		}
+>>>>>>> 86250e05537ea6311fa87d0f7bbae298788aa432
+
+		if(l==0)
+		{
+			caller->setBoundaryConditionCUDA(i, 0);
+			caller->setSubgridValueCUDA(i, caller->getSubgridValueCUDA(i) - 1 );
+		}
+
+
+	}
+
+
+
+}
+
+#endif /*HAVE_CUDA*/
+
+#endif /* TNLPARALLELEIKONALSOLVER_IMPL_H_ */