diff --git a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h
index d11941e251e2b282fc2090f723f4947301f81e3f..7dfc31f6923fe059b6b905fc3286dcf444ae29d5 100644
--- a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h
+++ b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h
@@ -186,7 +186,7 @@ bool tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::in
 		dim3 numBlocks(this->gridCols,this->gridRows);
 		cudaDeviceSynchronize();
 		checkCudaDevice;
-		initRunCUDA<SchemeTypeHost,SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver);
+		initRunCUDA<SchemeTypeHost,SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,2*this->n*this->n*sizeof(double)>>>(this->cudaSolver);
 		cudaDeviceSynchronize();
 //		cout << "post 1 kernel" << endl;
 
@@ -345,11 +345,11 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru
 			//cout << "a" << endl;
 			cudaDeviceSynchronize();
 			checkCudaDevice;
-			//start = std::clock();
-			runCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3*this->n*this->n*sizeof(double)>>>(this->cudaSolver);
-
+			start = std::clock();
+			runCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,2*this->n*this->n*sizeof(double)>>>(this->cudaSolver);
 			//cout << "a" << endl;
 			cudaDeviceSynchronize();
+			time_diff += (std::clock() - start) / (double)(CLOCKS_PER_SEC);
 
 			//start = std::clock();
 			synchronizeCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
@@ -368,7 +368,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru
 			cudaMemcpy(&run_host, (this->runcuda),sizeof(int), cudaMemcpyDeviceToHost);
 			//cout << "in kernel loop" << run_host << endl;
 		}
-		//cout << "Solving time was: " << time_diff << endl;
+		cout << "Solving time was: " << time_diff << endl;
 		//cout << "b" << endl;
 
 		//double* tmpu;
@@ -1080,30 +1080,30 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru
 {
 
 	__shared__ int tmp;
-	double* sharedTau = &u[blockDim.x*blockDim.y];
-	double* sharedRes = &sharedTau[blockDim.x*blockDim.y];
+	//double tmpRes = 0.0;
+	volatile double* sharedTau = &u[blockDim.x*blockDim.y];
 	int i = threadIdx.x;
 	int j = threadIdx.y;
 	int l = threadIdx.y * blockDim.x + threadIdx.x;
+	bool computeFU = !((i == 0 && (boundaryCondition & 4)) or
+			 (i == blockDim.x - 1 && (boundaryCondition & 2)) or
+			 (j == 0 && (boundaryCondition & 8)) or
+			 (j == blockDim.y - 1  && (boundaryCondition & 1)));
 
 	if(l == 0)
 	{
 		tmp = 0;
-		/*for(int o = 0; o < this->n * this->n; o++)
-			printf("%d : %f\n", o, u[o]);*/
+		if(this->getSubgridValueCUDA(subGridID) != this->currentStep + 4)
+			tmp = 1;
 	}
 
 	__syncthreads();
 
 	if(u[0]*u[l] <= 0.0)
 		atomicMax( &tmp, 1);
-	//__syncthreads();
-	//printf("tmp = %d", tmp);
-
-
-
 
 	__shared__ double value;
+
 	if(l == 0)
 		value = 0.0;
 	__syncthreads();
@@ -1111,166 +1111,67 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru
 	{
 		if(l == o)
 			value=Max(value,fabs(u[l]));
-		__syncthreads();
+	      __syncthreads();
 	}
-	//__syncthreads();
-	//atomicMax(&value,fabs(u[l]))
 
 	if(l == 0)
 		value *= Sign(u[0]);
 
-
-
 	__syncthreads();
-	if(tmp == 1)
-	{}
-	//north - 1, east - 2, west - 4, south - 8
-	else if(boundaryCondition == 4)
-	{
-		if(j > 0)
-				u[i*this->n + j] = value;
-	}
-	else if(boundaryCondition == 2)
-	{
-		if(j < this->n - 1)
-			u[i*this->n + j] = value;
-	}
-	else if(boundaryCondition == 1)
-	{
-		if(i < this->n - 1)
-			u[i*this->n + j] = value;
-	}
-	else if(boundaryCondition == 8)
-	{
-		if(i > 0)
-			u[i*this->n + j] = value;
-	}
-	//__syncthreads();
+	if(tmp !=1 && computeFU)
+		u[l] = value;
 
    double time = 0.0;
    __shared__ double currentTau;
-   __shared__ double maxResidue;
+   double cfl = this->cflCondition;
    double fu = 0.0;
    if(threadIdx.x * threadIdx.y == 0)
    {
 	   currentTau = this->tau0;
-	   maxResidue = 0.0;//10.0 * this->subMesh.getHx();
    }
    double finalTime = this->stopTime;
+   __syncthreads();
    if( time + currentTau > finalTime ) currentTau = finalTime - time;
 
-   __syncthreads();
 
-	//printf("%d : %f\n", l, u[l]);
    while( time < finalTime )
    {
 
-      /****
-       * Compute the RHS
-       */
-	   //__syncthreads();
-
+	  if(computeFU)
+		  fu = schemeHost.getValueDev( this->subMesh, l, tnlStaticVector<2,int>(i,j)/*this->subMesh.getCellCoordinates(l)*/, u, time, boundaryCondition);
 
-	   fu = schemeHost.getValueDev( this->subMesh, l, this->subMesh.getCellCoordinates(l), u, time, boundaryCondition);
-	   //__syncthreads();
-      //printf("%d : %f\n", l, fu);
-
-
-      //atomicMax(&maxResidue,fabs(fu));//maxResidue = fu. absMax();
-     // sharedRes[l]=fabs(fu);
-  /*  if(l == 0)
-    	maxResidue = 0.0;
-    __syncthreads();
-    																												//start reduction
-  	for(int o = 0; o < blockDim.x*blockDim.y; o++)
-  	{
-  		if(l == o)
-  			maxResidue=Max(maxResidue,sharedRes[l]);
-  		__syncthreads();
-  	}*/
-  	//__syncthreads();
-  																													//end reduction
-      sharedTau[l]=this -> cflCondition/fabs(fu);//sharedRes[l];
-      if((u[l]+sharedTau[l] * fu)*u[l] < 0.0)
-    	  sharedTau[l] = fabs(u[l]/(2.0*fu));
+	  sharedTau[l]=cfl/fabs(fu);
 
       if(l == 0)
-    	  if(sharedTau[l] > 1.0 * this->subMesh.getHx())
-    		  sharedTau[l] = 1.0 * this->subMesh.getHx();
+    	  if(sharedTau[0] > 1.0 * this->subMesh.getHx())	sharedTau[0] = 1.0 * this->subMesh.getHx();
 
-      if(l == 1)
-    	  if( time + sharedTau[l] > finalTime ) sharedTau[l] = finalTime - time;
+      if(l == blockDim.x*blockDim.y - 1)
+    	  if( time + sharedTau[l] > finalTime )		sharedTau[l] = finalTime - time;
       __syncthreads();
 
-   //   __syncthreads();
-  	/*for(unsigned int s = blockDim.x*blockDim.y/2; s>0; s>>=1)
-  	{
-  		if( l < s )
-  			sharedTau[l] = Min(sharedTau[l],sharedTau[l+s]) ;
-  		__syncthreads();
-  	}
-  	if(l==0) currentTau=sharedTau[l];
-  	__syncthreads();*/
-
-
-	for(unsigned int s = blockDim.x*blockDim.y/2; s>0; s>>=1)
-	{
-		//if( l < s )
-		//	sharedRes[l] = Max(sharedRes[l],sharedRes[l+s]);
-		if(l >= blockDim.x*blockDim.y - s)
-			sharedTau[l] = Min(sharedTau[l],sharedTau[l-s]);
-		__syncthreads();
-	}
-	if(l==0)
-	{
-		//maxResidue=sharedRes[l];
-		currentTau=sharedTau[blockDim.x*blockDim.y - 1];
-		/*if( this -> cflCondition * maxResidue != 0.0)
-			currentTau = Min(this -> cflCondition / maxResidue, currentTau);*/
-	}
-	__syncthreads();
-
 
-
-      //__syncthreads();
- //
-
-      //double tau2 = finalTime;
-
-
-
-      	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  //start reduction
-
-      //atomicMin(&currentTau, tau2);
-     /* if(l==0) currentTau=sharedTau[l];
+      if((blockDim.x == 16) && (l < 128))		sharedTau[l] = Min(sharedTau[l],sharedTau[l+128]);
       __syncthreads();
-    	for(int o = 0; o < blockDim.x*blockDim.y; o++)
-    	{
-    		if(l == o)
-    			currentTau=Min(currentTau,sharedTau[l]);
-    		__syncthreads();
-    	}*/
-    																											//end reduction
-
-
-
-
-
-
-//
-
-
+      if((blockDim.x == 16) && (l < 64))		sharedTau[l] = Min(sharedTau[l],sharedTau[l+64]);
+      __syncthreads();
+      if(l < 32)    							sharedTau[l] = Min(sharedTau[l],sharedTau[l+32]);
       //__syncthreads();
-      u[l] += currentTau * fu;
-      //if(l==0)
-    	 // printf("ct %f\n",currentTau);
-
-
+      if(l < 16)								sharedTau[l] = Min(sharedTau[l],sharedTau[l+16]);
+      //__syncthreads();
+      if(l < 8)									sharedTau[l] = Min(sharedTau[l],sharedTau[l+8]);
+     // __syncthreads();
+      if(l < 4)									sharedTau[l] = Min(sharedTau[l],sharedTau[l+4]);
+      //__syncthreads();
+      if(l < 2)									sharedTau[l] = Min(sharedTau[l],sharedTau[l+2]);
+      //__syncthreads();
+      if(l < 1)									currentTau   = Min(sharedTau[l],sharedTau[l+1]);
+	__syncthreads();
 
+      u[l] += currentTau * fu;
       time += currentTau;
       __syncthreads();
    }
-	//printf("%d : %f\n", l, u[l]);
+
 
 }
 
@@ -1697,10 +1598,11 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 	if(caller->getSubgridValueCUDA(i) != INT_MAX)
 	{
 		caller->getSubgridCUDA(i,caller, &u[l]);
-		u[2*blockDim.x*blockDim.y +l] = u[l];
 		int bound = caller->getBoundaryConditionCUDA(i);
 		//if(l == 0)
 			//printf("i = %d, bound = %d\n",i,caller->getSubgridValueCUDA(i));
+		//if(caller->getSubgridValueCUDA(i) == caller->currentStep+4)
+		//{
 		if(bound & 1)
 		{
 			caller->runSubgridCUDA(1,u,i);
@@ -1711,7 +1613,7 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 			caller->updateSubgridCUDA(i,caller, &u[l]);
 			__syncthreads();
 		}
-		if(bound & 2)
+		if(bound & 2 )
 		{
 			caller->runSubgridCUDA(2,u,i);
 			__syncthreads();
@@ -1741,10 +1643,11 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 			caller->updateSubgridCUDA(i,caller, &u[l]);
 			__syncthreads();
 		}
+		//}
 
 
 
-		if( ((bound & 2) ))
+		if( ((bound & 2) || (bound & 1) ))
 		{
 			caller->runSubgridCUDA(3,u,i);
 			__syncthreads();
@@ -1754,7 +1657,7 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 			caller->updateSubgridCUDA(i,caller, &u[l]);
 			__syncthreads();
 		}
-		if( ((bound & 4) ))
+		if( ((bound & 4) || (bound & 1) ))
 		{
 			caller->runSubgridCUDA(5,u,i);
 			__syncthreads();
@@ -1764,7 +1667,7 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 			caller->updateSubgridCUDA(i,caller, &u[l]);
 			__syncthreads();
 		}
-		if( ((bound & 2) ))
+		if( ((bound & 2) || (bound & 8) ))
 		{
 			caller->runSubgridCUDA(10,u,i);
 			__syncthreads();
@@ -1774,7 +1677,7 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
 			caller->updateSubgridCUDA(i,caller, &u[l]);
 			__syncthreads();
 		}
-		if(   (bound & 4) )
+		if(   (bound & 4) || (bound & 8) )
 		{
 			caller->runSubgridCUDA(12,u,i);
 			__syncthreads();
diff --git a/src/operators/godunov-eikonal/parallelGodunovEikonal.h b/src/operators/godunov-eikonal/parallelGodunovEikonal.h
index 8233c9278ab1c8836c8198f1ecdec6382696c3fd..aa3e34e6f48b4246a854d2e796c77b5e4ad75632 100644
--- a/src/operators/godunov-eikonal/parallelGodunovEikonal.h
+++ b/src/operators/godunov-eikonal/parallelGodunovEikonal.h
@@ -171,8 +171,8 @@ protected:
 
     DofVectorType dofVector;
 
-    RealType hx;
-    RealType hy;
+    RealType hx, ihx;
+    RealType hy, ihy;
 
     RealType epsilon;
 
diff --git a/src/operators/godunov-eikonal/parallelGodunovEikonal2D_impl.h b/src/operators/godunov-eikonal/parallelGodunovEikonal2D_impl.h
index a5b23b9f1bbe1b8ebf3cf422b83b9c183bd5f45f..628d65d68867fe11bb97d0bec29c27f9475060e3 100644
--- a/src/operators/godunov-eikonal/parallelGodunovEikonal2D_impl.h
+++ b/src/operators/godunov-eikonal/parallelGodunovEikonal2D_impl.h
@@ -45,7 +45,7 @@ template< typename MeshReal,
 Real  parallelGodunovEikonalScheme< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: negativePart(const Real arg) const
 {
 	if(arg < 0.0)
-		return arg;
+		return -arg;
 	return 0.0;
 }
 
@@ -62,13 +62,13 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2,MeshReal, Device, MeshIndex >, Rea
 {
 	if(x > eps)
 		return 1.0;
-	else if (x < -eps)
+	if (x < -eps)
 		return (-1.0);
 
 	if ( eps == 0.0)
 		return 0.0;
 
-	return sin((M_PI*x)/(2.0*eps));
+	return sin(/*(M_PI*x)/(2.0*eps)	*/(M_PI/2.0)*(x/eps));
 }
 
 
@@ -98,7 +98,9 @@ bool parallelGodunovEikonalScheme< tnlGrid< 2,MeshReal, Device, MeshIndex >, Rea
 
 
 	   hx = originalMesh.getHx();
+	   ihx = 1.0/hx;
 	   hy = originalMesh.getHy();
+	   ihy = 1.0/hy;
 
 	   epsilon = parameters. getParameter< double >( "epsilon" );
 
@@ -207,12 +209,12 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Re
 		   else
 			   yb = positivePart((u[cellIndex] - u[mesh.template getCellNextToCell<0,-1>( cellIndex )])/hy);
 
-		   if(xb + xf > 0.0)
+		   if(xb - xf > 0.0)
 			   xf = 0.0;
 		   else
 			   xb = 0.0;
 
-		   if(yb + yf > 0.0)
+		   if(yb - yf > 0.0)
 			   yf = 0.0;
 		   else
 			   yb = 0.0;
@@ -262,15 +264,15 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Re
 			   yb = negativePart((u[cellIndex] - u[mesh.template getCellNextToCell<0,-1>( cellIndex )])/hy);
 
 
-		   if(xb + xf > 0.0)
-			   xb = 0.0;
-		   else
+		   if(xb - xf > 0.0)
 			   xf = 0.0;
-
-		   if(yb + yf > 0.0)
-			   yb = 0.0;
 		   else
+			   xb = 0.0;
+
+		   if(yb - yf > 0.0)
 			   yf = 0.0;
+		   else
+			   yb = 0.0;
 
 		   nabla = sqrt (xf*xf + xb*xb + yf*yf + yb*yb );
 
@@ -310,7 +312,7 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Re
           	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	 const IndexType boundaryCondition) const
 {
 
-
+/*
 	if ( ((coordinates.x() == 0 && (boundaryCondition & 4)) or
 		 (coordinates.x() == mesh.getDimensions().x() - 1 && (boundaryCondition & 2)) or
 		 (coordinates.y() == 0 && (boundaryCondition & 8)) or
@@ -320,10 +322,10 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Re
 		return 0.0;
 	}
 
-
+*/
 	//RealType acc = hx*hy*hx*hy;
 
-	RealType nabla,  signui;
+	RealType signui;
 	signui = sign(u[cellIndex],epsilon);
 
 #ifdef HAVE_CUDA
@@ -334,6 +336,7 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Re
 	RealType xf = -u[cellIndex];
 	RealType yb = u[cellIndex];
 	RealType yf = -u[cellIndex];
+	RealType a,b;
 
 
 	   if(coordinates.x() == mesh.getDimensions().x() - 1)
@@ -357,10 +360,10 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Re
 		   yb -= u[mesh.template getCellNextToCell<0,-1>( cellIndex )];
 
 
-	   xb /= hx;
-	   xf /= hx;
-	   yb /= hy;
-	   yf /= hy;
+	   //xb *= ihx;
+	   //xf *= ihx;
+	  // yb *= ihy;
+	   //yf *= ihy;
 
 	   if(signui > 0.0)
 	   {
@@ -372,16 +375,6 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Re
 
 		   yb = positivePart(yb);
 
-		   if(xb + xf > 0.0)
-			   xf = 0.0;
-		   else
-			   xb = 0.0;
-
-		   if(yb + yf > 0.0)
-			   yf = 0.0;
-		   else
-			   yb = 0.0;
-
 	   }
 	   else if(signui < 0.0)
 	   {
@@ -393,27 +386,20 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Re
 		   yb = negativePart(yb);
 
 		   yf = positivePart(yf);
-
-		   if(xb + xf > 0.0)
-			   xb = 0.0;
-		   else
-			   xf = 0.0;
-
-		   if(yb + yf > 0.0)
-			   yb = 0.0;
-		   else
-			   yf = 0.0;
-
 	   }
 
 
+	   if(xb - xf > 0.0)
+		   a = xb;
+	   else
+		   a = xf;
 
-	   nabla = sqrt (xf*xf + xb*xb + yf*yf + yb*yb );
-	   return signui*(1.0 - nabla);
-
-
-
+	   if(yb - yf > 0.0)
+		   b = yb;
+	   else
+		   b = yf;
 
+	   return signui*(1.0 - sqrt(a*a+b*b)*ihx );
 }