From 4dc3933564a1692a80d56f84fb97b8f653861e8b Mon Sep 17 00:00:00 2001
From: Tomas Sobotik <sobotto4@fjfi.cvut.cz>
Date: Sun, 24 Apr 2016 12:22:14 +0200
Subject: [PATCH] Fixed Parallel 3D solver on GPU

---
 .../tnlParallelEikonalSolver3D_impl.h         |  64 +++++---
 .../parallelGodunovEikonal3D_impl.h           | 147 ++++++++----------
 2 files changed, 110 insertions(+), 101 deletions(-)

diff --git a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h
index 6dcae4f3f5..f8cc71933e 100644
--- a/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h
+++ b/examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver3D_impl.h
@@ -25,7 +25,7 @@ template< typename SchemeHost, typename SchemeDevice, typename Device>
 tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::tnlParallelEikonalSolver()
 {
 	cout << "a" << endl;
-	this->device = tnlHostDevice;  /////////////// tnlCuda Device --- vypocet na GPU, tnlHostDevice   ---    vypocet na CPU
+	this->device = tnlCudaDevice;  /////////////// tnlCuda Device --- vypocet na GPU, tnlHostDevice   ---    vypocet na CPU
 
 #ifdef HAVE_CUDA
 	if(this->device == tnlCudaDevice)
@@ -101,7 +101,7 @@ bool tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::
 	test();
 
 	VectorType* tmp = new VectorType[subgridValues.getSize()];
-	bool containsCurve = false;
+
 
 #ifdef HAVE_CUDA
 
@@ -148,8 +148,12 @@ bool tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::
 
 	if(this->device == tnlHostDevice)
 	{
+#ifdef HAVE_OPENMP
+#pragma omp parallel for num_threads(4) schedule(dynamic)
+#endif
 		for(int i = 0; i < this->subgridValues.getSize(); i++)
 		{
+			bool containsCurve = false;
 //			cout << "Working on subgrid " << i <<" --- check 1" << endl;
 
 			if(! tmp[i].setSize(this->n*this->n*this->n))
@@ -246,9 +250,9 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::
 	{
 
 	bool end = false;
-	while ((this->boundaryConditions.max() > 0 ) || !end)
+	while (/*(this->boundaryConditions.max() > 0 ) ||*/ !end)
 	{
-		if(this->boundaryConditions.max() == 0 )
+		if(this->boundaryConditions.max() == 0 || this->subgridValues.max() < 0)
 			end=true;
 		else
 			end=false;
@@ -1012,11 +1016,17 @@ __device__
 void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getSubgridCUDA3D( const int i ,tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller, double* a)
 {
 	//int j = threadIdx.x + threadIdx.y * blockDim.x;
-	int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows
-			 + (blockIdx.y) * this->n*this->n*this->gridCols
-             + (blockIdx.x) * this->n
-             + threadIdx.y * this->n*this->gridCols
-             + threadIdx.x;
+//	int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows
+//			 + (blockIdx.y) * this->n*this->n*this->gridCols
+//             + (blockIdx.x) * this->n
+//             + threadIdx.y * this->n*this->gridCols
+//             + threadIdx.x;
+
+
+	int index =  blockDim.x*blockIdx.x + threadIdx.x +
+			  (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x +
+			  (blockDim.z*blockIdx.z + threadIdx.z)*blockDim.x*gridDim.x*blockDim.y*gridDim.y;
+
 	//printf("i= %d,j= %d,th= %d\n",i,j,th);
 	*a = caller->work_u_cuda[index];
 	//printf("Hi %f \n", *a);
@@ -1029,11 +1039,15 @@ __device__
 void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::updateSubgridCUDA3D( const int i ,tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller, double* a)
 {
 //	int j = threadIdx.x + threadIdx.y * blockDim.x;
-	int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows
-			 + (blockIdx.y) * this->n*this->n*this->gridCols
-             + (blockIdx.x) * this->n
-             + threadIdx.y * this->n*this->gridCols
-             + threadIdx.x;
+//	int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows
+//			 + (blockIdx.y) * this->n*this->n*this->gridCols
+//             + (blockIdx.x) * this->n
+//             + threadIdx.y * this->n*this->gridCols
+//             + threadIdx.x;
+
+	int index =  blockDim.x*blockIdx.x + threadIdx.x +
+			  (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x +
+			  (blockDim.z*blockIdx.z + threadIdx.z)*blockDim.x*gridDim.x*blockDim.y*gridDim.y;
 
 	if( (fabs(caller->work_u_cuda[index]) > fabs(*a)) || (caller->unusedCell_cuda[index] == 1) )
 	{
@@ -1055,11 +1069,15 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::
 //	int j = threadIdx.x + threadIdx.y * blockDim.x;
 	//printf("j = %d, u = %f\n", j,u);
 
-		int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows
-				 + (blockIdx.y) * this->n*this->n*this->gridCols
-	             + (blockIdx.x) * this->n
-	             + threadIdx.y * this->n*this->gridCols
-	             + threadIdx.x;
+//		int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows
+//				 + (blockIdx.y) * this->n*this->n*this->gridCols
+//	             + (blockIdx.x) * this->n
+//	             + threadIdx.y * this->n*this->gridCols
+//	             + threadIdx.x;
+
+		int index =  blockDim.x*blockIdx.x + threadIdx.x +
+				  (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x +
+				  (blockDim.z*blockIdx.z + threadIdx.z)*blockDim.x*gridDim.x*blockDim.y*gridDim.y;
 
 		//printf("i= %d,j= %d,index= %d\n",i,j,index);
 		if( (fabs(this->work_u_cuda[index]) > fabs(u)) || (this->unusedCell_cuda[index] == 1) )
@@ -1181,7 +1199,7 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::
 //   }
    double finalTime = this->stopTime;
    __syncthreads();
-//   if( boundaryCondition == 0 ) finalTime *= 2.0;
+   if( boundaryCondition == 0 ) finalTime *= 2.0;
 
    tnlGridEntity<MeshType, 3, tnlGridEntityNoStencilStorage > Entity(subMesh);
    tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 3, tnlGridEntityNoStencilStorage >,3> neighbourEntities(Entity);
@@ -1210,7 +1228,7 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::
     	  if( time + sharedTau[l] > finalTime )		sharedTau[l] = finalTime - time;
       }
 
-
+      __syncthreads();
       if(l < 256)								sharedTau[l] = Min(sharedTau[l],sharedTau[l+256]);
       __syncthreads();
       if(l < 128)								sharedTau[l] = Min(sharedTau[l],sharedTau[l+128]);
@@ -1387,6 +1405,8 @@ void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>
 			subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + (blockIdx.z + 1)*gridDim.x*gridDim.y);
 			boundary_index = 4;
 		}
+		__threadfence();
+
 		if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX))
 		{
 			cudaSolver->unusedCell_cuda[gid] = 0;
@@ -1402,7 +1422,7 @@ void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>
 	if(threadIdx.x+threadIdx.y+threadIdx.z == 0)
 	{
 
-		if(subgridValue == INT_MAX && newSubgridValue !=0)
+		if(subgridValue == INT_MAX && newSubgridValue != 0)
 			cudaSolver->setSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + blockIdx.z*gridDim.x*gridDim.y, -INT_MAX);
 
 		cudaSolver->setBoundaryConditionCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + blockIdx.z*gridDim.x*gridDim.y, 	1  * boundary[0] +
diff --git a/src/operators/godunov-eikonal/parallelGodunovEikonal3D_impl.h b/src/operators/godunov-eikonal/parallelGodunovEikonal3D_impl.h
index ee2db1e2a4..3010b711ba 100644
--- a/src/operators/godunov-eikonal/parallelGodunovEikonal3D_impl.h
+++ b/src/operators/godunov-eikonal/parallelGodunovEikonal3D_impl.h
@@ -266,115 +266,104 @@ Real parallelGodunovEikonalScheme< tnlGrid< 3, MeshReal, Device, MeshIndex >, Re
           	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	 ) const
 {
 	RealType signui;
-	if(boundaryCondition == 0)
-		signui = sign(u[cellIndex],this->epsilon);
-	else
-		signui = Sign(u[cellIndex]);
+		signui = sign(u[cellIndex], this->epsilon);
 
 
-	RealType xb = u[cellIndex];
-	RealType xf = -u[cellIndex];
-	RealType yb = u[cellIndex];
-	RealType yf = -u[cellIndex];
-	RealType zb = u[cellIndex];
-	RealType zf = -u[cellIndex];
-	RealType a,b,c,d;
-//	if(threadIdx.x+threadIdx.y+threadIdx.z == 0)
-//		printf("x = %d, y = %d, z = %d\n",mesh.getDimensions().x() - 1,mesh.getDimensions().y() - 1,mesh.getDimensions().z() - 1);
+		RealType xb = u[cellIndex];
+		RealType xf = -u[cellIndex];
+		RealType yb = u[cellIndex];
+		RealType yf = -u[cellIndex];
+		RealType zb = u[cellIndex];
+		RealType zf = -u[cellIndex];
+		RealType a,b,c,d;
 
 
-	   if(coordinates.x() == mesh.getDimensions().x() - 1)
-		   xf += u[neighbourEntities.template getEntityIndex< -1,  0,  0 >()];
-	   else
-		   xf += u[neighbourEntities.template getEntityIndex< 1,  0,  0 >()];
+		   if(coordinates.x() == mesh.getDimensions().x() - 1)
+			   xf += u[neighbourEntities.template getEntityIndex< -1,  0,  0 >()];
+		   else
+			   xf += u[neighbourEntities.template getEntityIndex< 1,  0,  0 >()];
 
-	   if(coordinates.x() == 0)
-		   xb -= u[neighbourEntities.template getEntityIndex< 1,  0,  0 >()];
-	   else
-		   xb -= u[neighbourEntities.template getEntityIndex< -1,  0,  0 >()];
-
-	   if(coordinates.y() == mesh.getDimensions().y() - 1)
-		   yf += u[neighbourEntities.template getEntityIndex< 0,  -1,  0 >()];
-	   else
-		   yf += u[neighbourEntities.template getEntityIndex< 0,  1,  0 >()];
-
-	   if(coordinates.y() == 0)
-		   yb -= u[neighbourEntities.template getEntityIndex< 0,  1,  0 >()];
-	   else
-		   yb -= u[neighbourEntities.template getEntityIndex< 0,  -1,  0 >()];
+		   if(coordinates.x() == 0)
+			   xb -= u[neighbourEntities.template getEntityIndex< 1,  0,  0 >()];
+		   else
+			   xb -= u[neighbourEntities.template getEntityIndex< -1,  0,  0 >()];
 
+		   if(coordinates.y() == mesh.getDimensions().y() - 1)
+			   yf += u[neighbourEntities.template getEntityIndex< 0,  -1,  0 >()];
+		   else
+			   yf += u[neighbourEntities.template getEntityIndex< 0,  1,  0 >()];
 
-	   if(coordinates.z() == mesh.getDimensions().z() - 1)
-		   zf += u[neighbourEntities.template getEntityIndex< 0,  0,  -1 >()];
-	   else
-		   zf += u[neighbourEntities.template getEntityIndex< 0,  0,  1 >()];
+		   if(coordinates.y() == 0)
+			   yb -= u[neighbourEntities.template getEntityIndex< 0,  1,  0 >()];
+		   else
+			   yb -= u[neighbourEntities.template getEntityIndex< 0,  -1,  0 >()];
 
-	   if(coordinates.z() == 0)
-		   zb -= u[neighbourEntities.template getEntityIndex< 0,  0,  1 >()];
-	   else
-		   zb -= u[neighbourEntities.template getEntityIndex< 0,  0,  -1 >()];
 
+		   if(coordinates.z() == mesh.getDimensions().z() - 1)
+			   zf += u[neighbourEntities.template getEntityIndex< 0,  0,  -1 >()];
+		   else
+			   zf += u[neighbourEntities.template getEntityIndex< 0,  0,  1 >()];
 
-	   //xb *= ihx;
-	   //xf *= ihx;
-	  // yb *= ihy;
-	   //yf *= ihy;
+		   if(coordinates.z() == 0)
+			   zb -= u[neighbourEntities.template getEntityIndex< 0,  0,  1 >()];
+		   else
+			   zb -= u[neighbourEntities.template getEntityIndex< 0,  0,  -1 >()];
 
-	   if(signui > 0.0)
-	   {
-		   xf = negativePart(xf);
+		   if(signui > 0.0)
+		   {
+			   xf = negativePart(xf);
 
-		   xb = positivePart(xb);
+			   xb = positivePart(xb);
 
-		   yf = negativePart(yf);
+			   yf = negativePart(yf);
 
-		   yb = positivePart(yb);
+			   yb = positivePart(yb);
 
-		   zf = negativePart(zf);
+			   zf = negativePart(zf);
 
-		   zb = positivePart(zb);
+			   zb = positivePart(zb);
 
-	   }
-	   else if(signui < 0.0)
-	   {
+		   }
+		   else if(signui < 0.0)
+		   {
 
-		   xb = negativePart(xb);
+			   xb = negativePart(xb);
 
-		   xf = positivePart(xf);
+			   xf = positivePart(xf);
 
-		   yb = negativePart(yb);
+			   yb = negativePart(yb);
 
-		   yf = positivePart(yf);
+			   yf = positivePart(yf);
 
-		   zb = negativePart(zb);
+			   zb = negativePart(zb);
 
-		   zf = positivePart(zf);
-	   }
+			   zf = positivePart(zf);
+		   }
 
 
-	   if(xb - xf > 0.0)
-		   a = xb;
-	   else
-		   a = xf;
+		   if(xb - xf > 0.0)
+			   a = xb;
+		   else
+			   a = xf;
 
-	   if(yb - yf > 0.0)
-		   b = yb;
-	   else
-		   b = yf;
+		   if(yb - yf > 0.0)
+			   b = yb;
+		   else
+			   b = yf;
 
-	   if(zb - zf > 0.0)
-		   c = zb;
-	   else
-		   c = zf;
+		   if(zb - zf > 0.0)
+			   c = zb;
+		   else
+			   c = zf;
 
-	   d = ( 1.0 - sqrt(a*a + b*b + c*c)*ihx );
+		   d = ( 1.0 - sqrt(a*a + b*b + c*c)*ihx );
 
-//	   d = 1.0 - sqrt(xf*xf + xb*xb + yf*yf + yb*yb + zf*zf + zb*zb)*ihx; /*upwind*/
+	//	   d = 1.0 - sqrt(xf*xf + xb*xb + yf*yf + yb*yb + zf*zf + zb*zb)*ihx; /*upwind*/
 
-//	   if(Sign(d) > 0.0 )
-//		   return Sign(u[cellIndex])*d;
-//	   else
-		   return signui*d;
+		   if(Sign(d) > 0.0 )
+			   return Sign(u[cellIndex])*d;
+		   else
+			   return signui*d;
 }
 
 
-- 
GitLab