Commit 4dc39335 authored by Tomas Sobotik's avatar Tomas Sobotik
Browse files

Fixed Parallel 3D solver on GPU

parent dbfc942e
Loading
Loading
Loading
Loading
+42 −22
Original line number Diff line number Diff line
@@ -25,7 +25,7 @@ template< typename SchemeHost, typename SchemeDevice, typename Device>
tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::tnlParallelEikonalSolver()
{
	cout << "a" << endl;
	this->device = tnlHostDevice;  /////////////// tnlCuda Device --- vypocet na GPU, tnlHostDevice   ---    vypocet na CPU
	this->device = tnlCudaDevice;  /////////////// tnlCuda Device --- vypocet na GPU, tnlHostDevice   ---    vypocet na CPU

#ifdef HAVE_CUDA
	if(this->device == tnlCudaDevice)
@@ -101,7 +101,7 @@ bool tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::
	test();

	VectorType* tmp = new VectorType[subgridValues.getSize()];
	bool containsCurve = false;


#ifdef HAVE_CUDA

@@ -148,8 +148,12 @@ bool tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::

	if(this->device == tnlHostDevice)
	{
#ifdef HAVE_OPENMP
#pragma omp parallel for num_threads(4) schedule(dynamic)
#endif
		for(int i = 0; i < this->subgridValues.getSize(); i++)
		{
			bool containsCurve = false;
//			cout << "Working on subgrid " << i <<" --- check 1" << endl;

			if(! tmp[i].setSize(this->n*this->n*this->n))
@@ -246,9 +250,9 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::
	{

	bool end = false;
	while ((this->boundaryConditions.max() > 0 ) || !end)
	while (/*(this->boundaryConditions.max() > 0 ) ||*/ !end)
	{
		if(this->boundaryConditions.max() == 0 )
		if(this->boundaryConditions.max() == 0 || this->subgridValues.max() < 0)
			end=true;
		else
			end=false;
@@ -1012,11 +1016,17 @@ __device__
void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::getSubgridCUDA3D( const int i ,tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller, double* a)
{
	//int j = threadIdx.x + threadIdx.y * blockDim.x;
	int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows
			 + (blockIdx.y) * this->n*this->n*this->gridCols
             + (blockIdx.x) * this->n
             + threadIdx.y * this->n*this->gridCols
             + threadIdx.x;
//	int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows
//			 + (blockIdx.y) * this->n*this->n*this->gridCols
//             + (blockIdx.x) * this->n
//             + threadIdx.y * this->n*this->gridCols
//             + threadIdx.x;


	int index =  blockDim.x*blockIdx.x + threadIdx.x +
			  (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x +
			  (blockDim.z*blockIdx.z + threadIdx.z)*blockDim.x*gridDim.x*blockDim.y*gridDim.y;

	//printf("i= %d,j= %d,th= %d\n",i,j,th);
	*a = caller->work_u_cuda[index];
	//printf("Hi %f \n", *a);
@@ -1029,11 +1039,15 @@ __device__
void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::updateSubgridCUDA3D( const int i ,tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int >* caller, double* a)
{
//	int j = threadIdx.x + threadIdx.y * blockDim.x;
	int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows
			 + (blockIdx.y) * this->n*this->n*this->gridCols
             + (blockIdx.x) * this->n
             + threadIdx.y * this->n*this->gridCols
             + threadIdx.x;
//	int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows
//			 + (blockIdx.y) * this->n*this->n*this->gridCols
//             + (blockIdx.x) * this->n
//             + threadIdx.y * this->n*this->gridCols
//             + threadIdx.x;

	int index =  blockDim.x*blockIdx.x + threadIdx.x +
			  (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x +
			  (blockDim.z*blockIdx.z + threadIdx.z)*blockDim.x*gridDim.x*blockDim.y*gridDim.y;

	if( (fabs(caller->work_u_cuda[index]) > fabs(*a)) || (caller->unusedCell_cuda[index] == 1) )
	{
@@ -1055,11 +1069,15 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::
//	int j = threadIdx.x + threadIdx.y * blockDim.x;
	//printf("j = %d, u = %f\n", j,u);

		int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows
				 + (blockIdx.y) * this->n*this->n*this->gridCols
	             + (blockIdx.x) * this->n
	             + threadIdx.y * this->n*this->gridCols
	             + threadIdx.x;
//		int index = (blockIdx.z*this->n + threadIdx.z) * this->n*this->n*this->gridCols*this->gridRows
//				 + (blockIdx.y) * this->n*this->n*this->gridCols
//	             + (blockIdx.x) * this->n
//	             + threadIdx.y * this->n*this->gridCols
//	             + threadIdx.x;

		int index =  blockDim.x*blockIdx.x + threadIdx.x +
				  (blockDim.y*blockIdx.y + threadIdx.y)*blockDim.x*gridDim.x +
				  (blockDim.z*blockIdx.z + threadIdx.z)*blockDim.x*gridDim.x*blockDim.y*gridDim.y;

		//printf("i= %d,j= %d,index= %d\n",i,j,index);
		if( (fabs(this->work_u_cuda[index]) > fabs(u)) || (this->unusedCell_cuda[index] == 1) )
@@ -1181,7 +1199,7 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::
//   }
   double finalTime = this->stopTime;
   __syncthreads();
//   if( boundaryCondition == 0 ) finalTime *= 2.0;
   if( boundaryCondition == 0 ) finalTime *= 2.0;

   tnlGridEntity<MeshType, 3, tnlGridEntityNoStencilStorage > Entity(subMesh);
   tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 3, tnlGridEntityNoStencilStorage >,3> neighbourEntities(Entity);
@@ -1210,7 +1228,7 @@ void tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>::
    	  if( time + sharedTau[l] > finalTime )		sharedTau[l] = finalTime - time;
      }


      __syncthreads();
      if(l < 256)								sharedTau[l] = Min(sharedTau[l],sharedTau[l+256]);
      __syncthreads();
      if(l < 128)								sharedTau[l] = Min(sharedTau[l],sharedTau[l+128]);
@@ -1387,6 +1405,8 @@ void /*tnlParallelEikonalSolver<3,SchemeHost, SchemeDevice, Device, double, int>
			subgridValue_cmp = cudaSolver->getSubgridValueCUDA3D(blockIdx.y*gridDim.x + blockIdx.x + (blockIdx.z + 1)*gridDim.x*gridDim.y);
			boundary_index = 4;
		}
		__threadfence();

		if((subgridValue == INT_MAX || fabs(u_cmp) + cudaSolver->delta < fabs(u) ) && (subgridValue_cmp != INT_MAX && subgridValue_cmp != -INT_MAX))
		{
			cudaSolver->unusedCell_cuda[gid] = 0;
+68 −79
Original line number Diff line number Diff line
@@ -266,10 +266,7 @@ Real parallelGodunovEikonalScheme< tnlGrid< 3, MeshReal, Device, MeshIndex >, Re
          	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	 ) const
{
	RealType signui;
	if(boundaryCondition == 0)
		signui = sign(u[cellIndex], this->epsilon);
	else
		signui = Sign(u[cellIndex]);


		RealType xb = u[cellIndex];
@@ -279,8 +276,6 @@ Real parallelGodunovEikonalScheme< tnlGrid< 3, MeshReal, Device, MeshIndex >, Re
		RealType zb = u[cellIndex];
		RealType zf = -u[cellIndex];
		RealType a,b,c,d;
//	if(threadIdx.x+threadIdx.y+threadIdx.z == 0)
//		printf("x = %d, y = %d, z = %d\n",mesh.getDimensions().x() - 1,mesh.getDimensions().y() - 1,mesh.getDimensions().z() - 1);


		   if(coordinates.x() == mesh.getDimensions().x() - 1)
@@ -314,12 +309,6 @@ Real parallelGodunovEikonalScheme< tnlGrid< 3, MeshReal, Device, MeshIndex >, Re
		   else
			   zb -= u[neighbourEntities.template getEntityIndex< 0,  0,  -1 >()];


	   //xb *= ihx;
	   //xf *= ihx;
	  // yb *= ihy;
	   //yf *= ihy;

		   if(signui > 0.0)
		   {
			   xf = negativePart(xf);
@@ -371,9 +360,9 @@ Real parallelGodunovEikonalScheme< tnlGrid< 3, MeshReal, Device, MeshIndex >, Re

	//	   d = 1.0 - sqrt(xf*xf + xb*xb + yf*yf + yb*yb + zf*zf + zb*zb)*ihx; /*upwind*/

//	   if(Sign(d) > 0.0 )
//		   return Sign(u[cellIndex])*d;
//	   else
		   if(Sign(d) > 0.0 )
			   return Sign(u[cellIndex])*d;
		   else
			   return signui*d;
}