More optimalizations (acfcdee4) · Commits · TNL / tnl-dev

examples/hamilton-jacobi-parallel/tnlParallelEikonalSolver_impl.h

+49 −146

Original line number	Diff line number	Diff line
		@@ -186,7 +186,7 @@ bool tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::in
		dim3 numBlocks(this->gridCols,this->gridRows);
		cudaDeviceSynchronize();
		checkCudaDevice;
		initRunCUDA<SchemeTypeHost,SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3this->nthis->n*sizeof(double)>>>(this->cudaSolver);
		initRunCUDA<SchemeTypeHost,SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,2this->nthis->n*sizeof(double)>>>(this->cudaSolver);
		cudaDeviceSynchronize();
		// cout << "post 1 kernel" << endl;

		@@ -345,11 +345,11 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru
		//cout << "a" << endl;
		cudaDeviceSynchronize();
		checkCudaDevice;
		//start = std::clock();
		runCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,3this->nthis->n*sizeof(double)>>>(this->cudaSolver);

		start = std::clock();
		runCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock,2this->nthis->n*sizeof(double)>>>(this->cudaSolver);
		//cout << "a" << endl;
		cudaDeviceSynchronize();
		time_diff += (std::clock() - start) / (double)(CLOCKS_PER_SEC);

		//start = std::clock();
		synchronizeCUDA<SchemeTypeHost, SchemeTypeDevice, DeviceType><<<numBlocks,threadsPerBlock>>>(this->cudaSolver);
		@@ -368,7 +368,7 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru
		cudaMemcpy(&run_host, (this->runcuda),sizeof(int), cudaMemcpyDeviceToHost);
		//cout << "in kernel loop" << run_host << endl;
		}
		//cout << "Solving time was: " << time_diff << endl;
		cout << "Solving time was: " << time_diff << endl;
		//cout << "b" << endl;

		//double* tmpu;
		@@ -1080,30 +1080,30 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru
		{

		__shared__ int tmp;
		double* sharedTau = &u[blockDim.x*blockDim.y];
		double* sharedRes = &sharedTau[blockDim.x*blockDim.y];
		//double tmpRes = 0.0;
		volatile double* sharedTau = &u[blockDim.x*blockDim.y];
		int i = threadIdx.x;
		int j = threadIdx.y;
		int l = threadIdx.y * blockDim.x + threadIdx.x;
		bool computeFU = !((i == 0 && (boundaryCondition & 4)) or
		(i == blockDim.x - 1 && (boundaryCondition & 2)) or
		(j == 0 && (boundaryCondition & 8)) or
		(j == blockDim.y - 1 && (boundaryCondition & 1)));

		if(l == 0)
		{
		tmp = 0;
		/for(int o = 0; o < this->n this->n; o++)
		printf("%d : %f\n", o, u[o]);*/
		if(this->getSubgridValueCUDA(subGridID) != this->currentStep + 4)
		tmp = 1;
		}

		__syncthreads();

		if(u[0]*u[l] <= 0.0)
		atomicMax( &tmp, 1);
		//__syncthreads();
		//printf("tmp = %d", tmp);




		__shared__ double value;

		if(l == 0)
		value = 0.0;
		__syncthreads();
		@@ -1113,164 +1113,65 @@ void tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::ru
		value=Max(value,fabs(u[l]));
		__syncthreads();
		}
		//__syncthreads();
		//atomicMax(&value,fabs(u[l]))

		if(l == 0)
		value *= Sign(u[0]);



		__syncthreads();
		if(tmp == 1)
		{}
		//north - 1, east - 2, west - 4, south - 8
		else if(boundaryCondition == 4)
		{
		if(j > 0)
		u[i*this->n + j] = value;
		}
		else if(boundaryCondition == 2)
		{
		if(j < this->n - 1)
		u[i*this->n + j] = value;
		}
		else if(boundaryCondition == 1)
		{
		if(i < this->n - 1)
		u[i*this->n + j] = value;
		}
		else if(boundaryCondition == 8)
		{
		if(i > 0)
		u[i*this->n + j] = value;
		}
		//__syncthreads();
		if(tmp !=1 && computeFU)
		u[l] = value;

		double time = 0.0;
		__shared__ double currentTau;
		__shared__ double maxResidue;
		double cfl = this->cflCondition;
		double fu = 0.0;
		if(threadIdx.x * threadIdx.y == 0)
		{
		currentTau = this->tau0;
		maxResidue = 0.0;//10.0 * this->subMesh.getHx();
		}
		double finalTime = this->stopTime;
		__syncthreads();
		if( time + currentTau > finalTime ) currentTau = finalTime - time;

		__syncthreads();

		//printf("%d : %f\n", l, u[l]);
		while( time < finalTime )
		{

		/****
		* Compute the RHS
		*/
		//__syncthreads();


		fu = schemeHost.getValueDev( this->subMesh, l, this->subMesh.getCellCoordinates(l), u, time, boundaryCondition);
		//__syncthreads();
		//printf("%d : %f\n", l, fu);

		if(computeFU)
		fu = schemeHost.getValueDev( this->subMesh, l, tnlStaticVector<2,int>(i,j)/this->subMesh.getCellCoordinates(l)/, u, time, boundaryCondition);

		//atomicMax(&maxResidue,fabs(fu));//maxResidue = fu. absMax();
		// sharedRes[l]=fabs(fu);
		/* if(l == 0)
		maxResidue = 0.0;
		__syncthreads();
		//start reduction
		for(int o = 0; o < blockDim.x*blockDim.y; o++)
		{
		if(l == o)
		maxResidue=Max(maxResidue,sharedRes[l]);
		__syncthreads();
		}*/
		//__syncthreads();
		//end reduction
		sharedTau[l]=this -> cflCondition/fabs(fu);//sharedRes[l];
		if((u[l]+sharedTau[l] * fu)*u[l] < 0.0)
		sharedTau[l] = fabs(u[l]/(2.0*fu));
		sharedTau[l]=cfl/fabs(fu);

		if(l == 0)
		if(sharedTau[l] > 1.0 * this->subMesh.getHx())
		sharedTau[l] = 1.0 * this->subMesh.getHx();
		if(sharedTau[0] > 1.0 * this->subMesh.getHx()) sharedTau[0] = 1.0 * this->subMesh.getHx();

		if(l == 1)
		if(l == blockDim.x*blockDim.y - 1)
		if( time + sharedTau[l] > finalTime ) sharedTau[l] = finalTime - time;
		__syncthreads();

		// __syncthreads();
		/for(unsigned int s = blockDim.xblockDim.y/2; s>0; s>>=1)
		{
		if( l < s )
		sharedTau[l] = Min(sharedTau[l],sharedTau[l+s]) ;
		__syncthreads();
		}
		if(l==0) currentTau=sharedTau[l];
		__syncthreads();*/


		for(unsigned int s = blockDim.x*blockDim.y/2; s>0; s>>=1)
		{
		//if( l < s )
		// sharedRes[l] = Max(sharedRes[l],sharedRes[l+s]);
		if(l >= blockDim.x*blockDim.y - s)
		sharedTau[l] = Min(sharedTau[l],sharedTau[l-s]);
		if((blockDim.x == 16) && (l < 128)) sharedTau[l] = Min(sharedTau[l],sharedTau[l+128]);
		__syncthreads();
		}
		if(l==0)
		{
		//maxResidue=sharedRes[l];
		currentTau=sharedTau[blockDim.x*blockDim.y - 1];
		/if( this -> cflCondition maxResidue != 0.0)
		currentTau = Min(this -> cflCondition / maxResidue, currentTau);*/
		}
		if((blockDim.x == 16) && (l < 64)) sharedTau[l] = Min(sharedTau[l],sharedTau[l+64]);
		__syncthreads();



		if(l < 32) sharedTau[l] = Min(sharedTau[l],sharedTau[l+32]);
		//__syncthreads();
		//

		//double tau2 = finalTime;



		//start reduction

		//atomicMin(&currentTau, tau2);
		/* if(l==0) currentTau=sharedTau[l];
		__syncthreads();
		for(int o = 0; o < blockDim.x*blockDim.y; o++)
		{
		if(l == o)
		currentTau=Min(currentTau,sharedTau[l]);
		if(l < 16) sharedTau[l] = Min(sharedTau[l],sharedTau[l+16]);
		//__syncthreads();
		if(l < 8) sharedTau[l] = Min(sharedTau[l],sharedTau[l+8]);
		// __syncthreads();
		if(l < 4) sharedTau[l] = Min(sharedTau[l],sharedTau[l+4]);
		//__syncthreads();
		if(l < 2) sharedTau[l] = Min(sharedTau[l],sharedTau[l+2]);
		//__syncthreads();
		if(l < 1) currentTau = Min(sharedTau[l],sharedTau[l+1]);
		__syncthreads();
		}*/
		//end reduction






		//


		//__syncthreads();
		u[l] += currentTau * fu;
		//if(l==0)
		// printf("ct %f\n",currentTau);



		time += currentTau;
		__syncthreads();
		}
		//printf("%d : %f\n", l, u[l]);


		}

		@@ -1697,10 +1598,11 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
		if(caller->getSubgridValueCUDA(i) != INT_MAX)
		{
		caller->getSubgridCUDA(i,caller, &u[l]);
		u[2blockDim.xblockDim.y +l] = u[l];
		int bound = caller->getBoundaryConditionCUDA(i);
		//if(l == 0)
		//printf("i = %d, bound = %d\n",i,caller->getSubgridValueCUDA(i));
		//if(caller->getSubgridValueCUDA(i) == caller->currentStep+4)
		//{
		if(bound & 1)
		{
		caller->runSubgridCUDA(1,u,i);
		@@ -1741,10 +1643,11 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
		caller->updateSubgridCUDA(i,caller, &u[l]);
		__syncthreads();
		}
		//}



		if( ((bound & 2) ))
		if( ((bound & 2) \|\| (bound & 1) ))
		{
		caller->runSubgridCUDA(3,u,i);
		__syncthreads();
		@@ -1754,7 +1657,7 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
		caller->updateSubgridCUDA(i,caller, &u[l]);
		__syncthreads();
		}
		if( ((bound & 4) ))
		if( ((bound & 4) \|\| (bound & 1) ))
		{
		caller->runSubgridCUDA(5,u,i);
		__syncthreads();
		@@ -1764,7 +1667,7 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
		caller->updateSubgridCUDA(i,caller, &u[l]);
		__syncthreads();
		}
		if( ((bound & 2) ))
		if( ((bound & 2) \|\| (bound & 8) ))
		{
		caller->runSubgridCUDA(10,u,i);
		__syncthreads();
		@@ -1774,7 +1677,7 @@ void /*tnlParallelEikonalSolver<SchemeHost, SchemeDevice, Device, double, int>::
		caller->updateSubgridCUDA(i,caller, &u[l]);
		__syncthreads();
		}
		if( (bound & 4) )
		if( (bound & 4) \|\| (bound & 8) )
		{
		caller->runSubgridCUDA(12,u,i);
		__syncthreads();

src/operators/godunov-eikonal/parallelGodunovEikonal.h

+2 −2

Original line number	Diff line number	Diff line
		@@ -171,8 +171,8 @@ protected:

		DofVectorType dofVector;

		RealType hx;
		RealType hy;
		RealType hx, ihx;
		RealType hy, ihy;

		RealType epsilon;

src/operators/godunov-eikonal/parallelGodunovEikonal2D_impl.h

+30 −44

Original line number	Diff line number	Diff line
		@@ -45,7 +45,7 @@ template< typename MeshReal,
		Real parallelGodunovEikonalScheme< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: negativePart(const Real arg) const
		{
		if(arg < 0.0)
		return arg;
		return -arg;
		return 0.0;
		}

		@@ -62,13 +62,13 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2,MeshReal, Device, MeshIndex >, Rea
		{
		if(x > eps)
		return 1.0;
		else if (x < -eps)
		if (x < -eps)
		return (-1.0);

		if ( eps == 0.0)
		return 0.0;

		return sin((M_PIx)/(2.0eps));
		return sin(/(M_PIx)/(2.0eps) /(M_PI/2.0)*(x/eps));
		}


		@@ -98,7 +98,9 @@ bool parallelGodunovEikonalScheme< tnlGrid< 2,MeshReal, Device, MeshIndex >, Rea


		hx = originalMesh.getHx();
		ihx = 1.0/hx;
		hy = originalMesh.getHy();
		ihy = 1.0/hy;

		epsilon = parameters. getParameter< double >( "epsilon" );

		@@ -207,12 +209,12 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Re
		else
		yb = positivePart((u[cellIndex] - u[mesh.template getCellNextToCell<0,-1>( cellIndex )])/hy);

		if(xb + xf > 0.0)
		if(xb - xf > 0.0)
		xf = 0.0;
		else
		xb = 0.0;

		if(yb + yf > 0.0)
		if(yb - yf > 0.0)
		yf = 0.0;
		else
		yb = 0.0;
		@@ -262,15 +264,15 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Re
		yb = negativePart((u[cellIndex] - u[mesh.template getCellNextToCell<0,-1>( cellIndex )])/hy);


		if(xb + xf > 0.0)
		xb = 0.0;
		else
		if(xb - xf > 0.0)
		xf = 0.0;

		if(yb + yf > 0.0)
		yb = 0.0;
		else
		xb = 0.0;

		if(yb - yf > 0.0)
		yf = 0.0;
		else
		yb = 0.0;

		nabla = sqrt (xfxf + xbxb + yfyf + ybyb );

		@@ -310,7 +312,7 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Re
		const IndexType boundaryCondition) const
		{


		/*
		if ( ((coordinates.x() == 0 && (boundaryCondition & 4)) or
		(coordinates.x() == mesh.getDimensions().x() - 1 && (boundaryCondition & 2)) or
		(coordinates.y() == 0 && (boundaryCondition & 8)) or
		@@ -320,10 +322,10 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Re
		return 0.0;
		}


		*/
		//RealType acc = hxhyhx*hy;

		RealType nabla, signui;
		RealType signui;
		signui = sign(u[cellIndex],epsilon);

		#ifdef HAVE_CUDA
		@@ -334,6 +336,7 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Re
		RealType xf = -u[cellIndex];
		RealType yb = u[cellIndex];
		RealType yf = -u[cellIndex];
		RealType a,b;


		if(coordinates.x() == mesh.getDimensions().x() - 1)
		@@ -357,10 +360,10 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Re
		yb -= u[mesh.template getCellNextToCell<0,-1>( cellIndex )];


		xb /= hx;
		xf /= hx;
		yb /= hy;
		yf /= hy;
		//xb *= ihx;
		//xf *= ihx;
		// yb *= ihy;
		//yf *= ihy;

		if(signui > 0.0)
		{
		@@ -372,16 +375,6 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Re

		yb = positivePart(yb);

		if(xb + xf > 0.0)
		xf = 0.0;
		else
		xb = 0.0;

		if(yb + yf > 0.0)
		yf = 0.0;
		else
		yb = 0.0;

		}
		else if(signui < 0.0)
		{
		@@ -393,27 +386,20 @@ Real parallelGodunovEikonalScheme< tnlGrid< 2, MeshReal, Device, MeshIndex >, Re
		yb = negativePart(yb);

		yf = positivePart(yf);

		if(xb + xf > 0.0)
		xb = 0.0;
		else
		xf = 0.0;

		if(yb + yf > 0.0)
		yb = 0.0;
		else
		yf = 0.0;

		}


		if(xb - xf > 0.0)
		a = xb;
		else
		a = xf;

		nabla = sqrt (xfxf + xbxb + yfyf + ybyb );
		return signui*(1.0 - nabla);



		if(yb - yf > 0.0)
		b = yb;
		else
		b = yf;

		return signui(1.0 - sqrt(aa+bb)ihx );
		}