Narrow Band (4d189588) · Commits · TNL / tnl-dev

examples/CMakeLists.txt

+1 −0

Original line number	Diff line number	Diff line
		@@ -6,3 +6,4 @@ add_subdirectory( hamilton-jacobi-parallel )
		add_subdirectory( fast-sweeping )
		add_subdirectory( hamilton-jacobi-parallel-map )
		add_subdirectory( fast-sweeping-map )
		add_subdirectory( narrow-band )

examples/fast-sweeping-map/tnlFastSweepingMap2D_impl.h

+1 −4

Original line number	Diff line number	Diff line
		@@ -268,11 +268,8 @@ bool tnlFastSweepingMap< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >
		}


		// data.setLike(dofVector2.getData());
		// data = dofVector2.getData();
		// cout << data.getType() << endl;

		dofVector2.save("u-00001.tnl");
		//dofVector2.getData().save("u-00001.tnl");

		return true;
		}

examples/narrow-band/CMakeLists.txt

+10 −10

Original line number	Diff line number	Diff line
		set( tnl_fast_sweeping_SOURCES
		set( tnl_narrow_band_SOURCES
		# MainBuildConfig.h
		# tnlFastSweeping2D_impl.h
		# tnlFastSweeping.h
		# fastSweepingConfig.h
		# tnlNarrowBand2D_impl.h
		# tnlNarrowBand.h
		# narrowBandConfig.h
		main.cpp)


		IF( BUILD_CUDA )
		CUDA_ADD_EXECUTABLE(fast-sweeping${debugExt} main.cu)
		CUDA_ADD_EXECUTABLE(narrow-band${debugExt} main.cu)
		ELSE( BUILD_CUDA )
		ADD_EXECUTABLE(fast-sweeping${debugExt} main.cpp)
		ADD_EXECUTABLE(narrow-band${debugExt} main.cpp)
		ENDIF( BUILD_CUDA )
		target_link_libraries (fast-sweeping${debugExt} tnl${debugExt}-${tnlVersion} )
		target_link_libraries (narrow-band${debugExt} tnl${debugExt}-${tnlVersion} )


		INSTALL( TARGETS fast-sweeping${debugExt}
		INSTALL( TARGETS narrow-band${debugExt}
		RUNTIME DESTINATION bin
		PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )

		#INSTALL( FILES ${tnl_fast_sweeping_SOURCES}
		# DESTINATION share/tnl-${tnlVersion}/examples/fast-sweeping )
		#INSTALL( FILES ${tnl_narrow_band_SOURCES}
		# DESTINATION share/tnl-${tnlVersion}/examples/narrow-band )

examples/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h

+100 −20

Original line number	Diff line number	Diff line
		@@ -119,6 +119,8 @@ bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: i
		cudaMalloc(&(cudaStatusVector), statusGridSizestatusGridSize sizeof(int));
		// cudaMemcpy(cudaDofVector, this->dofVector.getData().getData(), statusGridSizestatusGridSize sizeof(int)), cudaMemcpyHostToDevice);

		cudaMalloc(&reinitialize, sizeof(int));


		cudaMalloc(&(this->cudaSolver), sizeof(tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >));
		cudaMemcpy(this->cudaSolver, this,sizeof(tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index >), cudaMemcpyHostToDevice);
		@@ -127,9 +129,12 @@ bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: i

		int n = Mesh.getDimensions().x();

		dim3 threadsPerBlock2(1, 1);
		dim3 threadsPerBlock2(NARROWBAND_SUBGRID_SIZE, NARROWBAND_SUBGRID_SIZE);
		dim3 numBlocks2(((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE) ,((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE));
		initStatusGridCUDA<<<numBlocks2,threadsPerBlock2>>>(this->cudaSolver);
		initSetupGridCUDA<<<numBlocks2,threadsPerBlock2>>>(this->cudaSolver);
		cudaDeviceSynchronize();
		checkCudaDevice;
		initSetupGrid2CUDA<<<numBlocks2,1>>>(this->cudaSolver);
		cudaDeviceSynchronize();
		checkCudaDevice;

		@@ -158,25 +163,43 @@ bool tnlNarrowBand< tnlGrid< 2,MeshReal, Device, MeshIndex >, Real, Index > :: r
		{

		int n = Mesh.getDimensions().x();
		dim3 threadsPerBlock(1, 512);
		dim3 numBlocks(4,1);
		dim3 threadsPerBlockFS(1, 512);
		dim3 numBlocksFS(4,1);
		dim3 threadsPerBlockNB(NARROWBAND_SUBGRID_SIZE, NARROWBAND_SUBGRID_SIZE);
		dim3 numBlocksNB(n/NARROWBAND_SUBGRID_SIZE + 1,n/NARROWBAND_SUBGRID_SIZE + 1);

		double time = 0.0;
		int reinit = 0;

		runCUDA<<<numBlocksFS,threadsPerBlockFS>>>(this->cudaSolver,0,0);
		cudaDeviceSynchronize();
		checkCudaDevice;
		while(time < finalTime)
		{
		if(tau+time > finalTime)
		tau=finalTime-time;

		runNarrowBandCUDA(this->cudaSolver,tau);
		runNarrowBandCUDA<<<numBlocksNB,threadsPerBlockNB>>>(this->cudaSolver,tau);
		cudaDeviceSynchronize();
		checkCudaDevice;

		time += tau;
		}

		runCUDA<<<numBlocks,threadsPerBlock>>>(this->cudaSolver,0,0);

		cudaMemcpy(&reinit, this->reinitialize, sizeof(int), cudaMemcpyDeviceToHost);
		if(reinit != 0)
		{
		initSetupGridCUDA<<<numBlocksNB,threadsPerBlockNB>>>(this->cudaSolver);
		cudaDeviceSynchronize();
		checkCudaDevice;
		initSetupGridCUDA<<<numBlocksNB,1>>>(this->cudaSolver);
		cudaDeviceSynchronize();
		checkCudaDevice;
		runCUDA<<<numBlocksFS,threadsPerBlockFS>>>(this->cudaSolver,0,0);
		cudaDeviceSynchronize();
		checkCudaDevice;
		}
		}

		//data.setLike(dofVector.getData());
		//cudaMemcpy(data.getData(), cudaDofVector2, this->dofVector.getData().getSize()*sizeof(double), cudaMemcpyDeviceToHost);
		@@ -512,8 +535,50 @@ __global__ void initCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, doubl

		__global__ void initSetupGridCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver)
		{
		cudaStatusVector[blockIdx.x + blockDim.x*blockIdx.y] = 0;
		__shared__ double u0;
		if(threadIdx.x+threadIdx.y == 0)
		{
		if(blockIdx.x+blockIdx.y == 0)
		*(solver->reinitialize) = 0;

		solver->cudaStatusVector[blockIdx.x + gridDim.x*blockIdx.y] = 0;

		u0 = solver->cudaDofVector2[(blockDim.yblockIdx.y + 0)blockDim.xgridDim.x + blockDim.xblockIdx.x + 0];
		}
		__syncthreads();

		double u = solver->cudaDofVector2[(blockDim.yblockIdx.y + threadIdx.y)blockDim.xgridDim.x + blockDim.xblockIdx.x + threadIdx.x];

		if(u*u0 <=0.0)
		atomicMax(&(solver->cudaStatusVector[blockIdx.x + gridDim.x*blockIdx.y]),1);
		}



		// run this with one thread per block
		__global__ void initSetupGrid2CUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver)
		{
		if(solver->cudaStatusVector[blockIdx.x + gridDim.x*blockIdx.y] == 1)
		{
		// 1 - with curve, 2 - to the north of curve, 4 - to the south of curve,
		// 8 - to the east of curve, 16 - to the west of curve.
		if(blockIdx.x > 0)
		atomicAdd(&(solver->cudaStatusVector[blockIdx.x - 1 + gridDim.x*blockIdx.y]), 16);

		if(blockIdx.x < gridDim.x - 1)
		atomicAdd(&(solver->cudaStatusVector[blockIdx.x + 1 + gridDim.x*blockIdx.y]), 8);

		if(blockIdx.y > 0 )
		atomicAdd(&(solver->cudaStatusVector[blockIdx.x + gridDim.x*(blockIdx.y - 1)]), 4);

		if(blockIdx.y < gridDim.y - 1)
		atomicAdd(&(solver->cudaStatusVector[blockIdx.x + gridDim.x*(blockIdx.y + 1)]), 2);
		}
		}





		__global__ void runNarrowBandCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, double tau)
		{
		@@ -521,19 +586,21 @@ __global__ void runNarrowBandCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int
		int i = threadIdx.x + blockIdx.x*blockDim.x;
		int j = threadIdx.y + blockIdx.y*blockDim.y;

		int blockID = blockIdx.x + blockIdx.y*gridDim.x;

		if(cudaStatusVector[i/NARROWBAND_SUBGRID_SIZE + (j/NARROWBAND_SUBGRID_SIZE) * ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE)] != 0)

		if(solver->cudaStatusVector[blockID/i/NARROWBAND_SUBGRID_SIZE + (j/NARROWBAND_SUBGRID_SIZE) ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE)*/] != 0)
		{
		tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage > Entity(solver->Mesh);
		Entity.setCoordinates(CoordinatesType(i,j));
		Entity.setCoordinates(tnlStaticVector<2,double>(i,j));
		Entity.refresh();
		tnlNeighbourGridEntityGetter<tnlGridEntity< MeshType, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
		Real value = cudaDofVector2[Entity.getIndex()];
		Real xf,xb,yf,yb, tmp, fu;
		tnlNeighbourGridEntityGetter<tnlGridEntity< tnlGrid< 2,double, tnlHost, int >, 2, tnlGridEntityNoStencilStorage >,2> neighbourEntities(Entity);
		double value = solver->cudaDofVector2[Entity.getIndex()];
		double xf,xb,yf,yb, tmp, fu;

		if( i == 0 )
		yb = yf = solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 1, 0 >()];
		else if( i == Mesh.getDimensions().x() - 1 )
		else if( i == solver->Mesh.getDimensions().x() - 1 )
		yb = yf = solver->cudaDofVector2[neighbourEntities.template getEntityIndex< -1, 0 >()];
		else
		{
		@@ -543,7 +610,7 @@ __global__ void runNarrowBandCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int

		if( j == 0 )
		xb = xf = solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 0, 1 >()];
		else if( j == Mesh.getDimensions().y() - 1 )
		else if( j == solver->Mesh.getDimensions().y() - 1 )
		xb = xf = solver->cudaDofVector2[neighbourEntities.template getEntityIndex< 0, -1 >()];
		else
		{
		@@ -556,8 +623,21 @@ __global__ void runNarrowBandCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int

		fu = 1.0 * tmp;

		if((taufu+u)u <=0 )
		if((taufu+value)value <=0 )
		{
		int status = solver->cudaStatusVector[blockID];
		// 1 - with curve, 2 - to the north of curve, 4 - to the south of curve,
		// 8 - to the east of curve, 16 - to the west of curve.

		if(threadIdx.x == 0 && !(status & 8) && (blockIdx.x > 0) )
		atomicMax(solver->reinitialize,1);
		else if(threadIdx.x == blockDim.x-1 && !(status & 16) && (blockIdx.x < gridDim.x - 1) )
		atomicMax(solver->reinitialize,1);
		else if(threadIdx.y == 0 && !(status & 2) && (blockIdx.y > 0) )
		atomicMax(solver->reinitialize,1);
		else if(threadIdx.y == blockDim.y-1 && !(status & 4) && (blockIdx.y < gridDim.y - 1) )
		atomicMax(solver->reinitialize,1);

		// if(blockIdx.x < gridDim.x - 1)
		// {
		// atomicMax( &cudaStatusVector[(i+2)/NARROWBAND_SUBGRID_SIZE + (j/NARROWBAND_SUBGRID_SIZE) * ((Mesh.getDimensions().x() + NARROWBAND_SUBGRID_SIZE-1 ) / NARROWBAND_SUBGRID_SIZE)] ,1);
		@@ -583,7 +663,7 @@ __global__ void runNarrowBandCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int
		}


		cudaDofVector2[Entity.getIndex()] = u+tau*fu;
		solver->cudaDofVector2[Entity.getIndex()] = value+tau*fu;
		}
		}

examples/narrow-band/tnlNarrowBand_CUDA.h

+2 −0

Original line number	Diff line number	Diff line
		@@ -74,6 +74,7 @@ public:
		double* cudaDofVector2;
		int* cudaStatusVector;
		int counter;
		int* reinitialize;
		__device__ void setupSquare1000(Index i, Index j);
		__device__ void setupSquare1100(Index i, Index j);
		__device__ void setupSquare1010(Index i, Index j);
		@@ -180,6 +181,7 @@ __global__ void runCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double
		__global__ void initCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver);

		__global__ void initSetupGridCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver);
		__global__ void initSetupGrid2CUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver);
		__global__ void runNarrowBandCUDA(tnlNarrowBand< tnlGrid< 2,double, tnlHost, int >, double, int >* solver, double tau);
		//__global__ void initCUDA(tnlNarrowBand< tnlGrid< 3,double, tnlHost, int >, double, int >* solver);
		#endif