FIM implemented in 3D (e3c970e1) · Commits · TNL / tnl-dev

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h

+8 −2

Original line number	Diff line number	Diff line
		@@ -160,11 +160,17 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3
		Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output,
		Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap );

		template < typename Real, typename Device, typename Index >
		template < int sizeSArray, typename Real, typename Device, typename Index >
		__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
		const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
		Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
		const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
		Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
		TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );

		template < typename Index >
		__global__ void GetNeighbours3D( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
		TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,
		int numBlockX, int numBlockY, int numBlockZ );
		#endif

		#include "tnlDirectEikonalMethodsBase_impl.h"

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h

+7 −9

Original line number	Diff line number	Diff line
		@@ -85,7 +85,7 @@ solve( const MeshPointer& mesh,
		{
		if( std::is_same< DeviceType, Devices::Host >::value )
		{
		int numThreadsPerBlock = 1024;
		int numThreadsPerBlock = 16;


		int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0);
		@@ -115,13 +115,13 @@ solve( const MeshPointer& mesh,
		}
		std::cout<<std::endl;*/
		unsigned int numWhile = 0;
		while( IsCalculationDone )
		while( IsCalculationDone && numWhile < 1 )
		{
		IsCalculationDone = 0;
		helpFunc1 = auxPtr;
		auxPtr = helpFunc;
		helpFunc = helpFunc1;
		this->template updateBlocks< 1026 >( interfaceMap, auxPtr, helpFunc, BlockIterHost, numThreadsPerBlock/, sArray/ );
		this->template updateBlocks< 18 >( interfaceMap, auxPtr, helpFunc, BlockIterHost, numThreadsPerBlock/, sArray/ );

		//Reduction
		for( int i = 0; i < BlockIterHost.getSize(); i++ ){
		@@ -394,9 +394,7 @@ solve( const MeshPointer& mesh,
		numIter ++;
		}
		if( numIter == 1 ){
		helpFunc1 = auxPtr;
		auxPtr = helpFunc;
		helpFunc = helpFunc1;
		}
		/*cudaFree( BlockIterDevice );
		cudaFree( dBlock );
		@@ -535,10 +533,10 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
		__syncthreads();
		/*-----------------------------------------/
		const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
		__shared__ volatile int dimX;
		__shared__ volatile int dimY;
		__shared__ volatile Real hx;
		__shared__ volatile Real hy;
		__shared__ int dimX;
		__shared__ int dimY;
		__shared__ Real hx;
		__shared__ Real hy;
		if( thri==0 && thrj == 0)
		{
		dimX = mesh.getDimensions().x();

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h

+463 −418

Original line number	Diff line number	Diff line
		@@ -264,6 +264,9 @@ solve( const MeshPointer& mesh,
		TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
		BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ );
		BlockIterDevice.setValue( 1 );
		TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom;
		BlockIterPom.setSize( numBlocksX * numBlocksY * numBlocksZ );
		BlockIterPom.setValue( 0 );
		/int BlockIterDevice;
		cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/
		int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0);
		@@ -271,18 +274,38 @@ solve( const MeshPointer& mesh,
		TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
		dBlock.setSize( nBlocks );
		dBlock.setValue( 0 );

		int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0);
		/int dBlock;
		cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/
		MeshFunctionPointer helpFunc1( mesh );
		MeshFunctionPointer helpFunc( mesh );

		helpFunc1 = auxPtr;
		auxPtr = helpFunc;
		helpFunc = helpFunc1;
		int numIter = 0;

		while( BlockIterD )
		{
		CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
		helpFunc1 = auxPtr;
		auxPtr = helpFunc;
		helpFunc = helpFunc1;
		TNL_CHECK_CUDA_DEVICE;

		CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr,
		interfaceMapPtr.template getData< Device >(),
		auxPtr.template modifyData< Device>(),
		auxPtr.template getData< Device>(),
		helpFunc.template modifyData< Device>(),
		BlockIterDevice );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;

		GetNeighbours3D<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, BlockIterPom, numBlocksX, numBlocksY, numBlocksZ );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		BlockIterDevice = BlockIterPom;

		CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		@@ -291,11 +314,14 @@ solve( const MeshPointer& mesh,
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);

		numIter++;
		/for( int i = 1; i < numBlocksX numBlocksY; i++ )
		BlockIter[ 0 ] = BlockIter[ 0 ] \|\| BlockIter[ i ];*/

		}
		if( numIter == 1 ){
		auxPtr = helpFunc;
		}
		//cudaFree( BlockIterDevice );
		//cudaFree( dBlock );
		cudaDeviceSynchronize();
		@@ -313,10 +339,43 @@ solve( const MeshPointer& mesh,
		}

		#ifdef HAVE_CUDA
		template < typename Real, typename Device, typename Index >
		template < typename Index >
		__global__ void GetNeighbours3D( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
		TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,
		int numBlockX, int numBlockY, int numBlockZ )
		{
		int i = blockIdx.x * 1024 + threadIdx.x;

		if( i < numBlockX * numBlockY * numBlockZ )
		{
		int pom = 0;//BlockIterPom[ i ] = 0;
		int m=0, l=0, k=0;
		l = i/( numBlockX * numBlockY );
		k = (i-lnumBlockX numBlockY )/(numBlockX );
		m = (i-lnumBlockX numBlockY )%( numBlockX );
		if( m > 0 && BlockIterDevice[ i - 1 ] ){
		pom = 1;//BlockIterPom[ i ] = 1;
		}else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){
		pom = 1;//BlockIterPom[ i ] = 1;
		}else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){
		pom = 1;// BlockIterPom[ i ] = 1;
		}else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){
		pom = 1;//BlockIterPom[ i ] = 1;
		}else if( l > 0 && BlockIterDevice[ i - numBlockX*numBlockY ] ){
		pom = 1;
		}else if( l < numBlockZ-1 && BlockIterDevice[ i + numBlockX*numBlockY ] ){
		pom = 1;
		}

		BlockIterPom[ i ] = pom;//BlockIterPom[ i ];
		}
		}

		template < int sizeSArray, typename Real, typename Device, typename Index >
		__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
		const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
		Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
		const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
		Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc,
		TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice )
		{
		int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
		@@ -326,62 +385,54 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
		int k = blockDim.z*blockIdx.z + threadIdx.z;
		int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri;

		__shared__ volatile bool changed[888];
		changed[ currentIndex ] = false;
		if( BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] )
		{
		__syncthreads();

		__shared__ volatile bool changed[ (sizeSArray - 2)(sizeSArray - 2)(sizeSArray - 2)];

		changed[ currentIndex ] = false;
		if( thrj == 0 && thri == 0 && thrk == 0 )
		changed[ 0 ] = true;

		const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
		__shared__ Real hx;
		__shared__ Real hy;
		__shared__ Real hz;
		__shared__ Real hx; __shared__ int dimX;
		__shared__ Real hy; __shared__ int dimY;
		__shared__ Real hz; __shared__ int dimZ;

		if( thrj == 1 && thri == 1 && thrk == 1 )
		{
		hx = mesh.getSpaceSteps().x();
		hy = mesh.getSpaceSteps().y();
		hz = mesh.getSpaceSteps().z();
		dimX = mesh.getDimensions().x();
		dimY = mesh.getDimensions().y();
		dimZ = mesh.getDimensions().z();
		BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 0;
		}
		__shared__ volatile Real sArray[10][10][10];
		sArray[thrk][thrj][thri] = std::numeric_limits< Real >::max();
		if(thri == 0 )
		{
		sArray[8][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
		sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
		sArray[thrk+1][thrj+1][8] = std::numeric_limits< Real >::max();
		sArray[thrk+1][thrj+1][9] = std::numeric_limits< Real >::max();
		sArray[thrj+1][8][thrk+1] = std::numeric_limits< Real >::max();
		sArray[thrj+1][9][thrk+1] = std::numeric_limits< Real >::max();
		}
		__shared__ volatile Real sArray[sizeSArray][sizeSArray][sizeSArray];
		sArray[thrk+1][thrj+1][thri+1] = std::numeric_limits< Real >::max();

		//filling sArray edges
		int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
		int dimZ = mesh.getDimensions().z();
		__shared__ volatile int numOfBlockx;
		__shared__ volatile int numOfBlocky;
		__shared__ volatile int numOfBlockz;
		__shared__ int xkolik;
		__shared__ int ykolik;
		__shared__ int zkolik;
		if( thri == 0 && thrj == 0 && thrk == 0 )
		{
		int numOfBlockx;
		int numOfBlocky;
		int numOfBlockz;
		int xkolik;
		int ykolik;
		int zkolik;
		xkolik = blockDim.x + 1;
		ykolik = blockDim.y + 1;
		zkolik = blockDim.z + 1;
		numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0);
		numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0);
		numOfBlockz = dimZ/blockDim.z + ((dimZ%blockDim.z != 0) ? 1:0);
		numOfBlockx = gridDim.x;
		numOfBlocky = gridDim.y;
		numOfBlockz = gridDim.z;

		if( numOfBlockx - 1 == blIdx )
		xkolik = dimX - (blIdx)*blockDim.x+1;

		if( numOfBlocky -1 == blIdy )
		ykolik = dimY - (blIdy)*blockDim.y+1;
		if( numOfBlockz-1 == blIdz )
		zkolik = dimZ - (blIdz)*blockDim.z+1;

		BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 0;
		}
		__syncthreads();

		if( thri == 0 )
		@@ -430,12 +481,10 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
		sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max();
		}

		if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() )
		if( i < dimX && j < dimY && k < dimZ )
		{
		sArray[thrk+1][thrj+1][thri+1] = aux[ kdimXdimY + j*dimX + i ];
		}
		__shared__ volatile int loopcounter;
		loopcounter = 0;
		__syncthreads();
		while( changed[ 0 ] )
		{
		@@ -510,17 +559,13 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
		}*/
		if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 )
		{
		//loopcounter++;
		BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 1;
		}
		__syncthreads();
		/*if(thri == 0 && thrj==0 && thrk==0)
		printf("%i \n",loopcounter);
		if(loopcounter == 500)
		break;*/
		}

		if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ && (!interfaceMap[ kdimXdimY+j * mesh.getDimensions().x() + i ]) )
		aux[ kdimXdimY + j * mesh.getDimensions().x() + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ];
		if( i < dimX && j < dimY && k < dimZ )
		helpFunc[ kdimXdimY + j * dimX + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ];
		}
		}
		#endif