FIM method is now faster than chess method but some random error occurs. (444e01c4) · Commits · TNL / tnl-dev

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h

+18 −6

Original line number	Diff line number	Diff line
		@@ -61,6 +61,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
		typedef Index IndexType;
		typedef Functions::MeshFunction< MeshType > MeshFunctionType;
		typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType;
		typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer;
		using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >;
		using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >;

		@@ -76,6 +77,11 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
		__cuda_callable__ bool updateCell( volatile Real sArray[18][18],
		int thri, int thrj, const Real hx, const Real hy,
		const Real velocity = 1.0 );
		void updateBlocks( InterfaceMapType interfaceMap,
		MeshFunctionType aux,
		ArrayContainer BlockIterHost, int numThreadsPerBlock );

		void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY );
		};

		template< typename Real,
		@@ -113,6 +119,8 @@ T1 meet2DCondition( T1 a, T1 b, const T2 ha, const T2 hb, const T1 value, double
		template < typename T1 >
		__cuda_callable__ void sortMinims( T1 pom[] );

		template < typename Index >
		void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY );

		#ifdef HAVE_CUDA
		template < typename Real, typename Device, typename Index >
		@@ -130,11 +138,15 @@ template < typename Real, typename Device, typename Index >
		__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
		const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
		Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
		int *BlockIterDevice, int oddEvenBlock);
		__global__ void CudaParallelReduc( int BlockIterDevice, int dBlock, int nBlocks );
		TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne = 1 );

		template < typename Index >
		__global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
		TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks );

		/*template < typename Real, typename Device, typename Index >
		__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real dAux, int a );/
		template < typename Index >
		__global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
		/TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,/ int numBlockX, int numBlockY );

		template < typename Real, typename Device, typename Index >
		__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input,
		@@ -150,7 +162,7 @@ template < typename Real, typename Device, typename Index >
		__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
		const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
		Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
		int *BlockIterDevice );
		TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );
		#endif

		#include "tnlDirectEikonalMethodsBase_impl.h"

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h

+193 −0

Original line number	Diff line number	Diff line
		@@ -89,6 +89,199 @@ initInterface( const MeshFunctionPointer& _input,
		}
		}

		template< typename Real,
		typename Device,
		typename Index >
		void
		tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
		updateBlocks( InterfaceMapType interfaceMap,
		MeshFunctionType aux,
		ArrayContainer BlockIterHost, int numThreadsPerBlock )
		{
		for( int i = 0; i < BlockIterHost.getSize(); i++ )
		{
		if( BlockIterHost[ i ] )
		{
		MeshType mesh = interfaceMap.template getMesh< Devices::Host >();

		int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
		int numOfBlockx = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
		int numOfBlocky = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
		int xkolik = numThreadsPerBlock + 1;
		int ykolik = numThreadsPerBlock + 1;

		int blIdx = i%numOfBlockx;
		int blIdy = i/numOfBlocky;

		if( numOfBlockx - 1 == blIdx )
		xkolik = dimX - (blIdx)*numThreadsPerBlock+1;

		if( numOfBlocky -1 == blIdy )
		ykolik = dimY - (blIdy)*numThreadsPerBlock+1;


		/bool changed[numThreadsPerBlocknumThreadsPerBlock];
		changed[ 0 ] = 1;*/
		Real hx = mesh.getSpaceSteps().x();
		Real hy = mesh.getSpaceSteps().y();

		Real changed1[ 16*16 ];
		/Real changed2[ 1616 ];
		Real changed3[ 16*16 ];
		Real changed4[ 1616 ];/
		Real sArray[18][18];

		for( int thri = 0; thri < numThreadsPerBlock + 2; thri++ )
		for( int thrj = 0; thrj < numThreadsPerBlock + 2; thrj++ )
		sArray[thrj][thri] = std::numeric_limits< Real >::max();

		BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0;

		for( int thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ )
		{
		if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik )
		sArray[thrj+1][xkolik] = aux[ blIdynumThreadsPerBlockdimX - dimX + blIdxnumThreadsPerBlock - 1 + (thrj+1)dimX + xkolik ];
		else
		sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();


		if( blIdx != 0 && thrj+1 < ykolik )
		sArray[thrj+1][0] = aux[ blIdynumThreadsPerBlockdimX - dimX + blIdxnumThreadsPerBlock - 1 + (thrj+1)dimX ];
		else
		sArray[thrj+1][0] = std::numeric_limits< Real >::max();

		if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik )
		sArray[ykolik][thrj+1] = aux[ blIdynumThreadsPerBlockdimX - dimX + blIdxnumThreadsPerBlock - 1 + ykolikdimX + thrj+1 ];
		else
		sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();

		if( blIdy != 0 && thrj+1 < xkolik )
		sArray[0][thrj+1] = aux[ blIdynumThreadsPerBlockdimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ];
		else
		sArray[0][thrj+1] = std::numeric_limits< Real >::max();
		}

		for( int k = 0; k < numThreadsPerBlock; k++ )
		for( int l = 0; l < numThreadsPerBlock; l++ )
		sArray[k+1][l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ];

		for( int k = 0; k < numThreadsPerBlock; k++ )
		for( int l = 0; l < numThreadsPerBlock; l++ ){
		changed1[ k*numThreadsPerBlock + l ] = 0;
		/changed2[ knumThreadsPerBlock + l ] = 0;
		changed3[ k*numThreadsPerBlock + l ] = 0;
		changed4[ knumThreadsPerBlock + l ] = 0;/
		if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + kdimX + l < dimXdimY )
		{
		if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] )
		{
		changed1[ k*numThreadsPerBlock + l ] = this->updateCell( sArray, l+1, k+1, hx,hy);
		}
		}
		}

		for( int k = numThreadsPerBlock-1; k > -1; k-- )
		for( int l = 0; l < numThreadsPerBlock; l++ ) {
		if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + kdimX + l < dimXdimY )
		{
		if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] )
		{
		/changed2[ knumThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
		}
		}
		}

		for( int k = 0; k < numThreadsPerBlock; k++ )
		for( int l = numThreadsPerBlock-1; l >-1; l-- ) {
		if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + kdimX + l < dimXdimY )
		{
		if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] )
		{
		/changed3[ knumThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
		}
		}
		}

		for( int k = numThreadsPerBlock-1; k > -1; k-- )
		for( int l = numThreadsPerBlock-1; l >-1; l-- ) {
		if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + kdimX + l < dimXdimY )
		{
		if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] )
		{
		/changed4[ knumThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
		}
		}
		}

		for( int k = numThreadsPerBlock-1; k > -1; k-- )
		for( int l = numThreadsPerBlock-1; l >-1; l-- ){
		changed1[ 0 ] = changed1[ 0 ] \|\| changed1[ k*numThreadsPerBlock + l ];
		/changed2[ 0 ] = changed2[ 0 ] \|\| changed2[ knumThreadsPerBlock + l ];
		changed3[ 0 ] = changed3[ 0 ] \|\| changed3[ k*numThreadsPerBlock + l ];
		changed4[ 0 ] = changed4[ 0 ] \|\| changed4[ knumThreadsPerBlock + l ];/
		}

		if( changed1[ 0 ] /\|\| changed2[ 0 ] \|\|changed3[ 0 ] \|\|changed4[ 0 ]/ )
		BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 1;

		for( int k = 0; k < numThreadsPerBlock; k++ ){
		for( int l = 0; l < numThreadsPerBlock; l++ ) {
		if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + kdimX + l < dimXdimY &&
		(!interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ]) )
		aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] = sArray[ k + 1 ][ l + 1 ];
		//std::cout<< sArray[k+1][l+1];
		}
		//std::cout<<std::endl;
		}
		}
		}
		}

		template< typename Real,
		typename Device,
		typename Index >
		void
		tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
		getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY )
		{
		int* BlockIterPom;
		BlockIterPom = new int [numBlockX * numBlockY];

		for(int i = 0; i < numBlockX * numBlockY; i++)
		{
		BlockIterPom[ i ] = 0;
		if( BlockIterHost[ i ] )
		{
		// i = k*numBlockY + m;
		int m=0, k=0;
		m = i%numBlockX;
		k = i/numBlockX;
		if( k > 0 )
		BlockIterPom[i - numBlockX] = 1;
		if( k < numBlockY - 1 )
		BlockIterPom[i + numBlockX] = 1;

		if( m < numBlockX - 1 )
		BlockIterPom[ i+1 ] = 1;
		if( m > 0 )
		BlockIterPom[ i-1 ] = 1;
		}
		}
		for(int i = 0; i < numBlockX * numBlockY; i++ )
		//if( !BlockIter[ i ] )
		BlockIterHost[ i ] = BlockIterPom[ i ];
		/*else
		BlockIter[ i ] = 0;*/
		/*for( int i = numBlockX-1; i > -1; i-- )
		{
		for( int j = 0; j< numBlockY; j++ )
		std::cout << BlockIterHost[ i*numBlockY + j ];
		std::cout << std::endl;
		}
		std::cout << std::endl;*/
		delete[] BlockIterPom;
		}

		template< typename Real,
		typename Device,
		typename Index >

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h

+2 −1

Original line number	Diff line number	Diff line
		@@ -89,6 +89,7 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >
		using typename BaseType::MeshFunctionType;
		using typename BaseType::InterfaceMapPointer;
		using typename BaseType::MeshFunctionPointer;
		using typename BaseType::ArrayContainer;

		FastSweepingMethod();

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h

+421 −425

File changed.

Preview size limit exceeded, changes collapsed.

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h

+23 −8

Original line number	Diff line number	Diff line
		@@ -258,13 +258,21 @@ solve( const MeshPointer& mesh,

		tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr;

		int *BlockIterDevice;

		int BlockIterD = 1;

		cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );
		TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
		BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ );
		BlockIterDevice.setValue( 1 );
		/int BlockIterDevice;
		cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/
		int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0);
		int *dBlock;
		cudaMalloc(&dBlock, nBlocks * sizeof( int ) );

		TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
		dBlock.setSize( nBlocks );
		dBlock.setValue( 0 );
		/int dBlock;
		cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/

		while( BlockIterD )
		{
		@@ -272,17 +280,24 @@ solve( const MeshPointer& mesh,
		interfaceMapPtr.template getData< Device >(),
		auxPtr.template modifyData< Device>(),
		BlockIterDevice );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;

		CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
		CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;

		CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);

		/for( int i = 1; i < numBlocksX numBlocksY; i++ )
		BlockIter[ 0 ] = BlockIter[ 0 ] \|\| BlockIter[ i ];*/

		}
		cudaFree( BlockIterDevice );
		cudaFree( dBlock );
		//cudaFree( BlockIterDevice );
		//cudaFree( dBlock );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		aux = *auxPtr;
		@@ -302,7 +317,7 @@ template < typename Real, typename Device, typename Index >
		__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr,
		const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap,
		Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux,
		int *BlockIterDevice )
		TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice )
		{
		int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z;
		int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z;