FIM method implemented for GPU and FIM-FSM implemented for CPU (parallel). (d40a55e3) · Commits · TNL / tnl-dev

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h

+10 −8

Original line number	Diff line number	Diff line
		@@ -74,12 +74,16 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
		const MeshEntity& cell,
		const RealType velocity = 1.0 );

		__cuda_callable__ bool updateCell( volatile Real sArray[18][18],
		template< int sizeSArray >
		__cuda_callable__ bool updateCell( volatile Real *sArray,
		int thri, int thrj, const Real hx, const Real hy,
		const Real velocity = 1.0 );

		template< int sizeSArray >
		void updateBlocks( InterfaceMapType interfaceMap,
		MeshFunctionType aux,
		ArrayContainer BlockIterHost, int numThreadsPerBlock );
		MeshFunctionType helpFunc,
		ArrayContainer BlockIterHost, int numThreadsPerBlock/, Real sArray/ );

		void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY );
		};
		@@ -119,9 +123,6 @@ T1 meet2DCondition( T1 a, T1 b, const T2 ha, const T2 hb, const T1 value, double
		template < typename T1 >
		__cuda_callable__ void sortMinims( T1 pom[] );

		template < typename Index >
		void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY );

		#ifdef HAVE_CUDA
		template < typename Real, typename Device, typename Index >
		__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input,
		@@ -134,11 +135,12 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
		Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& aux,
		bool *BlockIterDevice );

		template < typename Real, typename Device, typename Index >
		template < int sizeSArray, typename Real, typename Device, typename Index >
		__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
		const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
		Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
		TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne = 1 );
		const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
		Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc,
		TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock =0);

		template < typename Index >
		__global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h

+1036 −1009

Original line number	Diff line number	Diff line
		@@ -92,12 +92,15 @@ initInterface( const MeshFunctionPointer& _input,
		template< typename Real,
		typename Device,
		typename Index >
		template< int sizeSArray >
		void
		tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
		updateBlocks( InterfaceMapType interfaceMap,
		MeshFunctionType aux,
		ArrayContainer BlockIterHost, int numThreadsPerBlock )
		MeshFunctionType helpFunc,
		ArrayContainer BlockIterHost, int numThreadsPerBlock/, Real sArray/ )
		{
		#pragma omp parallel for schedule( dynamic )
		for( int i = 0; i < BlockIterHost.getSize(); i++ )
		{
		if( BlockIterHost[ i ] )
		@@ -105,19 +108,23 @@ updateBlocks( InterfaceMapType interfaceMap,
		MeshType mesh = interfaceMap.template getMesh< Devices::Host >();

		int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
		int numOfBlockx = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
		int numOfBlocky = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
		//std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl;
		int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0);
		int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0);
		//std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl;
		int xkolik = numThreadsPerBlock + 1;
		int ykolik = numThreadsPerBlock + 1;

		int blIdx = i%numOfBlockx;
		int blIdy = i/numOfBlocky;
		int blIdy = i/numOfBlockx;
		//std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl;

		if( numOfBlockx - 1 == blIdx )
		xkolik = dimX - (blIdx)*numThreadsPerBlock+1;

		if( numOfBlocky -1 == blIdy )
		ykolik = dimY - (blIdy)*numThreadsPerBlock+1;
		//std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl;


		/bool changed[numThreadsPerBlocknumThreadsPerBlock];
		@@ -125,114 +132,133 @@ updateBlocks( InterfaceMapType interfaceMap,
		Real hx = mesh.getSpaceSteps().x();
		Real hy = mesh.getSpaceSteps().y();

		Real changed1[ 16*16 ];
		/Real changed2[ 1616 ];
		Real changed3[ 16*16 ];
		Real changed4[ 1616 ];/
		Real sArray[18][18];
		bool changed = false;

		for( int thri = 0; thri < numThreadsPerBlock + 2; thri++ )
		for( int thrj = 0; thrj < numThreadsPerBlock + 2; thrj++ )
		sArray[thrj][thri] = std::numeric_limits< Real >::max();

		RealType *sArray;
		sArray = new Real[ sizeSArray * sizeSArray ];
		if( sArray == nullptr )
		std::cout << "Error while allocating memory for sArray." << std::endl;

		for( int thri = 0; thri < sizeSArray; thri++ ){
		for( int thrj = 0; thrj < sizeSArray; thrj++ )
		sArray/[i]/[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max();
		}

		BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0;

		for( int thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ )
		{
		if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik )
		sArray[thrj+1][xkolik] = aux[ blIdynumThreadsPerBlockdimX - dimX + blIdxnumThreadsPerBlock - 1 + (thrj+1)dimX + xkolik ];
		else
		sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
		sArray/[i]/[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdynumThreadsPerBlockdimX - dimX + blIdxnumThreadsPerBlock - 1 + (thrj+1)dimX + xkolik ];


		if( blIdx != 0 && thrj+1 < ykolik )
		sArray[thrj+1][0] = aux[ blIdynumThreadsPerBlockdimX - dimX + blIdxnumThreadsPerBlock - 1 + (thrj+1)dimX ];
		else
		sArray[thrj+1][0] = std::numeric_limits< Real >::max();
		sArray/[i]/[(thrj+1)* sizeSArray] = aux[ blIdynumThreadsPerBlockdimX - dimX + blIdxnumThreadsPerBlock - 1 + (thrj+1)dimX ];

		if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik )
		sArray[ykolik][thrj+1] = aux[ blIdynumThreadsPerBlockdimX - dimX + blIdxnumThreadsPerBlock - 1 + ykolikdimX + thrj+1 ];
		else
		sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();
		sArray/[i]/[ykolik * sizeSArray + thrj+1] = aux[ blIdynumThreadsPerBlockdimX - dimX + blIdxnumThreadsPerBlock - 1 + ykolikdimX + thrj+1 ];

		if( blIdy != 0 && thrj+1 < xkolik )
		sArray[0][thrj+1] = aux[ blIdynumThreadsPerBlockdimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ];
		else
		sArray[0][thrj+1] = std::numeric_limits< Real >::max();
		sArray/[i]/[thrj+1] = aux[ blIdynumThreadsPerBlockdimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ];
		}

		for( int k = 0; k < numThreadsPerBlock; k++ )
		for( int k = 0; k < numThreadsPerBlock; k++ ){
		for( int l = 0; l < numThreadsPerBlock; l++ )
		sArray[k+1][l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ];

		for( int k = 0; k < numThreadsPerBlock; k++ )
		if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
		sArray/[i]/[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ];
		}
		bool pom = false;
		for( int k = 0; k < numThreadsPerBlock; k++ ){
		for( int l = 0; l < numThreadsPerBlock; l++ ){
		changed1[ k*numThreadsPerBlock + l ] = 0;
		/changed2[ knumThreadsPerBlock + l ] = 0;
		changed3[ k*numThreadsPerBlock + l ] = 0;
		changed4[ knumThreadsPerBlock + l ] = 0;/
		if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + kdimX + l < dimXdimY )
		{
		if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ){
		//std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl;
		if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] )
		{
		changed1[ k*numThreadsPerBlock + l ] = this->updateCell( sArray, l+1, k+1, hx,hy);
		pom = this->updateCell<sizeSArray>( sArray/[i]/, l+1, k+1, hx,hy);
		changed = changed \|\| pom;
		}
		}
		}
		}
		/*aux.save( "aux-1pruch.tnl" );
		for( int k = 0; k < sizeSArray; k++ ){
		for( int l = 0; l < sizeSArray; l++ ) {
		std::cout << sArray[ k * sizeSArray + l] << " ";
		}
		std::cout << std::endl;
		}*/

		for( int k = numThreadsPerBlock-1; k > -1; k-- )
		for( int l = 0; l < numThreadsPerBlock; l++ ) {
		if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + kdimX + l < dimXdimY )
		for( int k = 0; k < numThreadsPerBlock; k++ )
		for( int l = numThreadsPerBlock-1; l >-1; l-- ) {
		if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
		{
		if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] )
		{
		/changed2[ knumThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
		this->updateCell<sizeSArray>( sArray/[i]/, l+1, k+1, hx,hy);
		}
		}
		}
		/*aux.save( "aux-2pruch.tnl" );
		for( int k = 0; k < sizeSArray; k++ ){
		for( int l = 0; l < sizeSArray; l++ ) {
		std::cout << sArray[ k * sizeSArray + l] << " ";
		}
		std::cout << std::endl;
		}*/

		for( int k = 0; k < numThreadsPerBlock; k++ )
		for( int l = numThreadsPerBlock-1; l >-1; l-- ) {
		if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + kdimX + l < dimXdimY )
		for( int k = numThreadsPerBlock-1; k > -1; k-- )
		for( int l = 0; l < numThreadsPerBlock; l++ ) {
		if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
		{
		if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] )
		{
		/changed3[ knumThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
		this->updateCell<sizeSArray>( sArray/[i]/, l+1, k+1, hx,hy);
		}
		}
		}
		/*aux.save( "aux-3pruch.tnl" );
		for( int k = 0; k < sizeSArray; k++ ){
		for( int l = 0; l < sizeSArray; l++ ) {
		std::cout << sArray[ k * sizeSArray + l] << " ";
		}
		std::cout << std::endl;
		}*/

		for( int k = numThreadsPerBlock-1; k > -1; k-- )
		for( int k = numThreadsPerBlock-1; k > -1; k-- ){
		for( int l = numThreadsPerBlock-1; l >-1; l-- ) {
		if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + kdimX + l < dimXdimY )
		if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
		{
		if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] )
		{
		/changed4[ knumThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy);
		this->updateCell<sizeSArray>( sArray/[i]/, l+1, k+1, hx,hy);
		}
		}
		}

		for( int k = numThreadsPerBlock-1; k > -1; k-- )
		for( int l = numThreadsPerBlock-1; l >-1; l-- ){
		changed1[ 0 ] = changed1[ 0 ] \|\| changed1[ k*numThreadsPerBlock + l ];
		/changed2[ 0 ] = changed2[ 0 ] \|\| changed2[ knumThreadsPerBlock + l ];
		changed3[ 0 ] = changed3[ 0 ] \|\| changed3[ k*numThreadsPerBlock + l ];
		changed4[ 0 ] = changed4[ 0 ] \|\| changed4[ knumThreadsPerBlock + l ];/
		}
		/*aux.save( "aux-4pruch.tnl" );
		for( int k = 0; k < sizeSArray; k++ ){
		for( int l = 0; l < sizeSArray; l++ ) {
		std::cout << sArray[ k * sizeSArray + l] << " ";
		}
		std::cout << std::endl;
		}*/

		if( changed1[ 0 ] /\|\| changed2[ 0 ] \|\|changed3[ 0 ] \|\|changed4[ 0 ]/ )

		if( changed ){
		BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 1;
		}


		for( int k = 0; k < numThreadsPerBlock; k++ ){
		for( int l = 0; l < numThreadsPerBlock; l++ ) {
		if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + kdimX + l < dimXdimY &&
		(!interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ]) )
		aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] = sArray[ k + 1 ][ l + 1 ];
		if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX )
		helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + kdimX + l ] = sArray/[i]/[ (k + 1) sizeSArray + l + 1 ];
		//std::cout<< sArray[k+1][l+1];
		}
		//std::cout<<std::endl;
		}
		//delete []sArray;
		}
		}
		}
		@@ -249,27 +275,27 @@ getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY )

		for(int i = 0; i < numBlockX * numBlockY; i++)
		{
		BlockIterPom[ i ] = 0;
		if( BlockIterHost[ i ] )
		{
		// i = k*numBlockY + m;
		BlockIterPom[ i ] = 0;//BlockIterPom[ i ] = 0;
		int m=0, k=0;
		m = i%numBlockX;
		k = i/numBlockX;
		if( k > 0 )
		BlockIterPom[i - numBlockX] = 1;
		if( k < numBlockY - 1 )
		BlockIterPom[i + numBlockX] = 1;

		if( m < numBlockX - 1 )
		BlockIterPom[ i+1 ] = 1;
		if( m > 0 )
		BlockIterPom[ i-1 ] = 1;
		if( m > 0 && BlockIterHost[ i - 1 ] ){
		BlockIterPom[ i ] = 1;
		}else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){
		BlockIterPom[ i ] = 1;
		}else if( k > 0 && BlockIterHost[ i - numBlockX ] ){
		BlockIterPom[ i ] = 1;
		}else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){
		BlockIterPom[ i ] = 1;
		}
		//BlockIterPom[ i ];
		}

		for(int i = 0; i < numBlockX * numBlockY; i++)
		//if( !BlockIter[ i ] )
		{
		if( !BlockIterHost[ i ] )
		BlockIterHost[ i ] = BlockIterPom[ i ];
		}
		/*else
		BlockIter[ i ] = 0;*/
		/*for( int i = numBlockX-1; i > -1; i-- )
		@@ -353,11 +379,11 @@ initInterface( const MeshFunctionPointer& _input,

		for (int i = 0; i < 320; i++)
		for (int j = 0; j < 320; j++)
		fileInit >> A[i][j];
		fileInit >> A[j];
		fileInit.close();
		for (int i = 0; i < 320; i++)
		for (int j = 0; j < 320; j++)
		input[i320 + j] = A[i][j];/
		input[i320 + j] = A[j];/


		MeshFunctionType& output = _output.modifyData();
		@@ -1110,20 +1136,21 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3
		template< typename Real,
		typename Device,
		typename Index >
		template< int sizeSArray >
		__cuda_callable__
		bool
		tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
		updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy,
		updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real hy,
		const Real v )
		{
		const RealType value = sArray[ thrj ][ thri ];
		const RealType value = sArray[ thrj * sizeSArray + thri ];
		RealType a, b, tmp = std::numeric_limits< RealType >::max();

		b = TNL::argAbsMin( sArray[ thrj+1 ][ thri ],
		sArray[ thrj-1 ][ thri ] );
		b = TNL::argAbsMin( sArray[ (thrj+1) * sizeSArray + thri ],
		sArray[ (thrj-1) * sizeSArray + thri ] );

		a = TNL::argAbsMin( sArray[ thrj ][ thri+1 ],
		sArray[ thrj ][ thri-1 ] );
		a = TNL::argAbsMin( sArray[ thrj * sizeSArray + thri+1 ],
		sArray[ thrj * sizeSArray + thri-1 ] );

		if( fabs( a ) == std::numeric_limits< RealType >::max() &&
		fabs( b ) == std::numeric_limits< RealType >::max() )
		@@ -1136,8 +1163,8 @@ updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, con

		if( fabs( tmp ) < fabs( pom[ 1 ] ) )
		{
		sArray[ thrj ][ thri ] = argAbsMin( value, tmp );
		tmp = value - sArray[ thrj ][ thri ];
		sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp );
		tmp = value - sArray[ thrj * sizeSArray + thri ];
		if ( fabs( tmp ) > 0.001*hx )
		return true;
		else
		@@ -1148,8 +1175,8 @@ updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, con
		tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] +
		TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) -
		( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
		sArray[ thrj ][ thri ] = argAbsMin( value, tmp );
		tmp = value - sArray[ thrj ][ thri ];
		sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp );
		tmp = value - sArray[ thrj * sizeSArray + thri ];
		if ( fabs( tmp ) > 0.001*hx )
		return true;
		else

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h

+394 −226

File changed.

Preview size limit exceeded, changes collapsed.