pyramid reduction in 2D with loopCounter (8a8b083e) · Commits · TNL / tnl-dev

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h

+3 −2

Original line number	Diff line number	Diff line
		@@ -69,7 +69,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >
		const MeshEntity& cell,
		const RealType velocity = 1.0 );

		__cuda_callable__ void updateCell( Real sArray[18][18],
		__cuda_callable__ bool updateCell( volatile Real sArray[18][18],
		int thri, int thrj, const Real hx, const Real hy,
		const Real velocity = 1.0 );

		@@ -116,7 +116,8 @@ __cuda_callable__ void sortMinims( T1 pom[] );
		template < typename Real, typename Device, typename Index >
		__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
		const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
		Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux );
		Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
		bool *BlockIterDevice );

		template < typename Real, typename Device, typename Index >
		__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input,

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h

+13 −128

Original line number	Diff line number	Diff line
		@@ -685,123 +685,8 @@ setsArray( MeshFunctionType& aux, Real sArray[18][18], int dimX, int dimY, int b
		sArray[0][i] = aux[ blockIdy16dimX - dimX + blockIdx16 - 1 + 0dimX + i ];
		}
		}

		/*int numOfBlockx = dimX/16 + ((dimX%16 != 0) ? 1:0);
		int numOfBlocky = dimY/16 + ((dimY%16 != 0) ? 1:0);
		int xkolik = 18;
		int ykolik = 18;

		if( numOfBlockx - 1 == blockIdx )
		xkolik = dimX - (numOfBlockx-1)*16+1;

		if( numOfBlocky -1 == blockIdy )
		ykolik = dimY - (numOfBlocky-1)*16+1;


		if( numOfBlockx == 0 \|\| numOfBlocky == 0 )
		return;
		else
		{
		if( blockIdx == 0 )
		{
		if( blockIdy == 0 )
		{
		for( int j = 0; j < ykolik-1; j++ )
		for( int i = 0; i < xkolik-1; i++ )
		{
		sArray[ j+1 ][ i+1 ] = aux[ j*dimX + i ];
		}

		//vypis( *sArray );
		}
		else
		{
		for( int j = 0; j < ykolik; j++ )
		for( int i = 0; i < xkolik-1; i++ )
		{
		sArray[ j ][ i+1 ] = aux[ blockIdy16dimX - dimX + j*dimX + i ];
		}
		}
		}
		else if( blockIdy == 0 )
		{
		for( int j = 0; j < ykolik-1; j++ )
		for( int i = 0; i < xkolik; i++ )
		{
		sArray[ j+1 ][ i ] = aux[ blockIdx16 - 1 + jdimX + i ];
		}
		}
		else
		{
		for( int j = 0; j < ykolik; j++ )
		for( int i = 0; i < xkolik; i++ )
		{
		//if( dimXdimY-1 > blockIdy16dimX - dimX + blockIdx16 - 1 + j*dimX + i )
		sArray[ j ][ i ] = aux[ blockIdy16dimX - dimX + blockIdx16 - 1 + jdimX + i ];
		}
		}
		}*/
		}

		/*template< typename Real,
		typename Device,
		typename Index >
		__cuda_callable__ void
		tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
		getsArray( MeshFunctionType& aux, Real sArray[18][18], int dimX, int dimY, int blockIdx, int blockIdy )
		{
		int numOfBlockx = dimX/16 + ((dimX%16 != 0) ? 1:0);
		int numOfBlocky = dimY/16 + ((dimY%16 != 0) ? 1:0);
		int xkolik = 18;
		int ykolik = 18;

		if( numOfBlockx - 1 == blockIdx )
		xkolik = dimX - (numOfBlockx-1)*16+2;

		if( numOfBlocky -1 == blockIdy )
		ykolik = dimY - (numOfBlocky-1)*16+2;

		if( numOfBlockx == 0 \|\| numOfBlocky == 0 )
		return;
		else
		{
		if( blockIdx == 0 )
		{
		if( blockIdy == 0 )
		{
		for( int j = 0; j < ykolik-2; j++ )
		for( int i = 0; i < xkolik-2; i++ )
		{
		aux[ j*dimX + i ] = sArray[ j+1 ][ i+1 ];
		}
		}
		else
		{
		for( int j = 1; j < ykolik-1; j++ )
		for( int i = 0; i < xkolik-2; i++ )
		{
		aux[ blockIdy16dimX - dimX + j*dimX + i ] = sArray[ j ][ i+1 ];
		}
		}
		}
		else if( blockIdy == 0 )
		{
		for( int j = 0; j < ykolik-2; j++ )
		for( int i = 1; i < xkolik-1; i++ )
		{
		aux[ blockIdx16 - 1 + jdimX + i ] = sArray[ j+1 ][ i ];
		}
		}
		else
		{
		for( int j = 1; j < ykolik-1; j++ )
		for( int i = 1; i < xkolik-1; i++ )
		{
		aux[ blockIdy16dimX - dimX + blockIdx16 - 1 + jdimX + i ] = sArray[ j ][ i ];
		}
		}
		}
		}*/


		#ifdef HAVE_CUDA
		@@ -974,9 +859,9 @@ template< typename Real,
		typename Device,
		typename Index >
		__cuda_callable__
		void
		bool
		tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::
		updateCell( Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy,
		updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy,
		const Real v )
		{

		@@ -992,38 +877,38 @@ updateCell( Real sArray[18][18], int thri, int thrj, const Real hx, const Real h
		const RealType value = sArray[ thrj ][ thri ];//u( cell );
		RealType a, b, tmp = TypeInfo< RealType >::getMaxValue();

		/*if( value != u[ cell.getIndex() ] )
		return;*/

		b = TNL::argAbsMin( sArray[ thrj+1 ][ thri ],
		sArray[ thrj-1 ][ thri ] );

		a = TNL::argAbsMin( sArray[ thrj ][ thri+1 ],
		sArray[ thrj ][ thri-1 ] );



		if( fabs( a ) == TypeInfo< RealType >::getMaxValue() &&
		fabs( b ) == TypeInfo< RealType >::getMaxValue() )
		return;
		return false;

		RealType pom[6] = { a, b, TypeInfo< Real >::getMaxValue(), (RealType)hx, (RealType)hy, 0.0 };
		RealType pom[6] = { a, b, TypeInfo< Real >::getMaxValue(), (RealType)hx, (RealType)hy, TypeInfo< Real >::getMaxValue() };
		sortMinims( pom );
		tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v;


		if( fabs( tmp ) < fabs( pom[ 1 ] ) )
		{
		//u[ cell.getIndex() ]= argAbsMin( value, tmp );
		sArray[ thrj ][ thri ] = argAbsMin( value, tmp );
		if ( fabs( value - sArray[ thrj ][ thri ] ) > 0.001 )
		return true;
		return false;
		}
		else
		{
		tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] +
		TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) -
		( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] );
		//u[ cell.getIndex() ]= argAbsMin( value, tmp );
		sArray[ thrj ][ thri ] = argAbsMin( value, tmp );
		if ( fabs( value - sArray[ thrj ][ thri ] ) > 0.001 )
		return true;
		return false;
		}
		return false;
		}
		#endif

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h

+161 −23

Original line number	Diff line number	Diff line
		@@ -227,12 +227,47 @@ solve( const MeshPointer& mesh,
		tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;
		int nBlockIter = numBlocksX > numBlocksY ? numBlocksX : numBlocksY;
		nBlockIter = numBlocksX == numBlocksY ? nBlockIter + 1 : nBlockIter;
		for( int k = 0; k < nBlockIter; k++)

		bool isNotDone = true;
		bool* BlockIter = (bool)malloc( ( numBlocksX numBlocksY ) * sizeof( bool ) );

		bool *BlockIterDevice;
		cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( bool ) );

		while( isNotDone )
		{
		for( int i = 0; i < numBlocksX * numBlocksY; i++ )
		BlockIter[ i ] = false;

		isNotDone = false;

		CudaUpdateCellCaller<<< gridSize, blockSize, 18 * 18 * sizeof( Real ) >>>( ptr,
		interfaceMapPtr.template getData< Device >(),
		auxPtr.template modifyData< Device>() );
		auxPtr.template modifyData< Device>(),
		BlockIterDevice );
		cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( bool ), cudaMemcpyDeviceToHost);
		for( int i = 0; i < numBlocksX; i++ )
		{ for( int j = 0; j < numBlocksY; j++ )
		{
		if( BlockIter[ j * numBlocksY + i ] )
		std::cout << "true." << "\t";
		else
		std::cout << "false." << "\t";
		}
		std::cout << std::endl;
		}
		std::cout << std::endl;

		for( int i = 0; i < numBlocksX * numBlocksY; i++ )
		isNotDone = isNotDone \|\| BlockIter[ i ];

		}
		delete[] BlockIter;
		cudaFree( BlockIterDevice );
		cudaDeviceSynchronize();

		TNL_CHECK_CUDA_DEVICE;

		aux = *auxPtr;
		interfaceMap = *interfaceMapPtr;
		#endif
		@@ -246,46 +281,149 @@ solve( const MeshPointer& mesh,
		template < typename Real, typename Device, typename Index >
		__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
		const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
		Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux )
		Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
		bool *BlockIterDevice )
		{
		int thri = threadIdx.x; int thrj = threadIdx.y; // nelze ke stejnym pristupovat znovu pres threadIdx (vzdy da jine hodnoty)
		int blIdx = blockIdx.x; int blIdy = blockIdx.y;
		int i = thri + blockDim.x*blIdx;
		int j = blockDim.y*blIdy + thrj;

		__shared__ volatile bool changed[256];
		changed[ thrj * blockDim.x + thri ] = false;
		__shared__ volatile bool SmallCycleBoolin;
		if( thrj == 0 && thri == 0 )
		SmallCycleBoolin = true;


		const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >();
		const Real hx = mesh.getSpaceSteps().x();
		const Real hy = mesh.getSpaceSteps().y();

		__shared__ Real sArray[ 18 ][ 18 ];
		for( int k = 0; k < 18; k++ )
		for( int l = 0; l < 18; l++ )
		sArray[ k ][ l ] = TypeInfo< Real >::getMaxValue();
		__syncthreads();
		/*//filling shared array
		ptr.setsArray( aux, sArray, mesh.getDimensions().x(), mesh.getDimensions().y(), blIdx, blIdy );
		__syncthreads();*/
		__shared__ volatile Real sArray[ 18 ][ 18 ];
		sArray[thrj][thri] = TypeInfo< Real >::getMaxValue();

		//filling sArray edges
		int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y();
		int numOfBlockx = dimX/16 + ((dimX%16 != 0) ? 1:0);
		int numOfBlocky = dimY/16 + ((dimY%16 != 0) ? 1:0);
		int xkolik = 17;
		int ykolik = 17;


		if( numOfBlockx - 1 == blIdx )
		xkolik = dimX - (blIdx)*16+1;

		if( numOfBlocky -1 == blIdy )
		ykolik = dimY - (blIdy)*16+1;


		if( thri == 0 )
		{
		if( dimX > (blIdx+1) * 16 && thrj+1 < ykolik )
		sArray[thrj+1][xkolik] = aux[ blIdy16dimX - dimX + blIdx16 - 1 + (thrj+1)dimX + 17 ];
		else
		sArray[thrj+1][xkolik] = TypeInfo< Real >::getMaxValue();
		}

		if( thri == 1 )
		{
		if( blIdx != 0 && thrj+1 < ykolik )
		sArray[thrj+1][0] = aux[ blIdy16dimX - dimX + blIdx16 - 1 + (thrj+1)dimX + 0];
		else
		sArray[thrj+1][0] = TypeInfo< Real >::getMaxValue();
		}

		if( thrj == 0 )
		{
		if( dimY > (blIdy+1) * 16 && thri+1 < xkolik )
		sArray[ykolik][thri+1] = aux[ blIdy16dimX - dimX + blIdx16 - 1 + 17dimX + thri+1 ];
		else
		sArray[ykolik][thri+1] = TypeInfo< Real >::getMaxValue();
		}

		if( thrj == 1 )
		{
		if( blIdy != 0 && thri+1 < xkolik )
		sArray[0][thri+1] = aux[ blIdy16dimX - dimX + blIdx16 - 1 + 0dimX + thri+1 ];
		else
		sArray[0][thri+1] = TypeInfo< Real >::getMaxValue();
		}

		//filling BlockIterDevice
		BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = false;


		if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
		{
		sArray[thrj+1][thri+1] = aux[ j*mesh.getDimensions().x() + i ];
		ptr.setsArray( aux, sArray, mesh.getDimensions().x(), mesh.getDimensions().y(), blIdx, blIdy ); //fill edges of sArray
		__syncthreads();
		}

		__shared__ int loopcounter;
		if( thri == 0 && thrj == 0 )
		loopcounter= 0;
		//if( blIdx == 5 && blIdy == 4 )
		while( SmallCycleBoolin )
		{
		if( thri == 1 && thrj == 1 )
		SmallCycleBoolin = false;

		changed[ thrj * 16 + thri ] = false;
		__syncthreads();

		//calculation of update cell
		if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
		{
		if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] )
		{
		for( int k = 0; k < 17; k++ )
		changed[ thrj * 16 + thri ] = ptr.updateCell( sArray, thri+1, thrj+1, hx, hy );
		__syncthreads();
		}
		}

		//pyramid reduction
		if( blockDim.x*blockDim.y >= 256 )
		{
		if( (thrj * 16 + thri) < 128 )
		{
		ptr.updateCell( sArray, thri+1, thrj+1, hx, hy );
		changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] \|\| changed[ thrj * 16 + thri + 128 ];
		}
		__syncthreads();
		}
		if( blockDim.x*blockDim.y >= 128 )
		{
		if( (thrj * 16 + thri) < 64 )
		{
		changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] \|\| changed[ thrj * 16 + thri + 64 ];
		}
		__syncthreads();
		}
		if( (thrj * 16 + thri) < 32 )
		{
		changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] \|\| changed[ thrj * 16 + thri + 32 ];
		changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] \|\| changed[ thrj * 16 + thri + 16 ];
		changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] \|\| changed[ thrj * 16 + thri + 8 ];
		changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] \|\| changed[ thrj * 16 + thri + 4 ];
		changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] \|\| changed[ thrj * 16 + thri + 2 ];
		changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] \|\| changed[ thrj * 16 + thri + 1 ];
		}
		__syncthreads();

		if( thrj == 1 && thri == 1 )
		{
		loopcounter++;
		if( loopcounter > 1000 )
		break;
		SmallCycleBoolin = changed[ 0 ];
		//if( SmallCycleBoolin )
		BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = SmallCycleBoolin;
		}
		/*for( int k = 0; k < mesh.getDimensions().x(); k++)
		for( int l = 0; l < mesh.getDimensions().y(); l++)
		aux[ kmesh.getDimensions().x() + l ] = TypeInfo< Real >::getMaxValue();/
		aux[j*mesh.getDimensions().x() + i] = sArray[thrj+1][thri+1];
		__syncthreads();
		}
		//ptr.getsArray( aux, sArray, mesh.getDimensions().x(), mesh.getDimensions().y(), blIdx, blIdy );

		if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
		aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ];

		}
		#endif