Merge branch 'hamilton-jacobi' of jlk.fjfi.cvut.cz:mmg/tnl-dev into hamilton-jacobi (3f14cbd0) · Commits · TNL / tnl-dev

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h

+4 −0

Original line number	Diff line number	Diff line
		@@ -138,7 +138,11 @@ template < typename Real, typename Device, typename Index >
		__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
		const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
		Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
		<<<<<<< HEAD
		TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne = 1 );
		=======
		TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice );
		>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf

		template < typename Index >
		__global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h

+283 −1

Original line number	Diff line number	Diff line
		@@ -187,6 +187,7 @@ solve( const MeshPointer& mesh,
		{
		// TODO: CUDA code
		#ifdef HAVE_CUDA
		<<<<<<< HEAD
		TNL_CHECK_CUDA_DEVICE;
		const int cudaBlockSize( 16 );
		int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
		@@ -271,6 +272,86 @@ solve( const MeshPointer& mesh,
		/for( int i = 1; i < numBlocksX numBlocksY; i++ )
		BlockIter[ 0 ] = BlockIter[ 0 ] \|\| BlockIter[ i ];*/
		//pocIter ++;
		=======
		const int cudaBlockSize( 16 );
		int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
		int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize );
		dim3 blockSize( cudaBlockSize, cudaBlockSize );
		dim3 gridSize( numBlocksX, numBlocksY );

		tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr;

		TNL::Containers::Array< int, Devices::Host, IndexType > BlockIter;
		BlockIter.setSize( numBlocksX * numBlocksY );
		BlockIter.setValue( 0 );
		/int BlockIter = (int)malloc( ( numBlocksX numBlocksY ) * sizeof( int ) );
		for( int i = 0; i < numBlocksX*numBlocksY +1; i++)
		BlockIter[i] = 1;*/

		int BlockIterD = 1;

		TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice;
		BlockIterDevice.setSize( numBlocksX * numBlocksY );
		BlockIterDevice.setValue( 1 );
		/int BlockIterDevice;
		cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );
		cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);*/

		int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0);

		TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock;
		dBlock.setSize( nBlocks );
		dBlock.setValue( 0 );
		/int dBlock;
		cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/

		while( BlockIterD )
		{
		/for( int i = 0; i < numBlocksX numBlocksY; i++ )
		BlockIter[ i ] = false;*/

		CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr,
		interfaceMapPtr.template getData< Device >(),
		auxPtr.template modifyData< Device>(),
		BlockIterDevice );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;

		BlockIter = BlockIterDevice;
		//cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyDeviceToHost);
		GetNeighbours( BlockIter, numBlocksX, numBlocksY );

		BlockIterDevice = BlockIter;
		//cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;


		CudaParallelReduc<<< nBlocks, 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;

		CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;

		cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost);

		/for( int i = 1; i < numBlocksX numBlocksY; i++ )
		BlockIter[ 0 ] = BlockIter[ 0 ] \|\| BlockIter[ i ];*/

		}
		/*cudaFree( BlockIterDevice );
		cudaFree( dBlock );
		delete BlockIter;*/
		cudaDeviceSynchronize();

		TNL_CHECK_CUDA_DEVICE;

		aux = *auxPtr;
		interfaceMap = *interfaceMapPtr;
		#endif
		>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
		}
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		@@ -285,6 +366,7 @@ solve( const MeshPointer& mesh,
		}
		aux.save("aux-final.tnl");
		}
		<<<<<<< HEAD

		#ifdef HAVE_CUDA
		template < typename Index >
		@@ -318,10 +400,60 @@ __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index
		}
		}

		=======
		template < typename Index >
		void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY )
		{
		TNL::Containers::Array< int, Devices::Host, Index > BlockIterPom;
		BlockIterPom.setSize( numBlockX * numBlockY );
		BlockIterPom.setValue( 0 );
		/int BlockIterPom;
		BlockIterPom = new int[numBlockX * numBlockY];*/
		/for(int i = 0; i < numBlockX numBlockY; i++)
		BlockIterPom[ i ] = 0;*/
		for(int i = 0; i < numBlockX * numBlockY; i++)
		{

		if( BlockIter[ i ] )
		{
		// i = k*numBlockY + m;
		int m=0, k=0;
		m = i%numBlockY;
		k = i/numBlockY;
		if( k > 0 && numBlockY > 1 )
		BlockIterPom[i - numBlockX] = 1;
		if( k < numBlockY-1 && numBlockY > 1 )
		BlockIterPom[i + numBlockX] = 1;

		if( m >= 0 && m < numBlockX - 1 && numBlockX > 1 )
		BlockIterPom[ i+1 ] = 1;
		if( m <= numBlockX -1 && m > 0 && numBlockX > 1 )
		BlockIterPom[ i-1 ] = 1;
		}
		}
		for(int i = 0; i < numBlockX * numBlockY; i++ ){
		/// if( !BlockIter[ i ] )
		BlockIter[ i ] = BlockIterPom[ i ];
		/// else
		/// BlockIter[ i ] = 0;
		}
		/*for( int i = numBlockX-1; i > -1; i-- )
		{
		for( int j = 0; j< numBlockY; j++ )
		std::cout << BlockIter[ i*numBlockY + j ];
		std::cout << std::endl;
		}
		std::cout << std::endl;*/
		//delete[] BlockIterPom;
		}

		#ifdef HAVE_CUDA
		>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
		template < typename Index >
		__global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice,
		TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks )
		{
		<<<<<<< HEAD
		int i = threadIdx.x;
		int blId = blockIdx.x;
		__shared__ volatile int sArray[ 512 ];
		@@ -363,6 +495,57 @@ __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, I

		if( i == 0 )
		dBlock[ blId ] = sArray[ 0 ];
		=======
		int i = threadIdx.x;
		int blId = blockIdx.x;
		/*if ( i == 0 && blId == 0 ){
		printf( "nBlocks = %d \n", nBlocks );
		for( int j = nBlocks-1; j > -1 ; j--){
		printf( "cislo = %d \n", BlockIterDevice[ j ] );
		}
		}*/
		__shared__ volatile int sArray[ 512 ];
		sArray[ i ] = 0;
		if( blId * 512 + i < nBlocks )
		sArray[ i ] = BlockIterDevice[ blId * 512 + i ];
		__syncthreads();

		if (blockDim.x == 1024) {
		if (i < 512)
		sArray[ i ] += sArray[ i + 512 ];
		}
		__syncthreads();
		if (blockDim.x >= 512) {
		if (i < 256) {
		sArray[ i ] += sArray[ i + 256 ];
		}
		}
		__syncthreads();
		if (blockDim.x >= 256) {
		if (i < 128) {
		sArray[ i ] += sArray[ i + 128 ];
		}
		}
		__syncthreads();
		if (blockDim.x >= 128) {
		if (i < 64) {
		sArray[ i ] += sArray[ i + 64 ];
		}
		}
		__syncthreads();
		if (i < 32 )
		{
		if( blockDim.x >= 64 ) sArray[ i ] += sArray[ i + 32 ];
		if( blockDim.x >= 32 ) sArray[ i ] += sArray[ i + 16 ];
		if( blockDim.x >= 16 ) sArray[ i ] += sArray[ i + 8 ];
		if( blockDim.x >= 8 ) sArray[ i ] += sArray[ i + 4 ];
		if( blockDim.x >= 4 ) sArray[ i ] += sArray[ i + 2 ];
		if( blockDim.x >= 2 ) sArray[ i ] += sArray[ i + 1 ];
		}

		if( i == 0 )
		dBlock[ blId ] = sArray[ 0 ];
		>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
		}


		@@ -371,7 +554,11 @@ template < typename Real, typename Device, typename Index >
		__global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr,
		const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap,
		Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux,
		<<<<<<< HEAD
		TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne )
		=======
		TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice )
		>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
		{
		int thri = threadIdx.x; int thrj = threadIdx.y;
		int blIdx = blockIdx.x; int blIdy = blockIdx.y;
		@@ -408,7 +595,8 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
		int i = thri + blockDim.x*blIdx;
		int j = blockDim.y*blIdy + thrj;
		int currentIndex = thrj * blockDim.x + thri;

		if( BlockIterDevice[ blIdy * gridDim.x + blIdx] )
		{
		//__shared__ volatile bool changed[ blockDim.x*blockDim.y ];
		__shared__ volatile bool changed[16*16];
		changed[ currentIndex ] = false;
		@@ -444,14 +632,24 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
		sArray[thrj+1][0] = std::numeric_limits< Real >::max();
		}

		<<<<<<< HEAD
		if( thri == 2 )
		{
		if( dimY > (blIdy+1) * blockDim.y && thri+1 < xkolik && NE == 1 )
		sArray[ykolik][thrj+1] = aux[ blIdyblockDim.ydimX - dimX + blIdxblockDim.x - 1 + ykolikdimX + thrj+1 ];
		else
		sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max();
		=======
		if( numOfBlockx - 1 == blIdx )
		xkolik = dimX - (blIdx)*blockDim.x+1;

		if( numOfBlocky -1 == blIdy )
		ykolik = dimY - (blIdy)*blockDim.y+1;
		//BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0;
		>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
		}

		<<<<<<< HEAD
		if( thri == 3 )
		{
		if( blIdy != 0 && thrj+1 < xkolik && NE == 1 )
		@@ -477,6 +675,20 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
		if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
		{
		if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] )
		=======
		if(thri == 0 && thrj == 0 )
		BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0;

		if( thri == 0 )
		{
		if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik )
		sArray[thrj+1][xkolik] = aux[ blIdyblockDim.ydimX - dimX + blIdxblockDim.x - 1 + (thrj+1)dimX + xkolik ];
		else
		sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max();
		}

		if( thri == 1 )
		>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
		{
		changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy);
		}
		@@ -512,6 +724,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
		{
		if( currentIndex < 64 )
		{
		<<<<<<< HEAD
		changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 64 ];
		}
		}
		@@ -533,5 +746,74 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid<
		if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) )
		aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ];
		}
		=======
		__syncthreads();

		changed[ currentIndex] = false;

		//calculation of update cell
		if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() )
		{
		if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] )
		{
		changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy);
		}
		}
		__syncthreads();

		//pyramid reduction
		if( blockDim.x*blockDim.y == 1024 )
		{
		if( currentIndex < 512 )
		{
		changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 512 ];
		}
		}
		__syncthreads();
		if( blockDim.x*blockDim.y >= 512 )
		{
		if( currentIndex < 256 )
		{
		changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 256 ];
		}
		}
		__syncthreads();
		if( blockDim.x*blockDim.y >= 256 )
		{
		if( currentIndex < 128 )
		{
		changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 128 ];
		}
		}
		__syncthreads();
		if( blockDim.x*blockDim.y >= 128 )
		{
		if( currentIndex < 64 )
		{
		changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 64 ];
		}
		}
		__syncthreads();
		if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU
		{
		if( true ) changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 32 ];
		if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 16 ];
		if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 8 ];
		if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 4 ];
		if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 2 ];
		if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] \|\| changed[ currentIndex + 1 ];
		}
		if( changed[ 0 ] && thri == 0 && thrj == 0 ){
		BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1;
		}
		__syncthreads();
		}

		if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) )
		aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ];
		}
		/*if( thri == 0 && thrj == 0 )
		printf( "Block ID = %d, value = %d \n", (blIdy * numOfBlockx + blIdx), BlockIterDevice[ blIdy * numOfBlockx + blIdx ] );*/
		>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf
		}
		#endif

src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h

+5 −0

Original line number	Diff line number	Diff line
		@@ -280,12 +280,17 @@ solve( const MeshPointer& mesh,
		interfaceMapPtr.template getData< Device >(),
		auxPtr.template modifyData< Device>(),
		BlockIterDevice );
		<<<<<<< HEAD
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;

		CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		=======
		//CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) );
		//CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
		>>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf

		CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks );
		cudaDeviceSynchronize();