From b95d6c7b98234dd1e93f245e0d2076ac91ba14bc Mon Sep 17 00:00:00 2001 From: Fencl <fenclmat@fjfi.cvut.cz> Date: Thu, 1 Nov 2018 16:26:36 +0100 Subject: [PATCH] Last repair of FIM for GPU. --- .../tnlDirectEikonalMethodsBase.h | 2 +- .../tnlDirectEikonalMethodsBase_impl.h | 72 ++++---- .../tnlFastSweepingMethod2D_impl.h | 165 ++++++++++-------- 3 files changed, 125 insertions(+), 114 deletions(-) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h index cbb1a1ff6a..ccbae8abe8 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h @@ -148,7 +148,7 @@ __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, I template < typename Index > __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, - /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY ); + TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY ); template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h index 500d1bf03e..5083544e2b 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h @@ -134,6 +134,7 @@ updateBlocks( InterfaceMapType interfaceMap, Real hy = mesh.getSpaceSteps().y(); bool changed = false; + BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0; Real *sArray; @@ -143,53 +144,52 @@ updateBlocks( InterfaceMapType interfaceMap, for( int thri = 0; thri < sizeSArray; thri++ ){ for( int thrj = 0; thrj < sizeSArray; thrj++ ) - sArray/*[i]*/[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max(); + sArray[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max(); } - BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0; for( int thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ ) { if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik ) - sArray/*[i]*/[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ]; + sArray[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ]; if( blIdx != 0 && thrj+1 < ykolik ) - sArray/*[i]*/[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ]; + sArray[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ]; if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik ) - sArray/*[i]*/[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ]; + sArray[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ]; if( blIdy != 0 && thrj+1 < xkolik ) - sArray/*[i]*/[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ]; + sArray[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ]; } for( int k = 0; k < numThreadsPerBlock; k++ ){ for( int l = 0; l < numThreadsPerBlock; l++ ) if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) - sArray/*[i]*/[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ]; + sArray[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ]; } - bool pom = false; + for( int k = 0; k < numThreadsPerBlock; k++ ){ for( int l = 0; l < numThreadsPerBlock; l++ ){ if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ){ //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl; if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) { - pom = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy); - changed = changed || pom; + changed = this->template updateCell< sizeSArray >( sArray, l+1, k+1, hx,hy) || changed; + } } } } /*aux.save( "aux-1pruch.tnl" ); - for( int k = 0; k < sizeSArray; k++ ){ - for( int l = 0; l < sizeSArray; l++ ) { - std::cout << sArray[ k * sizeSArray + l] << " "; - } - std::cout << std::endl; - }*/ - + for( int k = 0; k < sizeSArray; k++ ){ + for( int l = 0; l < sizeSArray; l++ ) { + std::cout << sArray[ k * sizeSArray + l] << " "; + } + std::cout << std::endl; + }*/ + for( int k = 0; k < numThreadsPerBlock; k++ ) for( int l = numThreadsPerBlock-1; l >-1; l-- ) { if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) @@ -201,12 +201,12 @@ updateBlocks( InterfaceMapType interfaceMap, } } /*aux.save( "aux-2pruch.tnl" ); - for( int k = 0; k < sizeSArray; k++ ){ - for( int l = 0; l < sizeSArray; l++ ) { - std::cout << sArray[ k * sizeSArray + l] << " "; - } - std::cout << std::endl; - }*/ + for( int k = 0; k < sizeSArray; k++ ){ + for( int l = 0; l < sizeSArray; l++ ) { + std::cout << sArray[ k * sizeSArray + l] << " "; + } + std::cout << std::endl; + }*/ for( int k = numThreadsPerBlock-1; k > -1; k-- ) for( int l = 0; l < numThreadsPerBlock; l++ ) { @@ -219,12 +219,12 @@ updateBlocks( InterfaceMapType interfaceMap, } } /*aux.save( "aux-3pruch.tnl" ); - for( int k = 0; k < sizeSArray; k++ ){ - for( int l = 0; l < sizeSArray; l++ ) { - std::cout << sArray[ k * sizeSArray + l] << " "; - } - std::cout << std::endl; - }*/ + for( int k = 0; k < sizeSArray; k++ ){ + for( int l = 0; l < sizeSArray; l++ ) { + std::cout << sArray[ k * sizeSArray + l] << " "; + } + std::cout << std::endl; + }*/ for( int k = numThreadsPerBlock-1; k > -1; k-- ){ for( int l = numThreadsPerBlock-1; l >-1; l-- ) { @@ -238,12 +238,12 @@ updateBlocks( InterfaceMapType interfaceMap, } } /*aux.save( "aux-4pruch.tnl" ); - for( int k = 0; k < sizeSArray; k++ ){ - for( int l = 0; l < sizeSArray; l++ ) { - std::cout << sArray[ k * sizeSArray + l] << " "; - } - std::cout << std::endl; - }*/ + for( int k = 0; k < sizeSArray; k++ ){ + for( int l = 0; l < sizeSArray; l++ ) { + std::cout << sArray[ k * sizeSArray + l] << " "; + } + std::cout << std::endl; + }*/ if( changed ){ @@ -254,7 +254,7 @@ updateBlocks( InterfaceMapType interfaceMap, for( int k = 0; k < numThreadsPerBlock; k++ ){ for( int l = 0; l < numThreadsPerBlock; l++ ) { if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) - helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] = sArray/*[i]*/[ (k + 1)* sizeSArray + l + 1 ]; + helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] = sArray[ (k + 1)* sizeSArray + l + 1 ]; //std::cout<< sArray[k+1][l+1]; } //std::cout<<std::endl; diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index e29421bb13..bc82b7a2c5 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -123,6 +123,7 @@ solve( const MeshPointer& mesh, helpFunc = helpFunc1; this->template updateBlocks< 1026 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); + //Reduction for( int i = 0; i < BlockIterHost.getSize(); i++ ){ if( IsCalculationDone == 0 ){ IsCalculationDone = IsCalculationDone || BlockIterHost[ i ]; @@ -130,6 +131,7 @@ solve( const MeshPointer& mesh, } } numWhile++; + std::cout <<"numWhile = "<< numWhile <<std::endl; for( int j = numBlocksY-1; j>-1; j-- ){ for( int i = 0; i < numBlocksX; i++ ) @@ -146,7 +148,6 @@ solve( const MeshPointer& mesh, std::cout << std::endl; } std::cout << std::endl;*/ - //Reduction //std::cout<<std::endl; string s( "aux-"+ std::to_string(numWhile) + ".tnl"); @@ -171,7 +172,7 @@ solve( const MeshPointer& mesh, if( ! interfaceMap( cell ) ) this->updateCell( aux, cell ); } - } + } //aux.save( "aux-1.tnl" ); @@ -261,12 +262,12 @@ solve( const MeshPointer& mesh, TNL_CHECK_CUDA_DEVICE; - /*TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom; - BlockIterPom.setSize( numBlocksX * numBlocksY ); - BlockIterPom.setValue( 0 );*/ + TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom; + BlockIterPom.setSize( numBlocksX * numBlocksY ); + BlockIterPom.setValue( 0 ); /*TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1; - BlockIterPom1.setSize( numBlocksX * numBlocksY ); - BlockIterPom1.setValue( 0 );*/ + BlockIterPom1.setSize( numBlocksX * numBlocksY ); + BlockIterPom1.setValue( 0 );*/ /*int *BlockIterDevice; cudaMalloc((void**) &BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/ int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0); @@ -284,9 +285,7 @@ solve( const MeshPointer& mesh, cudaMalloc((void**) &dBlock, nBlocks * sizeof( int ) );*/ - MeshFunctionPointer helpFunc1; - helpFunc1->setMesh(mesh); - + MeshFunctionPointer helpFunc1( mesh ); MeshFunctionPointer helpFunc( mesh ); helpFunc1 = auxPtr; @@ -301,83 +300,94 @@ solve( const MeshPointer& mesh, /** HERE IS CHESS METHOD **/ /*auxPtr = helpFunc; + + CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, + interfaceMapPtr.template getData< Device >(), + auxPtr.template getData< Device>(), + helpFunc.template modifyData< Device>(), + BlockIterDevice, + oddEvenBlock ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + auxPtr = helpFunc; + + oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; + + CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, + interfaceMapPtr.template getData< Device >(), + auxPtr.template getData< Device>(), + helpFunc.template modifyData< Device>(), + BlockIterDevice, + oddEvenBlock ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + auxPtr = helpFunc; + + oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; + + CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + BlockIterD = dBlock.getElement( 0 );*/ + + /**------------------------------------------------------------------------------------------------*/ + + + /** HERE IS FIM **/ + helpFunc1 = auxPtr; + auxPtr = helpFunc; + helpFunc = helpFunc1; + TNL_CHECK_CUDA_DEVICE; + + //int pocBloku = 0; + Devices::Cuda::synchronizeDevice(); CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), - auxPtr.template getData< Device>(), + auxPtr.template modifyData< Device>(), helpFunc.template modifyData< Device>(), - BlockIterDevice, - oddEvenBlock ); + BlockIterDevice ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - auxPtr = helpFunc; - oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; + //std::cout << "Pocet aktivnich bloku = " << pocBloku << std::endl; + //BlockIterPom1 = BlockIterDevice; + ///for( int i =0; i< numBlocksX; i++ ){ + // for( int j = 0; j < numBlocksY; j++ ) + // { + // std::cout << BlockIterPom1[j*numBlocksX + i]; + // } + // std::cout << std::endl; + //} + //std::cout << std::endl; - CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, - interfaceMapPtr.template getData< Device >(), - auxPtr.template getData< Device>(), - helpFunc.template modifyData< Device>(), - BlockIterDevice, - oddEvenBlock ); + GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, BlockIterPom, numBlocksX, numBlocksY ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - auxPtr = helpFunc; + BlockIterDevice = BlockIterPom; + + //std::cout<< "Probehlo" << std::endl; + + //TNL::swap( auxPtr, helpFunc ); - oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); - cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; + CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); - cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - BlockIterD = dBlock.getElement( 0 );*/ - /**------------------------------------------------------------------------------------------------*/ + BlockIterD = dBlock.getElement( 0 ); + //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; - /** HERE IS FIM **/ - - helpFunc1 = auxPtr; - auxPtr = helpFunc; - helpFunc = helpFunc1; - - //int pocBloku = 0; - Devices::Cuda::synchronizeDevice(); - CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, - interfaceMapPtr.template getData< Device >(), - auxPtr.template modifyData< Device>(), - helpFunc.template modifyData< Device>(), - BlockIterDevice ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - - //std::cout << "Pocet aktivnich bloku = " << pocBloku << std::endl; - - GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, numBlocksX, numBlocksY ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - - //std::cout<< "Probehlo" << std::endl; - - //TNL::swap( auxPtr, helpFunc ); - - - CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); - TNL_CHECK_CUDA_DEVICE; - - CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); - TNL_CHECK_CUDA_DEVICE; - - - BlockIterD = dBlock.getElement( 0 ); - //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - - /**-----------------------------------------------------------------------------------------------------------*/ /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ @@ -392,7 +402,6 @@ solve( const MeshPointer& mesh, cudaFree( dBlock ); delete BlockIter;*/ cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; aux = *auxPtr; @@ -410,7 +419,7 @@ solve( const MeshPointer& mesh, template < typename Index > __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, - /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY ) + TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, int numBlockX, int numBlockY ) { int i = blockIdx.x * 1024 + threadIdx.x; @@ -430,7 +439,7 @@ __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index pom = 1;//BlockIterPom[ i ] = 1; } - BlockIterDevice[ i ] = pom;//BlockIterPom[ i ]; + BlockIterPom[ i ] = pom;//BlockIterPom[ i ]; } } @@ -514,14 +523,16 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< int i = threadIdx.x + blockDim.x*blockIdx.x; int j = blockDim.y*blockIdx.y + threadIdx.y; /** FOR CHESS METHOD */ - if( (blockIdx.y%2 + blockIdx.x) % 2 == oddEvenBlock ) - { - /**-----------------------------------------*/ - + //if( (blockIdx.y%2 + blockIdx.x) % 2 == oddEvenBlock ) + //{ + /**------------------------------------------*/ + + + /** FOR FIM METHOD */ - /** FOR FIM METHOD */ - /*if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x] ) - {*/ + if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] ) + { + __syncthreads(); /**-----------------------------------------*/ const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); __shared__ volatile int dimX; -- GitLab