From 68f3276ebc650ddf8f6b1259583bfd0a8e3a2caf Mon Sep 17 00:00:00 2001 From: fencl <fenclmat@fjfi.cvut.cz> Date: Thu, 5 Jul 2018 19:27:34 +0200 Subject: [PATCH] Parallel reduction grid level. --- .../tnlDirectEikonalMethodsBase.h | 3 +- .../tnlFastSweepingMethod2D_impl.h | 86 +++++++++++++++---- 2 files changed, 73 insertions(+), 16 deletions(-) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h index fe7d4b9ba8..74393f1fd6 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h @@ -126,7 +126,8 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, - bool *BlockIterDevice ); + int *BlockIterDevice ); +__global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks ); template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index 83333b190d..efdb13da83 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -27,7 +27,7 @@ template< typename Real, typename Anisotropy > FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >:: FastSweepingMethod() -: maxIterations( 2 ) +: maxIterations( 1 ) { } @@ -246,29 +246,37 @@ solve( const MeshPointer& mesh, - bool* BlockIter = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) ); + //int BlockIter = 1;// = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) ); + + int *BlockIterDevice; + int BlockIterD = 1; - bool *BlockIterDevice; - cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( bool ) ); + cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) ); + int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0); + int *dBlock; + cudaMalloc(&dBlock, nBlocks * sizeof( int ) ); - while( BlockIter[ 0 ] ) + while( BlockIterD ) { - for( int i = 0; i < numBlocksX * numBlocksY; i++ ) - BlockIter[ i ] = false; - cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( bool ), cudaMemcpyHostToDevice); + /*for( int i = 0; i < numBlocksX * numBlocksY; i++ ) + BlockIter[ i ] = false;*/ CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), BlockIterDevice ); - cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( bool ), cudaMemcpyDeviceToHost); + + CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); + CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); + + cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); - for( int i = 1; i < numBlocksX * numBlocksY; i++ ) - BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ]; + /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) + BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ } - delete[] BlockIter; cudaFree( BlockIterDevice ); + cudaFree( dBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; @@ -283,11 +291,58 @@ solve( const MeshPointer& mesh, } #ifdef HAVE_CUDA +__global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks ) +{ + int i = threadIdx.x; + int blId = blockIdx.x; + __shared__ volatile int sArray[ 512 ]; + sArray[ i ] = false; + if(blId * 1024 + i < nBlocks ) + sArray[ i ] = BlockIterDevice[ blId * 1024 + i ]; + + if (blockDim.x * blockDim.y == 1024) { + if (i < 512) + sArray[ i ] += sArray[ i ]; + } + __syncthreads(); + if (blockDim.x * blockDim.y >= 512) { + if (i < 256) { + sArray[ i ] += sArray[ i ]; + } + } + if (blockDim.x * blockDim.y >= 256) { + if (i < 128) { + sArray[ i ] += sArray[ i + 128 ]; + } + } + __syncthreads(); + if (blockDim.x * blockDim.y >= 128) { + if (i < 64) { + sArray[ i ] += sArray[ i + 64 ]; + } + } + __syncthreads(); + if (i < 32 ) + { + if( blockDim.x * blockDim.y >= 64 ) sArray[ i ] += sArray[ i + 32 ]; + if( blockDim.x * blockDim.y >= 32 ) sArray[ i ] += sArray[ i + 16 ]; + if( blockDim.x * blockDim.y >= 16 ) sArray[ i ] += sArray[ i + 8 ]; + if( blockDim.x * blockDim.y >= 8 ) sArray[ i ] += sArray[ i + 4 ]; + if( blockDim.x * blockDim.y >= 4 ) sArray[ i ] += sArray[ i + 2 ]; + if( blockDim.x * blockDim.y >= 2 ) sArray[ i ] += sArray[ i + 1 ]; + } + + if( i == 0 ) + dBlock[ blId ] = sArray[ 0 ]; +} + + + template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, - bool *BlockIterDevice ) + int *BlockIterDevice ) { int thri = threadIdx.x; int thrj = threadIdx.y; int blIdx = blockIdx.x; int blIdy = blockIdx.y; @@ -331,6 +386,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( numOfBlocky -1 == blIdy ) ykolik = dimY - (blIdy)*blockDim.y+1; + BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0; } __syncthreads(); @@ -360,7 +416,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( thri == 3 ) { - if( blIdy != 0 && thri+1 < xkolik ) + if( blIdy != 0 && thrj+1 < xkolik ) sArray[0][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + thrj+1 ]; else sArray[0][thrj+1] = TypeInfo< Real >::getMaxValue(); @@ -427,7 +483,7 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; } if( changed[ 0 ] && thri == 0 && thrj == 0 ) - BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = true; + BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1; __syncthreads(); } -- GitLab