From 8a8b083e31f59bce56fefdff27375b7c04027ab8 Mon Sep 17 00:00:00 2001 From: fencl <fenclmat@fjfi.cvut.cz> Date: Thu, 17 May 2018 08:54:40 +0200 Subject: [PATCH] pyramid reduction in 2D with loopCounter --- .../tnlDirectEikonalMethodsBase.h | 5 +- .../tnlDirectEikonalMethodsBase_impl.h | 141 ++------------ .../tnlFastSweepingMethod2D_impl.h | 184 +++++++++++++++--- 3 files changed, 177 insertions(+), 153 deletions(-) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h index 69fce1467f..979c70e7b5 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h @@ -69,7 +69,7 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > const MeshEntity& cell, const RealType velocity = 1.0 ); - __cuda_callable__ void updateCell( Real sArray[18][18], + __cuda_callable__ bool updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy, const Real velocity = 1.0 ); @@ -116,7 +116,8 @@ __cuda_callable__ void sortMinims( T1 pom[] ); template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux ); + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, + bool *BlockIterDevice ); template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h index e52f2fe1c6..a3af357c94 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h @@ -684,124 +684,9 @@ setsArray( MeshFunctionType& aux, Real sArray[18][18], int dimX, int dimY, int b { sArray[0][i] = aux[ blockIdy*16*dimX - dimX + blockIdx*16 - 1 + 0*dimX + i ]; } - } - - /*int numOfBlockx = dimX/16 + ((dimX%16 != 0) ? 1:0); - int numOfBlocky = dimY/16 + ((dimY%16 != 0) ? 1:0); - int xkolik = 18; - int ykolik = 18; - - if( numOfBlockx - 1 == blockIdx ) - xkolik = dimX - (numOfBlockx-1)*16+1; - - if( numOfBlocky -1 == blockIdy ) - ykolik = dimY - (numOfBlocky-1)*16+1; - - - if( numOfBlockx == 0 || numOfBlocky == 0 ) - return; - else - { - if( blockIdx == 0 ) - { - if( blockIdy == 0 ) - { - for( int j = 0; j < ykolik-1; j++ ) - for( int i = 0; i < xkolik-1; i++ ) - { - sArray[ j+1 ][ i+1 ] = aux[ j*dimX + i ]; - } - - //vypis( *sArray ); - } - else - { - for( int j = 0; j < ykolik; j++ ) - for( int i = 0; i < xkolik-1; i++ ) - { - sArray[ j ][ i+1 ] = aux[ blockIdy*16*dimX - dimX + j*dimX + i ]; - } - } - } - else if( blockIdy == 0 ) - { - for( int j = 0; j < ykolik-1; j++ ) - for( int i = 0; i < xkolik; i++ ) - { - sArray[ j+1 ][ i ] = aux[ blockIdx*16 - 1 + j*dimX + i ]; - } - } - else - { - for( int j = 0; j < ykolik; j++ ) - for( int i = 0; i < xkolik; i++ ) - { - //if( dimX*dimY-1 > blockIdy*16*dimX - dimX + blockIdx*16 - 1 + j*dimX + i ) - sArray[ j ][ i ] = aux[ blockIdy*16*dimX - dimX + blockIdx*16 - 1 + j*dimX + i ]; - } - } - }*/ + } } -/*template< typename Real, - typename Device, - typename Index > -__cuda_callable__ void -tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: -getsArray( MeshFunctionType& aux, Real sArray[18][18], int dimX, int dimY, int blockIdx, int blockIdy ) -{ - int numOfBlockx = dimX/16 + ((dimX%16 != 0) ? 1:0); - int numOfBlocky = dimY/16 + ((dimY%16 != 0) ? 1:0); - int xkolik = 18; - int ykolik = 18; - - if( numOfBlockx - 1 == blockIdx ) - xkolik = dimX - (numOfBlockx-1)*16+2; - - if( numOfBlocky -1 == blockIdy ) - ykolik = dimY - (numOfBlocky-1)*16+2; - - if( numOfBlockx == 0 || numOfBlocky == 0 ) - return; - else - { - if( blockIdx == 0 ) - { - if( blockIdy == 0 ) - { - for( int j = 0; j < ykolik-2; j++ ) - for( int i = 0; i < xkolik-2; i++ ) - { - aux[ j*dimX + i ] = sArray[ j+1 ][ i+1 ]; - } - } - else - { - for( int j = 1; j < ykolik-1; j++ ) - for( int i = 0; i < xkolik-2; i++ ) - { - aux[ blockIdy*16*dimX - dimX + j*dimX + i ] = sArray[ j ][ i+1 ]; - } - } - } - else if( blockIdy == 0 ) - { - for( int j = 0; j < ykolik-2; j++ ) - for( int i = 1; i < xkolik-1; i++ ) - { - aux[ blockIdx*16 - 1 + j*dimX + i ] = sArray[ j+1 ][ i ]; - } - } - else - { - for( int j = 1; j < ykolik-1; j++ ) - for( int i = 1; i < xkolik-1; i++ ) - { - aux[ blockIdy*16*dimX - dimX + blockIdx*16 - 1 + j*dimX + i ] = sArray[ j ][ i ]; - } - } - } -}*/ #ifdef HAVE_CUDA @@ -974,9 +859,9 @@ template< typename Real, typename Device, typename Index > __cuda_callable__ -void +bool tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: -updateCell( Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy, +updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy, const Real v ) { @@ -991,39 +876,39 @@ updateCell( Real sArray[18][18], int thri, int thrj, const Real hx, const Real h const RealType& hy = mesh.getSpaceSteps().y();*/ const RealType value = sArray[ thrj ][ thri ];//u( cell ); RealType a, b, tmp = TypeInfo< RealType >::getMaxValue(); - - /*if( value != u[ cell.getIndex() ] ) - return;*/ - + b = TNL::argAbsMin( sArray[ thrj+1 ][ thri ], sArray[ thrj-1 ][ thri ] ); a = TNL::argAbsMin( sArray[ thrj ][ thri+1 ], sArray[ thrj ][ thri-1 ] ); - - if( fabs( a ) == TypeInfo< RealType >::getMaxValue() && fabs( b ) == TypeInfo< RealType >::getMaxValue() ) - return; + return false; - RealType pom[6] = { a, b, TypeInfo< Real >::getMaxValue(), (RealType)hx, (RealType)hy, 0.0 }; + RealType pom[6] = { a, b, TypeInfo< Real >::getMaxValue(), (RealType)hx, (RealType)hy, TypeInfo< Real >::getMaxValue() }; sortMinims( pom ); tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v; if( fabs( tmp ) < fabs( pom[ 1 ] ) ) { - //u[ cell.getIndex() ]= argAbsMin( value, tmp ); sArray[ thrj ][ thri ] = argAbsMin( value, tmp ); + if ( fabs( value - sArray[ thrj ][ thri ] ) > 0.001 ) + return true; + return false; } else { tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); - //u[ cell.getIndex() ]= argAbsMin( value, tmp ); sArray[ thrj ][ thri ] = argAbsMin( value, tmp ); + if ( fabs( value - sArray[ thrj ][ thri ] ) > 0.001 ) + return true; + return false; } + return false; } #endif diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index 1279e3437f..d4cb1f63be 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -227,12 +227,47 @@ solve( const MeshPointer& mesh, tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; int nBlockIter = numBlocksX > numBlocksY ? numBlocksX : numBlocksY; nBlockIter = numBlocksX == numBlocksY ? nBlockIter + 1 : nBlockIter; - for( int k = 0; k < nBlockIter; k++) + + bool isNotDone = true; + bool* BlockIter = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) ); + + bool *BlockIterDevice; + cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( bool ) ); + + while( isNotDone ) + { + for( int i = 0; i < numBlocksX * numBlocksY; i++ ) + BlockIter[ i ] = false; + + isNotDone = false; + CudaUpdateCellCaller<<< gridSize, blockSize, 18 * 18 * sizeof( Real ) >>>( ptr, interfaceMapPtr.template getData< Device >(), - auxPtr.template modifyData< Device>() ); + auxPtr.template modifyData< Device>(), + BlockIterDevice ); + cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( bool ), cudaMemcpyDeviceToHost); + for( int i = 0; i < numBlocksX; i++ ) + { for( int j = 0; j < numBlocksY; j++ ) + { + if( BlockIter[ j * numBlocksY + i ] ) + std::cout << "true." << "\t"; + else + std::cout << "false." << "\t"; + } + std::cout << std::endl; + } + std::cout << std::endl; + + for( int i = 0; i < numBlocksX * numBlocksY; i++ ) + isNotDone = isNotDone || BlockIter[ i ]; + + } + delete[] BlockIter; + cudaFree( BlockIterDevice ); cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + aux = *auxPtr; interfaceMap = *interfaceMapPtr; #endif @@ -246,46 +281,149 @@ solve( const MeshPointer& mesh, template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux ) + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, + bool *BlockIterDevice ) { int thri = threadIdx.x; int thrj = threadIdx.y; // nelze ke stejnym pristupovat znovu pres threadIdx (vzdy da jine hodnoty) int blIdx = blockIdx.x; int blIdy = blockIdx.y; int i = thri + blockDim.x*blIdx; int j = blockDim.y*blIdy + thrj; + + __shared__ volatile bool changed[256]; + changed[ thrj * blockDim.x + thri ] = false; + __shared__ volatile bool SmallCycleBoolin; + if( thrj == 0 && thri == 0 ) + SmallCycleBoolin = true; + + const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); const Real hx = mesh.getSpaceSteps().x(); const Real hy = mesh.getSpaceSteps().y(); - __shared__ Real sArray[ 18 ][ 18 ]; - for( int k = 0; k < 18; k++ ) - for( int l = 0; l < 18; l++ ) - sArray[ k ][ l ] = TypeInfo< Real >::getMaxValue(); - __syncthreads(); - /*//filling shared array - ptr.setsArray( aux, sArray, mesh.getDimensions().x(), mesh.getDimensions().y(), blIdx, blIdy ); - __syncthreads();*/ + __shared__ volatile Real sArray[ 18 ][ 18 ]; + sArray[thrj][thri] = TypeInfo< Real >::getMaxValue(); - if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) + //filling sArray edges + int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); + int numOfBlockx = dimX/16 + ((dimX%16 != 0) ? 1:0); + int numOfBlocky = dimY/16 + ((dimY%16 != 0) ? 1:0); + int xkolik = 17; + int ykolik = 17; + + + if( numOfBlockx - 1 == blIdx ) + xkolik = dimX - (blIdx)*16+1; + + if( numOfBlocky -1 == blIdy ) + ykolik = dimY - (blIdy)*16+1; + + + if( thri == 0 ) + { + if( dimX > (blIdx+1) * 16 && thrj+1 < ykolik ) + sArray[thrj+1][xkolik] = aux[ blIdy*16*dimX - dimX + blIdx*16 - 1 + (thrj+1)*dimX + 17 ]; + else + sArray[thrj+1][xkolik] = TypeInfo< Real >::getMaxValue(); + } + + if( thri == 1 ) + { + if( blIdx != 0 && thrj+1 < ykolik ) + sArray[thrj+1][0] = aux[ blIdy*16*dimX - dimX + blIdx*16 - 1 + (thrj+1)*dimX + 0]; + else + sArray[thrj+1][0] = TypeInfo< Real >::getMaxValue(); + } + + if( thrj == 0 ) { + if( dimY > (blIdy+1) * 16 && thri+1 < xkolik ) + sArray[ykolik][thri+1] = aux[ blIdy*16*dimX - dimX + blIdx*16 - 1 + 17*dimX + thri+1 ]; + else + sArray[ykolik][thri+1] = TypeInfo< Real >::getMaxValue(); + } + + if( thrj == 1 ) + { + if( blIdy != 0 && thri+1 < xkolik ) + sArray[0][thri+1] = aux[ blIdy*16*dimX - dimX + blIdx*16 - 1 + 0*dimX + thri+1 ]; + else + sArray[0][thri+1] = TypeInfo< Real >::getMaxValue(); + } + + //filling BlockIterDevice + BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = false; + + + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) + { sArray[thrj+1][thri+1] = aux[ j*mesh.getDimensions().x() + i ]; - ptr.setsArray( aux, sArray, mesh.getDimensions().x(), mesh.getDimensions().y(), blIdx, blIdy ); //fill edges of sArray __syncthreads(); - + } + + __shared__ int loopcounter; + if( thri == 0 && thrj == 0 ) + loopcounter= 0; + //if( blIdx == 5 && blIdy == 4 ) + while( SmallCycleBoolin ) + { + if( thri == 1 && thrj == 1 ) + SmallCycleBoolin = false; + + changed[ thrj * 16 + thri ] = false; + __syncthreads(); - if( ! interfaceMap[ j*mesh.getDimensions().x() + i ] ) + //calculation of update cell + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) + { + if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] ) + { + changed[ thrj * 16 + thri ] = ptr.updateCell( sArray, thri+1, thrj+1, hx, hy ); + __syncthreads(); + } + } + + //pyramid reduction + if( blockDim.x*blockDim.y >= 256 ) + { + if( (thrj * 16 + thri) < 128 ) + { + changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 128 ]; + } + __syncthreads(); + } + if( blockDim.x*blockDim.y >= 128 ) { - for( int k = 0; k < 17; k++ ) + if( (thrj * 16 + thri) < 64 ) { - ptr.updateCell( sArray, thri+1, thrj+1, hx, hy ); - __syncthreads(); + changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 64 ]; } + __syncthreads(); + } + if( (thrj * 16 + thri) < 32 ) + { + changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 32 ]; + changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 16 ]; + changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 8 ]; + changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 4 ]; + changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 2 ]; + changed[ thrj * 16 + thri ] = changed[ thrj * 16 + thri ] || changed[ thrj * 16 + thri + 1 ]; + } + __syncthreads(); + + if( thrj == 1 && thri == 1 ) + { + loopcounter++; + if( loopcounter > 1000 ) + break; + SmallCycleBoolin = changed[ 0 ]; + //if( SmallCycleBoolin ) + BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = SmallCycleBoolin; } - /*for( int k = 0; k < mesh.getDimensions().x(); k++) - for( int l = 0; l < mesh.getDimensions().y(); l++) - aux[ k*mesh.getDimensions().x() + l ] = TypeInfo< Real >::getMaxValue();*/ - aux[j*mesh.getDimensions().x() + i] = sArray[thrj+1][thri+1]; __syncthreads(); } - //ptr.getsArray( aux, sArray, mesh.getDimensions().x(), mesh.getDimensions().y(), blIdx, blIdy ); + + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) + aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ]; + } #endif -- GitLab