From 223c69a75ed82b7a8adabb8713f97ee716fcd3ae Mon Sep 17 00:00:00 2001 From: fencl <fenclmat@fjfi.cvut.cz> Date: Thu, 23 Aug 2018 11:10:16 +0200 Subject: [PATCH] Parallel version for CUDA of FSM is implemented. --- .../tnlDirectEikonalMethodsBase.h | 14 +- .../tnlDirectEikonalMethodsBase_impl.h | 74 ++++++ .../tnlFastSweepingMethod2D_impl.h | 73 ++++-- .../tnlFastSweepingMethod3D_impl.h | 237 ++++++++++++++++-- 4 files changed, 359 insertions(+), 39 deletions(-) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h index 74393f1fd6..a406a5b177 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h @@ -101,6 +101,10 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > __cuda_callable__ void updateCell( MeshFunctionType& u, const MeshEntity& cell, const RealType velocity = 1.0); + + __cuda_callable__ bool updateCell( volatile Real sArray[10][10][10], + int thri, int thrj, int thrk, const Real hx, const Real hy, const Real hz, + const Real velocity = 1.0 ); }; template < typename T1, typename T2 > @@ -125,10 +129,13 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, - int *BlockIterDevice ); + Real *aux, + int *BlockIterDevice); __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks ); +template < typename Real, typename Device, typename Index > +__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a ); + template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output, @@ -142,7 +149,8 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3 template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, - Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux ); + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, + int *BlockIterDevice ); #endif #include "tnlDirectEikonalMethodsBase_impl.h" diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h index cd4eb9e24b..31bc95df54 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h @@ -994,4 +994,78 @@ updateCell( volatile Real sArray[18], int thri, const Real h, const Real v ) else return false; } + +template< typename Real, + typename Device, + typename Index > +__cuda_callable__ +bool +tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: +updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk, + const Real hx, const Real hy, const Real hz, const Real v ) +{ + const RealType value = sArray[thrk][thrj][thri]; + //std::cout << value << std::endl; + RealType a, b, c, tmp = TypeInfo< RealType >::getMaxValue(); + + c = TNL::argAbsMin( sArray[ thrk+1 ][ thrj ][ thri ], + sArray[ thrk-1 ][ thrj ][ thri ] ); + + b = TNL::argAbsMin( sArray[ thrk ][ thrj+1 ][ thri ], + sArray[ thrk ][ thrj-1 ][ thri ] ); + + a = TNL::argAbsMin( sArray[ thrk ][ thrj ][ thri+1 ], + sArray[ thrk ][ thrj ][ thri-1 ] ); + + + if( fabs( a ) == TypeInfo< RealType >::getMaxValue() && + fabs( b ) == TypeInfo< RealType >::getMaxValue() && + fabs( c ) == TypeInfo< RealType >::getMaxValue() ) + return false; + + RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz}; + + sortMinims( pom ); + + tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]; + if( fabs( tmp ) < fabs( pom[ 1 ] ) ) + { + sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp ); + tmp = value - sArray[ thrk ][ thrj ][ thri ]; + if ( fabs( tmp ) > 0.01*hx ) + return true; + else + return false; + } + else + { + tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + + TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - + ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); + if( fabs( tmp ) < fabs( pom[ 2 ]) ) + { + sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp ); + tmp = value - sArray[ thrk ][ thrj ][ thri ]; + if ( fabs( tmp ) > 0.01*hx ) + return true; + else + return false; + } + else + { + tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c + + TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - + hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) - + hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx ); + sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp ); + tmp = value - sArray[ thrk ][ thrj ][ thri ]; + if ( fabs( tmp ) > 0.01*hx ) + return true; + else + return false; + } + } + + return false; +} #endif diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index efdb13da83..7d5f8cbaa9 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -27,7 +27,7 @@ template< typename Real, typename Anisotropy > FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >:: FastSweepingMethod() -: maxIterations( 1 ) +: maxIterations( 100 ) { } @@ -236,6 +236,13 @@ solve( const MeshPointer& mesh, { // TODO: CUDA code #ifdef HAVE_CUDA + + Real *dAux; + cudaMalloc(&dAux, ( mesh->getDimensions().x() * mesh->getDimensions().y() ) * sizeof( Real ) ); + + + + const int cudaBlockSize( 16 ); int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); @@ -244,7 +251,7 @@ solve( const MeshPointer& mesh, tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; - + aux1<<< gridSize, blockSize >>>( auxPtr.template modifyData< Device>(), dAux,1 ); //int BlockIter = 1;// = (bool*)malloc( ( numBlocksX * numBlocksY ) * sizeof( bool ) ); @@ -263,7 +270,7 @@ solve( const MeshPointer& mesh, CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), - auxPtr.template modifyData< Device>(), + dAux, BlockIterDevice ); CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); @@ -275,14 +282,16 @@ solve( const MeshPointer& mesh, BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ } + aux1<<<gridSize,blockSize>>>( auxPtr.template modifyData< Device>(), dAux, 0 ); + cudaFree( dAux ); cudaFree( BlockIterDevice ); cudaFree( dBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - aux = *auxPtr; - interfaceMap = *interfaceMapPtr; + //aux = *auxPtr; + //interfaceMap = *interfaceMapPtr; #endif } iteration++; @@ -291,6 +300,23 @@ solve( const MeshPointer& mesh, } #ifdef HAVE_CUDA +template < typename Real, typename Device, typename Index > +__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a ) +{ + int i = threadIdx.x + blockDim.x*blockIdx.x; + int j = blockDim.y*blockIdx.y + threadIdx.y; + const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.template getMesh< Devices::Cuda >(); + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 1 ) + { + dAux[ j*mesh.getDimensions().x() + i ] = aux[ j*mesh.getDimensions().x() + i ]; + } + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && a == 0 ) + { + aux[ j*mesh.getDimensions().x() + i ] = dAux[ j*mesh.getDimensions().x() + i ]; + } + +} + __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlocks ) { int i = threadIdx.x; @@ -341,16 +367,17 @@ __global__ void CudaParallelReduc( int *BlockIterDevice, int *dBlock, int nBlock template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, + Real *aux, int *BlockIterDevice ) { int thri = threadIdx.x; int thrj = threadIdx.y; int blIdx = blockIdx.x; int blIdy = blockIdx.y; int i = thri + blockDim.x*blIdx; int j = blockDim.y*blIdy + thrj; - int currentIndex = thrj * 16 + thri; + int currentIndex = thrj * blockDim.x + thri; - __shared__ volatile bool changed[256]; + //__shared__ volatile bool changed[ blockDim.x*blockDim.y ]; + __shared__ volatile bool changed[16*16]; changed[ currentIndex ] = false; if( thrj == 0 && thri == 0 ) @@ -365,7 +392,8 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< hy = mesh.getSpaceSteps().y(); } - __shared__ volatile Real sArray[ 18 ][ 18 ]; + //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ]; + __shared__ volatile Real sArray[18][18]; sArray[thrj][thri] = TypeInfo< Real >::getMaxValue(); //filling sArray edges @@ -440,23 +468,28 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< { if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] ) { - changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx, hy ); + changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy); } } __syncthreads(); - /*if( thri == 0 && thrj == 0 && blIdx == 1 && blIdy == 3 ){ - for( int h = 15; h > -1; h-- ){ - for( int g = 0; g < 16; g++ ){ - printf( "%d\t", changed[h*16+g] ); - } - printf("\n"); + //pyramid reduction + if( blockDim.x*blockDim.y == 1024 ) + { + if( currentIndex < 512 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; } - printf("\n"); } - __syncthreads();*/ - - //pyramid reduction + __syncthreads(); + if( blockDim.x*blockDim.y >= 512 ) + { + if( currentIndex < 256 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; + } + } + __syncthreads(); if( blockDim.x*blockDim.y >= 256 ) { if( currentIndex < 128 ) diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h index e9fa72e666..c20d9a2faa 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h @@ -255,13 +255,34 @@ solve( const MeshPointer& mesh, std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); - Devices::Cuda::synchronizeDevice(); - + tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr; - for( int k = 0; k < numBlocksX; k++) - CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, - interfaceMapPtr.template getData< Device >(), - auxPtr.template modifyData< Device>() ); + + int *BlockIterDevice; + int BlockIterD = 1; + + cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) ); + int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0); + int *dBlock; + cudaMalloc(&dBlock, nBlocks * sizeof( int ) ); + + while( BlockIterD ) + { + CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, + interfaceMapPtr.template getData< Device >(), + auxPtr.template modifyData< Device>(), + BlockIterDevice ); + CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); + CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); + + cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); + + /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) + BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ + + } + cudaFree( BlockIterDevice ); + cudaFree( dBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; aux = *auxPtr; @@ -280,27 +301,211 @@ solve( const MeshPointer& mesh, template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, - Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux ) + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, + int *BlockIterDevice ) { + int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z; + int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z; int i = threadIdx.x + blockDim.x*blockIdx.x; int j = blockDim.y*blockIdx.y + threadIdx.y; int k = blockDim.z*blockIdx.z + threadIdx.z; + int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri; + + __shared__ volatile bool changed[8*8*8]; + changed[ currentIndex ] = false; + + if( thrj == 0 && thri == 0 && thrk == 0 ) + changed[ 0 ] = true; + const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); + __shared__ Real hx; + __shared__ Real hy; + __shared__ Real hz; + if( thrj == 1 && thri == 1 && thrk == 1 ) + { + hx = mesh.getSpaceSteps().x(); + hy = mesh.getSpaceSteps().y(); + hz = mesh.getSpaceSteps().z(); + } + __shared__ volatile Real sArray[10][10][10]; + sArray[thrk][thrj][thri] = TypeInfo< Real >::getMaxValue(); + if(thri == 0 ) + { + sArray[8][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue(); + sArray[9][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue(); + sArray[thrk+1][thrj+1][8] = TypeInfo< Real >::getMaxValue(); + sArray[thrk+1][thrj+1][9] = TypeInfo< Real >::getMaxValue(); + sArray[thrj+1][8][thrk+1] = TypeInfo< Real >::getMaxValue(); + sArray[thrj+1][9][thrk+1] = TypeInfo< Real >::getMaxValue(); + } + + //filling sArray edges + int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); + int dimZ = mesh.getDimensions().z(); + __shared__ volatile int numOfBlockx; + __shared__ volatile int numOfBlocky; + __shared__ volatile int numOfBlockz; + __shared__ int xkolik; + __shared__ int ykolik; + __shared__ int zkolik; + if( thri == 0 && thrj == 0 && thrk == 0 ) + { + xkolik = blockDim.x + 1; + ykolik = blockDim.y + 1; + zkolik = blockDim.z + 1; + numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0); + numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0); + numOfBlockz = dimZ/blockDim.z + ((dimZ%blockDim.z != 0) ? 1:0); + + if( numOfBlockx - 1 == blIdx ) + xkolik = dimX - (blIdx)*blockDim.x+1; + + if( numOfBlocky -1 == blIdy ) + ykolik = dimY - (blIdy)*blockDim.y+1; + if( numOfBlockz-1 == blIdz ) + zkolik = dimZ - (blIdz)*blockDim.z+1; + + BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 0; + } + __syncthreads(); + + if( thri == 0 ) + { + if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik ) + sArray[thrk+1][thrj+1][0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ]; + else + sArray[thrk+1][thrj+1][0] = TypeInfo< Real >::getMaxValue(); + } + + if( thri == 1 ) + { + if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik ) + sArray[thrk+1][thrj+1][9] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ]; + else + sArray[thrk+1][thrj+1][9] = TypeInfo< Real >::getMaxValue(); + } + if( thri == 2 ) + { + if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik ) + sArray[thrk+1][0][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ]; + else + sArray[thrk+1][0][thrj+1] = TypeInfo< Real >::getMaxValue(); + } + + if( thri == 3 ) + { + if( dimY > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik ) + sArray[thrk+1][9][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ]; + else + sArray[thrk+1][9][thrj+1] = TypeInfo< Real >::getMaxValue(); + } + if( thri == 4 ) + { + if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik ) + sArray[0][thrj+1][thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ]; + else + sArray[0][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue(); + } + + if( thri == 5 ) + { + if( dimZ > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik ) + sArray[9][thrj+1][thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ]; + else + sArray[9][thrj+1][thrk+1] = TypeInfo< Real >::getMaxValue(); + } if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() ) { - typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell; - Cell cell( mesh ); - cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k; - cell.refresh(); - //tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr; - for( int l = 0; l < 10; l++ ) + sArray[thrk+1][thrj+1][thri+1] = aux[ k*dimX*dimY + j*dimX + i ]; + } + __shared__ volatile int loopcounter; + loopcounter = 0; + __syncthreads(); + while( changed[ 0 ] ) + { + __syncthreads(); + + changed[ currentIndex ] = false; + + //calculation of update cell + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ ) { - if( ! interfaceMap( cell ) ) + if( ! interfaceMap[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] ) { - ptr.updateCell( aux, cell ); + changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz); } } + __syncthreads(); + + //pyramid reduction + if( blockDim.x*blockDim.y*blockDim.z == 1024 ) + { + if( currentIndex < 512 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; + } + } + __syncthreads(); + if( blockDim.x*blockDim.y*blockDim.z >= 512 ) + { + if( currentIndex < 256 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; + } + } + __syncthreads(); + if( blockDim.x*blockDim.y*blockDim.z >= 256 ) + { + if( currentIndex < 128 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ]; + } + } + __syncthreads(); + if( blockDim.x*blockDim.y*blockDim.z >= 128 ) + { + if( currentIndex < 64 ) + { + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; + } + } + __syncthreads(); + if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU + { + if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ]; + if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ]; + if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ]; + if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ]; + if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; + if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; + } + __syncthreads(); + + /*if(thri == 0 && thrj ==0 && thrk ==0 && blIdx == 0 && blIdy == 0 && blIdz == 0) + { + for(int m = 0; m < 8; m++){ + for(int n = 0; n<8; n++){ + for(int b=0; b<8; b++) + printf(" %i ", changed[m*64 + n*8 + b]); + printf("\n"); + } + printf("\n \n"); + } + }*/ + if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 ) + { + //loopcounter++; + BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 1; + } + __syncthreads(); + /*if(thri == 0 && thrj==0 && thrk==0) + printf("%i \n",loopcounter); + if(loopcounter == 500) + break;*/ } -} + + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ && (!interfaceMap[ k*dimX*dimY+j * mesh.getDimensions().x() + i ]) ) + aux[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ]; +} #endif -- GitLab