diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h index 0f45be71c87b69c6b76c283a5e8fa970c45b816b..cbb1a1ff6ac31757e49d58a4bf8997a908f34fc9 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h @@ -74,12 +74,16 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > const MeshEntity& cell, const RealType velocity = 1.0 ); - __cuda_callable__ bool updateCell( volatile Real sArray[18][18], + template< int sizeSArray > + __cuda_callable__ bool updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real hy, const Real velocity = 1.0 ); + + template< int sizeSArray > void updateBlocks( InterfaceMapType interfaceMap, MeshFunctionType aux, - ArrayContainer BlockIterHost, int numThreadsPerBlock ); + MeshFunctionType helpFunc, + ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ); void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY ); }; @@ -119,9 +123,6 @@ T1 meet2DCondition( T1 a, T1 b, const T2 ha, const T2 hb, const T1 value, double template < typename T1 > __cuda_callable__ void sortMinims( T1 pom[] ); -template < typename Index > -void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY ); - #ifdef HAVE_CUDA template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, @@ -134,15 +135,12 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& aux, bool *BlockIterDevice ); -template < typename Real, typename Device, typename Index > +template < int sizeSArray, typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, -<<<<<<< HEAD - TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne = 1 ); -======= - TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ); ->>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf + const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, + TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock =0); template < typename Index > __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h index 1f9fc5eeba264cd9e9c335a43419804ab415a31b..95971c9b8707f53167135d95a855d68a853b2bd0 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h @@ -1,4 +1,4 @@ - /* +/* * File: tnlDirectEikonalMethodsBase_impl.h * Author: oberhuber * @@ -13,233 +13,259 @@ #include "tnlFastSweepingMethod.h" template< typename Real, - typename Device, - typename Index > + typename Device, + typename Index > void tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >:: initInterface( const MeshFunctionPointer& _input, - MeshFunctionPointer& _output, - InterfaceMapPointer& _interfaceMap ) + MeshFunctionPointer& _output, + InterfaceMapPointer& _interfaceMap ) { - if( std::is_same< Device, Devices::Cuda >::value ) - { + if( std::is_same< Device, Devices::Cuda >::value ) + { #ifdef HAVE_CUDA - const MeshType& mesh = _input->getMesh(); - - const int cudaBlockSize( 16 ); - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); - dim3 blockSize( cudaBlockSize ); - dim3 gridSize( numBlocksX ); - Devices::Cuda::synchronizeDevice(); - CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(), - _output.template modifyData< Device >(), - _interfaceMap.template modifyData< Device >() ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + const MeshType& mesh = _input->getMesh(); + + const int cudaBlockSize( 16 ); + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); + dim3 blockSize( cudaBlockSize ); + dim3 gridSize( numBlocksX ); + Devices::Cuda::synchronizeDevice(); + CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(), + _output.template modifyData< Device >(), + _interfaceMap.template modifyData< Device >() ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; #endif - } - if( std::is_same< Device, Devices::Host >::value ) - { - const MeshType& mesh = _input->getMesh(); - typedef typename MeshType::Cell Cell; - const MeshFunctionType& input = _input.getData(); - MeshFunctionType& output = _output.modifyData(); - InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); - Cell cell( mesh ); - for( cell.getCoordinates().x() = 0; + } + if( std::is_same< Device, Devices::Host >::value ) + { + const MeshType& mesh = _input->getMesh(); + typedef typename MeshType::Cell Cell; + const MeshFunctionType& input = _input.getData(); + MeshFunctionType& output = _output.modifyData(); + InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); + Cell cell( mesh ); + for( cell.getCoordinates().x() = 0; cell.getCoordinates().x() < mesh.getDimensions().x(); cell.getCoordinates().x() ++ ) - { - cell.refresh(); - output[ cell.getIndex() ] = - input( cell ) >= 0 ? std::numeric_limits< RealType >::max() : - -std::numeric_limits< RealType >::max(); - interfaceMap[ cell.getIndex() ] = false; - } - - - const RealType& h = mesh.getSpaceSteps().x(); - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh.getDimensions().x() - 1; - cell.getCoordinates().x() ++ ) + { + cell.refresh(); + output[ cell.getIndex() ] = + input( cell ) >= 0 ? std::numeric_limits< RealType >::max() : + -std::numeric_limits< RealType >::max(); + interfaceMap[ cell.getIndex() ] = false; + } + + + const RealType& h = mesh.getSpaceSteps().x(); + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh.getDimensions().x() - 1; + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + const RealType& c = input( cell ); + if( ! cell.isBoundaryEntity() ) + { + const auto& neighbors = cell.getNeighborEntities(); + Real pom = 0; + //const IndexType& c = cell.getIndex(); + const IndexType e = neighbors.template getEntityIndex< 1 >(); + if( c * input[ e ] <= 0 ) { - cell.refresh(); - const RealType& c = input( cell ); - if( ! cell.isBoundaryEntity() ) - { - const auto& neighbors = cell.getNeighborEntities(); - Real pom = 0; - //const IndexType& c = cell.getIndex(); - const IndexType e = neighbors.template getEntityIndex< 1 >(); - if( c * input[ e ] <= 0 ) - { - pom = TNL::sign( c )*( h * c )/( c - input[ e ]); - if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) - output[ cell.getIndex() ] = pom; - - pom = pom - TNL::sign( c )*h; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; - if( TNL::abs( output[ e ] ) > TNL::abs( pom ) ) - output[ e ] = pom; - - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ e ] = true; - } - } + pom = TNL::sign( c )*( h * c )/( c - input[ e ]); + if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) + output[ cell.getIndex() ] = pom; + + pom = pom - TNL::sign( c )*h; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; + if( TNL::abs( output[ e ] ) > TNL::abs( pom ) ) + output[ e ] = pom; + + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ e ] = true; } + } } + } } template< typename Real, - typename Device, - typename Index > -void + typename Device, + typename Index > +template< int sizeSArray > +void tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: updateBlocks( InterfaceMapType interfaceMap, - MeshFunctionType aux, - ArrayContainer BlockIterHost, int numThreadsPerBlock ) + MeshFunctionType aux, + MeshFunctionType helpFunc, + ArrayContainer BlockIterHost, int numThreadsPerBlock/*, Real **sArray*/ ) { +#pragma omp parallel for schedule( dynamic ) for( int i = 0; i < BlockIterHost.getSize(); i++ ) { if( BlockIterHost[ i ] ) { MeshType mesh = interfaceMap.template getMesh< Devices::Host >(); - + int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); - int numOfBlockx = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0); - int numOfBlocky = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0); + //std::cout << "dimX = " << dimX << " ,dimY = " << dimY << std::endl; + int numOfBlocky = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0); + int numOfBlockx = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0); + //std::cout << "numOfBlockx = " << numOfBlockx << " ,numOfBlocky = " << numOfBlocky << std::endl; int xkolik = numThreadsPerBlock + 1; int ykolik = numThreadsPerBlock + 1; int blIdx = i%numOfBlockx; - int blIdy = i/numOfBlocky; + int blIdy = i/numOfBlockx; + //std::cout << "blIdx = " << blIdx << " ,blIdy = " << blIdy << std::endl; if( numOfBlockx - 1 == blIdx ) xkolik = dimX - (blIdx)*numThreadsPerBlock+1; if( numOfBlocky -1 == blIdy ) ykolik = dimY - (blIdy)*numThreadsPerBlock+1; - - + //std::cout << "xkolik = " << xkolik << " ,ykolik = " << ykolik << std::endl; + + /*bool changed[numThreadsPerBlock*numThreadsPerBlock]; - changed[ 0 ] = 1;*/ + changed[ 0 ] = 1;*/ Real hx = mesh.getSpaceSteps().x(); Real hy = mesh.getSpaceSteps().y(); - Real changed1[ 16*16 ]; - /*Real changed2[ 16*16 ]; - Real changed3[ 16*16 ]; - Real changed4[ 16*16 ];*/ - Real sArray[18][18]; + bool changed = false; + + + RealType *sArray; + sArray = new Real[ sizeSArray * sizeSArray ]; + if( sArray == nullptr ) + std::cout << "Error while allocating memory for sArray." << std::endl; + + for( int thri = 0; thri < sizeSArray; thri++ ){ + for( int thrj = 0; thrj < sizeSArray; thrj++ ) + sArray/*[i]*/[ thri * sizeSArray + thrj ] = std::numeric_limits< Real >::max(); + } - for( int thri = 0; thri < numThreadsPerBlock + 2; thri++ ) - for( int thrj = 0; thrj < numThreadsPerBlock + 2; thrj++ ) - sArray[thrj][thri] = std::numeric_limits< Real >::max(); - BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0; - + for( int thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ ) { if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik ) - sArray[thrj+1][xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ]; - else - sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max(); - - + sArray/*[i]*/[ ( thrj+1 )* sizeSArray +xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ]; + + if( blIdx != 0 && thrj+1 < ykolik ) - sArray[thrj+1][0] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ]; - else - sArray[thrj+1][0] = std::numeric_limits< Real >::max(); - + sArray/*[i]*/[(thrj+1)* sizeSArray] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ]; + if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik ) - sArray[ykolik][thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ]; - else - sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max(); - + sArray/*[i]*/[ykolik * sizeSArray + thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ]; + if( blIdy != 0 && thrj+1 < xkolik ) - sArray[0][thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ]; - else - sArray[0][thrj+1] = std::numeric_limits< Real >::max(); + sArray/*[i]*/[thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ]; } - - for( int k = 0; k < numThreadsPerBlock; k++ ) - for( int l = 0; l < numThreadsPerBlock; l++ ) - sArray[k+1][l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ]; - - for( int k = 0; k < numThreadsPerBlock; k++ ) + + for( int k = 0; k < numThreadsPerBlock; k++ ){ + for( int l = 0; l < numThreadsPerBlock; l++ ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) + sArray/*[i]*/[(k+1) * sizeSArray + l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ]; + } + bool pom = false; + for( int k = 0; k < numThreadsPerBlock; k++ ){ for( int l = 0; l < numThreadsPerBlock; l++ ){ - changed1[ k*numThreadsPerBlock + l ] = 0; - /*changed2[ k*numThreadsPerBlock + l ] = 0; - changed3[ k*numThreadsPerBlock + l ] = 0; - changed4[ k*numThreadsPerBlock + l ] = 0;*/ - if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l < dimX*dimY ) - { + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ){ + //std::cout << "proslo i = " << k * numThreadsPerBlock + l << std::endl; if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) { - changed1[ k*numThreadsPerBlock + l ] = this->updateCell( sArray, l+1, k+1, hx,hy); + pom = this->updateCell<sizeSArray>( sArray/*[i]*/, l+1, k+1, hx,hy); + changed = changed || pom; } } } - - for( int k = numThreadsPerBlock-1; k > -1; k-- ) - for( int l = 0; l < numThreadsPerBlock; l++ ) { - if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l < dimX*dimY ) + } + /*aux.save( "aux-1pruch.tnl" ); + for( int k = 0; k < sizeSArray; k++ ){ + for( int l = 0; l < sizeSArray; l++ ) { + std::cout << sArray[ k * sizeSArray + l] << " "; + } + std::cout << std::endl; + }*/ + + for( int k = 0; k < numThreadsPerBlock; k++ ) + for( int l = numThreadsPerBlock-1; l >-1; l-- ) { + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) { if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) { - /*changed2[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy); + this->updateCell<sizeSArray>( sArray/*[i]*/, l+1, k+1, hx,hy); } } } - - for( int k = 0; k < numThreadsPerBlock; k++ ) - for( int l = numThreadsPerBlock-1; l >-1; l-- ) { - if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l < dimX*dimY ) + /*aux.save( "aux-2pruch.tnl" ); + for( int k = 0; k < sizeSArray; k++ ){ + for( int l = 0; l < sizeSArray; l++ ) { + std::cout << sArray[ k * sizeSArray + l] << " "; + } + std::cout << std::endl; + }*/ + + for( int k = numThreadsPerBlock-1; k > -1; k-- ) + for( int l = 0; l < numThreadsPerBlock; l++ ) { + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) { if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) { - /*changed3[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy); + this->updateCell<sizeSArray>( sArray/*[i]*/, l+1, k+1, hx,hy); } } } - - for( int k = numThreadsPerBlock-1; k > -1; k-- ) + /*aux.save( "aux-3pruch.tnl" ); + for( int k = 0; k < sizeSArray; k++ ){ + for( int l = 0; l < sizeSArray; l++ ) { + std::cout << sArray[ k * sizeSArray + l] << " "; + } + std::cout << std::endl; + }*/ + + for( int k = numThreadsPerBlock-1; k > -1; k-- ){ for( int l = numThreadsPerBlock-1; l >-1; l-- ) { - if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l < dimX*dimY ) + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) { if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) { - /*changed4[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy); + this->updateCell<sizeSArray>( sArray/*[i]*/, l+1, k+1, hx,hy); } } } - - for( int k = numThreadsPerBlock-1; k > -1; k-- ) - for( int l = numThreadsPerBlock-1; l >-1; l-- ){ - changed1[ 0 ] = changed1[ 0 ] || changed1[ k*numThreadsPerBlock + l ]; - /*changed2[ 0 ] = changed2[ 0 ] || changed2[ k*numThreadsPerBlock + l ]; - changed3[ 0 ] = changed3[ 0 ] || changed3[ k*numThreadsPerBlock + l ]; - changed4[ 0 ] = changed4[ 0 ] || changed4[ k*numThreadsPerBlock + l ];*/ + } + /*aux.save( "aux-4pruch.tnl" ); + for( int k = 0; k < sizeSArray; k++ ){ + for( int l = 0; l < sizeSArray; l++ ) { + std::cout << sArray[ k * sizeSArray + l] << " "; } + std::cout << std::endl; + }*/ + - if( changed1[ 0 ] /*|| changed2[ 0 ] ||changed3[ 0 ] ||changed4[ 0 ]*/ ) + if( changed ){ BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 1; - + } + + for( int k = 0; k < numThreadsPerBlock; k++ ){ - for( int l = 0; l < numThreadsPerBlock; l++ ) { - if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l < dimX*dimY && - (!interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ]) ) - aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] = sArray[ k + 1 ][ l + 1 ]; + for( int l = 0; l < numThreadsPerBlock; l++ ) { + if( blIdy * numThreadsPerBlock + k < dimY && blIdx * numThreadsPerBlock + l < dimX ) + helpFunc[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] = sArray/*[i]*/[ (k + 1)* sizeSArray + l + 1 ]; //std::cout<< sArray[k+1][l+1]; } //std::cout<<std::endl; } + //delete []sArray; } } } template< typename Real, - typename Device, - typename Index > + typename Device, + typename Index > void tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY ) @@ -249,643 +275,643 @@ getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY ) for(int i = 0; i < numBlockX * numBlockY; i++) { - BlockIterPom[ i ] = 0; - if( BlockIterHost[ i ] ) - { - // i = k*numBlockY + m; - int m=0, k=0; - m = i%numBlockX; - k = i/numBlockX; - if( k > 0 ) - BlockIterPom[i - numBlockX] = 1; - if( k < numBlockY - 1 ) - BlockIterPom[i + numBlockX] = 1; - - if( m < numBlockX - 1 ) - BlockIterPom[ i+1 ] = 1; - if( m > 0 ) - BlockIterPom[ i-1 ] = 1; + BlockIterPom[ i ] = 0;//BlockIterPom[ i ] = 0; + int m=0, k=0; + m = i%numBlockX; + k = i/numBlockX; + if( m > 0 && BlockIterHost[ i - 1 ] ){ + BlockIterPom[ i ] = 1; + }else if( m < numBlockX -1 && BlockIterHost[ i + 1 ] ){ + BlockIterPom[ i ] = 1; + }else if( k > 0 && BlockIterHost[ i - numBlockX ] ){ + BlockIterPom[ i ] = 1; + }else if( k < numBlockY -1 && BlockIterHost[ i + numBlockX ] ){ + BlockIterPom[ i ] = 1; } + //BlockIterPom[ i ]; } - for(int i = 0; i < numBlockX * numBlockY; i++ ) - //if( !BlockIter[ i ] ) - BlockIterHost[ i ] = BlockIterPom[ i ]; - /*else - BlockIter[ i ] = 0;*/ - /*for( int i = numBlockX-1; i > -1; i-- ) + + for(int i = 0; i < numBlockX * numBlockY; i++) { - for( int j = 0; j< numBlockY; j++ ) - std::cout << BlockIterHost[ i*numBlockY + j ]; - std::cout << std::endl; + if( !BlockIterHost[ i ] ) + BlockIterHost[ i ] = BlockIterPom[ i ]; } - std::cout << std::endl;*/ + /*else + BlockIter[ i ] = 0;*/ + /*for( int i = numBlockX-1; i > -1; i-- ) + { + for( int j = 0; j< numBlockY; j++ ) + std::cout << BlockIterHost[ i*numBlockY + j ]; + std::cout << std::endl; + } + std::cout << std::endl;*/ delete[] BlockIterPom; } template< typename Real, - typename Device, - typename Index > - template< typename MeshEntity > + typename Device, + typename Index > +template< typename MeshEntity > void tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >:: updateCell( MeshFunctionType& u, - const MeshEntity& cell, - const RealType v ) + const MeshEntity& cell, + const RealType v ) { - const auto& neighborEntities = cell.template getNeighborEntities< 1 >(); - const MeshType& mesh = cell.getMesh(); - const RealType& h = mesh.getSpaceSteps().x(); - const RealType value = u( cell ); - RealType a, tmp = std::numeric_limits< RealType >::max(); - - if( cell.getCoordinates().x() == 0 ) - a = u[ neighborEntities.template getEntityIndex< 1 >() ]; - else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 ) - a = u[ neighborEntities.template getEntityIndex< -1 >() ]; - else - { - a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1 >() ], - u[ neighborEntities.template getEntityIndex< 1 >() ] ); - } - - if( fabs( a ) == std::numeric_limits< RealType >::max() ) - return; - - tmp = a + TNL::sign( value ) * h/v; - - u[ cell.getIndex() ] = argAbsMin( value, tmp ); + const auto& neighborEntities = cell.template getNeighborEntities< 1 >(); + const MeshType& mesh = cell.getMesh(); + const RealType& h = mesh.getSpaceSteps().x(); + const RealType value = u( cell ); + RealType a, tmp = std::numeric_limits< RealType >::max(); + + if( cell.getCoordinates().x() == 0 ) + a = u[ neighborEntities.template getEntityIndex< 1 >() ]; + else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 ) + a = u[ neighborEntities.template getEntityIndex< -1 >() ]; + else + { + a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1 >() ], + u[ neighborEntities.template getEntityIndex< 1 >() ] ); + } + + if( fabs( a ) == std::numeric_limits< RealType >::max() ) + return; + + tmp = a + TNL::sign( value ) * h/v; + + u[ cell.getIndex() ] = argAbsMin( value, tmp ); } template< typename Real, - typename Device, - typename Index > + typename Device, + typename Index > void tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: initInterface( const MeshFunctionPointer& _input, - MeshFunctionPointer& _output, - InterfaceMapPointer& _interfaceMap ) + MeshFunctionPointer& _output, + InterfaceMapPointer& _interfaceMap ) { - - if( std::is_same< Device, Devices::Cuda >::value ) - { + + if( std::is_same< Device, Devices::Cuda >::value ) + { #ifdef HAVE_CUDA - const MeshType& mesh = _input->getMesh(); - - const int cudaBlockSize( 16 ); - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); - int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); - dim3 blockSize( cudaBlockSize, cudaBlockSize ); - dim3 gridSize( numBlocksX, numBlocksY ); - Devices::Cuda::synchronizeDevice(); - CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(), - _output.template modifyData< Device >(), - _interfaceMap.template modifyData< Device >() ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + const MeshType& mesh = _input->getMesh(); + + const int cudaBlockSize( 16 ); + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); + int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); + dim3 blockSize( cudaBlockSize, cudaBlockSize ); + dim3 gridSize( numBlocksX, numBlocksY ); + Devices::Cuda::synchronizeDevice(); + CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(), + _output.template modifyData< Device >(), + _interfaceMap.template modifyData< Device >() ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; #endif - } - if( std::is_same< Device, Devices::Host >::value ) - { - MeshFunctionType input = _input.getData(); - - /*double A[320][320]; - std::ifstream fileInit("/home/maty/Downloads/initData.txt"); - - for (int i = 0; i < 320; i++) - for (int j = 0; j < 320; j++) - fileInit >> A[i][j]; - fileInit.close(); - for (int i = 0; i < 320; i++) - for (int j = 0; j < 320; j++) - input[i*320 + j] = A[i][j];*/ - - - MeshFunctionType& output = _output.modifyData(); - InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); - const MeshType& mesh = input.getMesh(); - typedef typename MeshType::Cell Cell; - Cell cell( mesh ); - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh.getDimensions().y(); - cell.getCoordinates().y() ++ ) - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh.getDimensions().x(); - cell.getCoordinates().x() ++ ) - { - cell.refresh(); - output[ cell.getIndex() ] = - input( cell ) >= 0 ? std::numeric_limits< RealType >::max() : - - std::numeric_limits< RealType >::max(); - interfaceMap[ cell.getIndex() ] = false; - } - - const RealType& hx = mesh.getSpaceSteps().x(); - const RealType& hy = mesh.getSpaceSteps().y(); - for( cell.getCoordinates().y() = 0; + } + if( std::is_same< Device, Devices::Host >::value ) + { + MeshFunctionType input = _input.getData(); + + /*double A[320][320]; + std::ifstream fileInit("/home/maty/Downloads/initData.txt"); + + for (int i = 0; i < 320; i++) + for (int j = 0; j < 320; j++) + fileInit >> A[j]; + fileInit.close(); + for (int i = 0; i < 320; i++) + for (int j = 0; j < 320; j++) + input[i*320 + j] = A[j];*/ + + + MeshFunctionType& output = _output.modifyData(); + InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); + const MeshType& mesh = input.getMesh(); + typedef typename MeshType::Cell Cell; + Cell cell( mesh ); + for( cell.getCoordinates().y() = 0; cell.getCoordinates().y() < mesh.getDimensions().y(); cell.getCoordinates().y() ++ ) - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh.getDimensions().x(); - cell.getCoordinates().x() ++ ) + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh.getDimensions().x(); + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + output[ cell.getIndex() ] = + input( cell ) >= 0 ? std::numeric_limits< RealType >::max() : + - std::numeric_limits< RealType >::max(); + interfaceMap[ cell.getIndex() ] = false; + } + + const RealType& hx = mesh.getSpaceSteps().x(); + const RealType& hy = mesh.getSpaceSteps().y(); + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh.getDimensions().y(); + cell.getCoordinates().y() ++ ) + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh.getDimensions().x(); + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + const RealType& c = input( cell ); + if( ! cell.isBoundaryEntity() ) + { + auto neighbors = cell.getNeighborEntities(); + Real pom = 0; + const IndexType e = neighbors.template getEntityIndex< 1, 0 >(); + const IndexType n = neighbors.template getEntityIndex< 0, 1 >(); + //Try init with exact data: + /*if( c * input[ n ] <= 0 ) + { + output[ cell.getIndex() ] = c; + output[ n ] = input[ n ]; + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ n ] = true; + } + if( c * input[ e ] <= 0 ) + { + output[ cell.getIndex() ] = c; + output[ e ] = input[ e ]; + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ e ] = true; + }*/ + if( c * input[ n ] <= 0 ) { - cell.refresh(); - const RealType& c = input( cell ); - if( ! cell.isBoundaryEntity() ) + /*if( c >= 0 ) + {*/ + pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); + if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) + output[ cell.getIndex() ] = pom; + pom = pom - TNL::sign( c )*hy; + if( TNL::abs( output[ n ] ) > TNL::abs( pom ) ) + output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy; + /*}else { - auto neighbors = cell.getNeighborEntities(); - Real pom = 0; - const IndexType e = neighbors.template getEntityIndex< 1, 0 >(); - const IndexType n = neighbors.template getEntityIndex< 0, 1 >(); - //Try init with exact data: - /*if( c * input[ n ] <= 0 ) - { - output[ cell.getIndex() ] = c; - output[ n ] = input[ n ]; - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ n ] = true; - } - if( c * input[ e ] <= 0 ) - { - output[ cell.getIndex() ] = c; - output[ e ] = input[ e ]; - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ e ] = true; - }*/ - if( c * input[ n ] <= 0 ) - { - /*if( c >= 0 ) - {*/ - pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); - if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) - output[ cell.getIndex() ] = pom; - pom = pom - TNL::sign( c )*hy; - if( TNL::abs( output[ n ] ) > TNL::abs( pom ) ) - output[ n ] = pom; //( hy * c )/( c - input[ n ]) - hy; - /*}else - { - pom = - ( hy * c )/( c - input[ n ]); - if( output[ cell.getIndex() ] < pom ) - output[ cell.getIndex() ] = pom; - if( output[ n ] > hy + pom ) - output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]); - }*/ - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ n ] = true; - } - if( c * input[ e ] <= 0 ) - { - /*if( c >= 0 ) - {*/ - pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); - if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) - output[ cell.getIndex() ] = pom; - - pom = pom - TNL::sign( c )*hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; - if( TNL::abs( output[ e ] ) > TNL::abs( pom ) ) - output[ e ] = pom; - /*}else - { - pom = - (hx * c)/( c - input[ e ]); - if( output[ cell.getIndex() ] < pom ) - output[ cell.getIndex() ] = pom; - - pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]); - if( output[ e ] > pom ) - output[ e ] = pom; - }*/ - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ e ] = true; - } - } + pom = - ( hy * c )/( c - input[ n ]); + if( output[ cell.getIndex() ] < pom ) + output[ cell.getIndex() ] = pom; + if( output[ n ] > hy + pom ) + output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]); + }*/ + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ n ] = true; } + if( c * input[ e ] <= 0 ) + { + /*if( c >= 0 ) + {*/ + pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); + if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) + output[ cell.getIndex() ] = pom; + + pom = pom - TNL::sign( c )*hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; + if( TNL::abs( output[ e ] ) > TNL::abs( pom ) ) + output[ e ] = pom; + /*}else + { + pom = - (hx * c)/( c - input[ e ]); + if( output[ cell.getIndex() ] < pom ) + output[ cell.getIndex() ] = pom; + + pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]); + if( output[ e ] > pom ) + output[ e ] = pom; + }*/ + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ e ] = true; + } + } } + } } template< typename Real, - typename Device, - typename Index > - template< typename MeshEntity > + typename Device, + typename Index > +template< typename MeshEntity > __cuda_callable__ void tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: updateCell( MeshFunctionType& u, - const MeshEntity& cell, - const RealType v) + const MeshEntity& cell, + const RealType v) { - const auto& neighborEntities = cell.template getNeighborEntities< 2 >(); - const MeshType& mesh = cell.getMesh(); - const RealType& hx = mesh.getSpaceSteps().x(); - const RealType& hy = mesh.getSpaceSteps().y(); - const RealType value = u( cell ); - RealType a, b, tmp = std::numeric_limits< RealType >::max(); - - if( cell.getCoordinates().x() == 0 ) - a = u[ neighborEntities.template getEntityIndex< 1, 0 >() ]; - else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 ) - a = u[ neighborEntities.template getEntityIndex< -1, 0 >() ]; - else - { - a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0 >() ], - u[ neighborEntities.template getEntityIndex< 1, 0 >() ] ); - } - - if( cell.getCoordinates().y() == 0 ) - b = u[ neighborEntities.template getEntityIndex< 0, 1 >()]; - else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 ) - b = u[ neighborEntities.template getEntityIndex< 0, -1 >() ]; - else - { - b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1 >() ], - u[ neighborEntities.template getEntityIndex< 0, 1 >() ] ); - } - if( fabs( a ) == std::numeric_limits< RealType >::max() && - fabs( b ) == std::numeric_limits< RealType >::max() ) - return; - /*if( fabs( a ) == TypeInfo< Real >::getMaxValue() || - fabs( b ) == TypeInfo< Real >::getMaxValue() || - fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) ) + const auto& neighborEntities = cell.template getNeighborEntities< 2 >(); + const MeshType& mesh = cell.getMesh(); + const RealType& hx = mesh.getSpaceSteps().x(); + const RealType& hy = mesh.getSpaceSteps().y(); + const RealType value = u( cell ); + RealType a, b, tmp = std::numeric_limits< RealType >::max(); + + if( cell.getCoordinates().x() == 0 ) + a = u[ neighborEntities.template getEntityIndex< 1, 0 >() ]; + else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 ) + a = u[ neighborEntities.template getEntityIndex< -1, 0 >() ]; + else + { + a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0 >() ], + u[ neighborEntities.template getEntityIndex< 1, 0 >() ] ); + } + + if( cell.getCoordinates().y() == 0 ) + b = u[ neighborEntities.template getEntityIndex< 0, 1 >()]; + else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 ) + b = u[ neighborEntities.template getEntityIndex< 0, -1 >() ]; + else + { + b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1 >() ], + u[ neighborEntities.template getEntityIndex< 0, 1 >() ] ); + } + if( fabs( a ) == std::numeric_limits< RealType >::max() && + fabs( b ) == std::numeric_limits< RealType >::max() ) + return; + /*if( fabs( a ) == TypeInfo< Real >::getMaxValue() || + fabs( b ) == TypeInfo< Real >::getMaxValue() || + fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) ) { - tmp = - fabs( a ) >= fabs( b ) ? b + TNL::sign( value ) * hy : - a + TNL::sign( value ) * hx; + tmp = + fabs( a ) >= fabs( b ) ? b + TNL::sign( value ) * hy : + a + TNL::sign( value ) * hx; }*/ - /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() && - fabs( b ) != TypeInfo< Real >::getMaxValue() && - fabs( a - b ) < TNL::sqrt( (hx * hx + hy * hy)/v ) ) + /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() && + fabs( b ) != TypeInfo< Real >::getMaxValue() && + fabs( a - b ) < TNL::sqrt( (hx * hx + hy * hy)/v ) ) { - tmp = ( hx * hx * b + hy * hy * a + - sign( value ) * hx * hy * TNL::sqrt( ( hx * hx + hy * hy )/v - - ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy ); - u[ cell.getIndex() ] = tmp; + tmp = ( hx * hx * b + hy * hy * a + + sign( value ) * hx * hy * TNL::sqrt( ( hx * hx + hy * hy )/v - + ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy ); + u[ cell.getIndex() ] = tmp; } else { - tmp = - fabs( a ) > fabs( b ) ? b + TNL::sign( value ) * hy/v : - a + TNL::sign( value ) * hx/v; - u[ cell.getIndex() ] = argAbsMin( value, tmp ); - //tmp = TypeInfo< RealType >::getMaxValue(); + tmp = + fabs( a ) > fabs( b ) ? b + TNL::sign( value ) * hy/v : + a + TNL::sign( value ) * hx/v; + u[ cell.getIndex() ] = argAbsMin( value, tmp ); + //tmp = TypeInfo< RealType >::getMaxValue(); }*/ - RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 }; - sortMinims( pom ); - tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v; - - - if( fabs( tmp ) < fabs( pom[ 1 ] ) ) - u[ cell.getIndex() ] = argAbsMin( value, tmp ); - else - { - tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + + RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 }; + sortMinims( pom ); + tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v; + + + if( fabs( tmp ) < fabs( pom[ 1 ] ) ) + u[ cell.getIndex() ] = argAbsMin( value, tmp ); + else + { + tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); - u[ cell.getIndex() ] = argAbsMin( value, tmp ); - } + u[ cell.getIndex() ] = argAbsMin( value, tmp ); + } } template< typename Real, - typename Device, - typename Index > + typename Device, + typename Index > void tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: initInterface( const MeshFunctionPointer& _input, - MeshFunctionPointer& _output, - InterfaceMapPointer& _interfaceMap ) + MeshFunctionPointer& _output, + InterfaceMapPointer& _interfaceMap ) { - if( std::is_same< Device, Devices::Cuda >::value ) - { + if( std::is_same< Device, Devices::Cuda >::value ) + { #ifdef HAVE_CUDA - const MeshType& mesh = _input->getMesh(); - - const int cudaBlockSize( 8 ); - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); - int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); - int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize ); - if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) - std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; - dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); - dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); - Devices::Cuda::synchronizeDevice(); - CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(), - _output.template modifyData< Device >(), - _interfaceMap.template modifyData< Device >() ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + const MeshType& mesh = _input->getMesh(); + + const int cudaBlockSize( 8 ); + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); + int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); + int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize ); + if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) + std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; + dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); + dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); + Devices::Cuda::synchronizeDevice(); + CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(), + _output.template modifyData< Device >(), + _interfaceMap.template modifyData< Device >() ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; #endif - } - if( std::is_same< Device, Devices::Host >::value ) - { - const MeshFunctionType& input = _input.getData(); - MeshFunctionType& output = _output.modifyData(); - InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); - const MeshType& mesh = input.getMesh(); - typedef typename MeshType::Cell Cell; - Cell cell( mesh ); - for( cell.getCoordinates().z() = 0; - cell.getCoordinates().z() < mesh.getDimensions().z(); - cell.getCoordinates().z() ++ ) - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh.getDimensions().y(); - cell.getCoordinates().y() ++ ) - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh.getDimensions().x(); - cell.getCoordinates().x() ++ ) - { - cell.refresh(); - output[ cell.getIndex() ] = - input( cell ) > 0 ? std::numeric_limits< RealType >::max() : - - std::numeric_limits< RealType >::max(); - interfaceMap[ cell.getIndex() ] = false; - } - - const RealType& hx = mesh.getSpaceSteps().x(); - const RealType& hy = mesh.getSpaceSteps().y(); - const RealType& hz = mesh.getSpaceSteps().z(); - for( cell.getCoordinates().z() = 0; - cell.getCoordinates().z() < mesh.getDimensions().z(); - cell.getCoordinates().z() ++ ) - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh.getDimensions().y(); - cell.getCoordinates().y() ++ ) - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh.getDimensions().x(); - cell.getCoordinates().x() ++ ) + } + if( std::is_same< Device, Devices::Host >::value ) + { + const MeshFunctionType& input = _input.getData(); + MeshFunctionType& output = _output.modifyData(); + InterfaceMapType& interfaceMap = _interfaceMap.modifyData(); + const MeshType& mesh = input.getMesh(); + typedef typename MeshType::Cell Cell; + Cell cell( mesh ); + for( cell.getCoordinates().z() = 0; + cell.getCoordinates().z() < mesh.getDimensions().z(); + cell.getCoordinates().z() ++ ) + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh.getDimensions().y(); + cell.getCoordinates().y() ++ ) + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh.getDimensions().x(); + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + output[ cell.getIndex() ] = + input( cell ) > 0 ? std::numeric_limits< RealType >::max() : + - std::numeric_limits< RealType >::max(); + interfaceMap[ cell.getIndex() ] = false; + } + + const RealType& hx = mesh.getSpaceSteps().x(); + const RealType& hy = mesh.getSpaceSteps().y(); + const RealType& hz = mesh.getSpaceSteps().z(); + for( cell.getCoordinates().z() = 0; + cell.getCoordinates().z() < mesh.getDimensions().z(); + cell.getCoordinates().z() ++ ) + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh.getDimensions().y(); + cell.getCoordinates().y() ++ ) + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh.getDimensions().x(); + cell.getCoordinates().x() ++ ) + { + cell.refresh(); + const RealType& c = input( cell ); + if( ! cell.isBoundaryEntity() ) + { + auto neighbors = cell.getNeighborEntities(); + Real pom = 0; + const IndexType e = neighbors.template getEntityIndex< 1, 0, 0 >(); + const IndexType n = neighbors.template getEntityIndex< 0, 1, 0 >(); + const IndexType t = neighbors.template getEntityIndex< 0, 0, 1 >(); + //Try exact initiation + /*const IndexType w = neighbors.template getEntityIndex< -1, 0, 0 >(); + const IndexType s = neighbors.template getEntityIndex< 0, -1, 0 >(); + const IndexType b = neighbors.template getEntityIndex< 0, 0, -1 >(); + if( c * input[ e ] <= 0 ) + { + output[ cell.getIndex() ] = c; + output[ e ] = input[ e ]; + interfaceMap[ e ] = true; + interfaceMap[ cell.getIndex() ] = true; + } + else if( c * input[ n ] <= 0 ) + { + output[ cell.getIndex() ] = c; + output[ n ] = input[ n ]; + interfaceMap[ n ] = true; + interfaceMap[ cell.getIndex() ] = true; + } + else if( c * input[ t ] <= 0 ) + { + output[ cell.getIndex() ] = c; + output[ t ] = input[ t ]; + interfaceMap[ t ] = true; + interfaceMap[ cell.getIndex() ] = true; + }*/ + if( c * input[ n ] <= 0 ) + { + if( c >= 0 ) { - cell.refresh(); - const RealType& c = input( cell ); - if( ! cell.isBoundaryEntity() ) - { - auto neighbors = cell.getNeighborEntities(); - Real pom = 0; - const IndexType e = neighbors.template getEntityIndex< 1, 0, 0 >(); - const IndexType n = neighbors.template getEntityIndex< 0, 1, 0 >(); - const IndexType t = neighbors.template getEntityIndex< 0, 0, 1 >(); - //Try exact initiation - /*const IndexType w = neighbors.template getEntityIndex< -1, 0, 0 >(); - const IndexType s = neighbors.template getEntityIndex< 0, -1, 0 >(); - const IndexType b = neighbors.template getEntityIndex< 0, 0, -1 >(); - if( c * input[ e ] <= 0 ) - { - output[ cell.getIndex() ] = c; - output[ e ] = input[ e ]; - interfaceMap[ e ] = true; - interfaceMap[ cell.getIndex() ] = true; - } - else if( c * input[ n ] <= 0 ) - { - output[ cell.getIndex() ] = c; - output[ n ] = input[ n ]; - interfaceMap[ n ] = true; - interfaceMap[ cell.getIndex() ] = true; - } - else if( c * input[ t ] <= 0 ) - { - output[ cell.getIndex() ] = c; - output[ t ] = input[ t ]; - interfaceMap[ t ] = true; - interfaceMap[ cell.getIndex() ] = true; - }*/ - if( c * input[ n ] <= 0 ) - { - if( c >= 0 ) - { - pom = ( hy * c )/( c - input[ n ]); - if( output[ cell.getIndex() ] > pom ) - output[ cell.getIndex() ] = pom; - - if ( output[ n ] < pom - hy) - output[ n ] = pom - hy; // ( hy * c )/( c - input[ n ]) - hy; - - }else - { - pom = - ( hy * c )/( c - input[ n ]); - if( output[ cell.getIndex() ] < pom ) - output[ cell.getIndex() ] = pom; - if( output[ n ] > hy + pom ) - output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]); - - } - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ n ] = true; - } - if( c * input[ e ] <= 0 ) - { - if( c >= 0 ) - { - pom = ( hx * c )/( c - input[ e ]); - if( output[ cell.getIndex() ] > pom ) - output[ cell.getIndex() ] = pom; - - pom = pom - hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; - if( output[ e ] < pom ) - output[ e ] = pom; - - }else - { - pom = - (hx * c)/( c - input[ e ]); - if( output[ cell.getIndex() ] < pom ) - output[ cell.getIndex() ] = pom; - - pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]); - if( output[ e ] > pom ) - output[ e ] = pom; - } - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ e ] = true; - } - if( c * input[ t ] <= 0 ) - { - if( c >= 0 ) - { - pom = ( hz * c )/( c - input[ t ]); - if( output[ cell.getIndex() ] > pom ) - output[ cell.getIndex() ] = pom; - - pom = pom - hz; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; - if( output[ t ] < pom ) - output[ t ] = pom; - - }else - { - pom = - (hz * c)/( c - input[ t ]); - if( output[ cell.getIndex() ] < pom ) - output[ cell.getIndex() ] = pom; - - pom = pom + hz; //output[ e ] = hx - (hx * c)/( c - input[ e ]); - if( output[ t ] > pom ) - output[ t ] = pom; - - } - interfaceMap[ cell.getIndex() ] = true; - interfaceMap[ t ] = true; - } - } - /*output[ cell.getIndex() ] = - c > 0 ? TypeInfo< RealType >::getMaxValue() : - -TypeInfo< RealType >::getMaxValue(); - interfaceMap[ cell.getIndex() ] = false;*/ //is on line 245 + pom = ( hy * c )/( c - input[ n ]); + if( output[ cell.getIndex() ] > pom ) + output[ cell.getIndex() ] = pom; + + if ( output[ n ] < pom - hy) + output[ n ] = pom - hy; // ( hy * c )/( c - input[ n ]) - hy; + + }else + { + pom = - ( hy * c )/( c - input[ n ]); + if( output[ cell.getIndex() ] < pom ) + output[ cell.getIndex() ] = pom; + if( output[ n ] > hy + pom ) + output[ n ] = hy + pom; //hy - ( hy * c )/( c - input[ n ]); + } - } + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ n ] = true; + } + if( c * input[ e ] <= 0 ) + { + if( c >= 0 ) + { + pom = ( hx * c )/( c - input[ e ]); + if( output[ cell.getIndex() ] > pom ) + output[ cell.getIndex() ] = pom; + + pom = pom - hx; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; + if( output[ e ] < pom ) + output[ e ] = pom; + + }else + { + pom = - (hx * c)/( c - input[ e ]); + if( output[ cell.getIndex() ] < pom ) + output[ cell.getIndex() ] = pom; + + pom = pom + hx; //output[ e ] = hx - (hx * c)/( c - input[ e ]); + if( output[ e ] > pom ) + output[ e ] = pom; + } + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ e ] = true; + } + if( c * input[ t ] <= 0 ) + { + if( c >= 0 ) + { + pom = ( hz * c )/( c - input[ t ]); + if( output[ cell.getIndex() ] > pom ) + output[ cell.getIndex() ] = pom; + + pom = pom - hz; //output[ e ] = (hx * c)/( c - input[ e ]) - hx; + if( output[ t ] < pom ) + output[ t ] = pom; + + }else + { + pom = - (hz * c)/( c - input[ t ]); + if( output[ cell.getIndex() ] < pom ) + output[ cell.getIndex() ] = pom; + + pom = pom + hz; //output[ e ] = hx - (hx * c)/( c - input[ e ]); + if( output[ t ] > pom ) + output[ t ] = pom; + + } + interfaceMap[ cell.getIndex() ] = true; + interfaceMap[ t ] = true; + } + } + /*output[ cell.getIndex() ] = + c > 0 ? TypeInfo< RealType >::getMaxValue() : + -TypeInfo< RealType >::getMaxValue(); + interfaceMap[ cell.getIndex() ] = false;*/ //is on line 245 + } + } } template< typename Real, - typename Device, - typename Index > - template< typename MeshEntity > + typename Device, + typename Index > +template< typename MeshEntity > __cuda_callable__ void tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: updateCell( MeshFunctionType& u, - const MeshEntity& cell, - const RealType v ) + const MeshEntity& cell, + const RealType v ) { - const auto& neighborEntities = cell.template getNeighborEntities< 3 >(); - const MeshType& mesh = cell.getMesh(); + const auto& neighborEntities = cell.template getNeighborEntities< 3 >(); + const MeshType& mesh = cell.getMesh(); - const RealType& hx = mesh.getSpaceSteps().x(); - const RealType& hy = mesh.getSpaceSteps().y(); - const RealType& hz = mesh.getSpaceSteps().z(); - const RealType value = u( cell ); - //std::cout << value << std::endl; - RealType a, b, c, tmp = std::numeric_limits< RealType >::max(); - - - if( cell.getCoordinates().x() == 0 ) - a = u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ]; - else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 ) - a = u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ]; - else + const RealType& hx = mesh.getSpaceSteps().x(); + const RealType& hy = mesh.getSpaceSteps().y(); + const RealType& hz = mesh.getSpaceSteps().z(); + const RealType value = u( cell ); + //std::cout << value << std::endl; + RealType a, b, c, tmp = std::numeric_limits< RealType >::max(); + + + if( cell.getCoordinates().x() == 0 ) + a = u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ]; + else if( cell.getCoordinates().x() == mesh.getDimensions().x() - 1 ) + a = u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ]; + else + { + a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ], + u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ] ); + } + if( cell.getCoordinates().y() == 0 ) + b = u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ]; + else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 ) + b = u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ]; + else + { + b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ], + u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ] ); + }if( cell.getCoordinates().z() == 0 ) + c = u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ]; + else if( cell.getCoordinates().z() == mesh.getDimensions().z() - 1 ) + c = u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ]; + else + { + c = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ], + u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ] ); + } + if( fabs( a ) == std::numeric_limits< RealType >::max() && + fabs( b ) == std::numeric_limits< RealType >::max() && + fabs( c ) == std::numeric_limits< RealType >::max() ) + return; + + + /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() && + fabs( b ) != TypeInfo< Real >::getMaxValue() && + fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) ) { - a = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< -1, 0, 0 >() ], - u[ neighborEntities.template getEntityIndex< 1, 0, 0 >() ] ); + tmp = ( hx * hx * a + hy * hy * b + + sign( value ) * hx * hy * sqrt( ( hx * hx + hy * hy )/v - + ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy ); } - if( cell.getCoordinates().y() == 0 ) - b = u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ]; - else if( cell.getCoordinates().y() == mesh.getDimensions().y() - 1 ) - b = u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ]; - else - { - b = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, -1, 0 >() ], - u[ neighborEntities.template getEntityIndex< 0, 1, 0 >() ] ); - }if( cell.getCoordinates().z() == 0 ) - c = u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ]; - else if( cell.getCoordinates().z() == mesh.getDimensions().z() - 1 ) - c = u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ]; - else + if( fabs( a ) != TypeInfo< Real >::getMaxValue() && + fabs( c ) != TypeInfo< Real >::getMaxValue() && + fabs( a - c ) >= TNL::sqrt( (hx * hx + hz * hz)/v ) ) { - c = TNL::argAbsMin( u[ neighborEntities.template getEntityIndex< 0, 0, -1 >() ], - u[ neighborEntities.template getEntityIndex< 0, 0, 1 >() ] ); + tmp = ( hx * hx * a + hz * hz * c + + sign( value ) * hx * hz * sqrt( ( hx * hx + hz * hz )/v - + ( a - c ) * ( a - c ) ) )/( hx * hx + hz * hz ); } - if( fabs( a ) == std::numeric_limits< RealType >::max() && - fabs( b ) == std::numeric_limits< RealType >::max() && - fabs( c ) == std::numeric_limits< RealType >::max() ) - return; - - - /*if( fabs( a ) != TypeInfo< Real >::getMaxValue() && - fabs( b ) != TypeInfo< Real >::getMaxValue() && - fabs( a - b ) >= TNL::sqrt( (hx * hx + hy * hy)/v ) ) - { - tmp = ( hx * hx * a + hy * hy * b + - sign( value ) * hx * hy * sqrt( ( hx * hx + hy * hy )/v - - ( a - b ) * ( a - b ) ) )/( hx * hx + hy * hy ); - } - if( fabs( a ) != TypeInfo< Real >::getMaxValue() && - fabs( c ) != TypeInfo< Real >::getMaxValue() && - fabs( a - c ) >= TNL::sqrt( (hx * hx + hz * hz)/v ) ) - { - tmp = ( hx * hx * a + hz * hz * c + - sign( value ) * hx * hz * sqrt( ( hx * hx + hz * hz )/v - - ( a - c ) * ( a - c ) ) )/( hx * hx + hz * hz ); - } - if( fabs( b ) != TypeInfo< Real >::getMaxValue() && - fabs( c ) != TypeInfo< Real >::getMaxValue() && - fabs( b - c ) >= TNL::sqrt( (hy * hy + hz * hz)/v ) ) - { - tmp = ( hy * hy * b + hz * hz * c + - sign( value ) * hy * hz * sqrt( ( hy * hy + hz * hz )/v - - ( b - c ) * ( b - c ) ) )/( hy * hy + hz * hz ); - }*/ - RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz}; - sortMinims( pom ); - tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]; - if( fabs( tmp ) < fabs( pom[ 1 ] ) ) + if( fabs( b ) != TypeInfo< Real >::getMaxValue() && + fabs( c ) != TypeInfo< Real >::getMaxValue() && + fabs( b - c ) >= TNL::sqrt( (hy * hy + hz * hz)/v ) ) + { + tmp = ( hy * hy * b + hz * hz * c + + sign( value ) * hy * hz * sqrt( ( hy * hy + hz * hz )/v - + ( b - c ) * ( b - c ) ) )/( hy * hy + hz * hz ); + }*/ + RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz}; + sortMinims( pom ); + tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]; + if( fabs( tmp ) < fabs( pom[ 1 ] ) ) + { + u[ cell.getIndex() ] = argAbsMin( value, tmp ); + } + else + { + tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + + TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - + ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); + if( fabs( tmp ) < fabs( pom[ 2 ]) ) { - u[ cell.getIndex() ] = argAbsMin( value, tmp ); + u[ cell.getIndex() ] = argAbsMin( value, tmp ); } else { - tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + - TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - - ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); - if( fabs( tmp ) < fabs( pom[ 2 ]) ) - { - u[ cell.getIndex() ] = argAbsMin( value, tmp ); - } - else - { - tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c + - TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - - hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) - - hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx ); - u[ cell.getIndex() ] = argAbsMin( value, tmp ); - } + tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c + + TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - + hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) - + hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx ); + u[ cell.getIndex() ] = argAbsMin( value, tmp ); } + } } template < typename T1, typename T2 > T1 meet2DCondition( T1 a, T1 b, const T2 ha, const T2 hb, const T1 value, double v) { - T1 tmp; - if( fabs( a ) != std::numeric_limits< T1 >::max && - fabs( b ) != std::numeric_limits< T1 >::max && - fabs( a - b ) < ha/v )//TNL::sqrt( (ha * ha + hb * hb)/2 )/v ) - { - tmp = ( ha * ha * b + hb * hb * a + + T1 tmp; + if( fabs( a ) != std::numeric_limits< T1 >::max && + fabs( b ) != std::numeric_limits< T1 >::max && + fabs( a - b ) < ha/v )//TNL::sqrt( (ha * ha + hb * hb)/2 )/v ) + { + tmp = ( ha * ha * b + hb * hb * a + TNL::sign( value ) * ha * hb * TNL::sqrt( ( ha * ha + hb * hb )/( v * v ) - ( a - b ) * ( a - b ) ) )/( ha * ha + hb * hb ); - } - else - { - tmp = std::numeric_limits< T1 >::max; - } - - return tmp; + } + else + { + tmp = std::numeric_limits< T1 >::max; + } + + return tmp; } template < typename T1 > __cuda_callable__ void sortMinims( T1 pom[] ) { - T1 tmp[6] = {0.0,0.0,0.0,0.0,0.0,0.0}; - if( fabs(pom[0]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[2])){ - tmp[0] = pom[0]; tmp[1] = pom[1]; tmp[2] = pom[2]; - tmp[3] = pom[3]; tmp[4] = pom[4]; tmp[5] = pom[5]; - - } - else if( fabs(pom[0]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[1]) ){ - tmp[0] = pom[0]; tmp[1] = pom[2]; tmp[2] = pom[1]; - tmp[3] = pom[3]; tmp[4] = pom[5]; tmp[5] = pom[4]; - } - else if( fabs(pom[1]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[2]) ){ - tmp[0] = pom[1]; tmp[1] = pom[0]; tmp[2] = pom[2]; - tmp[3] = pom[4]; tmp[4] = pom[3]; tmp[5] = pom[5]; - } - else if( fabs(pom[1]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[0]) ){ - tmp[0] = pom[1]; tmp[1] = pom[2]; tmp[2] = pom[0]; - tmp[3] = pom[4]; tmp[4] = pom[5]; tmp[5] = pom[3]; - } - else if( fabs(pom[2]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[1]) ){ - tmp[0] = pom[2]; tmp[1] = pom[0]; tmp[2] = pom[1]; - tmp[3] = pom[5]; tmp[4] = pom[3]; tmp[5] = pom[4]; - } - else if( fabs(pom[2]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[0]) ){ - tmp[0] = pom[2]; tmp[1] = pom[1]; tmp[2] = pom[0]; - tmp[3] = pom[5]; tmp[4] = pom[4]; tmp[5] = pom[3]; - } + T1 tmp[6] = {0.0,0.0,0.0,0.0,0.0,0.0}; + if( fabs(pom[0]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[2])){ + tmp[0] = pom[0]; tmp[1] = pom[1]; tmp[2] = pom[2]; + tmp[3] = pom[3]; tmp[4] = pom[4]; tmp[5] = pom[5]; - for( int i = 0; i < 6; i++ ) - { - pom[ i ] = tmp[ i ]; - } + } + else if( fabs(pom[0]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[1]) ){ + tmp[0] = pom[0]; tmp[1] = pom[2]; tmp[2] = pom[1]; + tmp[3] = pom[3]; tmp[4] = pom[5]; tmp[5] = pom[4]; + } + else if( fabs(pom[1]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[2]) ){ + tmp[0] = pom[1]; tmp[1] = pom[0]; tmp[2] = pom[2]; + tmp[3] = pom[4]; tmp[4] = pom[3]; tmp[5] = pom[5]; + } + else if( fabs(pom[1]) <= fabs(pom[2]) && fabs(pom[2]) <= fabs(pom[0]) ){ + tmp[0] = pom[1]; tmp[1] = pom[2]; tmp[2] = pom[0]; + tmp[3] = pom[4]; tmp[4] = pom[5]; tmp[5] = pom[3]; + } + else if( fabs(pom[2]) <= fabs(pom[0]) && fabs(pom[0]) <= fabs(pom[1]) ){ + tmp[0] = pom[2]; tmp[1] = pom[0]; tmp[2] = pom[1]; + tmp[3] = pom[5]; tmp[4] = pom[3]; tmp[5] = pom[4]; + } + else if( fabs(pom[2]) <= fabs(pom[1]) && fabs(pom[1]) <= fabs(pom[0]) ){ + tmp[0] = pom[2]; tmp[1] = pom[1]; tmp[2] = pom[0]; + tmp[3] = pom[5]; tmp[4] = pom[4]; tmp[5] = pom[3]; + } + + for( int i = 0; i < 6; i++ ) + { + pom[ i ] = tmp[ i ]; + } } @@ -893,372 +919,373 @@ __cuda_callable__ void sortMinims( T1 pom[] ) #ifdef HAVE_CUDA template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& input, - Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output, - Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap ) + Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index > >& output, + Functions::MeshFunction< Meshes::Grid< 1, Real, Device, Index >, 1, bool >& interfaceMap ) { - int i = threadIdx.x + blockDim.x*blockIdx.x; - const Meshes::Grid< 1, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); + int i = threadIdx.x + blockDim.x*blockIdx.x; + const Meshes::Grid< 1, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); + + if( i < mesh.getDimensions().x() ) + { + typedef typename Meshes::Grid< 1, Real, Device, Index >::Cell Cell; + Cell cell( mesh ); + cell.getCoordinates().x() = i; + cell.refresh(); + const Index cind = cell.getIndex(); + - if( i < mesh.getDimensions().x() ) + output[ cind ] = + input( cell ) >= 0 ? std::numeric_limits< Real >::max() : + - std::numeric_limits< Real >::max(); + interfaceMap[ cind ] = false; + + const Real& h = mesh.getSpaceSteps().x(); + cell.refresh(); + const Real& c = input( cell ); + if( ! cell.isBoundaryEntity() ) { - typedef typename Meshes::Grid< 1, Real, Device, Index >::Cell Cell; - Cell cell( mesh ); - cell.getCoordinates().x() = i; - cell.refresh(); - const Index cind = cell.getIndex(); - - - output[ cind ] = - input( cell ) >= 0 ? std::numeric_limits< Real >::max() : - - std::numeric_limits< Real >::max(); - interfaceMap[ cind ] = false; - - const Real& h = mesh.getSpaceSteps().x(); - cell.refresh(); - const Real& c = input( cell ); - if( ! cell.isBoundaryEntity() ) - { - auto neighbors = cell.getNeighborEntities(); - Real pom = 0; - const Index e = neighbors.template getEntityIndex< 1 >(); - const Index w = neighbors.template getEntityIndex< -1 >(); - if( c * input[ e ] <= 0 ) - { - pom = TNL::sign( c )*( h * c )/( c - input[ e ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ w ] <= 0 ) - { - pom = TNL::sign( c )*( h * c )/( c - input[ w ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - } + auto neighbors = cell.getNeighborEntities(); + Real pom = 0; + const Index e = neighbors.template getEntityIndex< 1 >(); + const Index w = neighbors.template getEntityIndex< -1 >(); + if( c * input[ e ] <= 0 ) + { + pom = TNL::sign( c )*( h * c )/( c - input[ e ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ w ] <= 0 ) + { + pom = TNL::sign( c )*( h * c )/( c - input[ w ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } } - + } + } template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap ) + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output, + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap ) { - int i = threadIdx.x + blockDim.x*blockIdx.x; - int j = blockDim.y*blockIdx.y + threadIdx.y; - const Meshes::Grid< 2, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); + int i = threadIdx.x + blockDim.x*blockIdx.x; + int j = blockDim.y*blockIdx.y + threadIdx.y; + const Meshes::Grid< 2, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); + + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) + { + typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell; + Cell cell( mesh ); + cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; + cell.refresh(); + const Index cind = cell.getIndex(); + - if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) + output[ cind ] = + input( cell ) >= 0 ? std::numeric_limits< Real >::max() : + - std::numeric_limits< Real >::max(); + interfaceMap[ cind ] = false; + + const Real& hx = mesh.getSpaceSteps().x(); + const Real& hy = mesh.getSpaceSteps().y(); + cell.refresh(); + const Real& c = input( cell ); + if( ! cell.isBoundaryEntity() ) { - typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell; - Cell cell( mesh ); - cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; - cell.refresh(); - const Index cind = cell.getIndex(); - - - output[ cind ] = - input( cell ) >= 0 ? std::numeric_limits< Real >::max() : - - std::numeric_limits< Real >::max(); - interfaceMap[ cind ] = false; - - const Real& hx = mesh.getSpaceSteps().x(); - const Real& hy = mesh.getSpaceSteps().y(); - cell.refresh(); - const Real& c = input( cell ); - if( ! cell.isBoundaryEntity() ) - { - auto neighbors = cell.getNeighborEntities(); - Real pom = 0; - const Index e = neighbors.template getEntityIndex< 1, 0 >(); - const Index w = neighbors.template getEntityIndex< -1, 0 >(); - const Index n = neighbors.template getEntityIndex< 0, 1 >(); - const Index s = neighbors.template getEntityIndex< 0, -1 >(); - - if( c * input[ n ] <= 0 ) - { - pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cell.getIndex() ] = true; - } - if( c * input[ e ] <= 0 ) - { - pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ w ] <= 0 ) - { - pom = TNL::sign( c )*( hx * c )/( c - input[ w ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ s ] <= 0 ) - { - pom = TNL::sign( c )*( hy * c )/( c - input[ s ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - } + auto neighbors = cell.getNeighborEntities(); + Real pom = 0; + const Index e = neighbors.template getEntityIndex< 1, 0 >(); + const Index w = neighbors.template getEntityIndex< -1, 0 >(); + const Index n = neighbors.template getEntityIndex< 0, 1 >(); + const Index s = neighbors.template getEntityIndex< 0, -1 >(); + + if( c * input[ n ] <= 0 ) + { + pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cell.getIndex() ] = true; + } + if( c * input[ e ] <= 0 ) + { + pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ w ] <= 0 ) + { + pom = TNL::sign( c )*( hx * c )/( c - input[ w ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ s ] <= 0 ) + { + pom = TNL::sign( c )*( hy * c )/( c - input[ s ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } } + } } template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, - Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output, - Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap ) + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output, + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap ) { - int i = threadIdx.x + blockDim.x*blockIdx.x; - int j = blockDim.y*blockIdx.y + threadIdx.y; - int k = blockDim.z*blockIdx.z + threadIdx.z; - const Meshes::Grid< 3, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); + int i = threadIdx.x + blockDim.x*blockIdx.x; + int j = blockDim.y*blockIdx.y + threadIdx.y; + int k = blockDim.z*blockIdx.z + threadIdx.z; + const Meshes::Grid< 3, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); + + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() ) + { + typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell; + Cell cell( mesh ); + cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k; + cell.refresh(); + const Index cind = cell.getIndex(); + + + output[ cind ] = + input( cell ) >= 0 ? std::numeric_limits< Real >::max() : + - std::numeric_limits< Real >::max(); + interfaceMap[ cind ] = false; + cell.refresh(); - if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() ) + const Real& hx = mesh.getSpaceSteps().x(); + const Real& hy = mesh.getSpaceSteps().y(); + const Real& hz = mesh.getSpaceSteps().z(); + const Real& c = input( cell ); + if( ! cell.isBoundaryEntity() ) { - typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell; - Cell cell( mesh ); - cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k; - cell.refresh(); - const Index cind = cell.getIndex(); - - - output[ cind ] = - input( cell ) >= 0 ? std::numeric_limits< Real >::max() : - - std::numeric_limits< Real >::max(); - interfaceMap[ cind ] = false; - cell.refresh(); - - const Real& hx = mesh.getSpaceSteps().x(); - const Real& hy = mesh.getSpaceSteps().y(); - const Real& hz = mesh.getSpaceSteps().z(); - const Real& c = input( cell ); - if( ! cell.isBoundaryEntity() ) - { - auto neighbors = cell.getNeighborEntities(); - Real pom = 0; - const Index e = neighbors.template getEntityIndex< 1, 0, 0 >(); - const Index w = neighbors.template getEntityIndex< -1, 0, 0 >(); - const Index n = neighbors.template getEntityIndex< 0, 1, 0 >(); - const Index s = neighbors.template getEntityIndex< 0, -1, 0 >(); - const Index t = neighbors.template getEntityIndex< 0, 0, 1 >(); - const Index b = neighbors.template getEntityIndex< 0, 0, -1 >(); - - if( c * input[ n ] <= 0 ) - { - pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ e ] <= 0 ) - { - pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ w ] <= 0 ) - { - pom = TNL::sign( c )*( hx * c )/( c - input[ w ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ s ] <= 0 ) - { - pom = TNL::sign( c )*( hy * c )/( c - input[ s ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ b ] <= 0 ) - { - pom = TNL::sign( c )*( hz * c )/( c - input[ b ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - if( c * input[ t ] <= 0 ) - { - pom = TNL::sign( c )*( hz * c )/( c - input[ t ]); - if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) - output[ cind ] = pom; - - interfaceMap[ cind ] = true; - } - } + auto neighbors = cell.getNeighborEntities(); + Real pom = 0; + const Index e = neighbors.template getEntityIndex< 1, 0, 0 >(); + const Index w = neighbors.template getEntityIndex< -1, 0, 0 >(); + const Index n = neighbors.template getEntityIndex< 0, 1, 0 >(); + const Index s = neighbors.template getEntityIndex< 0, -1, 0 >(); + const Index t = neighbors.template getEntityIndex< 0, 0, 1 >(); + const Index b = neighbors.template getEntityIndex< 0, 0, -1 >(); + + if( c * input[ n ] <= 0 ) + { + pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ e ] <= 0 ) + { + pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ w ] <= 0 ) + { + pom = TNL::sign( c )*( hx * c )/( c - input[ w ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ s ] <= 0 ) + { + pom = TNL::sign( c )*( hy * c )/( c - input[ s ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ b ] <= 0 ) + { + pom = TNL::sign( c )*( hz * c )/( c - input[ b ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } + if( c * input[ t ] <= 0 ) + { + pom = TNL::sign( c )*( hz * c )/( c - input[ t ]); + if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) + output[ cind ] = pom; + + interfaceMap[ cind ] = true; + } } + } } template< typename Real, - typename Device, - typename Index > + typename Device, + typename Index > +template< int sizeSArray > __cuda_callable__ bool tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: -updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy, - const Real v ) +updateCell( volatile Real *sArray, int thri, int thrj, const Real hx, const Real hy, + const Real v ) { - const RealType value = sArray[ thrj ][ thri ]; - RealType a, b, tmp = std::numeric_limits< RealType >::max(); - - b = TNL::argAbsMin( sArray[ thrj+1 ][ thri ], - sArray[ thrj-1 ][ thri ] ); - - a = TNL::argAbsMin( sArray[ thrj ][ thri+1 ], - sArray[ thrj ][ thri-1 ] ); - - if( fabs( a ) == std::numeric_limits< RealType >::max() && - fabs( b ) == std::numeric_limits< RealType >::max() ) - return false; - - RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 }; - sortMinims( pom ); - tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v; - - - if( fabs( tmp ) < fabs( pom[ 1 ] ) ) - { - sArray[ thrj ][ thri ] = argAbsMin( value, tmp ); - tmp = value - sArray[ thrj ][ thri ]; - if ( fabs( tmp ) > 0.001*hx ) - return true; - else - return false; - } + const RealType value = sArray[ thrj * sizeSArray + thri ]; + RealType a, b, tmp = std::numeric_limits< RealType >::max(); + + b = TNL::argAbsMin( sArray[ (thrj+1) * sizeSArray + thri ], + sArray[ (thrj-1) * sizeSArray + thri ] ); + + a = TNL::argAbsMin( sArray[ thrj * sizeSArray + thri+1 ], + sArray[ thrj * sizeSArray + thri-1 ] ); + + if( fabs( a ) == std::numeric_limits< RealType >::max() && + fabs( b ) == std::numeric_limits< RealType >::max() ) + return false; + + RealType pom[6] = { a, b, std::numeric_limits< RealType >::max(), (RealType)hx, (RealType)hy, 0.0 }; + sortMinims( pom ); + tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]/v; + + + if( fabs( tmp ) < fabs( pom[ 1 ] ) ) + { + sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp ); + tmp = value - sArray[ thrj * sizeSArray + thri ]; + if ( fabs( tmp ) > 0.001*hx ) + return true; else - { - tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + + return false; + } + else + { + tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); - sArray[ thrj ][ thri ] = argAbsMin( value, tmp ); - tmp = value - sArray[ thrj ][ thri ]; - if ( fabs( tmp ) > 0.001*hx ) - return true; - else - return false; - } - - return false; + sArray[ thrj * sizeSArray + thri ] = argAbsMin( value, tmp ); + tmp = value - sArray[ thrj * sizeSArray + thri ]; + if ( fabs( tmp ) > 0.001*hx ) + return true; + else + return false; + } + + return false; } template< typename Real, - typename Device, - typename Index > + typename Device, + typename Index > __cuda_callable__ bool tnlDirectEikonalMethodsBase< Meshes::Grid< 1, Real, Device, Index > >:: updateCell( volatile Real sArray[18], int thri, const Real h, const Real v ) { - const RealType value = sArray[ thri ]; - RealType a, tmp = std::numeric_limits< RealType >::max(); - - a = TNL::argAbsMin( sArray[ thri+1 ], - sArray[ thri-1 ] ); - - if( fabs( a ) == std::numeric_limits< RealType >::max() ) - return false; - - tmp = a + TNL::sign( value ) * h/v; - - - sArray[ thri ] = argAbsMin( value, tmp ); - - tmp = value - sArray[ thri ]; - if ( fabs( tmp ) > 0.001*h ) - return true; - else - return false; + const RealType value = sArray[ thri ]; + RealType a, tmp = std::numeric_limits< RealType >::max(); + + a = TNL::argAbsMin( sArray[ thri+1 ], + sArray[ thri-1 ] ); + + if( fabs( a ) == std::numeric_limits< RealType >::max() ) + return false; + + tmp = a + TNL::sign( value ) * h/v; + + + sArray[ thri ] = argAbsMin( value, tmp ); + + tmp = value - sArray[ thri ]; + if ( fabs( tmp ) > 0.001*h ) + return true; + else + return false; } template< typename Real, - typename Device, - typename Index > + typename Device, + typename Index > __cuda_callable__ bool tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: updateCell( volatile Real sArray[10][10][10], int thri, int thrj, int thrk, const Real hx, const Real hy, const Real hz, const Real v ) { - const RealType value = sArray[thrk][thrj][thri]; - //std::cout << value << std::endl; - RealType a, b, c, tmp = std::numeric_limits< RealType >::max(); - - c = TNL::argAbsMin( sArray[ thrk+1 ][ thrj ][ thri ], - sArray[ thrk-1 ][ thrj ][ thri ] ); - - b = TNL::argAbsMin( sArray[ thrk ][ thrj+1 ][ thri ], - sArray[ thrk ][ thrj-1 ][ thri ] ); - - a = TNL::argAbsMin( sArray[ thrk ][ thrj ][ thri+1 ], - sArray[ thrk ][ thrj ][ thri-1 ] ); - - - if( fabs( a ) == std::numeric_limits< RealType >::max() && - fabs( b ) == std::numeric_limits< RealType >::max() && - fabs( c ) == std::numeric_limits< RealType >::max() ) + const RealType value = sArray[thrk][thrj][thri]; + //std::cout << value << std::endl; + RealType a, b, c, tmp = std::numeric_limits< RealType >::max(); + + c = TNL::argAbsMin( sArray[ thrk+1 ][ thrj ][ thri ], + sArray[ thrk-1 ][ thrj ][ thri ] ); + + b = TNL::argAbsMin( sArray[ thrk ][ thrj+1 ][ thri ], + sArray[ thrk ][ thrj-1 ][ thri ] ); + + a = TNL::argAbsMin( sArray[ thrk ][ thrj ][ thri+1 ], + sArray[ thrk ][ thrj ][ thri-1 ] ); + + + if( fabs( a ) == std::numeric_limits< RealType >::max() && + fabs( b ) == std::numeric_limits< RealType >::max() && + fabs( c ) == std::numeric_limits< RealType >::max() ) + return false; + + RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz}; + + sortMinims( pom ); + + tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]; + if( fabs( tmp ) < fabs( pom[ 1 ] ) ) + { + sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp ); + tmp = value - sArray[ thrk ][ thrj ][ thri ]; + if ( fabs( tmp ) > 0.001*hx ) + return true; + else return false; - - RealType pom[6] = { a, b, c, (RealType)hx, (RealType)hy, (RealType)hz}; - - sortMinims( pom ); - - tmp = pom[ 0 ] + TNL::sign( value ) * pom[ 3 ]; - if( fabs( tmp ) < fabs( pom[ 1 ] ) ) + } + else + { + tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + + TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - + ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); + if( fabs( tmp ) < fabs( pom[ 2 ]) ) { - sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp ); - tmp = value - sArray[ thrk ][ thrj ][ thri ]; - if ( fabs( tmp ) > 0.001*hx ) - return true; - else - return false; + sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp ); + tmp = value - sArray[ thrk ][ thrj ][ thri ]; + if ( fabs( tmp ) > 0.001*hx ) + return true; + else + return false; } else { - tmp = ( pom[ 3 ] * pom[ 3 ] * pom[ 1 ] + pom[ 4 ] * pom[ 4 ] * pom[ 0 ] + - TNL::sign( value ) * pom[ 3 ] * pom[ 4 ] * TNL::sqrt( ( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] )/( v * v ) - - ( pom[ 1 ] - pom[ 0 ] ) * ( pom[ 1 ] - pom[ 0 ] ) ) )/( pom[ 3 ] * pom[ 3 ] + pom[ 4 ] * pom[ 4 ] ); - if( fabs( tmp ) < fabs( pom[ 2 ]) ) - { - sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp ); - tmp = value - sArray[ thrk ][ thrj ][ thri ]; - if ( fabs( tmp ) > 0.001*hx ) - return true; - else - return false; - } - else - { - tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c + - TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - - hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) - - hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx ); - sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp ); - tmp = value - sArray[ thrk ][ thrj ][ thri ]; - if ( fabs( tmp ) > 0.001*hx ) - return true; - else - return false; - } + tmp = ( hy * hy * hz * hz * a + hx * hx * hz * hz * b + hx * hx * hy * hy * c + + TNL::sign( value ) * hx * hy * hz * TNL::sqrt( ( hx * hx * hz * hz + hy * hy * hz * hz + hx * hx * hy * hy)/( v * v ) - + hz * hz * ( a - b ) * ( a - b ) - hy * hy * ( a - c ) * ( a - c ) - + hx * hx * ( b - c ) * ( b - c ) ) )/( hx * hx * hy * hy + hy * hy * hz * hz + hz * hz * hx *hx ); + sArray[ thrk ][ thrj ][ thri ] = argAbsMin( value, tmp ); + tmp = value - sArray[ thrk ][ thrj ][ thri ]; + if ( fabs( tmp ) > 0.001*hx ) + return true; + else + return false; } - - return false; + } + + return false; } #endif diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index 4520eab0a45dceeb5c8a530ffc5140dc8954292e..e29421bb13aea872e19516d6ca72b9ce5c89fc93 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -15,6 +15,7 @@ #include "tnlFastSweepingMethod.h" #include <TNL/Devices/Cuda.h> +#include <string.h> #include <iostream> @@ -80,116 +81,171 @@ solve( const MeshPointer& mesh, - while( iteration < this->maxIterations ) { if( std::is_same< DeviceType, Devices::Host >::value ) { - int numThreadsPerBlock = 16; + int numThreadsPerBlock = 1024; + int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0); int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0); + //std::cout << "numBlocksX = " << numBlocksX << std::endl; + + /*Real **sArray = new Real*[numBlocksX*numBlocksY]; + for( int i = 0; i < numBlocksX * numBlocksY; i++ ) + sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)];*/ - ArrayContainer BlockIterHost; BlockIterHost.setSize( numBlocksX * numBlocksY ); BlockIterHost.setValue( 1 ); + int IsCalculationDone = 1; + + MeshFunctionPointer helpFunc( mesh ); + MeshFunctionPointer helpFunc1( mesh ); + helpFunc1 = auxPtr; + auxPtr = helpFunc; + helpFunc = helpFunc1; + //std::cout<< "Size = " << BlockIterHost.getSize() << std::endl; /*for( int k = numBlocksX-1; k >-1; k-- ){ - for( int l = 0; l < numBlocksY; l++ ){ - std::cout<< BlockIterHost[ l*numBlocksX + k ]; + for( int l = 0; l < numBlocksY; l++ ){ + std::cout<< BlockIterHost[ l*numBlocksX + k ]; + } + std::cout<<std::endl; + } + std::cout<<std::endl;*/ + unsigned int numWhile = 0; + while( IsCalculationDone ) + { + IsCalculationDone = 0; + helpFunc1 = auxPtr; + auxPtr = helpFunc; + helpFunc = helpFunc1; + this->template updateBlocks< 1026 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); + + for( int i = 0; i < BlockIterHost.getSize(); i++ ){ + if( IsCalculationDone == 0 ){ + IsCalculationDone = IsCalculationDone || BlockIterHost[ i ]; + //break; + } } - std::cout<<std::endl; - } - std::cout<<std::endl;*/ - - while( BlockIterHost[ 0 ] ) - { - this->updateBlocks( interfaceMap, aux, BlockIterHost, numThreadsPerBlock); + numWhile++; + + for( int j = numBlocksY-1; j>-1; j-- ){ + for( int i = 0; i < numBlocksX; i++ ) + std::cout << BlockIterHost[ j * numBlocksX + i ]; + std::cout << std::endl; + } + std::cout << std::endl; this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY ); - //Reduction - for( int k = numBlocksX-1; k >-1; k-- ){ - for( int l = 0; l < numBlocksY; l++ ){ - //std::cout<< BlockIterHost[ l*numBlocksX + k ]; - BlockIterHost[ 0 ] = BlockIterHost[ 0 ] || BlockIterHost[ l*numBlocksX + k ]; - } - //std::cout<<std::endl; - } + /*for( int j = numBlocksY-1; j>-1; j-- ){ + for( int i = 0; i < numBlocksX; i++ ) + std::cout << "BlockIterHost = "<< j*numBlocksX + i<< " ," << BlockIterHost[ j * numBlocksX + i ]; + std::cout << std::endl; + } + std::cout << std::endl;*/ + //Reduction + //std::cout<<std::endl; + string s( "aux-"+ std::to_string(numWhile) + ".tnl"); + aux.save( s ); } - /*for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh->getDimensions().y(); - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); - cell.getCoordinates().x()++ ) - { - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - - //aux.save( "aux-1.tnl" ); - - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh->getDimensions().y(); - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) - { - //std::cerr << "2 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } + if( numWhile == 1 ){ + auxPtr = helpFunc; } + /*for( int i = 0; i < numBlocksX * numBlocksY; i++ ) + delete []sArray[i];*/ - //aux.save( "aux-2.tnl" ); - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; - cell.getCoordinates().y() >= 0 ; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); - cell.getCoordinates().x()++ ) - { - //std::cerr << "3 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - - //aux.save( "aux-3.tnl" ); + /*for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh->getDimensions().y(); + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh->getDimensions().x(); + cell.getCoordinates().x()++ ) + { + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + + //aux.save( "aux-1.tnl" ); + + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh->getDimensions().y(); + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; + cell.getCoordinates().x() >= 0 ; + cell.getCoordinates().x()-- ) + { + //std::cerr << "2 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + + //aux.save( "aux-2.tnl" ); + + for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; + cell.getCoordinates().y() >= 0 ; + cell.getCoordinates().y()-- ) + { + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh->getDimensions().x(); + cell.getCoordinates().x()++ ) + { + //std::cerr << "3 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + + //aux.save( "aux-3.tnl" ); + + for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; + cell.getCoordinates().y() >= 0; + cell.getCoordinates().y()-- ) + { + for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; + cell.getCoordinates().x() >= 0 ; + cell.getCoordinates().x()-- ) + { + //std::cerr << "4 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + + for( int j = 0; + j < mesh->getDimensions().y(); + j++ ) + { + for( int i = 0; + i < mesh->getDimensions().x(); + i++ ) + { + std::cout << aux[ i * mesh->getDimensions().y() + j ] << " "; + } + std::cout << std::endl; + }*/ - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; - cell.getCoordinates().y() >= 0; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) - { - //std::cerr << "4 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - }*/ } if( std::is_same< DeviceType, Devices::Cuda >::value ) { // TODO: CUDA code #ifdef HAVE_CUDA -<<<<<<< HEAD TNL_CHECK_CUDA_DEVICE; + // Maximum cudaBlockSite is 32. Because of maximum num. of threads in kernel. const int cudaBlockSize( 16 ); + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); dim3 blockSize( cudaBlockSize, cudaBlockSize ); @@ -203,19 +259,14 @@ solve( const MeshPointer& mesh, BlockIterDevice.setSize( numBlocksX * numBlocksY ); BlockIterDevice.setValue( 1 ); TNL_CHECK_CUDA_DEVICE; - int ne = 0; - CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, - interfaceMapPtr.template getData< Device >(), - auxPtr.template modifyData< Device>(), - BlockIterDevice, ne); - TNL_CHECK_CUDA_DEVICE; + /*TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom; - BlockIterPom.setSize( numBlocksX * numBlocksY ); - BlockIterPom.setValue( 0 );*/ + BlockIterPom.setSize( numBlocksX * numBlocksY ); + BlockIterPom.setValue( 0 );*/ /*TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1; - BlockIterPom1.setSize( numBlocksX * numBlocksY ); - BlockIterPom1.setValue( 0 );*/ + BlockIterPom1.setSize( numBlocksX * numBlocksY ); + BlockIterPom1.setValue( 0 );*/ /*int *BlockIterDevice; cudaMalloc((void**) &BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/ int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0); @@ -224,139 +275,125 @@ solve( const MeshPointer& mesh, /*int *BlockIterPom; cudaMalloc((void**) &BlockIterPom, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/ - int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0); + int nBlocks = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0); + TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; - dBlock.setSize( nBlocks ); + dBlock.setSize( nBlocks ); TNL_CHECK_CUDA_DEVICE; /*int *dBlock; cudaMalloc((void**) &dBlock, nBlocks * sizeof( int ) );*/ - //int pocIter = 0; + + + MeshFunctionPointer helpFunc1; + helpFunc1->setMesh(mesh); + + MeshFunctionPointer helpFunc( mesh ); + + helpFunc1 = auxPtr; + auxPtr = helpFunc; + helpFunc = helpFunc1; + + int numIter = 0; + + //int oddEvenBlock = 0; while( BlockIterD ) { - /*BlockIterPom1 = BlockIterDevice; - for( int j = numBlocksY-1; j>-1; j-- ){ - for( int i = 0; i < numBlocksX; i++ ) - std::cout << BlockIterPom1[ j * numBlocksX + i ]; - std::cout << std::endl; - } - std::cout << std::endl;*/ + /** HERE IS CHESS METHOD **/ - CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, - interfaceMapPtr.template getData< Device >(), - auxPtr.template modifyData< Device>(), - BlockIterDevice, 1); + /*auxPtr = helpFunc; + + CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, + interfaceMapPtr.template getData< Device >(), + auxPtr.template getData< Device>(), + helpFunc.template modifyData< Device>(), + BlockIterDevice, + oddEvenBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; + auxPtr = helpFunc; - /*int poc = 0; - for( int i = 0; i < numBlocksX * numBlocksY; i++ ) - if( BlockIterPom1[ i ] ) - poc = poc+1; - std::cout << "pocet bloku, ktere se pocitali = " << poc << std::endl;*/ + oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; - GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, /*BlockIterPom,*/ numBlocksX, numBlocksY ); + CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, + interfaceMapPtr.template getData< Device >(), + auxPtr.template getData< Device>(), + helpFunc.template modifyData< Device>(), + BlockIterDevice, + oddEvenBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; + auxPtr = helpFunc; - CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); - TNL_CHECK_CUDA_DEVICE; + oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; - CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); + CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); + cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - - BlockIterD = dBlock.getElement( 0 ); - //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); + CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; + BlockIterD = dBlock.getElement( 0 );*/ + + /**------------------------------------------------------------------------------------------------*/ + + + /** HERE IS FIM **/ + + helpFunc1 = auxPtr; + auxPtr = helpFunc; + helpFunc = helpFunc1; + + //int pocBloku = 0; + Devices::Cuda::synchronizeDevice(); + CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, + interfaceMapPtr.template getData< Device >(), + auxPtr.template modifyData< Device>(), + helpFunc.template modifyData< Device>(), + BlockIterDevice ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + //std::cout << "Pocet aktivnich bloku = " << pocBloku << std::endl; + + GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, numBlocksX, numBlocksY ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + //std::cout<< "Probehlo" << std::endl; + + //TNL::swap( auxPtr, helpFunc ); + + + CudaParallelReduc<<< nBlocks , 1024 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); + TNL_CHECK_CUDA_DEVICE; + + CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); + TNL_CHECK_CUDA_DEVICE; + + + BlockIterD = dBlock.getElement( 0 ); + //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + + /**-----------------------------------------------------------------------------------------------------------*/ /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ - //pocIter ++; -======= - const int cudaBlockSize( 16 ); - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); - int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); - dim3 blockSize( cudaBlockSize, cudaBlockSize ); - dim3 gridSize( numBlocksX, numBlocksY ); - - tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; - - TNL::Containers::Array< int, Devices::Host, IndexType > BlockIter; - BlockIter.setSize( numBlocksX * numBlocksY ); - BlockIter.setValue( 0 ); - /*int* BlockIter = (int*)malloc( ( numBlocksX * numBlocksY ) * sizeof( int ) ); - for( int i = 0; i < numBlocksX*numBlocksY +1; i++) - BlockIter[i] = 1;*/ - - int BlockIterD = 1; - - TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; - BlockIterDevice.setSize( numBlocksX * numBlocksY ); - BlockIterDevice.setValue( 1 ); - /*int *BlockIterDevice; - cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) ); - cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);*/ - - int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0); - - TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; - dBlock.setSize( nBlocks ); - dBlock.setValue( 0 ); - /*int *dBlock; - cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/ - - while( BlockIterD ) - { - /*for( int i = 0; i < numBlocksX * numBlocksY; i++ ) - BlockIter[ i ] = false;*/ - - CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, - interfaceMapPtr.template getData< Device >(), - auxPtr.template modifyData< Device>(), - BlockIterDevice ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - - BlockIter = BlockIterDevice; - //cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyDeviceToHost); - GetNeighbours( BlockIter, numBlocksX, numBlocksY ); - - BlockIterDevice = BlockIter; - //cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - - - CudaParallelReduc<<< nBlocks, 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - - CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - - cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); - - /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) - BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ - - } - /*cudaFree( BlockIterDevice ); - cudaFree( dBlock ); - delete BlockIter;*/ - cudaDeviceSynchronize(); - - TNL_CHECK_CUDA_DEVICE; - - aux = *auxPtr; - interfaceMap = *interfaceMapPtr; -#endif ->>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf + numIter ++; } + if( numIter == 1 ){ + helpFunc1 = auxPtr; + auxPtr = helpFunc; + helpFunc = helpFunc1; + } + /*cudaFree( BlockIterDevice ); + cudaFree( dBlock ); + delete BlockIter;*/ cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - //std::cout<< pocIter << std::endl; + TNL_CHECK_CUDA_DEVICE; aux = *auxPtr; interfaceMap = *interfaceMapPtr; @@ -366,12 +403,14 @@ solve( const MeshPointer& mesh, } aux.save("aux-final.tnl"); } -<<<<<<< HEAD + #ifdef HAVE_CUDA + + template < typename Index > __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, - /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY ) + /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY ) { int i = blockIdx.x * 1024 + threadIdx.x; @@ -381,103 +420,68 @@ __global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index int m=0, k=0; m = i%numBlockX; k = i/numBlockX; - if( m > 0 ) - if( BlockIterDevice[ i - 1 ] ) - pom = 1;//BlockIterPom[ i ] = 1; - if( m < numBlockX -1 && pom == 0 ) - if( BlockIterDevice[ i + 1 ] ) - pom = 1;//BlockIterPom[ i ] = 1; - if( k > 0 && pom == 0 ) - if( BlockIterDevice[ i - numBlockX ] ) - pom = 1;// BlockIterPom[ i ] = 1; - if( k < numBlockY -1 && pom == 0 ) - if( BlockIterDevice[ i + numBlockX ] ) - pom = 1;//BlockIterPom[ i ] = 1; + if( m > 0 && BlockIterDevice[ i - 1 ] ){ + pom = 1;//BlockIterPom[ i ] = 1; + }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){ + pom = 1;//BlockIterPom[ i ] = 1; + }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){ + pom = 1;// BlockIterPom[ i ] = 1; + }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){ + pom = 1;//BlockIterPom[ i ] = 1; + } - - BlockIterDevice[ i ] = pom;//BlockIterPom[ i ]; } } -======= -template < typename Index > -void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY ) -{ - TNL::Containers::Array< int, Devices::Host, Index > BlockIterPom; - BlockIterPom.setSize( numBlockX * numBlockY ); - BlockIterPom.setValue( 0 ); - /*int* BlockIterPom; - BlockIterPom = new int[numBlockX * numBlockY];*/ - /*for(int i = 0; i < numBlockX * numBlockY; i++) - BlockIterPom[ i ] = 0;*/ - for(int i = 0; i < numBlockX * numBlockY; i++) - { - - if( BlockIter[ i ] ) - { - // i = k*numBlockY + m; - int m=0, k=0; - m = i%numBlockY; - k = i/numBlockY; - if( k > 0 && numBlockY > 1 ) - BlockIterPom[i - numBlockX] = 1; - if( k < numBlockY-1 && numBlockY > 1 ) - BlockIterPom[i + numBlockX] = 1; - - if( m >= 0 && m < numBlockX - 1 && numBlockX > 1 ) - BlockIterPom[ i+1 ] = 1; - if( m <= numBlockX -1 && m > 0 && numBlockX > 1 ) - BlockIterPom[ i-1 ] = 1; - } - } - for(int i = 0; i < numBlockX * numBlockY; i++ ){ -/// if( !BlockIter[ i ] ) - BlockIter[ i ] = BlockIterPom[ i ]; -/// else -/// BlockIter[ i ] = 0; - } - /*for( int i = numBlockX-1; i > -1; i-- ) - { - for( int j = 0; j< numBlockY; j++ ) - std::cout << BlockIter[ i*numBlockY + j ]; - std::cout << std::endl; - } - std::cout << std::endl;*/ - //delete[] BlockIterPom; -} - -#ifdef HAVE_CUDA ->>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf template < typename Index > __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, - TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks ) + TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks ) { -<<<<<<< HEAD int i = threadIdx.x; int blId = blockIdx.x; - __shared__ volatile int sArray[ 512 ]; + int blockSize = blockDim.x; + /*if ( i == 0 && blId == 0 ){ + printf( "nBlocks = %d \n", nBlocks ); + for( int j = nBlocks-1; j > -1 ; j--){ + printf( "cislo = %d \n", BlockIterDevice[ j ] ); + } + }*/ + __shared__ int sArray[ 1024 ]; sArray[ i ] = 0; - if(blId * 512 + i < nBlocks ) - sArray[ i ] = BlockIterDevice[ blId * 512 + i ]; + if( blId * 1024 + i < nBlocks ) + sArray[ i ] = BlockIterDevice[ blId * 1024 + i ]; __syncthreads(); - if (blockDim.x == 1024) { + /*extern __shared__ volatile int sArray[]; + unsigned int i = threadIdx.x; + unsigned int gid = blockIdx.x * blockSize * 2 + threadIdx.x; + unsigned int gridSize = blockSize * 2 * gridDim.x; + sArray[ i ] = 0; + while( gid < nBlocks ) + { + sArray[ i ] += BlockIterDevice[ gid ] + BlockIterDevice[ gid + blockSize ]; + gid += gridSize; + } + __syncthreads();*/ + + if ( blockSize == 1024) { if (i < 512) sArray[ i ] += sArray[ i + 512 ]; } __syncthreads(); - if (blockDim.x >= 512) { + if (blockSize >= 512) { if (i < 256) { sArray[ i ] += sArray[ i + 256 ]; } } - if (blockDim.x >= 256) { + __syncthreads(); + if (blockSize >= 256) { if (i < 128) { sArray[ i ] += sArray[ i + 128 ]; } } __syncthreads(); - if (blockDim.x >= 128) { + if (blockSize >= 128) { if (i < 64) { sArray[ i ] += sArray[ i + 64 ]; } @@ -485,183 +489,120 @@ __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, I __syncthreads(); if (i < 32 ) { - if( blockDim.x >= 64 ) sArray[ i ] += sArray[ i + 32 ]; - if( blockDim.x >= 32 ) sArray[ i ] += sArray[ i + 16 ]; - if( blockDim.x >= 16 ) sArray[ i ] += sArray[ i + 8 ]; - if( blockDim.x >= 8 ) sArray[ i ] += sArray[ i + 4 ]; - if( blockDim.x >= 4 ) sArray[ i ] += sArray[ i + 2 ]; - if( blockDim.x >= 2 ) sArray[ i ] += sArray[ i + 1 ]; + if( blockSize >= 64 ) sArray[ i ] += sArray[ i + 32 ]; + if( blockSize >= 32 ) sArray[ i ] += sArray[ i + 16 ]; + if( blockSize >= 16 ) sArray[ i ] += sArray[ i + 8 ]; + if( blockSize >= 8 ) sArray[ i ] += sArray[ i + 4 ]; + if( blockSize >= 4 ) sArray[ i ] += sArray[ i + 2 ]; + if( blockSize >= 2 ) sArray[ i ] += sArray[ i + 1 ]; } if( i == 0 ) dBlock[ blId ] = sArray[ 0 ]; -======= - int i = threadIdx.x; - int blId = blockIdx.x; - /*if ( i == 0 && blId == 0 ){ - printf( "nBlocks = %d \n", nBlocks ); - for( int j = nBlocks-1; j > -1 ; j--){ - printf( "cislo = %d \n", BlockIterDevice[ j ] ); - } - }*/ - __shared__ volatile int sArray[ 512 ]; - sArray[ i ] = 0; - if( blId * 512 + i < nBlocks ) - sArray[ i ] = BlockIterDevice[ blId * 512 + i ]; - __syncthreads(); - - if (blockDim.x == 1024) { - if (i < 512) - sArray[ i ] += sArray[ i + 512 ]; - } - __syncthreads(); - if (blockDim.x >= 512) { - if (i < 256) { - sArray[ i ] += sArray[ i + 256 ]; - } - } - __syncthreads(); - if (blockDim.x >= 256) { - if (i < 128) { - sArray[ i ] += sArray[ i + 128 ]; - } - } - __syncthreads(); - if (blockDim.x >= 128) { - if (i < 64) { - sArray[ i ] += sArray[ i + 64 ]; - } - } - __syncthreads(); - if (i < 32 ) - { - if( blockDim.x >= 64 ) sArray[ i ] += sArray[ i + 32 ]; - if( blockDim.x >= 32 ) sArray[ i ] += sArray[ i + 16 ]; - if( blockDim.x >= 16 ) sArray[ i ] += sArray[ i + 8 ]; - if( blockDim.x >= 8 ) sArray[ i ] += sArray[ i + 4 ]; - if( blockDim.x >= 4 ) sArray[ i ] += sArray[ i + 2 ]; - if( blockDim.x >= 2 ) sArray[ i ] += sArray[ i + 1 ]; - } - - if( i == 0 ) - dBlock[ blId ] = sArray[ 0 ]; ->>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf } -template < typename Real, typename Device, typename Index > +template < int sizeSArray, typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, - const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, - Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, -<<<<<<< HEAD - TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne ) -======= - TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ) ->>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf + const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, + const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, + Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& helpFunc, + TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int oddEvenBlock ) { int thri = threadIdx.x; int thrj = threadIdx.y; - int blIdx = blockIdx.x; int blIdy = blockIdx.y; - int grIdx = gridDim.x; - - if( BlockIterDevice[ blIdy * grIdx + blIdx] ) + int i = threadIdx.x + blockDim.x*blockIdx.x; + int j = blockDim.y*blockIdx.y + threadIdx.y; + /** FOR CHESS METHOD */ + if( (blockIdx.y%2 + blockIdx.x) % 2 == oddEvenBlock ) { - - const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); + /**-----------------------------------------*/ - int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); - __shared__ volatile int numOfBlockx; - __shared__ volatile int numOfBlocky; - __shared__ int xkolik; - __shared__ int ykolik; - __shared__ volatile int NE; - if( thri == 0 && thrj == 0 ) + + /** FOR FIM METHOD */ + /*if( BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x] ) + {*/ + /**-----------------------------------------*/ + const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); + __shared__ volatile int dimX; + __shared__ volatile int dimY; + __shared__ volatile Real hx; + __shared__ volatile Real hy; + if( thri==0 && thrj == 0) { - xkolik = blockDim.x + 1; - ykolik = blockDim.y + 1; - numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0); - numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0); - - if( numOfBlockx - 1 == blIdx ) - xkolik = dimX - (blIdx)*blockDim.x+1; - - if( numOfBlocky -1 == blIdy ) - ykolik = dimY - (blIdy)*blockDim.y+1; - BlockIterDevice[ blIdy * grIdx + blIdx ] = 0; - NE = ne; + dimX = mesh.getDimensions().x(); + dimY = mesh.getDimensions().y(); + hx = mesh.getSpaceSteps().x(); + hy = mesh.getSpaceSteps().y(); + BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] = 0; } __syncthreads(); - - int i = thri + blockDim.x*blIdx; - int j = blockDim.y*blIdy + thrj; + int numOfBlockx; + int numOfBlocky; + int xkolik; + int ykolik; + + xkolik = blockDim.x + 1; + ykolik = blockDim.y + 1; + numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0); + numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0); + + if( numOfBlockx - 1 == blockIdx.x ) + xkolik = dimX - (blockIdx.x)*blockDim.x+1; + + if( numOfBlocky -1 == blockIdx.y ) + ykolik = dimY - (blockIdx.y)*blockDim.y+1; + __syncthreads(); + int currentIndex = thrj * blockDim.x + thri; - if( BlockIterDevice[ blIdy * gridDim.x + blIdx] ) - { //__shared__ volatile bool changed[ blockDim.x*blockDim.y ]; - __shared__ volatile bool changed[16*16]; + __shared__ volatile bool changed[ (sizeSArray-2)*(sizeSArray-2)]; changed[ currentIndex ] = false; if( thrj == 0 && thri == 0 ) changed[ 0 ] = true; - __shared__ Real hx; - __shared__ Real hy; - if( thrj == 1 && thri == 1 ) - { - hx = mesh.getSpaceSteps().x(); - hy = mesh.getSpaceSteps().y(); - } //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ]; - __shared__ volatile Real sArray[18][18]; - sArray[thrj][thri] = std::numeric_limits< Real >::max(); + __shared__ volatile Real sArray[ sizeSArray * sizeSArray ]; + sArray[ thrj * sizeSArray + thri ] = std::numeric_limits< Real >::max(); //filling sArray edges if( thri == 0 ) - { - if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && NE == 1 ) - sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ]; + { + if( dimX > (blockIdx.x+1) * blockDim.x && thrj+1 < ykolik ) + sArray[(thrj+1)*sizeSArray + xkolik] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX + xkolik ]; else - sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max(); + sArray[(thrj+1)*sizeSArray + xkolik] = std::numeric_limits< Real >::max(); } if( thri == 1 ) { - if( blIdx != 0 && thrj+1 < ykolik && NE == 1 ) - sArray[thrj+1][0] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX ]; + if( blockIdx.x != 0 && thrj+1 < ykolik ) + sArray[(thrj+1)*sizeSArray + 0] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + (thrj+1)*dimX ]; else - sArray[thrj+1][0] = std::numeric_limits< Real >::max(); + sArray[(thrj+1)*sizeSArray + 0] = std::numeric_limits< Real >::max(); } -<<<<<<< HEAD if( thri == 2 ) { - if( dimY > (blIdy+1) * blockDim.y && thri+1 < xkolik && NE == 1 ) - sArray[ykolik][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thrj+1 ]; + if( dimY > (blockIdx.y+1) * blockDim.y && thrj+1 < xkolik ) + sArray[ ykolik*sizeSArray + thrj+1 ] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + ykolik*dimX + thrj+1 ]; else - sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max(); -======= - if( numOfBlockx - 1 == blIdx ) - xkolik = dimX - (blIdx)*blockDim.x+1; - - if( numOfBlocky -1 == blIdy ) - ykolik = dimY - (blIdy)*blockDim.y+1; - //BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0; ->>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf + sArray[ykolik*sizeSArray + thrj+1] = std::numeric_limits< Real >::max(); + } -<<<<<<< HEAD if( thri == 3 ) { - if( blIdy != 0 && thrj+1 < xkolik && NE == 1 ) - sArray[0][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + thrj+1 ]; + if( blockIdx.y != 0 && thrj+1 < xkolik ) + sArray[0*sizeSArray + thrj+1] = aux[ blockIdx.y*blockDim.y*dimX - dimX + blockIdx.x*blockDim.x - 1 + thrj+1 ]; else - sArray[0][thrj+1] = std::numeric_limits< Real >::max(); + sArray[0*sizeSArray + thrj+1] = std::numeric_limits< Real >::max(); } - - if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) + if( i < dimX && j < dimY ) { - sArray[thrj+1][thri+1] = aux[ j*mesh.getDimensions().x() + i ]; + sArray[(thrj+1)*sizeSArray + thri+1] = aux[ j*dimX + i ]; } __syncthreads(); @@ -672,25 +613,11 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< changed[ currentIndex] = false; //calculation of update cell - if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) + if( i < dimX && j < dimY ) { - if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] ) -======= - if(thri == 0 && thrj == 0 ) - BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0; - - if( thri == 0 ) - { - if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik ) - sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ]; - else - sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max(); - } - - if( thri == 1 ) ->>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf + if( ! interfaceMap[ j * dimX + i ] ) { - changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy); + changed[ currentIndex ] = ptr.updateCell<sizeSArray>( sArray, thri+1, thrj+1, hx,hy); } } __syncthreads(); @@ -724,12 +651,11 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< { if( currentIndex < 64 ) { -<<<<<<< HEAD changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; } } __syncthreads(); - if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU + if( currentIndex < 32 ) { if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ]; if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ]; @@ -738,82 +664,23 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; } - if( changed[ 0 ] && thri == 0 && thrj == 0 ) - BlockIterDevice[ blIdy * grIdx + blIdx ] = 1; + if( thri == 0 && thrj == 0 && changed[ 0 ] ){ + BlockIterDevice[ blockIdx.y * gridDim.x + blockIdx.x ] = 1; + } + /*if( thri==0 && thrj == 0 && blockIdx.x == 0 && blockIdx.y == 0 ) + { + for( int k = 15; k>-1; k-- ){ + for( int l = 0; l < 16; l++ ) + printf( "%f\t", sArray[k * 16 + l]); + printf( "\n"); + } + printf( "\n"); + }*/ __syncthreads(); } + if( i < dimX && j < dimY ) + helpFunc[ j * dimX + i ] = sArray[ ( thrj + 1 ) * sizeSArray + thri + 1 ]; - if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) ) - aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ]; - } -======= - __syncthreads(); - - changed[ currentIndex] = false; - - //calculation of update cell - if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) - { - if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] ) - { - changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy); - } - } - __syncthreads(); - - //pyramid reduction - if( blockDim.x*blockDim.y == 1024 ) - { - if( currentIndex < 512 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; - } - } - __syncthreads(); - if( blockDim.x*blockDim.y >= 512 ) - { - if( currentIndex < 256 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; - } - } - __syncthreads(); - if( blockDim.x*blockDim.y >= 256 ) - { - if( currentIndex < 128 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ]; - } - } - __syncthreads(); - if( blockDim.x*blockDim.y >= 128 ) - { - if( currentIndex < 64 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; - } - } - __syncthreads(); - if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU - { - if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ]; - if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ]; - if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ]; - if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ]; - if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; - if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; - } - if( changed[ 0 ] && thri == 0 && thrj == 0 ){ - BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1; - } - __syncthreads(); - } - - if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) ) - aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ]; - } - /*if( thri == 0 && thrj == 0 ) - printf( "Block ID = %d, value = %d \n", (blIdy * numOfBlockx + blIdx), BlockIterDevice[ blIdy * numOfBlockx + blIdx ] );*/ ->>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf + } } #endif diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h index 8d71bfe06d7c1eb63e63e0527ca58d8d4e95ad9f..4daf9fc920eddd50fb25a4147865193c62149214 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h @@ -280,17 +280,12 @@ solve( const MeshPointer& mesh, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), BlockIterDevice ); -<<<<<<< HEAD cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; -======= - //CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); - //CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); ->>>>>>> da336fb8bd927bc927bde8bde5876b18f07a23cf CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); cudaDeviceSynchronize();