diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h index c92368deb0e7c8ff23a80afce3e76a7fd50b1cb6..08ed947ed751935bab8a4ef31df8573619364b3e 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h @@ -61,8 +61,9 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > typedef Index IndexType; typedef Functions::MeshFunction< MeshType > MeshFunctionType; typedef Functions::MeshFunction< MeshType, 2, bool > InterfaceMapType; + typedef TNL::Containers::Array< int, Device, IndexType > ArrayContainer; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunctionType >; - using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >; + using InterfaceMapPointer = Pointers::SharedPointer< InterfaceMapType >; void initInterface( const MeshFunctionPointer& input, MeshFunctionPointer& output, @@ -76,6 +77,11 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > __cuda_callable__ bool updateCell( volatile Real sArray[18][18], int thri, int thrj, const Real hx, const Real hy, const Real velocity = 1.0 ); + void updateBlocks( InterfaceMapType interfaceMap, + MeshFunctionType aux, + ArrayContainer BlockIterHost, int numThreadsPerBlock ); + + void getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY ); }; template< typename Real, @@ -132,14 +138,15 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, - TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ); + TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne = 1 ); template < typename Index > __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks ); -/*template < typename Real, typename Device, typename Index > -__global__ void aux1( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, Real *dAux, int a );*/ +template < typename Index > +__global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, + /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY ); template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& input, @@ -155,7 +162,7 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, - int *BlockIterDevice ); + TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ); #endif #include "tnlDirectEikonalMethodsBase_impl.h" diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h index cfea6aca0cd9f77423c6fffd9541798cc7bdb271..1f9fc5eeba264cd9e9c335a43419804ab415a31b 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h @@ -89,6 +89,199 @@ initInterface( const MeshFunctionPointer& _input, } } +template< typename Real, + typename Device, + typename Index > +void +tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: +updateBlocks( InterfaceMapType interfaceMap, + MeshFunctionType aux, + ArrayContainer BlockIterHost, int numThreadsPerBlock ) +{ + for( int i = 0; i < BlockIterHost.getSize(); i++ ) + { + if( BlockIterHost[ i ] ) + { + MeshType mesh = interfaceMap.template getMesh< Devices::Host >(); + + int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); + int numOfBlockx = dimY/numThreadsPerBlock + ((dimY%numThreadsPerBlock != 0) ? 1:0); + int numOfBlocky = dimX/numThreadsPerBlock + ((dimX%numThreadsPerBlock != 0) ? 1:0); + int xkolik = numThreadsPerBlock + 1; + int ykolik = numThreadsPerBlock + 1; + + int blIdx = i%numOfBlockx; + int blIdy = i/numOfBlocky; + + if( numOfBlockx - 1 == blIdx ) + xkolik = dimX - (blIdx)*numThreadsPerBlock+1; + + if( numOfBlocky -1 == blIdy ) + ykolik = dimY - (blIdy)*numThreadsPerBlock+1; + + + /*bool changed[numThreadsPerBlock*numThreadsPerBlock]; + changed[ 0 ] = 1;*/ + Real hx = mesh.getSpaceSteps().x(); + Real hy = mesh.getSpaceSteps().y(); + + Real changed1[ 16*16 ]; + /*Real changed2[ 16*16 ]; + Real changed3[ 16*16 ]; + Real changed4[ 16*16 ];*/ + Real sArray[18][18]; + + for( int thri = 0; thri < numThreadsPerBlock + 2; thri++ ) + for( int thrj = 0; thrj < numThreadsPerBlock + 2; thrj++ ) + sArray[thrj][thri] = std::numeric_limits< Real >::max(); + + BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 0; + + for( int thrj = 0; thrj < numThreadsPerBlock + 1; thrj++ ) + { + if( dimX > (blIdx+1) * numThreadsPerBlock && thrj+1 < ykolik ) + sArray[thrj+1][xkolik] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX + xkolik ]; + else + sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max(); + + + if( blIdx != 0 && thrj+1 < ykolik ) + sArray[thrj+1][0] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + (thrj+1)*dimX ]; + else + sArray[thrj+1][0] = std::numeric_limits< Real >::max(); + + if( dimY > (blIdy+1) * numThreadsPerBlock && thrj+1 < xkolik ) + sArray[ykolik][thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + ykolik*dimX + thrj+1 ]; + else + sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max(); + + if( blIdy != 0 && thrj+1 < xkolik ) + sArray[0][thrj+1] = aux[ blIdy*numThreadsPerBlock*dimX - dimX + blIdx*numThreadsPerBlock - 1 + thrj+1 ]; + else + sArray[0][thrj+1] = std::numeric_limits< Real >::max(); + } + + for( int k = 0; k < numThreadsPerBlock; k++ ) + for( int l = 0; l < numThreadsPerBlock; l++ ) + sArray[k+1][l+1] = aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ]; + + for( int k = 0; k < numThreadsPerBlock; k++ ) + for( int l = 0; l < numThreadsPerBlock; l++ ){ + changed1[ k*numThreadsPerBlock + l ] = 0; + /*changed2[ k*numThreadsPerBlock + l ] = 0; + changed3[ k*numThreadsPerBlock + l ] = 0; + changed4[ k*numThreadsPerBlock + l ] = 0;*/ + if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l < dimX*dimY ) + { + if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) + { + changed1[ k*numThreadsPerBlock + l ] = this->updateCell( sArray, l+1, k+1, hx,hy); + } + } + } + + for( int k = numThreadsPerBlock-1; k > -1; k-- ) + for( int l = 0; l < numThreadsPerBlock; l++ ) { + if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l < dimX*dimY ) + { + if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) + { + /*changed2[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy); + } + } + } + + for( int k = 0; k < numThreadsPerBlock; k++ ) + for( int l = numThreadsPerBlock-1; l >-1; l-- ) { + if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l < dimX*dimY ) + { + if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) + { + /*changed3[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy); + } + } + } + + for( int k = numThreadsPerBlock-1; k > -1; k-- ) + for( int l = numThreadsPerBlock-1; l >-1; l-- ) { + if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l < dimX*dimY ) + { + if( ! interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] ) + { + /*changed4[ k*numThreadsPerBlock + l ] = */this->updateCell( sArray, l+1, k+1, hx,hy); + } + } + } + + for( int k = numThreadsPerBlock-1; k > -1; k-- ) + for( int l = numThreadsPerBlock-1; l >-1; l-- ){ + changed1[ 0 ] = changed1[ 0 ] || changed1[ k*numThreadsPerBlock + l ]; + /*changed2[ 0 ] = changed2[ 0 ] || changed2[ k*numThreadsPerBlock + l ]; + changed3[ 0 ] = changed3[ 0 ] || changed3[ k*numThreadsPerBlock + l ]; + changed4[ 0 ] = changed4[ 0 ] || changed4[ k*numThreadsPerBlock + l ];*/ + } + + if( changed1[ 0 ] /*|| changed2[ 0 ] ||changed3[ 0 ] ||changed4[ 0 ]*/ ) + BlockIterHost[ blIdy * numOfBlockx + blIdx ] = 1; + + for( int k = 0; k < numThreadsPerBlock; k++ ){ + for( int l = 0; l < numThreadsPerBlock; l++ ) { + if( blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l < dimX*dimY && + (!interfaceMap[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ]) ) + aux[ blIdy * numThreadsPerBlock * dimX + numThreadsPerBlock * blIdx + k*dimX + l ] = sArray[ k + 1 ][ l + 1 ]; + //std::cout<< sArray[k+1][l+1]; + } + //std::cout<<std::endl; + } + } + } +} + +template< typename Real, + typename Device, + typename Index > +void +tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: +getNeighbours( ArrayContainer BlockIterHost, int numBlockX, int numBlockY ) +{ + int* BlockIterPom; + BlockIterPom = new int [numBlockX * numBlockY]; + + for(int i = 0; i < numBlockX * numBlockY; i++) + { + BlockIterPom[ i ] = 0; + if( BlockIterHost[ i ] ) + { + // i = k*numBlockY + m; + int m=0, k=0; + m = i%numBlockX; + k = i/numBlockX; + if( k > 0 ) + BlockIterPom[i - numBlockX] = 1; + if( k < numBlockY - 1 ) + BlockIterPom[i + numBlockX] = 1; + + if( m < numBlockX - 1 ) + BlockIterPom[ i+1 ] = 1; + if( m > 0 ) + BlockIterPom[ i-1 ] = 1; + } + } + for(int i = 0; i < numBlockX * numBlockY; i++ ) + //if( !BlockIter[ i ] ) + BlockIterHost[ i ] = BlockIterPom[ i ]; + /*else + BlockIter[ i ] = 0;*/ + /*for( int i = numBlockX-1; i > -1; i-- ) + { + for( int j = 0; j< numBlockY; j++ ) + std::cout << BlockIterHost[ i*numBlockY + j ]; + std::cout << std::endl; + } + std::cout << std::endl;*/ + delete[] BlockIterPom; +} + template< typename Real, typename Device, typename Index > diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h index fa807742735f8beaa034b9a2555d3bc4a57f9f8e..60c690e0623c82f0a50d12823ad0bcee648e1d86 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h @@ -88,7 +88,8 @@ class FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy > using typename BaseType::InterfaceMapType; using typename BaseType::MeshFunctionType; using typename BaseType::InterfaceMapPointer; - using typename BaseType::MeshFunctionPointer; + using typename BaseType::MeshFunctionPointer; + using typename BaseType::ArrayContainer; FastSweepingMethod(); diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index 817811c84e4517c8da54ad0e8763b6c630f12edf..c6cc575d15bdf2fe3ef1f45b1b31551c8dd8751c 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -21,403 +21,348 @@ #include <fstream> template< typename Real, - typename Device, - typename Index, - typename Anisotropy > + typename Device, + typename Index, + typename Anisotropy > FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >:: FastSweepingMethod() : maxIterations( 1 ) { - + } template< typename Real, - typename Device, - typename Index, - typename Anisotropy > + typename Device, + typename Index, + typename Anisotropy > const Index& FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >:: getMaxIterations() const { - + } template< typename Real, - typename Device, - typename Index, - typename Anisotropy > + typename Device, + typename Index, + typename Anisotropy > void FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >:: setMaxIterations( const IndexType& maxIterations ) { - + } template< typename Real, - typename Device, - typename Index, - typename Anisotropy > + typename Device, + typename Index, + typename Anisotropy > void FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Anisotropy >:: solve( const MeshPointer& mesh, - const AnisotropyPointer& anisotropy, - MeshFunctionPointer& u ) -{ - /*MeshFunctionType v; - v.setMesh(mesh); - double A[320][320]; - for (int i = 0; i < 320; i++) - for (int j = 0; j < 320; j++) - A[i][j] = 0; - - std::ifstream file("/home/maty/Downloads/mapa2.txt"); - - for (int i = 0; i < 320; i++) - for (int j = 0; j < 320; j++) - file >> A[i][j]; - file.close(); - for (int i = 0; i < 320; i++) - for (int j = 0; j < 320; j++) - v[i*320 + j] = A[i][j]; - v.save("mapa.tnl");*/ - - - MeshFunctionPointer auxPtr; - InterfaceMapPointer interfaceMapPtr; - auxPtr->setMesh( mesh ); - interfaceMapPtr->setMesh( mesh ); - std::cout << "Initiating the interface cells ..." << std::endl; - BaseType::initInterface( u, auxPtr, interfaceMapPtr ); + const AnisotropyPointer& anisotropy, + MeshFunctionPointer& u ) +{ + MeshFunctionPointer auxPtr; + InterfaceMapPointer interfaceMapPtr; + auxPtr->setMesh( mesh ); + interfaceMapPtr->setMesh( mesh ); + std::cout << "Initiating the interface cells ..." << std::endl; + BaseType::initInterface( u, auxPtr, interfaceMapPtr ); + + auxPtr->save( "aux-ini.tnl" ); + + typename MeshType::Cell cell( *mesh ); + + IndexType iteration( 0 ); + InterfaceMapType interfaceMap = *interfaceMapPtr; + MeshFunctionType aux = *auxPtr; + + + + + while( iteration < this->maxIterations ) + { + if( std::is_same< DeviceType, Devices::Host >::value ) + { + int numThreadsPerBlock = 16; + + int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0); + int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0); + + + ArrayContainer BlockIterHost; + BlockIterHost.setSize( numBlocksX * numBlocksY ); + BlockIterHost.setValue( 1 ); + /*for( int k = numBlocksX-1; k >-1; k-- ){ + for( int l = 0; l < numBlocksY; l++ ){ + std::cout<< BlockIterHost[ l*numBlocksX + k ]; + } + std::cout<<std::endl; + } + std::cout<<std::endl;*/ + + while( BlockIterHost[ 0 ] ) + { + this->updateBlocks( interfaceMap, aux, BlockIterHost, numThreadsPerBlock); - auxPtr->save( "aux-ini.tnl" ); - - typename MeshType::Cell cell( *mesh ); - - IndexType iteration( 0 ); - InterfaceMapType interfaceMap = *interfaceMapPtr; - MeshFunctionType aux = *auxPtr; - while( iteration < this->maxIterations ) - { - if( std::is_same< DeviceType, Devices::Host >::value ) - { - for( cell.getCoordinates().y() = 0; + this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY ); + + //Reduction + for( int k = numBlocksX-1; k >-1; k-- ){ + for( int l = 0; l < numBlocksY; l++ ){ + //std::cout<< BlockIterHost[ l*numBlocksX + k ]; + BlockIterHost[ 0 ] = BlockIterHost[ 0 ] || BlockIterHost[ l*numBlocksX + k ]; + } + //std::cout<<std::endl; + } + //std::cout<<std::endl; + } + /*for( cell.getCoordinates().y() = 0; cell.getCoordinates().y() < mesh->getDimensions().y(); cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); - cell.getCoordinates().x()++ ) - { - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - - //aux.save( "aux-1.tnl" ); - - for( cell.getCoordinates().y() = 0; + { + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh->getDimensions().x(); + cell.getCoordinates().x()++ ) + { + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + + //aux.save( "aux-1.tnl" ); + + for( cell.getCoordinates().y() = 0; cell.getCoordinates().y() < mesh->getDimensions().y(); cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) - { - //std::cerr << "2 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - - //aux.save( "aux-2.tnl" ); - - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; + { + for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; + cell.getCoordinates().x() >= 0 ; + cell.getCoordinates().x()-- ) + { + //std::cerr << "2 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + + //aux.save( "aux-2.tnl" ); + + for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; cell.getCoordinates().y() >= 0 ; cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); - cell.getCoordinates().x()++ ) - { - //std::cerr << "3 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - - //aux.save( "aux-3.tnl" ); - - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; + { + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh->getDimensions().x(); + cell.getCoordinates().x()++ ) + { + //std::cerr << "3 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + + //aux.save( "aux-3.tnl" ); + + for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; cell.getCoordinates().y() >= 0; cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) - { - //std::cerr << "4 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - - //aux.save( "aux-4.tnl" ); - - /*for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().y(); - cell.getCoordinates().x()++ ) - { - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh->getDimensions().x(); - cell.getCoordinates().y()++ ) - { - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - - - aux.save( "aux-5.tnl" ); - - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().y(); - cell.getCoordinates().x()++ ) - { - for( cell.getCoordinates().y() = mesh->getDimensions().x() - 1; - cell.getCoordinates().y() >= 0 ; - cell.getCoordinates().y()-- ) - { - //std::cerr << "2 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - aux.save( "aux-6.tnl" ); - - for( cell.getCoordinates().x() = mesh->getDimensions().y() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) - { - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh->getDimensions().x(); - cell.getCoordinates().y()++ ) - { - //std::cerr << "3 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - aux.save( "aux-7.tnl" ); - - for( cell.getCoordinates().x() = mesh->getDimensions().y() - 1; - cell.getCoordinates().x() >= 0; - cell.getCoordinates().x()-- ) - { - for( cell.getCoordinates().y() = mesh->getDimensions().x() - 1; - cell.getCoordinates().y() >= 0 ; - cell.getCoordinates().y()-- ) - { - //std::cerr << "4 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - }*/ - } - if( std::is_same< DeviceType, Devices::Cuda >::value ) { - // TODO: CUDA code + for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; + cell.getCoordinates().x() >= 0 ; + cell.getCoordinates().x()-- ) + { + //std::cerr << "4 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + }*/ + } + if( std::is_same< DeviceType, Devices::Cuda >::value ) + { + // TODO: CUDA code #ifdef HAVE_CUDA - const int cudaBlockSize( 16 ); - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); - int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); - dim3 blockSize( cudaBlockSize, cudaBlockSize ); - dim3 gridSize( numBlocksX, numBlocksY ); - - tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; - - TNL::Containers::Array< int, Devices::Host, IndexType > BlockIter; - BlockIter.setSize( numBlocksX * numBlocksY ); - BlockIter.setValue( 0 ); - /*int* BlockIter = (int*)malloc( ( numBlocksX * numBlocksY ) * sizeof( int ) ); - for( int i = 0; i < numBlocksX*numBlocksY +1; i++) - BlockIter[i] = 1;*/ - - int BlockIterD = 1; - - TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; - BlockIterDevice.setSize( numBlocksX * numBlocksY ); - BlockIterDevice.setValue( 1 ); - /*int *BlockIterDevice; - cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) ); - cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice);*/ - - int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0); - - TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; - dBlock.setSize( nBlocks ); - dBlock.setValue( 0 ); - /*int *dBlock; - cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/ - - while( BlockIterD ) - { - /*for( int i = 0; i < numBlocksX * numBlocksY; i++ ) - BlockIter[ i ] = false;*/ - - CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, - interfaceMapPtr.template getData< Device >(), - auxPtr.template modifyData< Device>(), - BlockIterDevice ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - - BlockIter = BlockIterDevice; - //cudaMemcpy(BlockIter, BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyDeviceToHost); - GetNeighbours( BlockIter, numBlocksX, numBlocksY ); - - BlockIterDevice = BlockIter; - //cudaMemcpy(BlockIterDevice, BlockIter, ( numBlocksX * numBlocksY ) * sizeof( int ), cudaMemcpyHostToDevice); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - - - CudaParallelReduc<<< nBlocks, 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - - CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - - cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); - - /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) - BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ - - } - /*cudaFree( BlockIterDevice ); - cudaFree( dBlock ); - delete BlockIter;*/ - cudaDeviceSynchronize(); - - TNL_CHECK_CUDA_DEVICE; - - aux = *auxPtr; - interfaceMap = *interfaceMapPtr; -#endif + TNL_CHECK_CUDA_DEVICE; + const int cudaBlockSize( 16 ); + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); + int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); + dim3 blockSize( cudaBlockSize, cudaBlockSize ); + dim3 gridSize( numBlocksX, numBlocksY ); + + tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; + + int BlockIterD = 1; + + TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; + BlockIterDevice.setSize( numBlocksX * numBlocksY ); + BlockIterDevice.setValue( 1 ); + TNL_CHECK_CUDA_DEVICE; + int ne = 0; + CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, + interfaceMapPtr.template getData< Device >(), + auxPtr.template modifyData< Device>(), + BlockIterDevice, ne); + TNL_CHECK_CUDA_DEVICE; + + /*TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom; + BlockIterPom.setSize( numBlocksX * numBlocksY ); + BlockIterPom.setValue( 0 );*/ + /*TNL::Containers::Array< int, Devices::Host, IndexType > BlockIterPom1; + BlockIterPom1.setSize( numBlocksX * numBlocksY ); + BlockIterPom1.setValue( 0 );*/ + /*int *BlockIterDevice; + cudaMalloc((void**) &BlockIterDevice, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/ + int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0); + //std::cout << "nBlocksNeigh = " << nBlocksNeigh << std::endl; + //free( BlockIter ); + /*int *BlockIterPom; + cudaMalloc((void**) &BlockIterPom, ( numBlocksX * numBlocksY ) * sizeof( int ) );*/ + + int nBlocks = ( numBlocksX * numBlocksY )/512 + ((( numBlocksX * numBlocksY )%512 != 0) ? 1:0); + TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; + dBlock.setSize( nBlocks ); + TNL_CHECK_CUDA_DEVICE; + /*int *dBlock; + cudaMalloc((void**) &dBlock, nBlocks * sizeof( int ) );*/ + //int pocIter = 0; + while( BlockIterD ) + { + /*BlockIterPom1 = BlockIterDevice; + for( int j = numBlocksY-1; j>-1; j-- ){ + for( int i = 0; i < numBlocksX; i++ ) + std::cout << BlockIterPom1[ j * numBlocksX + i ]; + std::cout << std::endl; + } + std::cout << std::endl;*/ + + CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, + interfaceMapPtr.template getData< Device >(), + auxPtr.template modifyData< Device>(), + BlockIterDevice, 1); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + /*int poc = 0; + for( int i = 0; i < numBlocksX * numBlocksY; i++ ) + if( BlockIterPom1[ i ] ) + poc = poc+1; + std::cout << "pocet bloku, ktere se pocitali = " << poc << std::endl;*/ + + GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, /*BlockIterPom,*/ numBlocksX, numBlocksY ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY ) ); + TNL_CHECK_CUDA_DEVICE; + + CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); + TNL_CHECK_CUDA_DEVICE; + + BlockIterD = dBlock.getElement( 0 ); + //cudaMemcpy( &BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) + BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ + //pocIter ++; } - iteration++; - } - aux.save("aux-final.tnl"); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + //std::cout<< pocIter << std::endl; + + aux = *auxPtr; + interfaceMap = *interfaceMapPtr; +#endif + } + iteration++; + } + aux.save("aux-final.tnl"); } + +#ifdef HAVE_CUDA template < typename Index > -void GetNeighbours( TNL::Containers::Array< int, Devices::Host, Index > BlockIter, int numBlockX, int numBlockY ) +__global__ void GetNeighbours( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, + /*TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom,*/ int numBlockX, int numBlockY ) { - TNL::Containers::Array< int, Devices::Host, Index > BlockIterPom; - BlockIterPom.setSize( numBlockX * numBlockY ); - BlockIterPom.setValue( 0 ); - /*int* BlockIterPom; - BlockIterPom = new int[numBlockX * numBlockY];*/ - /*for(int i = 0; i < numBlockX * numBlockY; i++) - BlockIterPom[ i ] = 0;*/ - for(int i = 0; i < numBlockX * numBlockY; i++) + int i = blockIdx.x * 1024 + threadIdx.x; + + if( i < numBlockX * numBlockY ) { - - if( BlockIter[ i ] ) - { - // i = k*numBlockY + m; - int m=0, k=0; - m = i%numBlockY; - k = i/numBlockY; - if( k > 0 && numBlockY > 1 ) - BlockIterPom[i - numBlockX] = 1; - if( k < numBlockY-1 && numBlockY > 1 ) - BlockIterPom[i + numBlockX] = 1; + int pom = 0;//BlockIterPom[ i ] = 0; + int m=0, k=0; + m = i%numBlockX; + k = i/numBlockX; + if( m > 0 ) + if( BlockIterDevice[ i - 1 ] ) + pom = 1;//BlockIterPom[ i ] = 1; + if( m < numBlockX -1 && pom == 0 ) + if( BlockIterDevice[ i + 1 ] ) + pom = 1;//BlockIterPom[ i ] = 1; + if( k > 0 && pom == 0 ) + if( BlockIterDevice[ i - numBlockX ] ) + pom = 1;// BlockIterPom[ i ] = 1; + if( k < numBlockY -1 && pom == 0 ) + if( BlockIterDevice[ i + numBlockX ] ) + pom = 1;//BlockIterPom[ i ] = 1; + - if( m >= 0 && m < numBlockX - 1 && numBlockX > 1 ) - BlockIterPom[ i+1 ] = 1; - if( m <= numBlockX -1 && m > 0 && numBlockX > 1 ) - BlockIterPom[ i-1 ] = 1; - } - } - for(int i = 0; i < numBlockX * numBlockY; i++ ){ -/// if( !BlockIter[ i ] ) - BlockIter[ i ] = BlockIterPom[ i ]; -/// else -/// BlockIter[ i ] = 0; - } - /*for( int i = numBlockX-1; i > -1; i-- ) - { - for( int j = 0; j< numBlockY; j++ ) - std::cout << BlockIter[ i*numBlockY + j ]; - std::cout << std::endl; + + BlockIterDevice[ i ] = pom;//BlockIterPom[ i ]; } - std::cout << std::endl;*/ - //delete[] BlockIterPom; } -#ifdef HAVE_CUDA template < typename Index > __global__ void CudaParallelReduc( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, TNL::Containers::Array< int, Devices::Cuda, Index > dBlock, int nBlocks ) { - int i = threadIdx.x; - int blId = blockIdx.x; - /*if ( i == 0 && blId == 0 ){ - printf( "nBlocks = %d \n", nBlocks ); - for( int j = nBlocks-1; j > -1 ; j--){ - printf( "cislo = %d \n", BlockIterDevice[ j ] ); - } - }*/ - __shared__ volatile int sArray[ 512 ]; - sArray[ i ] = 0; - if( blId * 512 + i < nBlocks ) - sArray[ i ] = BlockIterDevice[ blId * 512 + i ]; - __syncthreads(); - - if (blockDim.x == 1024) { - if (i < 512) - sArray[ i ] += sArray[ i + 512 ]; - } - __syncthreads(); - if (blockDim.x >= 512) { - if (i < 256) { - sArray[ i ] += sArray[ i + 256 ]; - } - } - __syncthreads(); - if (blockDim.x >= 256) { - if (i < 128) { - sArray[ i ] += sArray[ i + 128 ]; - } + int i = threadIdx.x; + int blId = blockIdx.x; + __shared__ volatile int sArray[ 512 ]; + sArray[ i ] = 0; + if(blId * 512 + i < nBlocks ) + sArray[ i ] = BlockIterDevice[ blId * 512 + i ]; + __syncthreads(); + if (blockDim.x == 1024) { + if (i < 512) + sArray[ i ] += sArray[ i + 512 ]; + } + __syncthreads(); + if (blockDim.x >= 512) { + if (i < 256) { + sArray[ i ] += sArray[ i + 256 ]; } - __syncthreads(); - if (blockDim.x >= 128) { - if (i < 64) { - sArray[ i ] += sArray[ i + 64 ]; - } + } + if (blockDim.x >= 256) { + if (i < 128) { + sArray[ i ] += sArray[ i + 128 ]; } - __syncthreads(); - if (i < 32 ) - { - if( blockDim.x >= 64 ) sArray[ i ] += sArray[ i + 32 ]; - if( blockDim.x >= 32 ) sArray[ i ] += sArray[ i + 16 ]; - if( blockDim.x >= 16 ) sArray[ i ] += sArray[ i + 8 ]; - if( blockDim.x >= 8 ) sArray[ i ] += sArray[ i + 4 ]; - if( blockDim.x >= 4 ) sArray[ i ] += sArray[ i + 2 ]; - if( blockDim.x >= 2 ) sArray[ i ] += sArray[ i + 1 ]; + } + __syncthreads(); + if (blockDim.x >= 128) { + if (i < 64) { + sArray[ i ] += sArray[ i + 64 ]; } - - if( i == 0 ) - dBlock[ blId ] = sArray[ 0 ]; + } + __syncthreads(); + if (i < 32 ) + { + if( blockDim.x >= 64 ) sArray[ i ] += sArray[ i + 32 ]; + if( blockDim.x >= 32 ) sArray[ i ] += sArray[ i + 16 ]; + if( blockDim.x >= 16 ) sArray[ i ] += sArray[ i + 8 ]; + if( blockDim.x >= 8 ) sArray[ i ] += sArray[ i + 4 ]; + if( blockDim.x >= 4 ) sArray[ i ] += sArray[ i + 2 ]; + if( blockDim.x >= 2 ) sArray[ i ] += sArray[ i + 1 ]; + } + + if( i == 0 ) + dBlock[ blId ] = sArray[ 0 ]; } @@ -426,10 +371,40 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux, - TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ) + TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, int ne ) { - int thri = threadIdx.x; int thrj = threadIdx.y; - int blIdx = blockIdx.x; int blIdy = blockIdx.y; + int thri = threadIdx.x; int thrj = threadIdx.y; + int blIdx = blockIdx.x; int blIdy = blockIdx.y; + int grIdx = gridDim.x; + + if( BlockIterDevice[ blIdy * grIdx + blIdx] ) + { + + const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); + + int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); + __shared__ volatile int numOfBlockx; + __shared__ volatile int numOfBlocky; + __shared__ int xkolik; + __shared__ int ykolik; + __shared__ volatile int NE; + if( thri == 0 && thrj == 0 ) + { + xkolik = blockDim.x + 1; + ykolik = blockDim.y + 1; + numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0); + numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0); + + if( numOfBlockx - 1 == blIdx ) + xkolik = dimX - (blIdx)*blockDim.x+1; + + if( numOfBlocky -1 == blIdy ) + ykolik = dimY - (blIdy)*blockDim.y+1; + BlockIterDevice[ blIdy * grIdx + blIdx ] = 0; + NE = ne; + } + __syncthreads(); + int i = thri + blockDim.x*blIdx; int j = blockDim.y*blIdy + thrj; int currentIndex = thrj * blockDim.x + thri; @@ -438,17 +413,15 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< //__shared__ volatile bool changed[ blockDim.x*blockDim.y ]; __shared__ volatile bool changed[16*16]; changed[ currentIndex ] = false; - if( thrj == 0 && thri == 0 ) - changed[ 0 ] = true; + changed[ 0 ] = true; - const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); __shared__ Real hx; __shared__ Real hy; if( thrj == 1 && thri == 1 ) { - hx = mesh.getSpaceSteps().x(); - hy = mesh.getSpaceSteps().y(); + hx = mesh.getSpaceSteps().x(); + hy = mesh.getSpaceSteps().y(); } //__shared__ volatile Real sArray[ blockDim.y+2 ][ blockDim.x+2 ]; @@ -456,137 +429,89 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< sArray[thrj][thri] = std::numeric_limits< Real >::max(); //filling sArray edges - int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); - __shared__ volatile int numOfBlockx; - __shared__ volatile int numOfBlocky; - __shared__ int xkolik; - __shared__ int ykolik; - if( thri == 0 && thrj == 0 ) + if( thri == 0 ) + { + if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && NE == 1 ) + sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ]; + else + sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max(); + } + + if( thri == 1 ) { - xkolik = blockDim.x + 1; - ykolik = blockDim.y + 1; - numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0); - numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0); + if( blIdx != 0 && thrj+1 < ykolik && NE == 1 ) + sArray[thrj+1][0] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX ]; + else + sArray[thrj+1][0] = std::numeric_limits< Real >::max(); + } - if( numOfBlockx - 1 == blIdx ) - xkolik = dimX - (blIdx)*blockDim.x+1; - - if( numOfBlocky -1 == blIdy ) - ykolik = dimY - (blIdy)*blockDim.y+1; - //BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0; + if( thri == 2 ) + { + if( dimY > (blIdy+1) * blockDim.y && thri+1 < xkolik && NE == 1 ) + sArray[ykolik][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thrj+1 ]; + else + sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max(); } - __syncthreads(); - if(thri == 0 && thrj == 0 ) - BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 0; - - if( thri == 0 ) - { - if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik ) - sArray[thrj+1][xkolik] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX + xkolik ]; - else - sArray[thrj+1][xkolik] = std::numeric_limits< Real >::max(); - } - - if( thri == 1 ) { - if( blIdx != 0 && thrj+1 < ykolik ) - sArray[thrj+1][0] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + (thrj+1)*dimX ]; - else - sArray[thrj+1][0] = std::numeric_limits< Real >::max(); + changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy); } - - if( thri == 2 ) + } + __syncthreads(); + + //pyramid reduction + if( blockDim.x*blockDim.y == 1024 ) + { + if( currentIndex < 512 ) { - if( dimY > (blIdy+1) * blockDim.y && thri+1 < xkolik ) - sArray[ykolik][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + ykolik*dimX + thrj+1 ]; - else - sArray[ykolik][thrj+1] = std::numeric_limits< Real >::max(); + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; } - - if( thri == 3 ) + } + __syncthreads(); + if( blockDim.x*blockDim.y >= 512 ) + { + if( currentIndex < 256 ) { - if( blIdy != 0 && thrj+1 < xkolik ) - sArray[0][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + thrj+1 ]; - else - sArray[0][thrj+1] = std::numeric_limits< Real >::max(); + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; } - - - if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) - { - sArray[thrj+1][thri+1] = aux[ j*mesh.getDimensions().x() + i ]; - } - __syncthreads(); - - while( changed[ 0 ] ) + } + __syncthreads(); + if( blockDim.x*blockDim.y >= 256 ) + { + if( currentIndex < 128 ) { - __syncthreads(); - - changed[ currentIndex] = false; - - //calculation of update cell - if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) - { - if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] ) - { - changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, hx,hy); - } - } - __syncthreads(); - - //pyramid reduction - if( blockDim.x*blockDim.y == 1024 ) - { - if( currentIndex < 512 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; - } - } - __syncthreads(); - if( blockDim.x*blockDim.y >= 512 ) - { - if( currentIndex < 256 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; - } - } - __syncthreads(); - if( blockDim.x*blockDim.y >= 256 ) - { - if( currentIndex < 128 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ]; - } - } - __syncthreads(); - if( blockDim.x*blockDim.y >= 128 ) - { - if( currentIndex < 64 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; - } - } - __syncthreads(); - if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU - { - if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ]; - if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ]; - if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ]; - if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ]; - if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; - if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; - } - if( changed[ 0 ] && thri == 0 && thrj == 0 ){ - BlockIterDevice[ blIdy * numOfBlockx + blIdx ] = 1; - } - __syncthreads(); + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ]; } - - if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && (!interfaceMap[ j * mesh.getDimensions().x() + i ]) ) - aux[ j * mesh.getDimensions().x() + i ] = sArray[ thrj + 1 ][ thri + 1 ]; + } + __syncthreads(); + if( blockDim.x*blockDim.y >= 128 ) + { + if( currentIndex < 64 ) + { + if( thri == 3 ) + { + if( blIdy != 0 && thrj+1 < xkolik && NE == 1 ) + sArray[0][thrj+1] = aux[ blIdy*blockDim.y*dimX - dimX + blIdx*blockDim.x - 1 + thrj+1 ]; + else + sArray[0][thrj+1] = std::numeric_limits< Real >::max(); + } + + + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) + { + sArray[thrj+1][thri+1] = aux[ j*mesh.getDimensions().x() + i ]; } - /*if( thri == 0 && thrj == 0 ) - printf( "Block ID = %d, value = %d \n", (blIdy * numOfBlockx + blIdx), BlockIterDevice[ blIdy * numOfBlockx + blIdx ] );*/ + __syncthreads(); + + while( changed[ 0 ] ) + { + __syncthreads(); + + changed[ currentIndex] = false; + + //calculation of update cell + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) + { + if( ! interfaceMap[ j * mesh.getDimensions().x() + i ] ) } #endif diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h index 8c85745cd4def444de3944af157da411423d67dc..4daf9fc920eddd50fb25a4147865193c62149214 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h @@ -258,13 +258,21 @@ solve( const MeshPointer& mesh, tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr; - int *BlockIterDevice; + int BlockIterD = 1; - cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) ); + TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; + BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ ); + BlockIterDevice.setValue( 1 ); + /*int *BlockIterDevice; + cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/ int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0); - int *dBlock; - cudaMalloc(&dBlock, nBlocks * sizeof( int ) ); + + TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; + dBlock.setSize( nBlocks ); + dBlock.setValue( 0 ); + /*int *dBlock; + cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/ while( BlockIterD ) { @@ -272,17 +280,24 @@ solve( const MeshPointer& mesh, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>(), BlockIterDevice ); - //CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); - //CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ } - cudaFree( BlockIterDevice ); - cudaFree( dBlock ); + //cudaFree( BlockIterDevice ); + //cudaFree( dBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; aux = *auxPtr; @@ -302,7 +317,7 @@ template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, - int *BlockIterDevice ) + TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ) { int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z; int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z;