diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h index ccbae8abe8a4b3df48d6c98f17b9fbf6240ef0f0..7d990c1bb1a5ae471337f35d33401d2cc88743e5 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h @@ -160,11 +160,17 @@ __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3 Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap ); -template < typename Real, typename Device, typename Index > +template < int sizeSArray, typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, - Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, + const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ); + +template < typename Index > +__global__ void GetNeighbours3D( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, + TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, + int numBlockX, int numBlockY, int numBlockZ ); #endif #include "tnlDirectEikonalMethodsBase_impl.h" diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index 546cfe9aa57743f2cc6c36481f055c81ce7ebe3a..b823fec0376064bd1c6d50990c873e851e5318a6 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -85,7 +85,7 @@ solve( const MeshPointer& mesh, { if( std::is_same< DeviceType, Devices::Host >::value ) { - int numThreadsPerBlock = 1024; + int numThreadsPerBlock = 16; int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0); @@ -115,13 +115,13 @@ solve( const MeshPointer& mesh, } std::cout<<std::endl;*/ unsigned int numWhile = 0; - while( IsCalculationDone ) + while( IsCalculationDone && numWhile < 1 ) { IsCalculationDone = 0; helpFunc1 = auxPtr; auxPtr = helpFunc; helpFunc = helpFunc1; - this->template updateBlocks< 1026 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); + this->template updateBlocks< 18 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock/*, sArray*/ ); //Reduction for( int i = 0; i < BlockIterHost.getSize(); i++ ){ @@ -394,9 +394,7 @@ solve( const MeshPointer& mesh, numIter ++; } if( numIter == 1 ){ - helpFunc1 = auxPtr; auxPtr = helpFunc; - helpFunc = helpFunc1; } /*cudaFree( BlockIterDevice ); cudaFree( dBlock ); @@ -535,10 +533,10 @@ __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< __syncthreads(); /**-----------------------------------------*/ const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); - __shared__ volatile int dimX; - __shared__ volatile int dimY; - __shared__ volatile Real hx; - __shared__ volatile Real hy; + __shared__ int dimX; + __shared__ int dimY; + __shared__ Real hx; + __shared__ Real hy; if( thri==0 && thrj == 0) { dimX = mesh.getDimensions().x(); diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h index 4daf9fc920eddd50fb25a4147865193c62149214..65aba5bf58daa953ee43b68efc47db3d263a90cc 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h @@ -16,511 +16,556 @@ #include "tnlFastSweepingMethod.h" template< typename Real, - typename Device, - typename Index, - typename Anisotropy > + typename Device, + typename Index, + typename Anisotropy > FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >:: FastSweepingMethod() : maxIterations( 1 ) { - + } template< typename Real, - typename Device, - typename Index, - typename Anisotropy > + typename Device, + typename Index, + typename Anisotropy > const Index& FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >:: getMaxIterations() const { - + } template< typename Real, - typename Device, - typename Index, - typename Anisotropy > + typename Device, + typename Index, + typename Anisotropy > void FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >:: setMaxIterations( const IndexType& maxIterations ) { - + } template< typename Real, - typename Device, - typename Index, - typename Anisotropy > + typename Device, + typename Index, + typename Anisotropy > void FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >:: solve( const MeshPointer& mesh, - const AnisotropyPointer& anisotropy, - MeshFunctionPointer& u ) + const AnisotropyPointer& anisotropy, + MeshFunctionPointer& u ) { - MeshFunctionPointer auxPtr; - InterfaceMapPointer interfaceMapPtr; - auxPtr->setMesh( mesh ); - interfaceMapPtr->setMesh( mesh ); - std::cout << "Initiating the interface cells ..." << std::endl; - BaseType::initInterface( u, auxPtr, interfaceMapPtr ); + MeshFunctionPointer auxPtr; + InterfaceMapPointer interfaceMapPtr; + auxPtr->setMesh( mesh ); + interfaceMapPtr->setMesh( mesh ); + std::cout << "Initiating the interface cells ..." << std::endl; + BaseType::initInterface( u, auxPtr, interfaceMapPtr ); #ifdef HAVE_CUDA - cudaDeviceSynchronize(); + cudaDeviceSynchronize(); #endif - auxPtr->save( "aux-ini.tnl" ); - - typename MeshType::Cell cell( *mesh ); - - IndexType iteration( 0 ); - MeshFunctionType aux = *auxPtr; - InterfaceMapType interfaceMap = * interfaceMapPtr; - while( iteration < this->maxIterations ) + auxPtr->save( "aux-ini.tnl" ); + + typename MeshType::Cell cell( *mesh ); + + IndexType iteration( 0 ); + MeshFunctionType aux = *auxPtr; + InterfaceMapType interfaceMap = * interfaceMapPtr; + while( iteration < this->maxIterations ) + { + if( std::is_same< DeviceType, Devices::Host >::value ) { - if( std::is_same< DeviceType, Devices::Host >::value ) + for( cell.getCoordinates().z() = 0; + cell.getCoordinates().z() < mesh->getDimensions().z(); + cell.getCoordinates().z()++ ) + { + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh->getDimensions().y(); + cell.getCoordinates().y()++ ) { - for( cell.getCoordinates().z() = 0; - cell.getCoordinates().z() < mesh->getDimensions().z(); - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh->getDimensions().y(); - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); - cell.getCoordinates().x()++ ) - { - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } - //aux.save( "aux-1.tnl" ); - - for( cell.getCoordinates().z() = 0; - cell.getCoordinates().z() < mesh->getDimensions().z(); - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh->getDimensions().y(); - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) - { - //std::cerr << "2 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } - //aux.save( "aux-2.tnl" ); - for( cell.getCoordinates().z() = 0; - cell.getCoordinates().z() < mesh->getDimensions().z(); - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; - cell.getCoordinates().y() >= 0 ; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); - cell.getCoordinates().x()++ ) - { - //std::cerr << "3 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } - //aux.save( "aux-3.tnl" ); - - for( cell.getCoordinates().z() = 0; - cell.getCoordinates().z() < mesh->getDimensions().z(); - cell.getCoordinates().z()++ ) - { - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; - cell.getCoordinates().y() >= 0; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) - { - //std::cerr << "4 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } - //aux.save( "aux-4.tnl" ); - - for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1; - cell.getCoordinates().z() >= 0; - cell.getCoordinates().z()-- ) - { - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh->getDimensions().y(); - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); - cell.getCoordinates().x()++ ) - { - //std::cerr << "5 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } - //aux.save( "aux-5.tnl" ); - - for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1; - cell.getCoordinates().z() >= 0; - cell.getCoordinates().z()-- ) - { - for( cell.getCoordinates().y() = 0; - cell.getCoordinates().y() < mesh->getDimensions().y(); - cell.getCoordinates().y()++ ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) - { - //std::cerr << "6 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } - //aux.save( "aux-6.tnl" ); - - for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1; - cell.getCoordinates().z() >= 0; - cell.getCoordinates().z()-- ) - { - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; - cell.getCoordinates().y() >= 0 ; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = 0; - cell.getCoordinates().x() < mesh->getDimensions().x(); - cell.getCoordinates().x()++ ) - { - //std::cerr << "7 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } - //aux.save( "aux-7.tnl" ); - - for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1; - cell.getCoordinates().z() >= 0; - cell.getCoordinates().z()-- ) - { - for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; - cell.getCoordinates().y() >= 0; - cell.getCoordinates().y()-- ) - { - for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; - cell.getCoordinates().x() >= 0 ; - cell.getCoordinates().x()-- ) - { - //std::cerr << "8 -> "; - cell.refresh(); - if( ! interfaceMap( cell ) ) - this->updateCell( aux, cell ); - } - } - } + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh->getDimensions().x(); + cell.getCoordinates().x()++ ) + { + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } } - if( std::is_same< DeviceType, Devices::Cuda >::value ) + //aux.save( "aux-1.tnl" ); + + for( cell.getCoordinates().z() = 0; + cell.getCoordinates().z() < mesh->getDimensions().z(); + cell.getCoordinates().z()++ ) { - // TODO: CUDA code -#ifdef HAVE_CUDA - const int cudaBlockSize( 8 ); - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); - int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); - int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z(), cudaBlockSize ); - if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) - std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; - dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); - dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); - - tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr; - - - int BlockIterD = 1; - - TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; - BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ ); - BlockIterDevice.setValue( 1 ); - /*int *BlockIterDevice; - cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/ - int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0); - - TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; - dBlock.setSize( nBlocks ); - dBlock.setValue( 0 ); - /*int *dBlock; - cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/ - - while( BlockIterD ) + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh->getDimensions().y(); + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; + cell.getCoordinates().x() >= 0 ; + cell.getCoordinates().x()-- ) { - CudaUpdateCellCaller<<< gridSize, blockSize >>>( ptr, - interfaceMapPtr.template getData< Device >(), - auxPtr.template modifyData< Device>(), - BlockIterDevice ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - - CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - - CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); - - /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) - BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ - + //std::cerr << "2 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); } - //cudaFree( BlockIterDevice ); - //cudaFree( dBlock ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - aux = *auxPtr; - interfaceMap = *interfaceMapPtr; -#endif + } } - - //aux.save( "aux-8.tnl" ); - iteration++; + //aux.save( "aux-2.tnl" ); + for( cell.getCoordinates().z() = 0; + cell.getCoordinates().z() < mesh->getDimensions().z(); + cell.getCoordinates().z()++ ) + { + for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; + cell.getCoordinates().y() >= 0 ; + cell.getCoordinates().y()-- ) + { + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh->getDimensions().x(); + cell.getCoordinates().x()++ ) + { + //std::cerr << "3 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + } + //aux.save( "aux-3.tnl" ); + + for( cell.getCoordinates().z() = 0; + cell.getCoordinates().z() < mesh->getDimensions().z(); + cell.getCoordinates().z()++ ) + { + for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; + cell.getCoordinates().y() >= 0; + cell.getCoordinates().y()-- ) + { + for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; + cell.getCoordinates().x() >= 0 ; + cell.getCoordinates().x()-- ) + { + //std::cerr << "4 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + } + //aux.save( "aux-4.tnl" ); + + for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1; + cell.getCoordinates().z() >= 0; + cell.getCoordinates().z()-- ) + { + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh->getDimensions().y(); + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh->getDimensions().x(); + cell.getCoordinates().x()++ ) + { + //std::cerr << "5 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + } + //aux.save( "aux-5.tnl" ); + + for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1; + cell.getCoordinates().z() >= 0; + cell.getCoordinates().z()-- ) + { + for( cell.getCoordinates().y() = 0; + cell.getCoordinates().y() < mesh->getDimensions().y(); + cell.getCoordinates().y()++ ) + { + for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; + cell.getCoordinates().x() >= 0 ; + cell.getCoordinates().x()-- ) + { + //std::cerr << "6 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + } + //aux.save( "aux-6.tnl" ); + + for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1; + cell.getCoordinates().z() >= 0; + cell.getCoordinates().z()-- ) + { + for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; + cell.getCoordinates().y() >= 0 ; + cell.getCoordinates().y()-- ) + { + for( cell.getCoordinates().x() = 0; + cell.getCoordinates().x() < mesh->getDimensions().x(); + cell.getCoordinates().x()++ ) + { + //std::cerr << "7 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + } + //aux.save( "aux-7.tnl" ); + + for( cell.getCoordinates().z() = mesh->getDimensions().z() - 1; + cell.getCoordinates().z() >= 0; + cell.getCoordinates().z()-- ) + { + for( cell.getCoordinates().y() = mesh->getDimensions().y() - 1; + cell.getCoordinates().y() >= 0; + cell.getCoordinates().y()-- ) + { + for( cell.getCoordinates().x() = mesh->getDimensions().x() - 1; + cell.getCoordinates().x() >= 0 ; + cell.getCoordinates().x()-- ) + { + //std::cerr << "8 -> "; + cell.refresh(); + if( ! interfaceMap( cell ) ) + this->updateCell( aux, cell ); + } + } + } + } + if( std::is_same< DeviceType, Devices::Cuda >::value ) + { + // TODO: CUDA code +#ifdef HAVE_CUDA + const int cudaBlockSize( 8 ); + int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); + int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); + int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z(), cudaBlockSize ); + if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) + std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; + dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); + dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); + + tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr; + + + int BlockIterD = 1; + + TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice; + BlockIterDevice.setSize( numBlocksX * numBlocksY * numBlocksZ ); + BlockIterDevice.setValue( 1 ); + TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom; + BlockIterPom.setSize( numBlocksX * numBlocksY * numBlocksZ ); + BlockIterPom.setValue( 0 ); + /*int *BlockIterDevice; + cudaMalloc(&BlockIterDevice, ( numBlocksX * numBlocksY * numBlocksZ ) * sizeof( int ) );*/ + int nBlocks = ( numBlocksX * numBlocksY * numBlocksZ )/512 + ((( numBlocksX * numBlocksY * numBlocksZ )%512 != 0) ? 1:0); - } - aux.save("aux-final.tnl"); + TNL::Containers::Array< int, Devices::Cuda, IndexType > dBlock; + dBlock.setSize( nBlocks ); + dBlock.setValue( 0 ); + + int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0); + /*int *dBlock; + cudaMalloc(&dBlock, nBlocks * sizeof( int ) );*/ + MeshFunctionPointer helpFunc1( mesh ); + MeshFunctionPointer helpFunc( mesh ); + + helpFunc1 = auxPtr; + auxPtr = helpFunc; + helpFunc = helpFunc1; + int numIter = 0; + + while( BlockIterD ) + { + helpFunc1 = auxPtr; + auxPtr = helpFunc; + helpFunc = helpFunc1; + TNL_CHECK_CUDA_DEVICE; + + CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr, + interfaceMapPtr.template getData< Device >(), + auxPtr.template getData< Device>(), + helpFunc.template modifyData< Device>(), + BlockIterDevice ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + GetNeighbours3D<<< nBlocksNeigh, 1024 >>>( BlockIterDevice, BlockIterPom, numBlocksX, numBlocksY, numBlocksZ ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + BlockIterDevice = BlockIterPom; + + CudaParallelReduc<<< nBlocks , 512 >>>( BlockIterDevice, dBlock, ( numBlocksX * numBlocksY * numBlocksZ ) ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + + CudaParallelReduc<<< 1, nBlocks >>>( dBlock, dBlock, nBlocks ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + cudaMemcpy(&BlockIterD, &dBlock[0], sizeof( int ), cudaMemcpyDeviceToHost); + numIter++; + /*for( int i = 1; i < numBlocksX * numBlocksY; i++ ) + BlockIter[ 0 ] = BlockIter[ 0 ] || BlockIter[ i ];*/ + + } + if( numIter == 1 ){ + auxPtr = helpFunc; + } + //cudaFree( BlockIterDevice ); + //cudaFree( dBlock ); + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + aux = *auxPtr; + interfaceMap = *interfaceMapPtr; +#endif + } + + //aux.save( "aux-8.tnl" ); + iteration++; + + } + aux.save("aux-final.tnl"); } #ifdef HAVE_CUDA -template < typename Real, typename Device, typename Index > +template < typename Index > +__global__ void GetNeighbours3D( TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice, + TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterPom, + int numBlockX, int numBlockY, int numBlockZ ) +{ + int i = blockIdx.x * 1024 + threadIdx.x; + + if( i < numBlockX * numBlockY * numBlockZ ) + { + int pom = 0;//BlockIterPom[ i ] = 0; + int m=0, l=0, k=0; + l = i/( numBlockX * numBlockY ); + k = (i-l*numBlockX * numBlockY )/(numBlockX ); + m = (i-l*numBlockX * numBlockY )%( numBlockX ); + if( m > 0 && BlockIterDevice[ i - 1 ] ){ + pom = 1;//BlockIterPom[ i ] = 1; + }else if( m < numBlockX -1 && BlockIterDevice[ i + 1 ] ){ + pom = 1;//BlockIterPom[ i ] = 1; + }else if( k > 0 && BlockIterDevice[ i - numBlockX ] ){ + pom = 1;// BlockIterPom[ i ] = 1; + }else if( k < numBlockY -1 && BlockIterDevice[ i + numBlockX ] ){ + pom = 1;//BlockIterPom[ i ] = 1; + }else if( l > 0 && BlockIterDevice[ i - numBlockX*numBlockY ] ){ + pom = 1; + }else if( l < numBlockZ-1 && BlockIterDevice[ i + numBlockX*numBlockY ] ){ + pom = 1; + } + + BlockIterPom[ i ] = pom;//BlockIterPom[ i ]; + } +} + +template < int sizeSArray, typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, - const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, - Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, - TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ) + const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, + const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux, + Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& helpFunc, + TNL::Containers::Array< int, Devices::Cuda, Index > BlockIterDevice ) { - int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z; - int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z; - int i = threadIdx.x + blockDim.x*blockIdx.x; - int j = blockDim.y*blockIdx.y + threadIdx.y; - int k = blockDim.z*blockIdx.z + threadIdx.z; - int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri; + int thri = threadIdx.x; int thrj = threadIdx.y; int thrk = threadIdx.z; + int blIdx = blockIdx.x; int blIdy = blockIdx.y; int blIdz = blockIdx.z; + int i = threadIdx.x + blockDim.x*blockIdx.x; + int j = blockDim.y*blockIdx.y + threadIdx.y; + int k = blockDim.z*blockIdx.z + threadIdx.z; + int currentIndex = thrk * blockDim.x * blockDim.y + thrj * blockDim.x + thri; + + if( BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] ) + { + __syncthreads(); - __shared__ volatile bool changed[8*8*8]; - changed[ currentIndex ] = false; + __shared__ volatile bool changed[ (sizeSArray - 2)*(sizeSArray - 2)*(sizeSArray - 2)]; + changed[ currentIndex ] = false; if( thrj == 0 && thri == 0 && thrk == 0 ) - changed[ 0 ] = true; + changed[ 0 ] = true; const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); - __shared__ Real hx; - __shared__ Real hy; - __shared__ Real hz; + __shared__ Real hx; __shared__ int dimX; + __shared__ Real hy; __shared__ int dimY; + __shared__ Real hz; __shared__ int dimZ; + if( thrj == 1 && thri == 1 && thrk == 1 ) { - hx = mesh.getSpaceSteps().x(); - hy = mesh.getSpaceSteps().y(); - hz = mesh.getSpaceSteps().z(); + hx = mesh.getSpaceSteps().x(); + hy = mesh.getSpaceSteps().y(); + hz = mesh.getSpaceSteps().z(); + dimX = mesh.getDimensions().x(); + dimY = mesh.getDimensions().y(); + dimZ = mesh.getDimensions().z(); + BlockIterDevice[ blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x + blockIdx.x ] = 0; } - __shared__ volatile Real sArray[10][10][10]; - sArray[thrk][thrj][thri] = std::numeric_limits< Real >::max(); - if(thri == 0 ) - { - sArray[8][thrj+1][thrk+1] = std::numeric_limits< Real >::max(); - sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max(); - sArray[thrk+1][thrj+1][8] = std::numeric_limits< Real >::max(); - sArray[thrk+1][thrj+1][9] = std::numeric_limits< Real >::max(); - sArray[thrj+1][8][thrk+1] = std::numeric_limits< Real >::max(); - sArray[thrj+1][9][thrk+1] = std::numeric_limits< Real >::max(); - } - + __shared__ volatile Real sArray[sizeSArray][sizeSArray][sizeSArray]; + sArray[thrk+1][thrj+1][thri+1] = std::numeric_limits< Real >::max(); + //filling sArray edges - int dimX = mesh.getDimensions().x(); int dimY = mesh.getDimensions().y(); - int dimZ = mesh.getDimensions().z(); - __shared__ volatile int numOfBlockx; - __shared__ volatile int numOfBlocky; - __shared__ volatile int numOfBlockz; - __shared__ int xkolik; - __shared__ int ykolik; - __shared__ int zkolik; - if( thri == 0 && thrj == 0 && thrk == 0 ) - { - xkolik = blockDim.x + 1; - ykolik = blockDim.y + 1; - zkolik = blockDim.z + 1; - numOfBlocky = dimY/blockDim.y + ((dimY%blockDim.y != 0) ? 1:0); - numOfBlockx = dimX/blockDim.x + ((dimX%blockDim.x != 0) ? 1:0); - numOfBlockz = dimZ/blockDim.z + ((dimZ%blockDim.z != 0) ? 1:0); - - if( numOfBlockx - 1 == blIdx ) - xkolik = dimX - (blIdx)*blockDim.x+1; - - if( numOfBlocky -1 == blIdy ) - ykolik = dimY - (blIdy)*blockDim.y+1; - if( numOfBlockz-1 == blIdz ) - zkolik = dimZ - (blIdz)*blockDim.z+1; - - BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 0; - } + int numOfBlockx; + int numOfBlocky; + int numOfBlockz; + int xkolik; + int ykolik; + int zkolik; + xkolik = blockDim.x + 1; + ykolik = blockDim.y + 1; + zkolik = blockDim.z + 1; + numOfBlockx = gridDim.x; + numOfBlocky = gridDim.y; + numOfBlockz = gridDim.z; + + if( numOfBlockx - 1 == blIdx ) + xkolik = dimX - (blIdx)*blockDim.x+1; + if( numOfBlocky -1 == blIdy ) + ykolik = dimY - (blIdy)*blockDim.y+1; + if( numOfBlockz-1 == blIdz ) + zkolik = dimZ - (blIdz)*blockDim.z+1; __syncthreads(); if( thri == 0 ) { - if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik ) - sArray[thrk+1][thrj+1][0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ]; - else - sArray[thrk+1][thrj+1][0] = std::numeric_limits< Real >::max(); + if( blIdx != 0 && thrj+1 < ykolik && thrk+1 < zkolik ) + sArray[thrk+1][thrj+1][0] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX -1 + thrk*dimX*dimY ]; + else + sArray[thrk+1][thrj+1][0] = std::numeric_limits< Real >::max(); } if( thri == 1 ) { - if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik ) - sArray[thrk+1][thrj+1][9] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ]; - else - sArray[thrk+1][thrj+1][9] = std::numeric_limits< Real >::max(); + if( dimX > (blIdx+1) * blockDim.x && thrj+1 < ykolik && thrk+1 < zkolik ) + sArray[thrk+1][thrj+1][9] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy *blockDim.y*dimX+ blIdx*blockDim.x + blockDim.x + thrj * dimX + thrk*dimX*dimY ]; + else + sArray[thrk+1][thrj+1][9] = std::numeric_limits< Real >::max(); } if( thri == 2 ) { - if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik ) - sArray[thrk+1][0][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ]; - else - sArray[thrk+1][0][thrj+1] = std::numeric_limits< Real >::max(); + if( blIdy != 0 && thrj+1 < xkolik && thrk+1 < zkolik ) + sArray[thrk+1][0][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX + thrj + thrk*dimX*dimY ]; + else + sArray[thrk+1][0][thrj+1] = std::numeric_limits< Real >::max(); } if( thri == 3 ) { - if( dimY > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik ) - sArray[thrk+1][9][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ]; - else - sArray[thrk+1][9][thrj+1] = std::numeric_limits< Real >::max(); + if( dimY > (blIdy+1) * blockDim.y && thrj+1 < xkolik && thrk+1 < zkolik ) + sArray[thrk+1][9][thrj+1] = aux[ blIdz*blockDim.z * dimX * dimY + (blIdy+1) * blockDim.y*dimX + blIdx*blockDim.x + thrj + thrk*dimX*dimY ]; + else + sArray[thrk+1][9][thrj+1] = std::numeric_limits< Real >::max(); } if( thri == 4 ) { - if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik ) - sArray[0][thrj+1][thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ]; - else - sArray[0][thrj+1][thrk+1] = std::numeric_limits< Real >::max(); + if( blIdz != 0 && thrj+1 < ykolik && thrk+1 < xkolik ) + sArray[0][thrj+1][thrk+1] = aux[ blIdz*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x - dimX * dimY + thrj * dimX + thrk ]; + else + sArray[0][thrj+1][thrk+1] = std::numeric_limits< Real >::max(); } if( thri == 5 ) { - if( dimZ > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik ) - sArray[9][thrj+1][thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ]; - else - sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max(); + if( dimZ > (blIdz+1) * blockDim.z && thrj+1 < ykolik && thrk+1 < xkolik ) + sArray[9][thrj+1][thrk+1] = aux[ (blIdz+1)*blockDim.z * dimX * dimY + blIdy * blockDim.y*dimX + blIdx*blockDim.x + thrj * dimX + thrk ]; + else + sArray[9][thrj+1][thrk+1] = std::numeric_limits< Real >::max(); } - if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() ) + if( i < dimX && j < dimY && k < dimZ ) { - sArray[thrk+1][thrj+1][thri+1] = aux[ k*dimX*dimY + j*dimX + i ]; + sArray[thrk+1][thrj+1][thri+1] = aux[ k*dimX*dimY + j*dimX + i ]; } - __shared__ volatile int loopcounter; - loopcounter = 0; __syncthreads(); while( changed[ 0 ] ) { - __syncthreads(); - - changed[ currentIndex ] = false; - - //calculation of update cell - if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ ) - { - if( ! interfaceMap[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] ) - { - changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz); - } - } - __syncthreads(); - - //pyramid reduction - if( blockDim.x*blockDim.y*blockDim.z == 1024 ) - { - if( currentIndex < 512 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; - } - } - __syncthreads(); - if( blockDim.x*blockDim.y*blockDim.z >= 512 ) + __syncthreads(); + + changed[ currentIndex ] = false; + + //calculation of update cell + if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ ) + { + if( ! interfaceMap[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] ) { - if( currentIndex < 256 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; - } + changed[ currentIndex ] = ptr.updateCell( sArray, thri+1, thrj+1, thrk+1, hx,hy,hz); } - __syncthreads(); - if( blockDim.x*blockDim.y*blockDim.z >= 256 ) + } + __syncthreads(); + + //pyramid reduction + if( blockDim.x*blockDim.y*blockDim.z == 1024 ) + { + if( currentIndex < 512 ) { - if( currentIndex < 128 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ]; - } + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 512 ]; } - __syncthreads(); - if( blockDim.x*blockDim.y*blockDim.z >= 128 ) + } + __syncthreads(); + if( blockDim.x*blockDim.y*blockDim.z >= 512 ) + { + if( currentIndex < 256 ) { - if( currentIndex < 64 ) - { - changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; - } + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 256 ]; } - __syncthreads(); - if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU + } + __syncthreads(); + if( blockDim.x*blockDim.y*blockDim.z >= 256 ) + { + if( currentIndex < 128 ) { - if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ]; - if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ]; - if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ]; - if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ]; - if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; - if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 128 ]; } - __syncthreads(); - - /*if(thri == 0 && thrj ==0 && thrk ==0 && blIdx == 0 && blIdy == 0 && blIdz == 0) - { - for(int m = 0; m < 8; m++){ - for(int n = 0; n<8; n++){ - for(int b=0; b<8; b++) - printf(" %i ", changed[m*64 + n*8 + b]); - printf("\n"); - } - printf("\n \n"); - } - }*/ - if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 ) + } + __syncthreads(); + if( blockDim.x*blockDim.y*blockDim.z >= 128 ) + { + if( currentIndex < 64 ) { - //loopcounter++; - BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 1; + changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 64 ]; } - __syncthreads(); - /*if(thri == 0 && thrj==0 && thrk==0) - printf("%i \n",loopcounter); - if(loopcounter == 500) - break;*/ + } + __syncthreads(); + if( currentIndex < 32 ) //POUZE IF JSOU SINCHRONNI NA JEDNOM WARPU + { + if( true ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 32 ]; + if( currentIndex < 16 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 16 ]; + if( currentIndex < 8 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 8 ]; + if( currentIndex < 4 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 4 ]; + if( currentIndex < 2 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 2 ]; + if( currentIndex < 1 ) changed[ currentIndex ] = changed[ currentIndex ] || changed[ currentIndex + 1 ]; + } + __syncthreads(); + + /*if(thri == 0 && thrj ==0 && thrk ==0 && blIdx == 0 && blIdy == 0 && blIdz == 0) + { + for(int m = 0; m < 8; m++){ + for(int n = 0; n<8; n++){ + for(int b=0; b<8; b++) + printf(" %i ", changed[m*64 + n*8 + b]); + printf("\n"); + } + printf("\n \n"); + } + }*/ + if( changed[ 0 ] && thri == 0 && thrj == 0 && thrk == 0 ) + { + BlockIterDevice[ blIdz * numOfBlockx * numOfBlocky + blIdy * numOfBlockx + blIdx ] = 1; + } + __syncthreads(); } - - if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < dimZ && (!interfaceMap[ k*dimX*dimY+j * mesh.getDimensions().x() + i ]) ) - aux[ k*dimX*dimY + j * mesh.getDimensions().x() + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ]; -} + + if( i < dimX && j < dimY && k < dimZ ) + helpFunc[ k*dimX*dimY + j * dimX + i ] = sArray[thrk+1][ thrj + 1 ][ thri + 1 ]; + } +} #endif