Loading src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +12 −11 Original line number Diff line number Diff line Loading @@ -68,8 +68,6 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > __cuda_callable__ void updateCell( MeshFunctionType& u, const MeshEntity& cell, const RealType velocity = 1.0 ); protected: }; template< typename Real, Loading @@ -78,7 +76,6 @@ template< typename Real, class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > { public: typedef Meshes::Grid< 3, Real, Device, Index > MeshType; typedef Real RealType; typedef Device DevcieType; Loading @@ -96,11 +93,6 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > __cuda_callable__ void updateCell( MeshFunctionType& u, const MeshEntity& cell, const RealType velocity = 1.0); /*Real sort( Real a, Real b, Real c, const RealType& ha, const RealType& hb, const RealType& hc ); */ }; template < typename T1, typename T2 > Loading @@ -112,7 +104,8 @@ __cuda_callable__ void sortMinims( T1 pom[] ); #ifdef HAVE_CUDA template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux ); template < typename Real, typename Device, typename Index > Loading @@ -120,7 +113,15 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap ); //__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, double, TNL::Devices::Cuda, int > >& input ); template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap ); template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux ); #endif #include "tnlDirectEikonalMethodsBase_impl.h" src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +280 −176 Original line number Diff line number Diff line Loading @@ -11,6 +11,7 @@ #include <iostream> #include "tnlFastSweepingMethod.h" template< typename Real, typename Device, typename Index > Loading Loading @@ -81,12 +82,6 @@ initInterface( const MeshFunctionPointer& _input, MeshFunctionPointer& _output, InterfaceMapPointer& _interfaceMap ) { /* doplnit přepočty pro cudu: * overit is_same device * na kazdy bod jedno cuda vlakno */ if( std::is_same< Device, Devices::Cuda >::value ) { Loading @@ -102,7 +97,6 @@ initInterface( const MeshFunctionPointer& _input, CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(), _output.template modifyData< Device >(), _interfaceMap.template modifyData< Device >() ); //CudaInitCaller<<< gridSize, blockSize >>>( input ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; #endif Loading Loading @@ -215,6 +209,7 @@ template< typename Real, typename Device, typename Index > template< typename MeshEntity > __cuda_callable__ void tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: updateCell( MeshFunctionType& u, Loading Loading @@ -300,6 +295,29 @@ tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: initInterface( const MeshFunctionPointer& _input, MeshFunctionPointer& _output, InterfaceMapPointer& _interfaceMap ) { if( std::is_same< Device, Devices::Cuda >::value ) { #ifdef HAVE_CUDA const MeshType& mesh = _input->getMesh(); const int cudaBlockSize( 8 ); int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize ); if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); Devices::Cuda::synchronizeDevice(); CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(), _output.template modifyData< Device >(), _interfaceMap.template modifyData< Device >() ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; #endif } if( std::is_same< Device, Devices::Host >::value ) { const MeshFunctionType& input = _input.getData(); MeshFunctionType& output = _output.modifyData(); Loading Loading @@ -452,11 +470,13 @@ initInterface( const MeshFunctionPointer& _input, interfaceMap[ cell.getIndex() ] = false;*/ //is on line 245 } } } template< typename Real, typename Device, typename Index > template< typename MeshEntity > __cuda_callable__ void tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: updateCell( MeshFunctionType& u, Loading Loading @@ -628,12 +648,13 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Cell cell( mesh ); cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.refresh(); const Index cind = cell.getIndex(); output[ cell.getIndex() ] = output[ cind ] = input( cell ) >= 0 ? TypeInfo< Real >::getMaxValue() : - TypeInfo< Real >::getMaxValue(); interfaceMap[ cell.getIndex() ] = false; interfaceMap[ cind ] = false; const Real& hx = mesh.getSpaceSteps().x(); const Real& hy = mesh.getSpaceSteps().y(); Loading @@ -651,44 +672,127 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, if( c * input[ n ] <= 0 ) { pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) output[ cell.getIndex() ] = pom; if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) output[ cind ] = pom; interfaceMap[ cell.getIndex() ] = true; } if( c * input[ e ] <= 0 ) { pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) output[ cell.getIndex() ] = pom; if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) output[ cind ] = pom; interfaceMap[ cell.getIndex() ] = true; interfaceMap[ cind ] = true; } if( c * input[ w ] <= 0 ) { pom = TNL::sign( c )*( hx * c )/( c - input[ w ]); if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) output[ cell.getIndex() ] = pom; if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) output[ cind ] = pom; interfaceMap[ cell.getIndex() ] = true; interfaceMap[ cind ] = true; } if( c * input[ s ] <= 0 ) { pom = TNL::sign( c )*( hy * c )/( c - input[ s ]); if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) output[ cell.getIndex() ] = pom; if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) output[ cind ] = pom; interfaceMap[ cell.getIndex() ] = true; interfaceMap[ cind ] = true; } } } } /*__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, double, TNL::Devices::Cuda, int > >& input ) template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap ) { int i = threadIdx.x + blockDim.x*blockIdx.x; int j = blockDim.y*blockIdx.y + threadIdx.y; //const Meshes::Grid< 2, double, TNL::Devices::Cuda, int >& mesh = input.getMesh(); int k = blockDim.z*blockIdx.z + threadIdx.z; const Meshes::Grid< 3, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); }*/ if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() ) { typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell; Cell cell( mesh ); cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k; cell.refresh(); const Index cind = cell.getIndex(); output[ cind ] = input( cell ) >= 0 ? TypeInfo< Real >::getMaxValue() : - TypeInfo< Real >::getMaxValue(); interfaceMap[ cind ] = false; cell.refresh(); const Real& hx = mesh.getSpaceSteps().x(); const Real& hy = mesh.getSpaceSteps().y(); const Real& hz = mesh.getSpaceSteps().z(); const Real& c = input( cell ); if( ! cell.isBoundaryEntity() ) { auto neighbors = cell.getNeighborEntities(); Real pom = 0; const Index e = neighbors.template getEntityIndex< 1, 0, 0 >(); const Index w = neighbors.template getEntityIndex< -1, 0, 0 >(); const Index n = neighbors.template getEntityIndex< 0, 1, 0 >(); const Index s = neighbors.template getEntityIndex< 0, -1, 0 >(); const Index t = neighbors.template getEntityIndex< 0, 0, 1 >(); const Index b = neighbors.template getEntityIndex< 0, 0, -1 >(); if( c * input[ n ] <= 0 ) { pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) output[ cind ] = pom; interfaceMap[ cind ] = true; } if( c * input[ e ] <= 0 ) { pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) output[ cind ] = pom; interfaceMap[ cind ] = true; } if( c * input[ w ] <= 0 ) { pom = TNL::sign( c )*( hx * c )/( c - input[ w ]); if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) output[ cind ] = pom; interfaceMap[ cind ] = true; } if( c * input[ s ] <= 0 ) { pom = TNL::sign( c )*( hy * c )/( c - input[ s ]); if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) output[ cind ] = pom; interfaceMap[ cind ] = true; } if( c * input[ b ] <= 0 ) { pom = TNL::sign( c )*( hz * c )/( c - input[ b ]); if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) output[ cind ] = pom; interfaceMap[ cind ] = true; } if( c * input[ t ] <= 0 ) { pom = TNL::sign( c )*( hz * c )/( c - input[ t ]); if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) output[ cind ] = pom; interfaceMap[ cind ] = true; } } } } src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h +2 −0 Original line number Diff line number Diff line Loading @@ -15,6 +15,7 @@ #include <TNL/SharedPointer.h> #include "tnlDirectEikonalMethodsBase.h" template< typename Mesh, typename Anisotropy = Functions::Analytic::Constant< Mesh::getMeshDimension(), typename Mesh::RealType > > class FastSweepingMethod Loading Loading @@ -145,6 +146,7 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy > const IndexType maxIterations; }; #include "tnlFastSweepingMethod1D_impl.h" #include "tnlFastSweepingMethod2D_impl.h" #include "tnlFastSweepingMethod3D_impl.h" src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +30 −19 Original line number Diff line number Diff line Loading @@ -14,6 +14,9 @@ #pragma once #include "tnlFastSweepingMethod.h" #include <TNL/TypeInfo.h> #include <TNL/Devices/Cuda.h> template< typename Real, typename Device, Loading Loading @@ -212,15 +215,23 @@ solve( const MeshPointer& mesh, { // TODO: CUDA code #ifdef HAVE_CUDA /*int numBlocks = 2; int threadsPerBlock; if( mesh->getDimensions().x() >= mesh->getDimensions().y() ) threadsPerBlock = (int)( mesh->getDimensions().x() ); else threadsPerBlock = (int)( mesh->getDimensions().y() ); const int cudaBlockSize( 16 ); int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); dim3 blockSize( cudaBlockSize, cudaBlockSize ); dim3 gridSize( numBlocksX, numBlocksY ); Devices::Cuda::synchronizeDevice(); int DIM = mesh->getDimensions().x(); CudaUpdateCellCaller< Real, Device, Index ><<< numBlocks, threadsPerBlock >>>( interfaceMap, aux ); cudaDeviceSynchronize(); //copak dela?*/ tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; for( int k = 0; k < numBlocksX; k++) CudaUpdateCellCaller< Real, Device, Index ><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>() ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; aux = *auxPtr; interfaceMap = *interfaceMapPtr; #endif } iteration++; Loading @@ -228,30 +239,30 @@ solve( const MeshPointer& mesh, aux.save("aux-final.tnl"); } //#ifdef HAVE_CUDA template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux ) { int i = threadIdx.x + blockDim.x*blockIdx.x; int j = threadIdx.y + blockDim.y*blockIdx.y; const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.getMesh(); int j = blockDim.y*blockIdx.y + threadIdx.y; const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) { //make cell of aux from index typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell; Cell cell( mesh ); cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.refresh(); //update cell value few times //for( int i = 0; i < mesh.getDimensions() ; i++ ) //{ cell.refresh(); //tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; for( int k = 0; k < 16; k++ ) { if( ! interfaceMap( cell ) ) { // tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::updateCell( aux, cell ); ptr.updateCell( aux, cell ); } } //} } } //#endif No newline at end of file src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +218 −162 Original line number Diff line number Diff line Loading @@ -21,7 +21,7 @@ template< typename Real, typename Anisotropy > FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >:: FastSweepingMethod() : maxIterations( 1 ) : maxIterations( 2 ) { } Loading Loading @@ -64,6 +64,7 @@ solve( const MeshPointer& mesh, interfaceMapPtr->setMesh( mesh ); std::cout << "Initiating the interface cells ..." << std::endl; BaseType::initInterface( u, auxPtr, interfaceMapPtr ); cudaDeviceSynchronize(); auxPtr->save( "aux-ini.tnl" ); typename MeshType::Cell cell( *mesh ); Loading @@ -72,6 +73,8 @@ solve( const MeshPointer& mesh, MeshFunctionType aux = *auxPtr; InterfaceMapType interfaceMap = * interfaceMapPtr; while( iteration < this->maxIterations ) { if( std::is_same< DeviceType, Devices::Host >::value ) { for( cell.getCoordinates().z() = 0; cell.getCoordinates().z() < mesh->getDimensions().z(); Loading Loading @@ -237,6 +240,33 @@ solve( const MeshPointer& mesh, } } } } if( std::is_same< DeviceType, Devices::Cuda >::value ) { // TODO: CUDA code #ifdef HAVE_CUDA const int cudaBlockSize( 8 ); int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z(), cudaBlockSize ); if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); Devices::Cuda::synchronizeDevice(); tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr; for( int k = 0; k < numBlocksX; k++) CudaUpdateCellCaller< Real, Device, Index ><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>() ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; aux = *auxPtr; interfaceMap = *interfaceMapPtr; #endif } //aux.save( "aux-8.tnl" ); iteration++; Loading @@ -244,3 +274,29 @@ solve( const MeshPointer& mesh, aux.save("aux-final.tnl"); } template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux ) { int i = threadIdx.x + blockDim.x*blockIdx.x; int j = blockDim.y*blockIdx.y + threadIdx.y; int k = blockDim.z*blockIdx.z + threadIdx.z; const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() ) { typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell; Cell cell( mesh ); cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k; cell.refresh(); //tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr; for( int l = 0; l < 8; l++ ) { if( ! interfaceMap( cell ) ) { ptr.updateCell( aux, cell ); } } } } No newline at end of file Loading
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase.h +12 −11 Original line number Diff line number Diff line Loading @@ -68,8 +68,6 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > __cuda_callable__ void updateCell( MeshFunctionType& u, const MeshEntity& cell, const RealType velocity = 1.0 ); protected: }; template< typename Real, Loading @@ -78,7 +76,6 @@ template< typename Real, class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > { public: typedef Meshes::Grid< 3, Real, Device, Index > MeshType; typedef Real RealType; typedef Device DevcieType; Loading @@ -96,11 +93,6 @@ class tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > __cuda_callable__ void updateCell( MeshFunctionType& u, const MeshEntity& cell, const RealType velocity = 1.0); /*Real sort( Real a, Real b, Real c, const RealType& ha, const RealType& hb, const RealType& hc ); */ }; template < typename T1, typename T2 > Loading @@ -112,7 +104,8 @@ __cuda_callable__ void sortMinims( T1 pom[] ); #ifdef HAVE_CUDA template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux ); template < typename Real, typename Device, typename Index > Loading @@ -120,7 +113,15 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& output, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap ); //__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, double, TNL::Devices::Cuda, int > >& input ); template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap ); template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux ); #endif #include "tnlDirectEikonalMethodsBase_impl.h"
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodsBase_impl.h +280 −176 Original line number Diff line number Diff line Loading @@ -11,6 +11,7 @@ #include <iostream> #include "tnlFastSweepingMethod.h" template< typename Real, typename Device, typename Index > Loading Loading @@ -81,12 +82,6 @@ initInterface( const MeshFunctionPointer& _input, MeshFunctionPointer& _output, InterfaceMapPointer& _interfaceMap ) { /* doplnit přepočty pro cudu: * overit is_same device * na kazdy bod jedno cuda vlakno */ if( std::is_same< Device, Devices::Cuda >::value ) { Loading @@ -102,7 +97,6 @@ initInterface( const MeshFunctionPointer& _input, CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(), _output.template modifyData< Device >(), _interfaceMap.template modifyData< Device >() ); //CudaInitCaller<<< gridSize, blockSize >>>( input ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; #endif Loading Loading @@ -215,6 +209,7 @@ template< typename Real, typename Device, typename Index > template< typename MeshEntity > __cuda_callable__ void tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >:: updateCell( MeshFunctionType& u, Loading Loading @@ -300,6 +295,29 @@ tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: initInterface( const MeshFunctionPointer& _input, MeshFunctionPointer& _output, InterfaceMapPointer& _interfaceMap ) { if( std::is_same< Device, Devices::Cuda >::value ) { #ifdef HAVE_CUDA const MeshType& mesh = _input->getMesh(); const int cudaBlockSize( 8 ); int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize ); if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); Devices::Cuda::synchronizeDevice(); CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(), _output.template modifyData< Device >(), _interfaceMap.template modifyData< Device >() ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; #endif } if( std::is_same< Device, Devices::Host >::value ) { const MeshFunctionType& input = _input.getData(); MeshFunctionType& output = _output.modifyData(); Loading Loading @@ -452,11 +470,13 @@ initInterface( const MeshFunctionPointer& _input, interfaceMap[ cell.getIndex() ] = false;*/ //is on line 245 } } } template< typename Real, typename Device, typename Index > template< typename MeshEntity > __cuda_callable__ void tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > >:: updateCell( MeshFunctionType& u, Loading Loading @@ -628,12 +648,13 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, Cell cell( mesh ); cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.refresh(); const Index cind = cell.getIndex(); output[ cell.getIndex() ] = output[ cind ] = input( cell ) >= 0 ? TypeInfo< Real >::getMaxValue() : - TypeInfo< Real >::getMaxValue(); interfaceMap[ cell.getIndex() ] = false; interfaceMap[ cind ] = false; const Real& hx = mesh.getSpaceSteps().x(); const Real& hy = mesh.getSpaceSteps().y(); Loading @@ -651,44 +672,127 @@ __global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, if( c * input[ n ] <= 0 ) { pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) output[ cell.getIndex() ] = pom; if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) output[ cind ] = pom; interfaceMap[ cell.getIndex() ] = true; } if( c * input[ e ] <= 0 ) { pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) output[ cell.getIndex() ] = pom; if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) output[ cind ] = pom; interfaceMap[ cell.getIndex() ] = true; interfaceMap[ cind ] = true; } if( c * input[ w ] <= 0 ) { pom = TNL::sign( c )*( hx * c )/( c - input[ w ]); if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) output[ cell.getIndex() ] = pom; if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) output[ cind ] = pom; interfaceMap[ cell.getIndex() ] = true; interfaceMap[ cind ] = true; } if( c * input[ s ] <= 0 ) { pom = TNL::sign( c )*( hy * c )/( c - input[ s ]); if( TNL::abs( output[ cell.getIndex() ] ) > TNL::abs( pom ) ) output[ cell.getIndex() ] = pom; if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) output[ cind ] = pom; interfaceMap[ cell.getIndex() ] = true; interfaceMap[ cind ] = true; } } } } /*__global__ void CudaInitCaller( const Functions::MeshFunction< Meshes::Grid< 2, double, TNL::Devices::Cuda, int > >& input ) template < typename Real, typename Device, typename Index > __global__ void CudaInitCaller3d( const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& input, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& output, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap ) { int i = threadIdx.x + blockDim.x*blockIdx.x; int j = blockDim.y*blockIdx.y + threadIdx.y; //const Meshes::Grid< 2, double, TNL::Devices::Cuda, int >& mesh = input.getMesh(); int k = blockDim.z*blockIdx.z + threadIdx.z; const Meshes::Grid< 3, Real, Device, Index >& mesh = input.template getMesh< Devices::Cuda >(); }*/ if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() ) { typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell; Cell cell( mesh ); cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k; cell.refresh(); const Index cind = cell.getIndex(); output[ cind ] = input( cell ) >= 0 ? TypeInfo< Real >::getMaxValue() : - TypeInfo< Real >::getMaxValue(); interfaceMap[ cind ] = false; cell.refresh(); const Real& hx = mesh.getSpaceSteps().x(); const Real& hy = mesh.getSpaceSteps().y(); const Real& hz = mesh.getSpaceSteps().z(); const Real& c = input( cell ); if( ! cell.isBoundaryEntity() ) { auto neighbors = cell.getNeighborEntities(); Real pom = 0; const Index e = neighbors.template getEntityIndex< 1, 0, 0 >(); const Index w = neighbors.template getEntityIndex< -1, 0, 0 >(); const Index n = neighbors.template getEntityIndex< 0, 1, 0 >(); const Index s = neighbors.template getEntityIndex< 0, -1, 0 >(); const Index t = neighbors.template getEntityIndex< 0, 0, 1 >(); const Index b = neighbors.template getEntityIndex< 0, 0, -1 >(); if( c * input[ n ] <= 0 ) { pom = TNL::sign( c )*( hy * c )/( c - input[ n ]); if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) output[ cind ] = pom; interfaceMap[ cind ] = true; } if( c * input[ e ] <= 0 ) { pom = TNL::sign( c )*( hx * c )/( c - input[ e ]); if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) output[ cind ] = pom; interfaceMap[ cind ] = true; } if( c * input[ w ] <= 0 ) { pom = TNL::sign( c )*( hx * c )/( c - input[ w ]); if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) output[ cind ] = pom; interfaceMap[ cind ] = true; } if( c * input[ s ] <= 0 ) { pom = TNL::sign( c )*( hy * c )/( c - input[ s ]); if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) output[ cind ] = pom; interfaceMap[ cind ] = true; } if( c * input[ b ] <= 0 ) { pom = TNL::sign( c )*( hz * c )/( c - input[ b ]); if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) output[ cind ] = pom; interfaceMap[ cind ] = true; } if( c * input[ t ] <= 0 ) { pom = TNL::sign( c )*( hz * c )/( c - input[ t ]); if( TNL::abs( output[ cind ] ) > TNL::abs( pom ) ) output[ cind ] = pom; interfaceMap[ cind ] = true; } } } }
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod.h +2 −0 Original line number Diff line number Diff line Loading @@ -15,6 +15,7 @@ #include <TNL/SharedPointer.h> #include "tnlDirectEikonalMethodsBase.h" template< typename Mesh, typename Anisotropy = Functions::Analytic::Constant< Mesh::getMeshDimension(), typename Mesh::RealType > > class FastSweepingMethod Loading Loading @@ -145,6 +146,7 @@ class FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy > const IndexType maxIterations; }; #include "tnlFastSweepingMethod1D_impl.h" #include "tnlFastSweepingMethod2D_impl.h" #include "tnlFastSweepingMethod3D_impl.h"
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +30 −19 Original line number Diff line number Diff line Loading @@ -14,6 +14,9 @@ #pragma once #include "tnlFastSweepingMethod.h" #include <TNL/TypeInfo.h> #include <TNL/Devices/Cuda.h> template< typename Real, typename Device, Loading Loading @@ -212,15 +215,23 @@ solve( const MeshPointer& mesh, { // TODO: CUDA code #ifdef HAVE_CUDA /*int numBlocks = 2; int threadsPerBlock; if( mesh->getDimensions().x() >= mesh->getDimensions().y() ) threadsPerBlock = (int)( mesh->getDimensions().x() ); else threadsPerBlock = (int)( mesh->getDimensions().y() ); const int cudaBlockSize( 16 ); int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); dim3 blockSize( cudaBlockSize, cudaBlockSize ); dim3 gridSize( numBlocksX, numBlocksY ); Devices::Cuda::synchronizeDevice(); int DIM = mesh->getDimensions().x(); CudaUpdateCellCaller< Real, Device, Index ><<< numBlocks, threadsPerBlock >>>( interfaceMap, aux ); cudaDeviceSynchronize(); //copak dela?*/ tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; for( int k = 0; k < numBlocksX; k++) CudaUpdateCellCaller< Real, Device, Index ><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>() ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; aux = *auxPtr; interfaceMap = *interfaceMapPtr; #endif } iteration++; Loading @@ -228,30 +239,30 @@ solve( const MeshPointer& mesh, aux.save("aux-final.tnl"); } //#ifdef HAVE_CUDA template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index >, 2, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 2, Real, Device, Index > >& aux ) { int i = threadIdx.x + blockDim.x*blockIdx.x; int j = threadIdx.y + blockDim.y*blockIdx.y; const Meshes::Grid< 2, Real, Device, Index >& mesh = aux.getMesh(); int j = blockDim.y*blockIdx.y + threadIdx.y; const Meshes::Grid< 2, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() ) { //make cell of aux from index typedef typename Meshes::Grid< 2, Real, Device, Index >::Cell Cell; Cell cell( mesh ); cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.refresh(); //update cell value few times //for( int i = 0; i < mesh.getDimensions() ; i++ ) //{ cell.refresh(); //tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > > ptr; for( int k = 0; k < 16; k++ ) { if( ! interfaceMap( cell ) ) { // tnlDirectEikonalMethodsBase< Meshes::Grid< 2, Real, Device, Index > >::updateCell( aux, cell ); ptr.updateCell( aux, cell ); } } //} } } //#endif No newline at end of file
src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +218 −162 Original line number Diff line number Diff line Loading @@ -21,7 +21,7 @@ template< typename Real, typename Anisotropy > FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Anisotropy >:: FastSweepingMethod() : maxIterations( 1 ) : maxIterations( 2 ) { } Loading Loading @@ -64,6 +64,7 @@ solve( const MeshPointer& mesh, interfaceMapPtr->setMesh( mesh ); std::cout << "Initiating the interface cells ..." << std::endl; BaseType::initInterface( u, auxPtr, interfaceMapPtr ); cudaDeviceSynchronize(); auxPtr->save( "aux-ini.tnl" ); typename MeshType::Cell cell( *mesh ); Loading @@ -72,6 +73,8 @@ solve( const MeshPointer& mesh, MeshFunctionType aux = *auxPtr; InterfaceMapType interfaceMap = * interfaceMapPtr; while( iteration < this->maxIterations ) { if( std::is_same< DeviceType, Devices::Host >::value ) { for( cell.getCoordinates().z() = 0; cell.getCoordinates().z() < mesh->getDimensions().z(); Loading Loading @@ -237,6 +240,33 @@ solve( const MeshPointer& mesh, } } } } if( std::is_same< DeviceType, Devices::Cuda >::value ) { // TODO: CUDA code #ifdef HAVE_CUDA const int cudaBlockSize( 8 ); int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y(), cudaBlockSize ); int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z(), cudaBlockSize ); if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); Devices::Cuda::synchronizeDevice(); tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr; for( int k = 0; k < numBlocksX; k++) CudaUpdateCellCaller< Real, Device, Index ><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), auxPtr.template modifyData< Device>() ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; aux = *auxPtr; interfaceMap = *interfaceMapPtr; #endif } //aux.save( "aux-8.tnl" ); iteration++; Loading @@ -244,3 +274,29 @@ solve( const MeshPointer& mesh, aux.save("aux-final.tnl"); } template < typename Real, typename Device, typename Index > __global__ void CudaUpdateCellCaller( tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr, const Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index >, 3, bool >& interfaceMap, Functions::MeshFunction< Meshes::Grid< 3, Real, Device, Index > >& aux ) { int i = threadIdx.x + blockDim.x*blockIdx.x; int j = blockDim.y*blockIdx.y + threadIdx.y; int k = blockDim.z*blockIdx.z + threadIdx.z; const Meshes::Grid< 3, Real, Device, Index >& mesh = interfaceMap.template getMesh< Devices::Cuda >(); if( i < mesh.getDimensions().x() && j < mesh.getDimensions().y() && k < mesh.getDimensions().z() ) { typedef typename Meshes::Grid< 3, Real, Device, Index >::Cell Cell; Cell cell( mesh ); cell.getCoordinates().x() = i; cell.getCoordinates().y() = j; cell.getCoordinates().z() = k; cell.refresh(); //tnlDirectEikonalMethodsBase< Meshes::Grid< 3, Real, Device, Index > > ptr; for( int l = 0; l < 8; l++ ) { if( ! interfaceMap( cell ) ) { ptr.updateCell( aux, cell ); } } } } No newline at end of file