Loading src/TNL/Solvers/PDE/ExplicitUpdater.h +3 −0 Original line number Diff line number Diff line Loading @@ -146,6 +146,9 @@ class ExplicitUpdater { TNL_ASSERT_TRUE( this->userData.boundaryConditions, "The boundary conditions are not correctly set-up. Use method setBoundaryCondtions() to do it." ); TNL_ASSERT_TRUE( &uPointer.template modifyData< DeviceType >(), "The function u is not correctly set-up. It was not bound probably with DOFs." ); this->userData.time = time; this->userData.u = &uPointer.template modifyData< DeviceType >(); Meshes::Traverser< MeshType, EntityType > meshTraverser; Loading tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem.h +3 −1 Original line number Diff line number Diff line Loading @@ -3,6 +3,7 @@ #include <TNL/Problems/PDEProblem.h> #include <TNL/Functions/MeshFunction.h> #include <TNL/Solvers/PDE/ExplicitUpdater.h> #include "Tuning/ExplicitUpdater.h" using namespace TNL; Loading Loading @@ -93,7 +94,8 @@ class HeatEquationBenchmarkProblem: RightHandSide* cudaRightHandSide; DifferentialOperator* cudaDifferentialOperator; TNL::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide > explicitUpdater; TNL::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide > tuningExplicitUpdater; TNL::Solvers::PDE::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide > explicitUpdater; }; Loading tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h +58 −38 Original line number Diff line number Diff line Loading @@ -5,9 +5,11 @@ #include <TNL/Matrices/MatrixSetter.h> #include <TNL/Solvers/PDE/LinearSystemAssembler.h> #include <TNL/Solvers/PDE/BackwardTimeDiscretisation.h> #include <TNL/Solvers/PDE/ExplicitUpdater.h> #include "TestGridEntity.h" #include "Tuning/tunning.h" #include "Tuning/SimpleCell.h" #include "Tuning/GridTraverser.h" template< typename Mesh, typename BoundaryCondition, Loading Loading @@ -41,7 +43,14 @@ String HeatEquationBenchmarkProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >:: getPrologHeader() const { return String( "Heat Equation Benchmark" ); if( this->cudaKernelType == "pure-c" ) return String( "Heat Equation Benchmark PURE-C test" ); if( this->cudaKernelType == "templated" ) return String( "Heat Equation Benchmark TEMPLATED test" ); if( this->cudaKernelType == "templated-compact" ) return String( "Heat Equation Benchmark TEMPLATED COMPACT test" ); if( this->cudaKernelType == "tunning" ) return String( "Heat Equation Benchmark TUNNIG test" ); } template< typename Mesh, Loading Loading @@ -78,7 +87,9 @@ setup( const Config::ParameterContainer& parameters, this->cudaRightHandSide = Devices::Cuda::passToDevice( *this->rightHandSidePointer ); this->cudaDifferentialOperator = Devices::Cuda::passToDevice( *this->differentialOperatorPointer ); } this->explicitUpdater.setDifferentialOperator( this->differentialOperatorPointer ); this->explicitUpdater.setBoundaryConditions( this->boundaryConditionPointer ); this->explicitUpdater.setRightHandSide( this->rightHandSidePointer ); return true; } Loading @@ -105,6 +116,7 @@ void HeatEquationBenchmarkProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >:: bindDofs( DofVectorPointer& dofsPointer ) { this->u->bind( this->getMesh(), dofsPointer ); } template< typename Mesh, Loading Loading @@ -516,55 +528,66 @@ getExplicitUpdate( const RealType& time, this->u->bind( mesh, uDofs ); this->fu->bind( mesh, fuDofs ); //explicitUpdater.setGPUTransferTimer( this->gpuTransferTimer ); explicitUpdater.setDifferentialOperator( this->differentialOperatorPointer ); explicitUpdater.setBoundaryConditions( this->boundaryConditionPointer ); explicitUpdater.setRightHandSide( this->rightHandSidePointer ); //this->explicitUpdater.template update< typename Mesh::Cell >( time, tau, mesh, this->u, this->fu ); this->explicitUpdater.template update< typename Mesh::Cell >( time, tau, mesh, this->u, this->fu ); } if( this->cudaKernelType == "tunning" ) { if( std::is_same< DeviceType, Devices::Cuda >::value ) { /*using ExplicitUpdaterType = TNL::Solvers::PDE::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide >; using Cell = typename MeshType::Cell; using MeshTraverserType = Meshes::Traverser< MeshType, Cell >; using UserData = TNL::Solvers::PDE::ExplicitUpdaterTraverserUserData< RealType, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide >;*/ //using CellConfig = Meshes::GridEntityNoStencilStorage; using CellConfig = Meshes::GridEntityCrossStencilStorage< 1 >; using ExplicitUpdaterType = ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide >; using Cell = SimpleCell< Mesh, CellConfig >; using MeshTraverserType = Traverser< MeshType, Cell >; using UserData = ExplicitUpdaterTraverserUserData< RealType, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide >; using ExplicitUpdaterType = TNL::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide >; using InteriorEntitiesProcessor = typename ExplicitUpdaterType::TraverserInteriorEntitiesProcessor; using BoundaryEntitiesProcessor = typename ExplicitUpdaterType::TraverserBoundaryEntitiesProcessor; const IndexType gridXSize = mesh->getDimensions().x(); const IndexType gridYSize = mesh->getDimensions().y(); /*const RealType& hx_inv = mesh->template getSpaceStepsProducts< -2, 0 >(); const RealType& hy_inv = mesh->template getSpaceStepsProducts< 0, -2 >();*/ dim3 cudaBlockSize( 16, 16 ); dim3 cudaGridSize( gridXSize / 16 + ( gridXSize % 16 != 0 ), gridYSize / 16 + ( gridYSize % 16 != 0 ) ); /*Pointers::SharedPointer< UserData, Devices::Cuda > userDataPtr; userDataPtr->time = time; userDataPtr->differentialOperator = &this->differentialOperatorPointer.template getData< Devices::Cuda >(); userDataPtr->boundaryConditions = &this->boundaryConditionPointer.template getData< Devices::Cuda >(); userDataPtr->rightHandSide = NULL; userDataPtr->u = uDofs->getData(); userDataPtr->fu = fuDofs->getData();*/ this->u->bind( mesh, uDofs ); this->fu->bind( mesh, fuDofs ); UserData userData; userData.time = time; userData.differentialOperator = &this->differentialOperatorPointer.template getData< Devices::Cuda >(); userData.boundaryConditions = &this->boundaryConditionPointer.template getData< Devices::Cuda >(); userData.rightHandSide = NULL; userData.rightHandSide = &this->rightHandSidePointer.template getData< Devices::Cuda >(); //userData.u = &this->u.template modifyData< Devices::Cuda >(); //uDofs->getData(); //userData.fu = &this->fu.template modifyData< Devices::Cuda >(); //fuDofs->getData(); userData.u = uDofs->getData(); userData.fu = fuDofs->getData(); const IndexType gridXSize = mesh->getDimensions().x(); const IndexType gridYSize = mesh->getDimensions().y(); dim3 cudaBlockSize( 16, 16 ); dim3 cudaGridSize( gridXSize / 16 + ( gridXSize % 16 != 0 ), gridYSize / 16 + ( gridYSize % 16 != 0 ) ); TNL::Devices::Cuda::synchronizeDevice(); int cudaErr; /*Meshes::Traverser< MeshType, Cell > meshTraverser; meshTraverser.template processBoundaryEntities< UserData, BoundaryEntitiesProcessor > ( mesh, userData ); */ _boundaryConditionsKernel< BoundaryEntitiesProcessor, UserData, MeshType, RealType, IndexType > <<< cudaGridSize, cudaBlockSize >>> ( &mesh.template getData< Devices::Cuda >(), Loading @@ -576,20 +599,17 @@ getExplicitUpdate( const RealType& time, return; } /**** * Laplace operator */ //cout << "Laplace operator ... " << endl; /*Meshes::Traverser< MeshType, SimpleCell > meshTraverser meshTraverser.template processInteriorEntities< UserData, /*meshTraverser.template processInteriorEntities< UserData, InteriorEntitiesProcessor > ( meshPointer, userDataPointer );*/ ( mesh, userData ); * */ _heatEquationKernel< InteriorEntitiesProcessor, UserData, MeshType, RealType, IndexType > <<< cudaGridSize, cudaBlockSize >>> ( &mesh.template getData< Devices::Cuda >(), userData ); //&userDataPtr.template modifyData< Devices::Cuda >() ); //&userDataPtr.template modifyData< Devices::Cuda >() );*/ if( cudaGetLastError() != cudaSuccess ) { std::cerr << "Laplace operator failed." << std::endl; Loading tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser.h +9 −7 Original line number Diff line number Diff line Loading @@ -19,7 +19,7 @@ namespace TNL { /**** * This is only a helper class for Traverser specializations for Grid. */ template< typename Grid > template< typename Grid, typename Cell > class GridTraverser { }; Loading @@ -28,8 +28,9 @@ class GridTraverser * 2D grid, Devices::Host */ template< typename Real, typename Index > class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > > typename Index, typename Cell > class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index >, Cell > { public: Loading @@ -53,7 +54,7 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > > const GridPointer& gridPointer, const CoordinatesType begin, const CoordinatesType end, Pointers::SharedPointer< UserData, DeviceType >& userData, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) // const int& stream = 0, const int& stream, Loading @@ -66,8 +67,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > > * 2D grid, Devices::Cuda */ template< typename Real, typename Index > class GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > > typename Index, typename Cell > class GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index >, Cell > { public: Loading @@ -91,7 +93,7 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > > const GridPointer& gridPointer, const CoordinatesType& begin, const CoordinatesType& end, Pointers::SharedPointer< UserData, DeviceType >& userData, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) // const int& stream = 0, const int& stream, Loading tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser_impl.h +72 −116 Original line number Diff line number Diff line Loading @@ -22,7 +22,8 @@ namespace TNL { * 2D traverser, host */ template< typename Real, typename Index > typename Index, typename Cell > template< typename GridEntity, typename EntitiesProcessor, Loading @@ -32,12 +33,12 @@ template< typename Real, int YOrthogonalBoundary, typename... GridEntityParameters > void GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >:: GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index >, Cell >:: processEntities( const GridPointer& gridPointer, const CoordinatesType begin, const CoordinatesType end, Pointers::SharedPointer< UserData, DeviceType >& userDataPointer, UserData& userData, const int& stream, const GridEntityParameters&... gridEntityParameters ) { Loading @@ -52,10 +53,10 @@ processEntities( { entity.getCoordinates().y() = begin.y(); entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), *userDataPointer, entity ); EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); entity.getCoordinates().y() = end.y(); entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), *userDataPointer, entity ); EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); } if( XOrthogonalBoundary ) for( entity.getCoordinates().y() = begin.y(); Loading @@ -64,10 +65,10 @@ processEntities( { entity.getCoordinates().x() = begin.x(); entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), *userDataPointer, entity ); EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); entity.getCoordinates().x() = end.x(); entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), *userDataPointer, entity ); EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); } } else Loading Loading @@ -98,7 +99,7 @@ processEntities( entity.getCoordinates().x() = x; entity.getCoordinates().y() = y; entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), *userDataPointer, entity ); EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); } } } Loading Loading @@ -134,9 +135,9 @@ _GridTraverser2D( { //GridEntity entity( *grid, coordinates, gridEntityParameters... ); //entity.refresh(); if( ! processOnlyBoundaryEntities || /*if( ! processOnlyBoundaryEntities || ( coordinates.x() == 0 || coordinates.y() == 0 || coordinates.x() == grid->getDimensions().x() - 1 || coordinates.y() == grid->getDimensions().y() - 1 ) ) coordinates.x() == grid->getDimensions().x() - 1 || coordinates.y() == grid->getDimensions().y() - 1 ) )*/ //entity.isBoundaryEntity() ) { EntitiesProcessor::processEntity Loading @@ -149,6 +150,7 @@ _GridTraverser2D( } } template< typename Real, typename Index, typename GridEntity, Loading @@ -157,70 +159,68 @@ template< typename Real, bool processOnlyBoundaryEntities, typename... GridEntityParameters > __global__ void _GridTraverser2DBoundaryAlongX( _GridTraverser2DBoundary( const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, UserData* userData, UserData userData, const Index beginX, const Index endX, const Index fixedY, const Index beginY, const Index endY, const dim3 gridIdx, const GridEntityParameters... gridEntityParameters ) { typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; using GridType = Meshes::Grid< 2, Real, Devices::Cuda, Index >; using CoordinatesType = typename GridType::CoordinatesType; coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); coordinates.y() = fixedY; Index entitiesAlongX = endX - beginX + 1; Index entitiesAlongY = endY - beginY; if( coordinates.x() <= endX ) Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); if( threadId < entitiesAlongX ) { GridEntity entity( *grid, coordinates, gridEntityParameters... ); GridEntity entity( *grid, CoordinatesType( beginX + threadId, beginY ), gridEntityParameters... ); //printf( "X1: Thread %d -> %d %d x %d %d \n ", threadId, // entity.getCoordinates().x(), entity.getCoordinates().y(), // grid->getDimensions().x(), grid->getDimensions().y() ); entity.refresh(); EntitiesProcessor::processEntity ( *grid, *userData, entity ); EntitiesProcessor::processEntity( *grid, userData, entity ); } else if( ( threadId -= entitiesAlongX ) < entitiesAlongX && threadId >= 0 ) { GridEntity entity( *grid, CoordinatesType( beginX + threadId, endY ), gridEntityParameters... ); entity.refresh(); //printf( "X2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); EntitiesProcessor::processEntity( *grid, userData, entity ); } template< typename Real, typename Index, typename GridEntity, typename UserData, typename EntitiesProcessor, bool processOnlyBoundaryEntities, typename... GridEntityParameters > __global__ void _GridTraverser2DBoundaryAlongY( const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, UserData* userData, const Index beginY, const Index endY, const Index fixedX, const dim3 gridIdx, const GridEntityParameters... gridEntityParameters ) else if( ( ( threadId -= entitiesAlongX ) < entitiesAlongY - 1 ) && threadId >= 0 ) { typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; coordinates.x() = fixedX; coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); if( coordinates.y() <= endY ) GridEntity entity( *grid, CoordinatesType( beginX, beginY + threadId + 1 ), gridEntityParameters... ); entity.refresh(); //printf( "Y1: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); EntitiesProcessor::processEntity( *grid, userData, entity ); } else if( ( ( threadId -= entitiesAlongY - 1 ) < entitiesAlongY - 1 ) && threadId >= 0 ) { GridEntity entity( *grid, coordinates, gridEntityParameters... ); GridEntity entity( *grid, CoordinatesType( endX, beginY + threadId + 1 ), gridEntityParameters... ); entity.refresh(); EntitiesProcessor::processEntity ( *grid, *userData, entity ); //printf( "Y2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); EntitiesProcessor::processEntity( *grid, userData, entity ); } } #endif template< typename Real, typename Index > typename Index, typename Cell > template< typename GridEntity, typename EntitiesProcessor, Loading @@ -230,12 +230,12 @@ template< typename Real, int YOrthogonalBoundary, typename... GridEntityParameters > void GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >:: GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index >, Cell >:: processEntities( const GridPointer& gridPointer, const CoordinatesType& begin, const CoordinatesType& end, Pointers::SharedPointer< UserData, DeviceType >& userDataPointer, UserData& userData, const int& stream, const GridEntityParameters&... gridEntityParameters ) { Loading @@ -244,69 +244,25 @@ processEntities( ( GridEntity::getEntityDimension() == 2 || GridEntity::getEntityDimension() == 0 ) ) { dim3 cudaBlockSize( 256 ); dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX, cudaBlocksCountAlongY, cudaGridsCountAlongY; Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 ); Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 ); auto& pool = CudaStreamPool::getInstance(); Devices::Cuda::synchronizeDevice(); const cudaStream_t& s1 = pool.getStream( stream ); const cudaStream_t& s2 = pool.getStream( stream + 1 ); dim3 cudaBlocksCount, cudaGridsCount; IndexType cudaThreadsCount = 2 * ( end.x() - begin.x() + end.y() - begin.y() + 1 ); Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount ); dim3 gridIdx, cudaGridSize; for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ ) Devices::Cuda::synchronizeDevice(); for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ ) { Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize ); //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX ); TNL::_GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaGridSize, cudaBlockSize, 0, s1 >>> Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); _GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaGridSize, cudaBlockSize >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), userData, begin.x(), end.x(), begin.y(), gridIdx, gridEntityParameters... ); TNL::_GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaGridSize, cudaBlockSize, 0, s2 >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), begin.x(), end.x(), end.y(), gridIdx, gridEntityParameters... ); } const cudaStream_t& s3 = pool.getStream( stream + 2 ); const cudaStream_t& s4 = pool.getStream( stream + 3 ); for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ ) { Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize ); TNL::_GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaGridSize, cudaBlockSize, 0, s3 >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), begin.y() + 1, end.y() - 1, begin.x(), gridIdx, gridEntityParameters... ); TNL::_GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaGridSize, cudaBlockSize, 0, s4 >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), begin.y() + 1, end.y() - 1, end.x(), gridIdx, gridEntityParameters... ); } cudaStreamSynchronize( s1 ); cudaStreamSynchronize( s2 ); cudaStreamSynchronize( s3 ); cudaStreamSynchronize( s4 ); TNL_CHECK_CUDA_DEVICE; } else { Loading @@ -329,7 +285,7 @@ processEntities( TNL::_GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaGridSize, cudaBlockSize, 0, s >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), &userData, begin, end, gridIdx, Loading Loading
src/TNL/Solvers/PDE/ExplicitUpdater.h +3 −0 Original line number Diff line number Diff line Loading @@ -146,6 +146,9 @@ class ExplicitUpdater { TNL_ASSERT_TRUE( this->userData.boundaryConditions, "The boundary conditions are not correctly set-up. Use method setBoundaryCondtions() to do it." ); TNL_ASSERT_TRUE( &uPointer.template modifyData< DeviceType >(), "The function u is not correctly set-up. It was not bound probably with DOFs." ); this->userData.time = time; this->userData.u = &uPointer.template modifyData< DeviceType >(); Meshes::Traverser< MeshType, EntityType > meshTraverser; Loading
tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem.h +3 −1 Original line number Diff line number Diff line Loading @@ -3,6 +3,7 @@ #include <TNL/Problems/PDEProblem.h> #include <TNL/Functions/MeshFunction.h> #include <TNL/Solvers/PDE/ExplicitUpdater.h> #include "Tuning/ExplicitUpdater.h" using namespace TNL; Loading Loading @@ -93,7 +94,8 @@ class HeatEquationBenchmarkProblem: RightHandSide* cudaRightHandSide; DifferentialOperator* cudaDifferentialOperator; TNL::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide > explicitUpdater; TNL::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide > tuningExplicitUpdater; TNL::Solvers::PDE::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide > explicitUpdater; }; Loading
tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h +58 −38 Original line number Diff line number Diff line Loading @@ -5,9 +5,11 @@ #include <TNL/Matrices/MatrixSetter.h> #include <TNL/Solvers/PDE/LinearSystemAssembler.h> #include <TNL/Solvers/PDE/BackwardTimeDiscretisation.h> #include <TNL/Solvers/PDE/ExplicitUpdater.h> #include "TestGridEntity.h" #include "Tuning/tunning.h" #include "Tuning/SimpleCell.h" #include "Tuning/GridTraverser.h" template< typename Mesh, typename BoundaryCondition, Loading Loading @@ -41,7 +43,14 @@ String HeatEquationBenchmarkProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >:: getPrologHeader() const { return String( "Heat Equation Benchmark" ); if( this->cudaKernelType == "pure-c" ) return String( "Heat Equation Benchmark PURE-C test" ); if( this->cudaKernelType == "templated" ) return String( "Heat Equation Benchmark TEMPLATED test" ); if( this->cudaKernelType == "templated-compact" ) return String( "Heat Equation Benchmark TEMPLATED COMPACT test" ); if( this->cudaKernelType == "tunning" ) return String( "Heat Equation Benchmark TUNNIG test" ); } template< typename Mesh, Loading Loading @@ -78,7 +87,9 @@ setup( const Config::ParameterContainer& parameters, this->cudaRightHandSide = Devices::Cuda::passToDevice( *this->rightHandSidePointer ); this->cudaDifferentialOperator = Devices::Cuda::passToDevice( *this->differentialOperatorPointer ); } this->explicitUpdater.setDifferentialOperator( this->differentialOperatorPointer ); this->explicitUpdater.setBoundaryConditions( this->boundaryConditionPointer ); this->explicitUpdater.setRightHandSide( this->rightHandSidePointer ); return true; } Loading @@ -105,6 +116,7 @@ void HeatEquationBenchmarkProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >:: bindDofs( DofVectorPointer& dofsPointer ) { this->u->bind( this->getMesh(), dofsPointer ); } template< typename Mesh, Loading Loading @@ -516,55 +528,66 @@ getExplicitUpdate( const RealType& time, this->u->bind( mesh, uDofs ); this->fu->bind( mesh, fuDofs ); //explicitUpdater.setGPUTransferTimer( this->gpuTransferTimer ); explicitUpdater.setDifferentialOperator( this->differentialOperatorPointer ); explicitUpdater.setBoundaryConditions( this->boundaryConditionPointer ); explicitUpdater.setRightHandSide( this->rightHandSidePointer ); //this->explicitUpdater.template update< typename Mesh::Cell >( time, tau, mesh, this->u, this->fu ); this->explicitUpdater.template update< typename Mesh::Cell >( time, tau, mesh, this->u, this->fu ); } if( this->cudaKernelType == "tunning" ) { if( std::is_same< DeviceType, Devices::Cuda >::value ) { /*using ExplicitUpdaterType = TNL::Solvers::PDE::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide >; using Cell = typename MeshType::Cell; using MeshTraverserType = Meshes::Traverser< MeshType, Cell >; using UserData = TNL::Solvers::PDE::ExplicitUpdaterTraverserUserData< RealType, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide >;*/ //using CellConfig = Meshes::GridEntityNoStencilStorage; using CellConfig = Meshes::GridEntityCrossStencilStorage< 1 >; using ExplicitUpdaterType = ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide >; using Cell = SimpleCell< Mesh, CellConfig >; using MeshTraverserType = Traverser< MeshType, Cell >; using UserData = ExplicitUpdaterTraverserUserData< RealType, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide >; using ExplicitUpdaterType = TNL::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide >; using InteriorEntitiesProcessor = typename ExplicitUpdaterType::TraverserInteriorEntitiesProcessor; using BoundaryEntitiesProcessor = typename ExplicitUpdaterType::TraverserBoundaryEntitiesProcessor; const IndexType gridXSize = mesh->getDimensions().x(); const IndexType gridYSize = mesh->getDimensions().y(); /*const RealType& hx_inv = mesh->template getSpaceStepsProducts< -2, 0 >(); const RealType& hy_inv = mesh->template getSpaceStepsProducts< 0, -2 >();*/ dim3 cudaBlockSize( 16, 16 ); dim3 cudaGridSize( gridXSize / 16 + ( gridXSize % 16 != 0 ), gridYSize / 16 + ( gridYSize % 16 != 0 ) ); /*Pointers::SharedPointer< UserData, Devices::Cuda > userDataPtr; userDataPtr->time = time; userDataPtr->differentialOperator = &this->differentialOperatorPointer.template getData< Devices::Cuda >(); userDataPtr->boundaryConditions = &this->boundaryConditionPointer.template getData< Devices::Cuda >(); userDataPtr->rightHandSide = NULL; userDataPtr->u = uDofs->getData(); userDataPtr->fu = fuDofs->getData();*/ this->u->bind( mesh, uDofs ); this->fu->bind( mesh, fuDofs ); UserData userData; userData.time = time; userData.differentialOperator = &this->differentialOperatorPointer.template getData< Devices::Cuda >(); userData.boundaryConditions = &this->boundaryConditionPointer.template getData< Devices::Cuda >(); userData.rightHandSide = NULL; userData.rightHandSide = &this->rightHandSidePointer.template getData< Devices::Cuda >(); //userData.u = &this->u.template modifyData< Devices::Cuda >(); //uDofs->getData(); //userData.fu = &this->fu.template modifyData< Devices::Cuda >(); //fuDofs->getData(); userData.u = uDofs->getData(); userData.fu = fuDofs->getData(); const IndexType gridXSize = mesh->getDimensions().x(); const IndexType gridYSize = mesh->getDimensions().y(); dim3 cudaBlockSize( 16, 16 ); dim3 cudaGridSize( gridXSize / 16 + ( gridXSize % 16 != 0 ), gridYSize / 16 + ( gridYSize % 16 != 0 ) ); TNL::Devices::Cuda::synchronizeDevice(); int cudaErr; /*Meshes::Traverser< MeshType, Cell > meshTraverser; meshTraverser.template processBoundaryEntities< UserData, BoundaryEntitiesProcessor > ( mesh, userData ); */ _boundaryConditionsKernel< BoundaryEntitiesProcessor, UserData, MeshType, RealType, IndexType > <<< cudaGridSize, cudaBlockSize >>> ( &mesh.template getData< Devices::Cuda >(), Loading @@ -576,20 +599,17 @@ getExplicitUpdate( const RealType& time, return; } /**** * Laplace operator */ //cout << "Laplace operator ... " << endl; /*Meshes::Traverser< MeshType, SimpleCell > meshTraverser meshTraverser.template processInteriorEntities< UserData, /*meshTraverser.template processInteriorEntities< UserData, InteriorEntitiesProcessor > ( meshPointer, userDataPointer );*/ ( mesh, userData ); * */ _heatEquationKernel< InteriorEntitiesProcessor, UserData, MeshType, RealType, IndexType > <<< cudaGridSize, cudaBlockSize >>> ( &mesh.template getData< Devices::Cuda >(), userData ); //&userDataPtr.template modifyData< Devices::Cuda >() ); //&userDataPtr.template modifyData< Devices::Cuda >() );*/ if( cudaGetLastError() != cudaSuccess ) { std::cerr << "Laplace operator failed." << std::endl; Loading
tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser.h +9 −7 Original line number Diff line number Diff line Loading @@ -19,7 +19,7 @@ namespace TNL { /**** * This is only a helper class for Traverser specializations for Grid. */ template< typename Grid > template< typename Grid, typename Cell > class GridTraverser { }; Loading @@ -28,8 +28,9 @@ class GridTraverser * 2D grid, Devices::Host */ template< typename Real, typename Index > class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > > typename Index, typename Cell > class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index >, Cell > { public: Loading @@ -53,7 +54,7 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > > const GridPointer& gridPointer, const CoordinatesType begin, const CoordinatesType end, Pointers::SharedPointer< UserData, DeviceType >& userData, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) // const int& stream = 0, const int& stream, Loading @@ -66,8 +67,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > > * 2D grid, Devices::Cuda */ template< typename Real, typename Index > class GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > > typename Index, typename Cell > class GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index >, Cell > { public: Loading @@ -91,7 +93,7 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > > const GridPointer& gridPointer, const CoordinatesType& begin, const CoordinatesType& end, Pointers::SharedPointer< UserData, DeviceType >& userData, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) // const int& stream = 0, const int& stream, Loading
tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser_impl.h +72 −116 Original line number Diff line number Diff line Loading @@ -22,7 +22,8 @@ namespace TNL { * 2D traverser, host */ template< typename Real, typename Index > typename Index, typename Cell > template< typename GridEntity, typename EntitiesProcessor, Loading @@ -32,12 +33,12 @@ template< typename Real, int YOrthogonalBoundary, typename... GridEntityParameters > void GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >:: GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index >, Cell >:: processEntities( const GridPointer& gridPointer, const CoordinatesType begin, const CoordinatesType end, Pointers::SharedPointer< UserData, DeviceType >& userDataPointer, UserData& userData, const int& stream, const GridEntityParameters&... gridEntityParameters ) { Loading @@ -52,10 +53,10 @@ processEntities( { entity.getCoordinates().y() = begin.y(); entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), *userDataPointer, entity ); EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); entity.getCoordinates().y() = end.y(); entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), *userDataPointer, entity ); EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); } if( XOrthogonalBoundary ) for( entity.getCoordinates().y() = begin.y(); Loading @@ -64,10 +65,10 @@ processEntities( { entity.getCoordinates().x() = begin.x(); entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), *userDataPointer, entity ); EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); entity.getCoordinates().x() = end.x(); entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), *userDataPointer, entity ); EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); } } else Loading Loading @@ -98,7 +99,7 @@ processEntities( entity.getCoordinates().x() = x; entity.getCoordinates().y() = y; entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), *userDataPointer, entity ); EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); } } } Loading Loading @@ -134,9 +135,9 @@ _GridTraverser2D( { //GridEntity entity( *grid, coordinates, gridEntityParameters... ); //entity.refresh(); if( ! processOnlyBoundaryEntities || /*if( ! processOnlyBoundaryEntities || ( coordinates.x() == 0 || coordinates.y() == 0 || coordinates.x() == grid->getDimensions().x() - 1 || coordinates.y() == grid->getDimensions().y() - 1 ) ) coordinates.x() == grid->getDimensions().x() - 1 || coordinates.y() == grid->getDimensions().y() - 1 ) )*/ //entity.isBoundaryEntity() ) { EntitiesProcessor::processEntity Loading @@ -149,6 +150,7 @@ _GridTraverser2D( } } template< typename Real, typename Index, typename GridEntity, Loading @@ -157,70 +159,68 @@ template< typename Real, bool processOnlyBoundaryEntities, typename... GridEntityParameters > __global__ void _GridTraverser2DBoundaryAlongX( _GridTraverser2DBoundary( const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, UserData* userData, UserData userData, const Index beginX, const Index endX, const Index fixedY, const Index beginY, const Index endY, const dim3 gridIdx, const GridEntityParameters... gridEntityParameters ) { typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; using GridType = Meshes::Grid< 2, Real, Devices::Cuda, Index >; using CoordinatesType = typename GridType::CoordinatesType; coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); coordinates.y() = fixedY; Index entitiesAlongX = endX - beginX + 1; Index entitiesAlongY = endY - beginY; if( coordinates.x() <= endX ) Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); if( threadId < entitiesAlongX ) { GridEntity entity( *grid, coordinates, gridEntityParameters... ); GridEntity entity( *grid, CoordinatesType( beginX + threadId, beginY ), gridEntityParameters... ); //printf( "X1: Thread %d -> %d %d x %d %d \n ", threadId, // entity.getCoordinates().x(), entity.getCoordinates().y(), // grid->getDimensions().x(), grid->getDimensions().y() ); entity.refresh(); EntitiesProcessor::processEntity ( *grid, *userData, entity ); EntitiesProcessor::processEntity( *grid, userData, entity ); } else if( ( threadId -= entitiesAlongX ) < entitiesAlongX && threadId >= 0 ) { GridEntity entity( *grid, CoordinatesType( beginX + threadId, endY ), gridEntityParameters... ); entity.refresh(); //printf( "X2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); EntitiesProcessor::processEntity( *grid, userData, entity ); } template< typename Real, typename Index, typename GridEntity, typename UserData, typename EntitiesProcessor, bool processOnlyBoundaryEntities, typename... GridEntityParameters > __global__ void _GridTraverser2DBoundaryAlongY( const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, UserData* userData, const Index beginY, const Index endY, const Index fixedX, const dim3 gridIdx, const GridEntityParameters... gridEntityParameters ) else if( ( ( threadId -= entitiesAlongX ) < entitiesAlongY - 1 ) && threadId >= 0 ) { typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; coordinates.x() = fixedX; coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); if( coordinates.y() <= endY ) GridEntity entity( *grid, CoordinatesType( beginX, beginY + threadId + 1 ), gridEntityParameters... ); entity.refresh(); //printf( "Y1: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); EntitiesProcessor::processEntity( *grid, userData, entity ); } else if( ( ( threadId -= entitiesAlongY - 1 ) < entitiesAlongY - 1 ) && threadId >= 0 ) { GridEntity entity( *grid, coordinates, gridEntityParameters... ); GridEntity entity( *grid, CoordinatesType( endX, beginY + threadId + 1 ), gridEntityParameters... ); entity.refresh(); EntitiesProcessor::processEntity ( *grid, *userData, entity ); //printf( "Y2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); EntitiesProcessor::processEntity( *grid, userData, entity ); } } #endif template< typename Real, typename Index > typename Index, typename Cell > template< typename GridEntity, typename EntitiesProcessor, Loading @@ -230,12 +230,12 @@ template< typename Real, int YOrthogonalBoundary, typename... GridEntityParameters > void GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >:: GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index >, Cell >:: processEntities( const GridPointer& gridPointer, const CoordinatesType& begin, const CoordinatesType& end, Pointers::SharedPointer< UserData, DeviceType >& userDataPointer, UserData& userData, const int& stream, const GridEntityParameters&... gridEntityParameters ) { Loading @@ -244,69 +244,25 @@ processEntities( ( GridEntity::getEntityDimension() == 2 || GridEntity::getEntityDimension() == 0 ) ) { dim3 cudaBlockSize( 256 ); dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX, cudaBlocksCountAlongY, cudaGridsCountAlongY; Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 ); Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 ); auto& pool = CudaStreamPool::getInstance(); Devices::Cuda::synchronizeDevice(); const cudaStream_t& s1 = pool.getStream( stream ); const cudaStream_t& s2 = pool.getStream( stream + 1 ); dim3 cudaBlocksCount, cudaGridsCount; IndexType cudaThreadsCount = 2 * ( end.x() - begin.x() + end.y() - begin.y() + 1 ); Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount ); dim3 gridIdx, cudaGridSize; for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ ) Devices::Cuda::synchronizeDevice(); for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ ) { Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize ); //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX ); TNL::_GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaGridSize, cudaBlockSize, 0, s1 >>> Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); _GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaGridSize, cudaBlockSize >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), userData, begin.x(), end.x(), begin.y(), gridIdx, gridEntityParameters... ); TNL::_GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaGridSize, cudaBlockSize, 0, s2 >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), begin.x(), end.x(), end.y(), gridIdx, gridEntityParameters... ); } const cudaStream_t& s3 = pool.getStream( stream + 2 ); const cudaStream_t& s4 = pool.getStream( stream + 3 ); for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ ) { Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize ); TNL::_GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaGridSize, cudaBlockSize, 0, s3 >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), begin.y() + 1, end.y() - 1, begin.x(), gridIdx, gridEntityParameters... ); TNL::_GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaGridSize, cudaBlockSize, 0, s4 >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), begin.y() + 1, end.y() - 1, end.x(), gridIdx, gridEntityParameters... ); } cudaStreamSynchronize( s1 ); cudaStreamSynchronize( s2 ); cudaStreamSynchronize( s3 ); cudaStreamSynchronize( s4 ); TNL_CHECK_CUDA_DEVICE; } else { Loading @@ -329,7 +285,7 @@ processEntities( TNL::_GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaGridSize, cudaBlockSize, 0, s >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), &userData, begin, end, gridIdx, Loading