From 5f87aba32866310ee33168817129403338d829e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com> Date: Wed, 26 Jun 2019 21:28:34 +0200 Subject: [PATCH] Implemented addAndReduceAbs, refactoring of the Euler solver. --- .../Expressions/ExpressionTemplates.h | 52 ++++++++ .../Expressions/StaticExpressionTemplates.h | 46 +++++++ src/TNL/Problems/HeatEquationProblem.h | 51 ++++---- src/TNL/Problems/HeatEquationProblem_impl.h | 32 +++-- src/TNL/Solvers/ODE/Euler.h | 40 +++--- src/TNL/Solvers/ODE/Euler.hpp | 119 +----------------- src/UnitTests/Containers/VectorTest-8.h | 31 ++++- 7 files changed, 193 insertions(+), 178 deletions(-) diff --git a/src/TNL/Containers/Expressions/ExpressionTemplates.h b/src/TNL/Containers/Expressions/ExpressionTemplates.h index 8a7863e561..dab52995a8 100644 --- a/src/TNL/Containers/Expressions/ExpressionTemplates.h +++ b/src/TNL/Containers/Expressions/ExpressionTemplates.h @@ -2252,6 +2252,58 @@ Result addAndReduce( Vector& lhs, return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); } +//// +// Addition and reduction +template< typename Vector, + typename T1, + typename T2, + template< typename, typename > class Operation, + typename Reduction, + typename VolatileReduction, + typename Result > +Result addAndReduceAbs( Vector& lhs, + const Containers::Expressions::BinaryExpressionTemplate< T1, T2, Operation >& expression, + Reduction& reduction, + VolatileReduction& volatileReduction, + const Result& zero ) +{ + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + using DeviceType = typename Vector::DeviceType; + + RealType* lhs_data = lhs.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { + const RealType aux = expression[ i ]; + lhs_data[ i ] += aux; + return TNL::abs( aux ); + }; + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); +} + +template< typename Vector, + typename T1, + template< typename > class Operation, + typename Reduction, + typename VolatileReduction, + typename Result > +Result addAndReduceAbs( Vector& lhs, + const Containers::Expressions::UnaryExpressionTemplate< T1, Operation >& expression, + Reduction& reduction, + VolatileReduction& volatileReduction, + const Result& zero ) +{ + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + using DeviceType = typename Vector::DeviceType; + + RealType* lhs_data = lhs.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { + const RealType aux = expression[ i ]; + lhs_data[ i ] += aux; + return TNL::abs( aux ); + }; + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); +} //// // Output stream diff --git a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h index cec0d3ce22..967de900dd 100644 --- a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h +++ b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h @@ -2476,5 +2476,51 @@ Result addAndReduce( Vector& lhs, return result; } +//// +// Addition with reduction of abs +template< typename Vector, + typename T1, + typename T2, + template< typename, typename > class Operation, + typename Reduction, + typename VolatileReduction, + typename Result > +__cuda_callable__ +Result addAndReduceAbs( Vector& lhs, + const Containers::Expressions::StaticBinaryExpressionTemplate< T1, T2, Operation >& expression, + Reduction& reduction, + VolatileReduction& volatileReduction, + const Result& zero ) +{ + Result result( zero ); + for( int i = 0; i < Vector::getSize(); i++ ) { + const Result aux = expression[ i ]; + lhs[ i ] += aux; + reduction( result, TNL::abs( aux ) ); + } + return result; +} + +template< typename Vector, + typename T1, + template< typename > class Operation, + typename Reduction, + typename VolatileReduction, + typename Result > +__cuda_callable__ +Result addAndReduceAbs( Vector& lhs, + const Containers::Expressions::StaticUnaryExpressionTemplate< T1, Operation >& expression, + Reduction& reduction, + VolatileReduction& volatileReduction, + const Result& zero ) +{ + Result result( zero ); + for( int i = 0; i < Vector::getSize(); i++ ) { + const Result aux = expression[ i ]; + lhs[ i ] += aux; + reduction( result, TNL::abs( aux ) ); + } + return result; +} } // namespace TNL diff --git a/src/TNL/Problems/HeatEquationProblem.h b/src/TNL/Problems/HeatEquationProblem.h index a04e6ea2a9..cddd70746a 100644 --- a/src/TNL/Problems/HeatEquationProblem.h +++ b/src/TNL/Problems/HeatEquationProblem.h @@ -95,7 +95,7 @@ class HeatEquationProblem : public PDEProblem< Mesh, DofVectorPointer& _fu ); void applyBoundaryConditions( const RealType& time, - DofVectorPointer& dofs ); + DofVectorPointer& dofs ); template< typename MatrixPointer > void assemblyLinearSystem( const RealType& time, @@ -104,31 +104,32 @@ class HeatEquationProblem : public PDEProblem< Mesh, MatrixPointer& matrixPointer, DofVectorPointer& rightHandSidePointer ); - protected: - - MeshFunctionPointer uPointer; - MeshFunctionPointer fuPointer; - - DifferentialOperatorPointer differentialOperatorPointer; - - BoundaryConditionPointer boundaryConditionPointer; - - RightHandSidePointer rightHandSidePointer; - - Timer gpuTransferTimer; - - Solvers::PDE::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide > explicitUpdater; - - Solvers::PDE::LinearSystemAssembler< Mesh, - MeshFunctionType, - DifferentialOperator, - BoundaryCondition, - RightHandSide, - Solvers::PDE::BackwardTimeDiscretisation, - DofVectorType > systemAssembler; - - Meshes::DistributedMeshes::DistrGridIOTypes distributedIOType; + protected: + + MeshFunctionPointer uPointer; + MeshFunctionPointer fuPointer; + + DifferentialOperatorPointer differentialOperatorPointer; + + BoundaryConditionPointer boundaryConditionPointer; + + RightHandSidePointer rightHandSidePointer; + + Timer gpuTransferTimer; + + Solvers::PDE::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide > explicitUpdater; + + Solvers::PDE::LinearSystemAssembler< Mesh, + MeshFunctionType, + DifferentialOperator, + BoundaryCondition, + RightHandSide, + Solvers::PDE::BackwardTimeDiscretisation, + DofVectorType > systemAssembler; + + Meshes::DistributedMeshes::DistrGridIOTypes distributedIOType; + bool catchExceptions = true; }; } // namespace Problems diff --git a/src/TNL/Problems/HeatEquationProblem_impl.h b/src/TNL/Problems/HeatEquationProblem_impl.h index 0c0cd7418e..64b4a2ca91 100644 --- a/src/TNL/Problems/HeatEquationProblem_impl.h +++ b/src/TNL/Problems/HeatEquationProblem_impl.h @@ -95,11 +95,11 @@ setup( const Config::ParameterContainer& parameters, return false; } - String param=parameters.getParameter< String >( "distributed-grid-io-type" ); - if(param=="MpiIO") - distributedIOType=Meshes::DistributedMeshes::MpiIO; - if(param=="LocalCopy") - distributedIOType=Meshes::DistributedMeshes::LocalCopy; + String param = parameters.getParameter< String >( "distributed-grid-io-type" ); + if( param == "MpiIO" ) + distributedIOType = Meshes::DistributedMeshes::MpiIO; + if( param == "LocalCopy" ) + distributedIOType = Meshes::DistributedMeshes::LocalCopy; this->explicitUpdater.setDifferentialOperator( this->differentialOperatorPointer ); this->explicitUpdater.setBoundaryConditions( this->boundaryConditionPointer ); @@ -107,6 +107,8 @@ setup( const Config::ParameterContainer& parameters, this->systemAssembler.setDifferentialOperator( this->differentialOperatorPointer ); this->systemAssembler.setBoundaryConditions( this->boundaryConditionPointer ); this->systemAssembler.setRightHandSide( this->rightHandSidePointer ); + + this->catchExceptions = parameters.getParameter< bool >( "catch-exceptions" ); return true; } @@ -150,7 +152,6 @@ setInitialCondition( const Config::ParameterContainer& parameters, { this->bindDofs( dofs ); const String& initialConditionFile = parameters.getParameter< String >( "initial-condition" ); - std::cout<<"setInitialCondition" <<std::endl; if(CommunicatorType::isDistributed()) { std::cout<<"Nodes Distribution: " << uPointer->getMesh().getDistributedMesh()->printProcessDistr() << std::endl; @@ -162,15 +163,20 @@ setInitialCondition( const Config::ParameterContainer& parameters, } else { - try - { - this->uPointer->boundLoad( initialConditionFile ); - } - catch(...) + if( this->catchExceptions ) { - std::cerr << "I am not able to load the initial condition from the file " << initialConditionFile << "." << std::endl; - return false; + try + { + this->uPointer->boundLoad( initialConditionFile ); + } + catch( std::ios_base::failure& e ) + { + std::cerr << "I am not able to load the initial condition from the file " << initialConditionFile << "." << std::endl; + std::cerr << e.what() << std::endl; + return false; + } } + else this->uPointer->boundLoad( initialConditionFile ); } return true; } diff --git a/src/TNL/Solvers/ODE/Euler.h b/src/TNL/Solvers/ODE/Euler.h index c782b08cd9..57c4e10b94 100644 --- a/src/TNL/Solvers/ODE/Euler.h +++ b/src/TNL/Solvers/ODE/Euler.h @@ -25,38 +25,34 @@ class Euler : public ExplicitSolver< Problem > { public: - using ProblemType = Problem; - using DofVectorType = typename ProblemType::DofVectorType; - using RealType = typename ProblemType::RealType; - using DeviceType = typename ProblemType::DeviceType; - using IndexType = typename ProblemType::IndexType; - using DofVectorView = typename ViewTypeGetter< DofVectorType >::Type; - using DofVectorPointer = Pointers::SharedPointer< DofVectorType, DeviceType >; + using ProblemType = Problem; + using DofVectorType = typename ProblemType::DofVectorType; + using RealType = typename ProblemType::RealType; + using DeviceType = typename ProblemType::DeviceType; + using IndexType = typename ProblemType::IndexType; + using DofVectorView = typename ViewTypeGetter< DofVectorType >::Type; + using DofVectorPointer = Pointers::SharedPointer< DofVectorType, DeviceType >; - Euler(); + Euler(); - static String getType(); + static String getType(); - static void configSetup( Config::ConfigDescription& config, - const String& prefix = "" ); + static void configSetup( Config::ConfigDescription& config, + const String& prefix = "" ); - bool setup( const Config::ParameterContainer& parameters, - const String& prefix = "" ); + bool setup( const Config::ParameterContainer& parameters, + const String& prefix = "" ); - void setCFLCondition( const RealType& cfl ); + void setCFLCondition( const RealType& cfl ); - const RealType& getCFLCondition() const; + const RealType& getCFLCondition() const; - bool solve( DofVectorPointer& u ); + bool solve( DofVectorPointer& u ); protected: - void computeNewTimeLevel( DofVectorPointer& u, - RealType tau, - RealType& currentResidue ); + DofVectorPointer _k1; - DofVectorPointer _k1; - - RealType cflCondition; + RealType cflCondition; }; } // namespace ODE diff --git a/src/TNL/Solvers/ODE/Euler.hpp b/src/TNL/Solvers/ODE/Euler.hpp index 5e2c4979e7..c107df8a29 100644 --- a/src/TNL/Solvers/ODE/Euler.hpp +++ b/src/TNL/Solvers/ODE/Euler.hpp @@ -119,17 +119,9 @@ bool Euler< Problem > :: solve( DofVectorPointer& _u ) continue; } } - RealType newResidue( 0.0 ); - //computeNewTimeLevel( u, currentTau, newResidue ); - /* auto fetch = [ currentTau, k1, u ] __cuda_callable__ ( IndexType i ) -> RealType { - const RealType add = currentTau * k1[ i ]; - u[ i ] += add; - return TNL::abs( add ); }; - auto reduction = [=] __cuda_callable__ ( RealType& a , const RealType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a , const volatile RealType& b ) { a += b; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( u.getSize(), reduction, volatileReduction, fetch, 0.0 );*/ - - this->setResidue( newResidue ); + auto reduction = [] __cuda_callable__ ( RealType& a , const RealType& b ) { a += b; }; + auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a , const volatile RealType& b ) { a += b; }; + this->setResidue( addAndReduceAbs( u, currentTau * k1, reduction, volatileReduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) ); /**** * When time is close to stopTime the new residue @@ -169,111 +161,6 @@ bool Euler< Problem > :: solve( DofVectorPointer& _u ) } }; -/*template< typename Problem > -void Euler< Problem > :: computeNewTimeLevel( DofVectorPointer& u, - RealType tau, - RealType& currentResidue ) -{ - RealType localResidue = RealType( 0.0 ); - const IndexType size = k1->getSize(); - RealType* _u = u->getData(); - RealType* _k1 = k1->getData(); - - if( std::is_same< DeviceType, Devices::Host >::value ) - { -#ifdef HAVE_OPENMP -#pragma omp parallel for reduction(+:localResidue) firstprivate( _u, _k1, tau ) if( Devices::Host::isOMPEnabled() ) -#endif - for( IndexType i = 0; i < size; i ++ ) - { - const RealType add = tau * _k1[ i ]; - _u[ i ] += add; - localResidue += std::fabs( add ); - } - } - if( std::is_same< DeviceType, Devices::Cuda >::value ) - { -#ifdef HAVE_CUDA - dim3 cudaBlockSize( 512 ); - const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x ); - const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks ); - this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) ); - const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x; - - localResidue = 0.0; - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ ) - { - const IndexType sharedMemory = cudaBlockSize.x * sizeof( RealType ); - const IndexType gridOffset = gridIdx * threadsPerGrid; - const IndexType currentSize = min( size - gridOffset, threadsPerGrid ); - const IndexType currentGridSize = Devices::Cuda::getNumberOfBlocks( currentSize, cudaBlockSize.x ); - - updateUEuler<<< currentGridSize, cudaBlockSize, sharedMemory >>>( currentSize, - tau, - &_k1[ gridOffset ], - &_u[ gridOffset ], - this->cudaBlockResidue.getData() ); - localResidue += this->cudaBlockResidue.sum(); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - } -#endif - } - - //MIC - if( std::is_same< DeviceType, Devices::MIC >::value ) - { - -#ifdef HAVE_MIC - Devices::MICHider<RealType> mu; - mu.pointer=_u; - Devices::MICHider<RealType> mk1; - mk1.pointer=_k1; - #pragma offload target(mic) in(mu,mk1,size) inout(localResidue) - { - #pragma omp parallel for reduction(+:localResidue) firstprivate( mu, mk1 ) - for( IndexType i = 0; i < size; i ++ ) - { - const RealType add = tau * mk1.pointer[ i ]; - mu.pointer[ i ] += add; - localResidue += std::fabs( add ); - } - } -#endif - } - localResidue /= tau * ( RealType ) size; - Problem::CommunicatorType::Allreduce( &localResidue, ¤tResidue, 1, MPI_SUM, Problem::CommunicatorType::AllGroup ); - //std::cerr << "Local residue = " << localResidue << " - globalResidue = " << currentResidue << std::endl; -} - -#ifdef HAVE_CUDA -template< typename RealType, typename Index > -__global__ void updateUEuler( const Index size, - const RealType tau, - const RealType* k1, - RealType* u, - RealType* cudaBlockResidue ) -{ - extern __shared__ RealType du[]; - const Index blockOffset = blockIdx. x * blockDim.x; - const Index i = blockOffset + threadIdx. x; - if( i < size ) - u[ i ] += du[ threadIdx.x ] = tau * k1[ i ]; - else - du[ threadIdx.x ] = 0.0; - du[ threadIdx.x ] = abs( du[ threadIdx.x ] ); - __syncthreads(); - - const Index rest = size - blockOffset; - Index n = rest < blockDim.x ? rest : blockDim.x; - - computeBlockResidue( du, - cudaBlockResidue, - n ); -} -#endif -*/ - } // namespace ODE } // namespace Solvers } // namespace TNL diff --git a/src/UnitTests/Containers/VectorTest-8.h b/src/UnitTests/Containers/VectorTest-8.h index 7b3e908df3..c70408f404 100644 --- a/src/UnitTests/Containers/VectorTest-8.h +++ b/src/UnitTests/Containers/VectorTest-8.h @@ -72,7 +72,7 @@ TYPED_TEST( VectorTest, evaluateAndReduce ) // error #3049-D: The enclosing parent function ("TestBody") for an extended __host__ __device__ lambda cannot have private or protected access within its class template< typename VectorView > typename VectorView::RealType -performAddAndReduce( VectorView& u, VectorView& v, VectorView& w ) +performAddAndReduce1( VectorView& u, VectorView& v, VectorView& w ) { using RealType = typename VectorView::RealType; @@ -81,6 +81,18 @@ performAddAndReduce( VectorView& u, VectorView& v, VectorView& w ) return addAndReduce( w, u * v, reduction, volatileReduction, ( RealType ) 0.0 ); } +template< typename VectorView > +typename VectorView::RealType +performAddAndReduce2( VectorView& v, VectorView& w ) +{ + using RealType = typename VectorView::RealType; + + auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; + auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; + return addAndReduce( w, 5.0 * v, reduction, volatileReduction, ( RealType ) 0.0 ); +} + + TYPED_TEST( VectorTest, addAndReduce ) { using VectorType = typename TestFixture::VectorType; @@ -101,9 +113,24 @@ TYPED_TEST( VectorTest, addAndReduce ) w.setElement( i, x ); aux += x * y; } - auto r = performAddAndReduce( u, v, w ); + auto r = performAddAndReduce1( u, v, w ); EXPECT_TRUE( w == u * v + u ); EXPECT_NEAR( aux, r, 1.0e-5 ); + + aux = 0.0; + for( int i = 0; i < size; i++ ) + { + const RealType x = i; + const RealType y = size / 2 - i; + u.setElement( i, x ); + v.setElement( i, y ); + w.setElement( i, x ); + aux += 5.0 * y; + } + r = performAddAndReduce2( v, w ); + EXPECT_TRUE( w == 5.0 * v + u ); + EXPECT_NEAR( aux, r, 1.0e-5 ); + } #endif // HAVE_GTEST -- GitLab