From 7398085f0c5aae3ce3e76ce931ba0ac3562650c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com> Date: Thu, 27 Jun 2019 17:29:06 +0200 Subject: [PATCH] Rewritting the Merson solver using expression templates. --- src/TNL/Solvers/ODE/Merson.h | 2 +- src/TNL/Solvers/ODE/Merson_impl.h | 328 +++++++++++++----------------- 2 files changed, 148 insertions(+), 182 deletions(-) diff --git a/src/TNL/Solvers/ODE/Merson.h b/src/TNL/Solvers/ODE/Merson.h index d39cf169fa..d9a69bceb0 100644 --- a/src/TNL/Solvers/ODE/Merson.h +++ b/src/TNL/Solvers/ODE/Merson.h @@ -65,7 +65,7 @@ class Merson : public ExplicitSolver< Problem > void writeGrids( const DofVectorPointer& u ); - DofVectorPointer k1, k2, k3, k4, k5, kAux; + DofVectorPointer _k1, _k2, _k3, _k4, _k5, _kAux; /**** * This controls the accuracy of the solver diff --git a/src/TNL/Solvers/ODE/Merson_impl.h b/src/TNL/Solvers/ODE/Merson_impl.h index 0b73f234d5..c3aab56b63 100644 --- a/src/TNL/Solvers/ODE/Merson_impl.h +++ b/src/TNL/Solvers/ODE/Merson_impl.h @@ -128,7 +128,7 @@ void Merson< Problem > :: setAdaptivity( const RealType& a ) }; template< typename Problem > -bool Merson< Problem > :: solve( DofVectorPointer& u ) +bool Merson< Problem >::solve( DofVectorPointer& _u ) { if( ! this->problem ) { @@ -143,19 +143,25 @@ bool Merson< Problem > :: solve( DofVectorPointer& u ) /**** * First setup the supporting meshes k1...k5 and kAux. */ - k1->setLike( *u ); - k2->setLike( *u ); - k3->setLike( *u ); - k4->setLike( *u ); - k5->setLike( *u ); - kAux->setLike( *u ); - k1->setValue( 0.0 ); - k2->setValue( 0.0 ); - k3->setValue( 0.0 ); - k4->setValue( 0.0 ); - k5->setValue( 0.0 ); - kAux->setValue( 0.0 ); - + _k1->setLike( *_u ); + _k2->setLike( *_u ); + _k3->setLike( *_u ); + _k4->setLike( *_u ); + _k5->setLike( *_u ); + _kAux->setLike( *_u ); + auto k1 = _k1->getView(); + auto k2 = _k2->getView(); + auto k3 = _k3->getView(); + auto k4 = _k4->getView(); + auto k5 = _k5->getView(); + auto kAux = _kAux->getView(); + auto u = _u->getView(); + k1 = 0.0; + k2 = 0.0; + k3 = 0.0; + k4 = 0.0; + k5 = 0.0; + kAux = 0.0; /**** * Set necessary parameters @@ -170,37 +176,69 @@ bool Merson< Problem > :: solve( DofVectorPointer& u ) this->refreshSolverMonitor(); - /**** - * Start the main loop - */ + ///// + // Start the main loop while( this->checkNextIteration() ) { - /**** - * Compute Runge-Kutta coefficients - */ - computeKFunctions( u, time, currentTau ); + ///// + // Compute Runge-Kutta coefficients + RealType tau_3 = currentTau / 3.0; + + ///// + // k1 + this->problem->getExplicitUpdate( time, currentTau, _u, _k1 ); + + ///// + // k2 + kAux = u + currentTau * ( 1.0 / 3.0 * k1 ); + this->problem->applyBoundaryConditions( time + tau_3, _kAux ); + this->problem->getExplicitUpdate( time + tau_3, currentTau, _kAux, _k2 ); + + ///// + // k3 + kAux = u + currentTau * 1.0 / 6.0 * ( k1 + k2 ); + this->problem->applyBoundaryConditions( time + tau_3, _kAux ); + this->problem->getExplicitUpdate( time + tau_3, currentTau, _kAux, _k3 ); + + ///// + // k4 + kAux = u + currentTau * ( 0.125 * k1 + 0.375 * k3 ); + this->problem->applyBoundaryConditions( time + 0.5 * currentTau, _kAux ); + this->problem->getExplicitUpdate( time + 0.5 * currentTau, currentTau, _kAux, _k4 ); + + ///// + // k5 + kAux = u + currentTau * ( 0.5 * k1 - 1.5 * k3 + 2.0 * k4 ); + this->problem->applyBoundaryConditions( time + currentTau, _kAux ); + this->problem->getExplicitUpdate( time + currentTau, currentTau, _kAux, _k5 ); + if( this->testingMode ) - writeGrids( u ); + writeGrids( _u ); - /**** - * Compute an error of the approximation. - */ - RealType eps( 0.0 ); + ///// + // Compute an error of the approximation. + RealType error( 0.0 ); if( adaptivity != 0.0 ) - eps = computeError( currentTau ); + { + const RealType localError = + max( currentTau / 3.0 * abs( 0.2 * k1 -0.9 * k3 + 0.8 * k4 -0.1 * k5 ) ); + Problem::CommunicatorType::Allreduce( &localError, &error, 1, MPI_MAX, Problem::CommunicatorType::AllGroup ); + } - if( adaptivity == 0.0 || eps < adaptivity ) + if( adaptivity == 0.0 || error < adaptivity ) { RealType lastResidue = this->getResidue(); RealType newResidue( 0.0 ); time += currentTau; - computeNewTimeLevel( time, currentTau, u, newResidue ); - this->setResidue( newResidue ); - - /**** - * When time is close to stopTime the new residue - * may be inaccurate significantly. - */ + + auto reduction = [] __cuda_callable__ ( RealType& a , const RealType& b ) { a += b; }; + auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a , const volatile RealType& b ) { a += b; }; + this->setResidue( addAndReduceAbs( u, currentTau / 6.0 * ( k1 + 4.0 * k4 + k5 ), + reduction, volatileReduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) ); + + ///// + // When time is close to stopTime the new residue + // may be inaccurate significantly. if( abs( time - this->stopTime ) < 1.0e-7 ) this->setResidue( lastResidue ); @@ -209,12 +247,11 @@ bool Merson< Problem > :: solve( DofVectorPointer& u ) } this->refreshSolverMonitor(); - /**** - * Compute the new time step. - */ - if( adaptivity != 0.0 && eps != 0.0 ) + ///// + // Compute the new time step. + if( adaptivity != 0.0 && error != 0.0 ) { - currentTau *= 0.8 * ::pow( adaptivity / eps, 0.2 ); + currentTau *= 0.8 * ::pow( adaptivity / error, 0.2 ); currentTau = min( currentTau, this->getMaxTau() ); #ifdef USE_MPI TNLMPI::Bcast( currentTau, 1, 0 ); @@ -225,11 +262,8 @@ bool Merson< Problem > :: solve( DofVectorPointer& u ) else this->tau = currentTau; - /**** - * Check stop conditions. - */ - //cerr << "residue = " << residue << std::endl; - //cerr << "this->getConvergenceResidue() = " << this->getConvergenceResidue() << std::endl; + ///// + // Check stop conditions. if( time >= this->getStopTime() || ( this->getConvergenceResidue() != 0.0 && this->getResidue() < this->getConvergenceResidue() ) ) { @@ -243,126 +277,57 @@ bool Merson< Problem > :: solve( DofVectorPointer& u ) }; template< typename Problem > -void Merson< Problem >::computeKFunctions( DofVectorPointer& u, +void Merson< Problem >::computeKFunctions( DofVectorPointer& _u, const RealType& time, RealType tau ) { - IndexType size = u->getSize(); - - RealType* _k1 = k1->getData(); - RealType* _k2 = k2->getData(); - RealType* _k3 = k3->getData(); - RealType* _k4 = k4->getData(); - RealType* _k5 = k5->getData(); - RealType* _kAux = kAux->getData(); - RealType* _u = u->getData(); + auto k1 = _k1->getView(); + auto k2 = _k2->getView(); + auto k3 = _k3->getView(); + auto k4 = _k4->getView(); + auto k5 = _k5->getView(); + auto kAux = _kAux->getView(); + auto u = _u->getView(); RealType tau_3 = tau / 3.0; - - if( std::is_same< DeviceType, Devices::Host >::value ) - { - this->problem->getExplicitUpdate( time, tau, u, k1 ); - - #ifdef HAVE_OPENMP - #pragma omp parallel for firstprivate( size, _kAux, _u, _k1, tau, tau_3 ) if( Devices::Host::isOMPEnabled() ) - #endif - for( IndexType i = 0; i < size; i ++ ) - _kAux[ i ] = _u[ i ] + tau * ( 1.0 / 3.0 * _k1[ i ] ); - this->problem->applyBoundaryConditions( time + tau_3, kAux ); - this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k2 ); - - #ifdef HAVE_OPENMP - #pragma omp parallel for firstprivate( size, _kAux, _u, _k1, _k2, tau, tau_3 ) if( Devices::Host::isOMPEnabled() ) - #endif - for( IndexType i = 0; i < size; i ++ ) - _kAux[ i ] = _u[ i ] + tau * 1.0 / 6.0 * ( _k1[ i ] + _k2[ i ] ); - this->problem->applyBoundaryConditions( time + tau_3, kAux ); - this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k3 ); - - #ifdef HAVE_OPENMP - #pragma omp parallel for firstprivate( size, _kAux, _u, _k1, _k3, tau, tau_3 ) if( Devices::Host::isOMPEnabled() ) - #endif - for( IndexType i = 0; i < size; i ++ ) - _kAux[ i ] = _u[ i ] + tau * ( 0.125 * _k1[ i ] + 0.375 * _k3[ i ] ); - this->problem->applyBoundaryConditions( time + 0.5 * tau, kAux ); - this->problem->getExplicitUpdate( time + 0.5 * tau, tau, kAux, k4 ); - - #ifdef HAVE_OPENMP - #pragma omp parallel for firstprivate( size, _kAux, _u, _k1, _k3, _k4, tau, tau_3 ) if( Devices::Host::isOMPEnabled() ) - #endif - for( IndexType i = 0; i < size; i ++ ) - _kAux[ i ] = _u[ i ] + tau * ( 0.5 * _k1[ i ] - 1.5 * _k3[ i ] + 2.0 * _k4[ i ] ); - this->problem->applyBoundaryConditions( time + tau, kAux ); - this->problem->getExplicitUpdate( time + tau, tau, kAux, k5 ); - } - if( std::is_same< DeviceType, Devices::Cuda >::value ) - { -#ifdef HAVE_CUDA - dim3 cudaBlockSize( 512 ); - const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x ); - const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks ); - this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) ); - const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x; - - this->problem->getExplicitUpdate( time, tau, u, k1 ); - cudaDeviceSynchronize(); - - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ ) - { - const IndexType gridOffset = gridIdx * threadsPerGrid; - const IndexType currentSize = min( size - gridOffset, threadsPerGrid ); - computeK2Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_kAux[ gridOffset ] ); - } - cudaDeviceSynchronize(); - this->problem->applyBoundaryConditions( time + tau_3, kAux ); - this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k2 ); - cudaDeviceSynchronize(); - - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ ) - { - const IndexType gridOffset = gridIdx * threadsPerGrid; - const IndexType currentSize = min( size - gridOffset, threadsPerGrid ); - computeK3Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k2[ gridOffset ], &_kAux[ gridOffset ] ); - } - cudaDeviceSynchronize(); - this->problem->applyBoundaryConditions( time + tau_3, kAux ); - this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k3 ); - cudaDeviceSynchronize(); - - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ ) - { - const IndexType gridOffset = gridIdx * threadsPerGrid; - const IndexType currentSize = min( size - gridOffset, threadsPerGrid ); - computeK4Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k3[ gridOffset ], &_kAux[ gridOffset ] ); - } - cudaDeviceSynchronize(); - this->problem->applyBoundaryConditions( time + 0.5 * tau, kAux ); - this->problem->getExplicitUpdate( time + 0.5 * tau, tau, kAux, k4 ); - cudaDeviceSynchronize(); - - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ ) - { - const IndexType gridOffset = gridIdx * threadsPerGrid; - const IndexType currentSize = min( size - gridOffset, threadsPerGrid ); - computeK5Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k3[ gridOffset ], &_k4[ gridOffset ], &_kAux[ gridOffset ] ); - } - cudaDeviceSynchronize(); - this->problem->applyBoundaryConditions( time + tau, kAux ); - this->problem->getExplicitUpdate( time + tau, tau, kAux, k5 ); - cudaDeviceSynchronize(); -#endif - } + ///// + // k1 + this->problem->getExplicitUpdate( time, tau, _u, _k1 ); + + ///// + // k2 + kAux = u + tau * ( 1.0 / 3.0 * k1 ); + this->problem->applyBoundaryConditions( time + tau_3, _kAux ); + this->problem->getExplicitUpdate( time + tau_3, tau, _kAux, _k2 ); + + ///// + // k3 + kAux = u + tau * 1.0 / 6.0 * ( k1 + k2 ); + this->problem->applyBoundaryConditions( time + tau_3, _kAux ); + this->problem->getExplicitUpdate( time + tau_3, tau, _kAux, _k3 ); + + ///// + // k4 + kAux = u + tau * ( 0.125 * k1 + 0.375 * k3 ); + this->problem->applyBoundaryConditions( time + 0.5 * tau, _kAux ); + this->problem->getExplicitUpdate( time + 0.5 * tau, tau, _kAux, _k4 ); + + ///// + // k5 + kAux = u + tau * ( 0.5 * k1 - 1.5 * k3 + 2.0 * k4 ); + this->problem->applyBoundaryConditions( time + tau, _kAux ); + this->problem->getExplicitUpdate( time + tau, tau, _kAux, _k5 ); } -template< typename Problem > +/*template< typename Problem > typename Problem :: RealType Merson< Problem > :: computeError( const RealType tau ) { - const IndexType size = k1->getSize(); - const RealType* _k1 = k1->getData(); - const RealType* _k3 = k3->getData(); - const RealType* _k4 = k4->getData(); - const RealType* _k5 = k5->getData(); - RealType* _kAux = kAux->getData(); + const IndexType size = _k1->getSize(); + const RealType* k1 = _k1->getData(); + const RealType* k3 = _k3->getData(); + const RealType* k4 = _k4->getData(); + const RealType* k5 = _k5->getData(); + RealType* kAux = _kAux->getData(); RealType eps( 0.0 ), maxEps( 0.0 ); if( std::is_same< DeviceType, Devices::Host >::value ) @@ -379,10 +344,10 @@ typename Problem :: RealType Merson< Problem > :: computeError( const RealType t for( IndexType i = 0; i < size; i ++ ) { RealType err = ( RealType ) ( tau / 3.0 * - abs( 0.2 * _k1[ i ] + - -0.9 * _k3[ i ] + - 0.8 * _k4[ i ] + - -0.1 * _k5[ i ] ) ); + abs( 0.2 * k1[ i ] + + -0.9 * k3[ i ] + + 0.8 * k4[ i ] + + -0.1 * k5[ i ] ) ); localEps = max( localEps, err ); } this->openMPErrorEstimateBuffer[ Devices::Host::getThreadIdx() ] = localEps; @@ -404,13 +369,13 @@ typename Problem :: RealType Merson< Problem > :: computeError( const RealType t const IndexType currentSize = min( size - gridOffset, threadsPerGrid ); computeErrorKernel<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, - &_k1[ gridOffset ], - &_k3[ gridOffset ], - &_k4[ gridOffset ], - &_k5[ gridOffset ], - &_kAux[ gridOffset ] ); + &k1[ gridOffset ], + &k3[ gridOffset ], + &k4[ gridOffset ], + &k5[ gridOffset ], + &kAux[ gridOffset ] ); cudaDeviceSynchronize(); - eps = std::max( eps, kAux->max() ); + eps = std::max( eps, _kAux->max() ); } #endif } @@ -425,20 +390,20 @@ void Merson< Problem >::computeNewTimeLevel( const RealType time, RealType& currentResidue ) { RealType localResidue = RealType( 0.0 ); - IndexType size = k1->getSize(); + IndexType size = _k1->getSize(); RealType* _u = u->getData(); - RealType* _k1 = k1->getData(); - RealType* _k4 = k4->getData(); - RealType* _k5 = k5->getData(); + RealType* k1 = _k1->getData(); + RealType* k4 = _k4->getData(); + RealType* k5 = _k5->getData(); if( std::is_same< DeviceType, Devices::Host >::value ) { #ifdef HAVE_OPENMP -#pragma omp parallel for reduction(+:localResidue) firstprivate( size, _u, _k1, _k4, _k5, tau ) if( Devices::Host::isOMPEnabled() ) +#pragma omp parallel for reduction(+:localResidue) firstprivate( size, _u, k1, k4, k5, tau ) if( Devices::Host::isOMPEnabled() ) #endif for( IndexType i = 0; i < size; i ++ ) { - const RealType add = tau / 6.0 * ( _k1[ i ] + 4.0 * _k4[ i ] + _k5[ i ] ); + const RealType add = tau / 6.0 * ( k1[ i ] + 4.0 * k4[ i ] + k5[ i ] ); _u[ i ] += add; localResidue += abs( ( RealType ) add ); } @@ -462,9 +427,9 @@ void Merson< Problem >::computeNewTimeLevel( const RealType time, updateUMerson<<< cudaBlocks, cudaBlockSize, sharedMemory >>>( currentSize, tau, - &_k1[ gridOffset ], - &_k4[ gridOffset ], - &_k5[ gridOffset ], + &k1[ gridOffset ], + &k4[ gridOffset ], + &k5[ gridOffset ], &_u[ gridOffset ], this->cudaBlockResidue.getData() ); localResidue += this->cudaBlockResidue.sum(); @@ -490,17 +455,17 @@ void Merson< Problem >::writeGrids( const DofVectorPointer& u ) { std::cout << "Writing Merson solver grids ..."; File( "Merson-u.tnl", std::ios_base::out ) << *u; - File( "Merson-k1.tnl", std::ios_base::out ) << *k1; - File( "Merson-k2.tnl", std::ios_base::out ) << *k2; - File( "Merson-k3.tnl", std::ios_base::out ) << *k3; - File( "Merson-k4.tnl", std::ios_base::out ) << *k4; - File( "Merson-k5.tnl", std::ios_base::out ) << *k5; + File( "Merson-k1.tnl", std::ios_base::out ) << *_k1; + File( "Merson-k2.tnl", std::ios_base::out ) << *_k2; + File( "Merson-k3.tnl", std::ios_base::out ) << *_k3; + File( "Merson-k4.tnl", std::ios_base::out ) << *_k4; + File( "Merson-k5.tnl", std::ios_base::out ) << *_k5; std::cout << " done. PRESS A KEY." << std::endl; getchar(); } #ifdef HAVE_CUDA - +/* template< typename RealType, typename Index > __global__ void computeK2Arg( const Index size, const RealType tau, @@ -552,6 +517,7 @@ __global__ void computeK5Arg( const Index size, if( i < size ) k5_arg[ i ] = u[ i ] + tau * ( 0.5 * k1[ i ] - 1.5 * k3[ i ] + 2.0 * k4[ i ] ); } +*/ template< typename RealType, typename Index > __global__ void computeErrorKernel( const Index size, -- GitLab