Skip to content
Snippets Groups Projects
Commit 7398085f authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Rewritting the Merson solver using expression templates.

parent 5f87aba3
No related branches found
No related tags found
1 merge request!34Runge kutta
...@@ -65,7 +65,7 @@ class Merson : public ExplicitSolver< Problem > ...@@ -65,7 +65,7 @@ class Merson : public ExplicitSolver< Problem >
void writeGrids( const DofVectorPointer& u ); void writeGrids( const DofVectorPointer& u );
DofVectorPointer k1, k2, k3, k4, k5, kAux; DofVectorPointer _k1, _k2, _k3, _k4, _k5, _kAux;
/**** /****
* This controls the accuracy of the solver * This controls the accuracy of the solver
......
...@@ -128,7 +128,7 @@ void Merson< Problem > :: setAdaptivity( const RealType& a ) ...@@ -128,7 +128,7 @@ void Merson< Problem > :: setAdaptivity( const RealType& a )
}; };
template< typename Problem > template< typename Problem >
bool Merson< Problem > :: solve( DofVectorPointer& u ) bool Merson< Problem >::solve( DofVectorPointer& _u )
{ {
if( ! this->problem ) if( ! this->problem )
{ {
...@@ -143,19 +143,25 @@ bool Merson< Problem > :: solve( DofVectorPointer& u ) ...@@ -143,19 +143,25 @@ bool Merson< Problem > :: solve( DofVectorPointer& u )
/**** /****
* First setup the supporting meshes k1...k5 and kAux. * First setup the supporting meshes k1...k5 and kAux.
*/ */
k1->setLike( *u ); _k1->setLike( *_u );
k2->setLike( *u ); _k2->setLike( *_u );
k3->setLike( *u ); _k3->setLike( *_u );
k4->setLike( *u ); _k4->setLike( *_u );
k5->setLike( *u ); _k5->setLike( *_u );
kAux->setLike( *u ); _kAux->setLike( *_u );
k1->setValue( 0.0 ); auto k1 = _k1->getView();
k2->setValue( 0.0 ); auto k2 = _k2->getView();
k3->setValue( 0.0 ); auto k3 = _k3->getView();
k4->setValue( 0.0 ); auto k4 = _k4->getView();
k5->setValue( 0.0 ); auto k5 = _k5->getView();
kAux->setValue( 0.0 ); auto kAux = _kAux->getView();
auto u = _u->getView();
k1 = 0.0;
k2 = 0.0;
k3 = 0.0;
k4 = 0.0;
k5 = 0.0;
kAux = 0.0;
/**** /****
* Set necessary parameters * Set necessary parameters
...@@ -170,37 +176,69 @@ bool Merson< Problem > :: solve( DofVectorPointer& u ) ...@@ -170,37 +176,69 @@ bool Merson< Problem > :: solve( DofVectorPointer& u )
this->refreshSolverMonitor(); this->refreshSolverMonitor();
/**** /////
* Start the main loop // Start the main loop
*/
while( this->checkNextIteration() ) while( this->checkNextIteration() )
{ {
/**** /////
* Compute Runge-Kutta coefficients // Compute Runge-Kutta coefficients
*/ RealType tau_3 = currentTau / 3.0;
computeKFunctions( u, time, currentTau );
/////
// k1
this->problem->getExplicitUpdate( time, currentTau, _u, _k1 );
/////
// k2
kAux = u + currentTau * ( 1.0 / 3.0 * k1 );
this->problem->applyBoundaryConditions( time + tau_3, _kAux );
this->problem->getExplicitUpdate( time + tau_3, currentTau, _kAux, _k2 );
/////
// k3
kAux = u + currentTau * 1.0 / 6.0 * ( k1 + k2 );
this->problem->applyBoundaryConditions( time + tau_3, _kAux );
this->problem->getExplicitUpdate( time + tau_3, currentTau, _kAux, _k3 );
/////
// k4
kAux = u + currentTau * ( 0.125 * k1 + 0.375 * k3 );
this->problem->applyBoundaryConditions( time + 0.5 * currentTau, _kAux );
this->problem->getExplicitUpdate( time + 0.5 * currentTau, currentTau, _kAux, _k4 );
/////
// k5
kAux = u + currentTau * ( 0.5 * k1 - 1.5 * k3 + 2.0 * k4 );
this->problem->applyBoundaryConditions( time + currentTau, _kAux );
this->problem->getExplicitUpdate( time + currentTau, currentTau, _kAux, _k5 );
if( this->testingMode ) if( this->testingMode )
writeGrids( u ); writeGrids( _u );
/**** /////
* Compute an error of the approximation. // Compute an error of the approximation.
*/ RealType error( 0.0 );
RealType eps( 0.0 );
if( adaptivity != 0.0 ) if( adaptivity != 0.0 )
eps = computeError( currentTau ); {
const RealType localError =
max( currentTau / 3.0 * abs( 0.2 * k1 -0.9 * k3 + 0.8 * k4 -0.1 * k5 ) );
Problem::CommunicatorType::Allreduce( &localError, &error, 1, MPI_MAX, Problem::CommunicatorType::AllGroup );
}
if( adaptivity == 0.0 || eps < adaptivity ) if( adaptivity == 0.0 || error < adaptivity )
{ {
RealType lastResidue = this->getResidue(); RealType lastResidue = this->getResidue();
RealType newResidue( 0.0 ); RealType newResidue( 0.0 );
time += currentTau; time += currentTau;
computeNewTimeLevel( time, currentTau, u, newResidue );
this->setResidue( newResidue ); auto reduction = [] __cuda_callable__ ( RealType& a , const RealType& b ) { a += b; };
auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a , const volatile RealType& b ) { a += b; };
/**** this->setResidue( addAndReduceAbs( u, currentTau / 6.0 * ( k1 + 4.0 * k4 + k5 ),
* When time is close to stopTime the new residue reduction, volatileReduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) );
* may be inaccurate significantly.
*/ /////
// When time is close to stopTime the new residue
// may be inaccurate significantly.
if( abs( time - this->stopTime ) < 1.0e-7 ) this->setResidue( lastResidue ); if( abs( time - this->stopTime ) < 1.0e-7 ) this->setResidue( lastResidue );
...@@ -209,12 +247,11 @@ bool Merson< Problem > :: solve( DofVectorPointer& u ) ...@@ -209,12 +247,11 @@ bool Merson< Problem > :: solve( DofVectorPointer& u )
} }
this->refreshSolverMonitor(); this->refreshSolverMonitor();
/**** /////
* Compute the new time step. // Compute the new time step.
*/ if( adaptivity != 0.0 && error != 0.0 )
if( adaptivity != 0.0 && eps != 0.0 )
{ {
currentTau *= 0.8 * ::pow( adaptivity / eps, 0.2 ); currentTau *= 0.8 * ::pow( adaptivity / error, 0.2 );
currentTau = min( currentTau, this->getMaxTau() ); currentTau = min( currentTau, this->getMaxTau() );
#ifdef USE_MPI #ifdef USE_MPI
TNLMPI::Bcast( currentTau, 1, 0 ); TNLMPI::Bcast( currentTau, 1, 0 );
...@@ -225,11 +262,8 @@ bool Merson< Problem > :: solve( DofVectorPointer& u ) ...@@ -225,11 +262,8 @@ bool Merson< Problem > :: solve( DofVectorPointer& u )
else this->tau = currentTau; else this->tau = currentTau;
/**** /////
* Check stop conditions. // Check stop conditions.
*/
//cerr << "residue = " << residue << std::endl;
//cerr << "this->getConvergenceResidue() = " << this->getConvergenceResidue() << std::endl;
if( time >= this->getStopTime() || if( time >= this->getStopTime() ||
( this->getConvergenceResidue() != 0.0 && this->getResidue() < this->getConvergenceResidue() ) ) ( this->getConvergenceResidue() != 0.0 && this->getResidue() < this->getConvergenceResidue() ) )
{ {
...@@ -243,126 +277,57 @@ bool Merson< Problem > :: solve( DofVectorPointer& u ) ...@@ -243,126 +277,57 @@ bool Merson< Problem > :: solve( DofVectorPointer& u )
}; };
template< typename Problem > template< typename Problem >
void Merson< Problem >::computeKFunctions( DofVectorPointer& u, void Merson< Problem >::computeKFunctions( DofVectorPointer& _u,
const RealType& time, const RealType& time,
RealType tau ) RealType tau )
{ {
IndexType size = u->getSize(); auto k1 = _k1->getView();
auto k2 = _k2->getView();
RealType* _k1 = k1->getData(); auto k3 = _k3->getView();
RealType* _k2 = k2->getData(); auto k4 = _k4->getView();
RealType* _k3 = k3->getData(); auto k5 = _k5->getView();
RealType* _k4 = k4->getData(); auto kAux = _kAux->getView();
RealType* _k5 = k5->getData(); auto u = _u->getView();
RealType* _kAux = kAux->getData();
RealType* _u = u->getData();
RealType tau_3 = tau / 3.0; RealType tau_3 = tau / 3.0;
/////
if( std::is_same< DeviceType, Devices::Host >::value ) // k1
{ this->problem->getExplicitUpdate( time, tau, _u, _k1 );
this->problem->getExplicitUpdate( time, tau, u, k1 );
/////
#ifdef HAVE_OPENMP // k2
#pragma omp parallel for firstprivate( size, _kAux, _u, _k1, tau, tau_3 ) if( Devices::Host::isOMPEnabled() ) kAux = u + tau * ( 1.0 / 3.0 * k1 );
#endif this->problem->applyBoundaryConditions( time + tau_3, _kAux );
for( IndexType i = 0; i < size; i ++ ) this->problem->getExplicitUpdate( time + tau_3, tau, _kAux, _k2 );
_kAux[ i ] = _u[ i ] + tau * ( 1.0 / 3.0 * _k1[ i ] );
this->problem->applyBoundaryConditions( time + tau_3, kAux ); /////
this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k2 ); // k3
kAux = u + tau * 1.0 / 6.0 * ( k1 + k2 );
#ifdef HAVE_OPENMP this->problem->applyBoundaryConditions( time + tau_3, _kAux );
#pragma omp parallel for firstprivate( size, _kAux, _u, _k1, _k2, tau, tau_3 ) if( Devices::Host::isOMPEnabled() ) this->problem->getExplicitUpdate( time + tau_3, tau, _kAux, _k3 );
#endif
for( IndexType i = 0; i < size; i ++ ) /////
_kAux[ i ] = _u[ i ] + tau * 1.0 / 6.0 * ( _k1[ i ] + _k2[ i ] ); // k4
this->problem->applyBoundaryConditions( time + tau_3, kAux ); kAux = u + tau * ( 0.125 * k1 + 0.375 * k3 );
this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k3 ); this->problem->applyBoundaryConditions( time + 0.5 * tau, _kAux );
this->problem->getExplicitUpdate( time + 0.5 * tau, tau, _kAux, _k4 );
#ifdef HAVE_OPENMP
#pragma omp parallel for firstprivate( size, _kAux, _u, _k1, _k3, tau, tau_3 ) if( Devices::Host::isOMPEnabled() ) /////
#endif // k5
for( IndexType i = 0; i < size; i ++ ) kAux = u + tau * ( 0.5 * k1 - 1.5 * k3 + 2.0 * k4 );
_kAux[ i ] = _u[ i ] + tau * ( 0.125 * _k1[ i ] + 0.375 * _k3[ i ] ); this->problem->applyBoundaryConditions( time + tau, _kAux );
this->problem->applyBoundaryConditions( time + 0.5 * tau, kAux ); this->problem->getExplicitUpdate( time + tau, tau, _kAux, _k5 );
this->problem->getExplicitUpdate( time + 0.5 * tau, tau, kAux, k4 );
#ifdef HAVE_OPENMP
#pragma omp parallel for firstprivate( size, _kAux, _u, _k1, _k3, _k4, tau, tau_3 ) if( Devices::Host::isOMPEnabled() )
#endif
for( IndexType i = 0; i < size; i ++ )
_kAux[ i ] = _u[ i ] + tau * ( 0.5 * _k1[ i ] - 1.5 * _k3[ i ] + 2.0 * _k4[ i ] );
this->problem->applyBoundaryConditions( time + tau, kAux );
this->problem->getExplicitUpdate( time + tau, tau, kAux, k5 );
}
if( std::is_same< DeviceType, Devices::Cuda >::value )
{
#ifdef HAVE_CUDA
dim3 cudaBlockSize( 512 );
const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks );
this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) );
const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x;
this->problem->getExplicitUpdate( time, tau, u, k1 );
cudaDeviceSynchronize();
for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
{
const IndexType gridOffset = gridIdx * threadsPerGrid;
const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
computeK2Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_kAux[ gridOffset ] );
}
cudaDeviceSynchronize();
this->problem->applyBoundaryConditions( time + tau_3, kAux );
this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k2 );
cudaDeviceSynchronize();
for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
{
const IndexType gridOffset = gridIdx * threadsPerGrid;
const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
computeK3Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k2[ gridOffset ], &_kAux[ gridOffset ] );
}
cudaDeviceSynchronize();
this->problem->applyBoundaryConditions( time + tau_3, kAux );
this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k3 );
cudaDeviceSynchronize();
for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
{
const IndexType gridOffset = gridIdx * threadsPerGrid;
const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
computeK4Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k3[ gridOffset ], &_kAux[ gridOffset ] );
}
cudaDeviceSynchronize();
this->problem->applyBoundaryConditions( time + 0.5 * tau, kAux );
this->problem->getExplicitUpdate( time + 0.5 * tau, tau, kAux, k4 );
cudaDeviceSynchronize();
for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
{
const IndexType gridOffset = gridIdx * threadsPerGrid;
const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
computeK5Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k3[ gridOffset ], &_k4[ gridOffset ], &_kAux[ gridOffset ] );
}
cudaDeviceSynchronize();
this->problem->applyBoundaryConditions( time + tau, kAux );
this->problem->getExplicitUpdate( time + tau, tau, kAux, k5 );
cudaDeviceSynchronize();
#endif
}
} }
template< typename Problem > /*template< typename Problem >
typename Problem :: RealType Merson< Problem > :: computeError( const RealType tau ) typename Problem :: RealType Merson< Problem > :: computeError( const RealType tau )
{ {
const IndexType size = k1->getSize(); const IndexType size = _k1->getSize();
const RealType* _k1 = k1->getData(); const RealType* k1 = _k1->getData();
const RealType* _k3 = k3->getData(); const RealType* k3 = _k3->getData();
const RealType* _k4 = k4->getData(); const RealType* k4 = _k4->getData();
const RealType* _k5 = k5->getData(); const RealType* k5 = _k5->getData();
RealType* _kAux = kAux->getData(); RealType* kAux = _kAux->getData();
RealType eps( 0.0 ), maxEps( 0.0 ); RealType eps( 0.0 ), maxEps( 0.0 );
if( std::is_same< DeviceType, Devices::Host >::value ) if( std::is_same< DeviceType, Devices::Host >::value )
...@@ -379,10 +344,10 @@ typename Problem :: RealType Merson< Problem > :: computeError( const RealType t ...@@ -379,10 +344,10 @@ typename Problem :: RealType Merson< Problem > :: computeError( const RealType t
for( IndexType i = 0; i < size; i ++ ) for( IndexType i = 0; i < size; i ++ )
{ {
RealType err = ( RealType ) ( tau / 3.0 * RealType err = ( RealType ) ( tau / 3.0 *
abs( 0.2 * _k1[ i ] + abs( 0.2 * k1[ i ] +
-0.9 * _k3[ i ] + -0.9 * k3[ i ] +
0.8 * _k4[ i ] + 0.8 * k4[ i ] +
-0.1 * _k5[ i ] ) ); -0.1 * k5[ i ] ) );
localEps = max( localEps, err ); localEps = max( localEps, err );
} }
this->openMPErrorEstimateBuffer[ Devices::Host::getThreadIdx() ] = localEps; this->openMPErrorEstimateBuffer[ Devices::Host::getThreadIdx() ] = localEps;
...@@ -404,13 +369,13 @@ typename Problem :: RealType Merson< Problem > :: computeError( const RealType t ...@@ -404,13 +369,13 @@ typename Problem :: RealType Merson< Problem > :: computeError( const RealType t
const IndexType currentSize = min( size - gridOffset, threadsPerGrid ); const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
computeErrorKernel<<< cudaBlocks, cudaBlockSize >>>( currentSize, computeErrorKernel<<< cudaBlocks, cudaBlockSize >>>( currentSize,
tau, tau,
&_k1[ gridOffset ], &k1[ gridOffset ],
&_k3[ gridOffset ], &k3[ gridOffset ],
&_k4[ gridOffset ], &k4[ gridOffset ],
&_k5[ gridOffset ], &k5[ gridOffset ],
&_kAux[ gridOffset ] ); &kAux[ gridOffset ] );
cudaDeviceSynchronize(); cudaDeviceSynchronize();
eps = std::max( eps, kAux->max() ); eps = std::max( eps, _kAux->max() );
} }
#endif #endif
} }
...@@ -425,20 +390,20 @@ void Merson< Problem >::computeNewTimeLevel( const RealType time, ...@@ -425,20 +390,20 @@ void Merson< Problem >::computeNewTimeLevel( const RealType time,
RealType& currentResidue ) RealType& currentResidue )
{ {
RealType localResidue = RealType( 0.0 ); RealType localResidue = RealType( 0.0 );
IndexType size = k1->getSize(); IndexType size = _k1->getSize();
RealType* _u = u->getData(); RealType* _u = u->getData();
RealType* _k1 = k1->getData(); RealType* k1 = _k1->getData();
RealType* _k4 = k4->getData(); RealType* k4 = _k4->getData();
RealType* _k5 = k5->getData(); RealType* k5 = _k5->getData();
if( std::is_same< DeviceType, Devices::Host >::value ) if( std::is_same< DeviceType, Devices::Host >::value )
{ {
#ifdef HAVE_OPENMP #ifdef HAVE_OPENMP
#pragma omp parallel for reduction(+:localResidue) firstprivate( size, _u, _k1, _k4, _k5, tau ) if( Devices::Host::isOMPEnabled() ) #pragma omp parallel for reduction(+:localResidue) firstprivate( size, _u, k1, k4, k5, tau ) if( Devices::Host::isOMPEnabled() )
#endif #endif
for( IndexType i = 0; i < size; i ++ ) for( IndexType i = 0; i < size; i ++ )
{ {
const RealType add = tau / 6.0 * ( _k1[ i ] + 4.0 * _k4[ i ] + _k5[ i ] ); const RealType add = tau / 6.0 * ( k1[ i ] + 4.0 * k4[ i ] + k5[ i ] );
_u[ i ] += add; _u[ i ] += add;
localResidue += abs( ( RealType ) add ); localResidue += abs( ( RealType ) add );
} }
...@@ -462,9 +427,9 @@ void Merson< Problem >::computeNewTimeLevel( const RealType time, ...@@ -462,9 +427,9 @@ void Merson< Problem >::computeNewTimeLevel( const RealType time,
updateUMerson<<< cudaBlocks, cudaBlockSize, sharedMemory >>>( currentSize, updateUMerson<<< cudaBlocks, cudaBlockSize, sharedMemory >>>( currentSize,
tau, tau,
&_k1[ gridOffset ], &k1[ gridOffset ],
&_k4[ gridOffset ], &k4[ gridOffset ],
&_k5[ gridOffset ], &k5[ gridOffset ],
&_u[ gridOffset ], &_u[ gridOffset ],
this->cudaBlockResidue.getData() ); this->cudaBlockResidue.getData() );
localResidue += this->cudaBlockResidue.sum(); localResidue += this->cudaBlockResidue.sum();
...@@ -490,17 +455,17 @@ void Merson< Problem >::writeGrids( const DofVectorPointer& u ) ...@@ -490,17 +455,17 @@ void Merson< Problem >::writeGrids( const DofVectorPointer& u )
{ {
std::cout << "Writing Merson solver grids ..."; std::cout << "Writing Merson solver grids ...";
File( "Merson-u.tnl", std::ios_base::out ) << *u; File( "Merson-u.tnl", std::ios_base::out ) << *u;
File( "Merson-k1.tnl", std::ios_base::out ) << *k1; File( "Merson-k1.tnl", std::ios_base::out ) << *_k1;
File( "Merson-k2.tnl", std::ios_base::out ) << *k2; File( "Merson-k2.tnl", std::ios_base::out ) << *_k2;
File( "Merson-k3.tnl", std::ios_base::out ) << *k3; File( "Merson-k3.tnl", std::ios_base::out ) << *_k3;
File( "Merson-k4.tnl", std::ios_base::out ) << *k4; File( "Merson-k4.tnl", std::ios_base::out ) << *_k4;
File( "Merson-k5.tnl", std::ios_base::out ) << *k5; File( "Merson-k5.tnl", std::ios_base::out ) << *_k5;
std::cout << " done. PRESS A KEY." << std::endl; std::cout << " done. PRESS A KEY." << std::endl;
getchar(); getchar();
} }
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
/*
template< typename RealType, typename Index > template< typename RealType, typename Index >
__global__ void computeK2Arg( const Index size, __global__ void computeK2Arg( const Index size,
const RealType tau, const RealType tau,
...@@ -552,6 +517,7 @@ __global__ void computeK5Arg( const Index size, ...@@ -552,6 +517,7 @@ __global__ void computeK5Arg( const Index size,
if( i < size ) if( i < size )
k5_arg[ i ] = u[ i ] + tau * ( 0.5 * k1[ i ] - 1.5 * k3[ i ] + 2.0 * k4[ i ] ); k5_arg[ i ] = u[ i ] + tau * ( 0.5 * k1[ i ] - 1.5 * k3[ i ] + 2.0 * k4[ i ] );
} }
*/
template< typename RealType, typename Index > template< typename RealType, typename Index >
__global__ void computeErrorKernel( const Index size, __global__ void computeErrorKernel( const Index size,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment