From 7398085f0c5aae3ce3e76ce931ba0ac3562650c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 27 Jun 2019 17:29:06 +0200
Subject: [PATCH] Rewritting the Merson solver using expression templates.

---
 src/TNL/Solvers/ODE/Merson.h      |   2 +-
 src/TNL/Solvers/ODE/Merson_impl.h | 328 +++++++++++++-----------------
 2 files changed, 148 insertions(+), 182 deletions(-)

diff --git a/src/TNL/Solvers/ODE/Merson.h b/src/TNL/Solvers/ODE/Merson.h
index d39cf169fa..d9a69bceb0 100644
--- a/src/TNL/Solvers/ODE/Merson.h
+++ b/src/TNL/Solvers/ODE/Merson.h
@@ -65,7 +65,7 @@ class Merson : public ExplicitSolver< Problem >
 
    void writeGrids( const DofVectorPointer& u );
 
-   DofVectorPointer k1, k2, k3, k4, k5, kAux;
+   DofVectorPointer _k1, _k2, _k3, _k4, _k5, _kAux;
 
    /****
     * This controls the accuracy of the solver
diff --git a/src/TNL/Solvers/ODE/Merson_impl.h b/src/TNL/Solvers/ODE/Merson_impl.h
index 0b73f234d5..c3aab56b63 100644
--- a/src/TNL/Solvers/ODE/Merson_impl.h
+++ b/src/TNL/Solvers/ODE/Merson_impl.h
@@ -128,7 +128,7 @@ void Merson< Problem > :: setAdaptivity( const RealType& a )
 };
 
 template< typename Problem >
-bool Merson< Problem > :: solve( DofVectorPointer& u )
+bool Merson< Problem >::solve( DofVectorPointer& _u )
 {
    if( ! this->problem )
    {
@@ -143,19 +143,25 @@ bool Merson< Problem > :: solve( DofVectorPointer& u )
    /****
     * First setup the supporting meshes k1...k5 and kAux.
     */
-   k1->setLike( *u );
-   k2->setLike( *u );
-   k3->setLike( *u );
-   k4->setLike( *u );
-   k5->setLike( *u );
-   kAux->setLike( *u );
-   k1->setValue( 0.0 );
-   k2->setValue( 0.0 );
-   k3->setValue( 0.0 );
-   k4->setValue( 0.0 );
-   k5->setValue( 0.0 );
-   kAux->setValue( 0.0 );
-
+   _k1->setLike( *_u );
+   _k2->setLike( *_u );
+   _k3->setLike( *_u );
+   _k4->setLike( *_u );
+   _k5->setLike( *_u );
+   _kAux->setLike( *_u );
+   auto k1 = _k1->getView();
+   auto k2 = _k2->getView();
+   auto k3 = _k3->getView();
+   auto k4 = _k4->getView();
+   auto k5 = _k5->getView();
+   auto kAux = _kAux->getView();
+   auto u = _u->getView();
+   k1 = 0.0;
+   k2 = 0.0;
+   k3 = 0.0;
+   k4 = 0.0;
+   k5 = 0.0;
+   kAux = 0.0;
 
    /****
     * Set necessary parameters
@@ -170,37 +176,69 @@ bool Merson< Problem > :: solve( DofVectorPointer& u )
 
    this->refreshSolverMonitor();
 
-   /****
-    * Start the main loop
-    */
+   /////
+   // Start the main loop
    while( this->checkNextIteration() )
    {
-      /****
-       * Compute Runge-Kutta coefficients
-       */
-      computeKFunctions( u, time, currentTau );
+      /////
+      // Compute Runge-Kutta coefficients
+      RealType tau_3 = currentTau / 3.0;
+
+      /////
+      // k1
+      this->problem->getExplicitUpdate( time, currentTau, _u, _k1 );
+
+      /////
+      // k2
+      kAux = u + currentTau * ( 1.0 / 3.0 * k1 );
+      this->problem->applyBoundaryConditions( time + tau_3, _kAux );
+      this->problem->getExplicitUpdate( time + tau_3, currentTau, _kAux, _k2 );
+
+      /////
+      // k3
+      kAux = u + currentTau * 1.0 / 6.0 * ( k1 + k2 );
+      this->problem->applyBoundaryConditions( time + tau_3, _kAux );
+      this->problem->getExplicitUpdate( time + tau_3, currentTau, _kAux, _k3 );
+
+      /////
+      // k4
+      kAux = u + currentTau * ( 0.125 * k1 + 0.375 * k3 );
+      this->problem->applyBoundaryConditions( time + 0.5 * currentTau, _kAux );
+      this->problem->getExplicitUpdate( time + 0.5 * currentTau, currentTau, _kAux, _k4 );
+
+      /////
+      // k5
+      kAux = u + currentTau * ( 0.5 * k1 - 1.5 * k3 + 2.0 * k4 );
+      this->problem->applyBoundaryConditions( time + currentTau, _kAux );
+      this->problem->getExplicitUpdate( time + currentTau, currentTau, _kAux, _k5 );
+
       if( this->testingMode )
-         writeGrids( u );
+         writeGrids( _u );
 
-      /****
-       * Compute an error of the approximation.
-       */
-      RealType eps( 0.0 );
+      /////
+      // Compute an error of the approximation.
+      RealType error( 0.0 );
       if( adaptivity != 0.0 )
-         eps = computeError( currentTau );
+      {
+         const RealType localError = 
+            max( currentTau / 3.0 * abs( 0.2 * k1 -0.9 * k3 + 0.8 * k4 -0.1 * k5 ) );
+            Problem::CommunicatorType::Allreduce( &localError, &error, 1, MPI_MAX, Problem::CommunicatorType::AllGroup );
+      }
 
-      if( adaptivity == 0.0 || eps < adaptivity )
+      if( adaptivity == 0.0 || error < adaptivity )
       {
          RealType lastResidue = this->getResidue();
          RealType newResidue( 0.0 );
          time += currentTau;
-         computeNewTimeLevel( time, currentTau, u, newResidue );
-         this->setResidue( newResidue );
- 
-         /****
-          * When time is close to stopTime the new residue
-          * may be inaccurate significantly.
-          */
+
+         auto reduction = [] __cuda_callable__ ( RealType& a , const RealType& b ) { a += b; };
+         auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a , const volatile RealType& b ) { a += b; };
+         this->setResidue( addAndReduceAbs( u, currentTau / 6.0 * ( k1 + 4.0 * k4 + k5 ),
+            reduction, volatileReduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) );
+
+         /////
+         // When time is close to stopTime the new residue
+         // may be inaccurate significantly.
          if( abs( time - this->stopTime ) < 1.0e-7 ) this->setResidue( lastResidue );
          
 
@@ -209,12 +247,11 @@ bool Merson< Problem > :: solve( DofVectorPointer& u )
       }
       this->refreshSolverMonitor();
 
-      /****
-       * Compute the new time step.
-       */
-      if( adaptivity != 0.0 && eps != 0.0 )
+      /////
+      // Compute the new time step.
+      if( adaptivity != 0.0 && error != 0.0 )
       {
-         currentTau *= 0.8 * ::pow( adaptivity / eps, 0.2 );
+         currentTau *= 0.8 * ::pow( adaptivity / error, 0.2 );
          currentTau = min( currentTau, this->getMaxTau() );
 #ifdef USE_MPI
          TNLMPI::Bcast( currentTau, 1, 0 );
@@ -225,11 +262,8 @@ bool Merson< Problem > :: solve( DofVectorPointer& u )
       else this->tau = currentTau;
 
 
-      /****
-       * Check stop conditions.
-       */
-      //cerr << "residue = " << residue << std::endl;
-      //cerr << "this->getConvergenceResidue() = " << this->getConvergenceResidue() << std::endl;
+      /////
+      // Check stop conditions.
       if( time >= this->getStopTime() ||
           ( this->getConvergenceResidue() != 0.0 && this->getResidue() < this->getConvergenceResidue() ) )
       {
@@ -243,126 +277,57 @@ bool Merson< Problem > :: solve( DofVectorPointer& u )
 };
 
 template< typename Problem >
-void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
+void Merson< Problem >::computeKFunctions( DofVectorPointer& _u,
                                              const RealType& time,
                                              RealType tau )
 {
-   IndexType size = u->getSize();
-
-   RealType* _k1 = k1->getData();
-   RealType* _k2 = k2->getData();
-   RealType* _k3 = k3->getData();
-   RealType* _k4 = k4->getData();
-   RealType* _k5 = k5->getData();
-   RealType* _kAux = kAux->getData();
-   RealType* _u = u->getData();
+   auto k1 = _k1->getView();
+   auto k2 = _k2->getView();
+   auto k3 = _k3->getView();
+   auto k4 = _k4->getView();
+   auto k5 = _k5->getView();
+   auto kAux = _kAux->getView();
+   auto u = _u->getView();
 
    RealType tau_3 = tau / 3.0;
-
-   if( std::is_same< DeviceType, Devices::Host >::value )
-   {
-      this->problem->getExplicitUpdate( time, tau, u, k1 );
-
-   #ifdef HAVE_OPENMP
-   #pragma omp parallel for firstprivate( size, _kAux, _u, _k1, tau, tau_3 ) if( Devices::Host::isOMPEnabled() )
-   #endif
-      for( IndexType i = 0; i < size; i ++ )
-         _kAux[ i ] = _u[ i ] + tau * ( 1.0 / 3.0 * _k1[ i ] );
-      this->problem->applyBoundaryConditions( time + tau_3, kAux );
-      this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k2 );
-
-   #ifdef HAVE_OPENMP
-   #pragma omp parallel for firstprivate( size, _kAux, _u, _k1, _k2, tau, tau_3 ) if( Devices::Host::isOMPEnabled() )
-   #endif
-      for( IndexType i = 0; i < size; i ++ )
-         _kAux[ i ] = _u[ i ] + tau * 1.0 / 6.0 * ( _k1[ i ] + _k2[ i ] );
-      this->problem->applyBoundaryConditions( time + tau_3, kAux );
-      this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k3 );
-
-   #ifdef HAVE_OPENMP
-   #pragma omp parallel for firstprivate( size, _kAux, _u, _k1, _k3, tau, tau_3 ) if( Devices::Host::isOMPEnabled() )
-   #endif
-      for( IndexType i = 0; i < size; i ++ )
-         _kAux[ i ] = _u[ i ] + tau * ( 0.125 * _k1[ i ] + 0.375 * _k3[ i ] );
-      this->problem->applyBoundaryConditions( time + 0.5 * tau, kAux );
-      this->problem->getExplicitUpdate( time + 0.5 * tau, tau, kAux, k4 );
-
-   #ifdef HAVE_OPENMP
-   #pragma omp parallel for firstprivate( size, _kAux, _u, _k1, _k3, _k4, tau, tau_3 ) if( Devices::Host::isOMPEnabled() )
-   #endif
-      for( IndexType i = 0; i < size; i ++ )
-         _kAux[ i ] = _u[ i ] + tau * ( 0.5 * _k1[ i ] - 1.5 * _k3[ i ] + 2.0 * _k4[ i ] );
-      this->problem->applyBoundaryConditions( time + tau, kAux );
-      this->problem->getExplicitUpdate( time + tau, tau, kAux, k5 );
-   }
-   if( std::is_same< DeviceType, Devices::Cuda >::value )
-   {
-#ifdef HAVE_CUDA
-      dim3 cudaBlockSize( 512 );
-      const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
-      const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks );
-      this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) );
-      const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x;
-
-      this->problem->getExplicitUpdate( time, tau, u, k1 );
-      cudaDeviceSynchronize();
-
-      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
-      {
-         const IndexType gridOffset = gridIdx * threadsPerGrid;
-         const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
-         computeK2Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_kAux[ gridOffset ] );
-      }
-      cudaDeviceSynchronize();
-      this->problem->applyBoundaryConditions( time + tau_3, kAux );
-      this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k2 );
-      cudaDeviceSynchronize();
-
-      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
-      {
-         const IndexType gridOffset = gridIdx * threadsPerGrid;
-         const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
-         computeK3Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k2[ gridOffset ], &_kAux[ gridOffset ] );
-      }
-      cudaDeviceSynchronize();
-      this->problem->applyBoundaryConditions( time + tau_3, kAux );
-      this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k3 );
-      cudaDeviceSynchronize();
-
-      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
-      {
-         const IndexType gridOffset = gridIdx * threadsPerGrid;
-         const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
-         computeK4Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k3[ gridOffset ], &_kAux[ gridOffset ] );
-      }
-      cudaDeviceSynchronize();
-      this->problem->applyBoundaryConditions( time + 0.5 * tau, kAux );
-      this->problem->getExplicitUpdate( time + 0.5 * tau, tau, kAux, k4 );
-      cudaDeviceSynchronize();
-
-      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
-      {
-         const IndexType gridOffset = gridIdx * threadsPerGrid;
-         const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
-         computeK5Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k3[ gridOffset ], &_k4[ gridOffset ], &_kAux[ gridOffset ] );
-      }
-      cudaDeviceSynchronize();
-      this->problem->applyBoundaryConditions( time + tau, kAux );
-      this->problem->getExplicitUpdate( time + tau, tau, kAux, k5 );
-      cudaDeviceSynchronize();
-#endif
-   }
+   /////
+   // k1
+   this->problem->getExplicitUpdate( time, tau, _u, _k1 );
+
+   /////
+   // k2
+   kAux = u + tau * ( 1.0 / 3.0 * k1 );
+   this->problem->applyBoundaryConditions( time + tau_3, _kAux );
+   this->problem->getExplicitUpdate( time + tau_3, tau, _kAux, _k2 );
+
+   /////
+   // k3
+   kAux = u + tau * 1.0 / 6.0 * ( k1 + k2 );
+   this->problem->applyBoundaryConditions( time + tau_3, _kAux );
+   this->problem->getExplicitUpdate( time + tau_3, tau, _kAux, _k3 );
+
+   /////
+   // k4
+   kAux = u + tau * ( 0.125 * k1 + 0.375 * k3 );
+   this->problem->applyBoundaryConditions( time + 0.5 * tau, _kAux );
+   this->problem->getExplicitUpdate( time + 0.5 * tau, tau, _kAux, _k4 );
+
+   /////
+   // k5
+   kAux = u + tau * ( 0.5 * k1 - 1.5 * k3 + 2.0 * k4 );
+   this->problem->applyBoundaryConditions( time + tau, _kAux );
+   this->problem->getExplicitUpdate( time + tau, tau, _kAux, _k5 );
 }
 
-template< typename Problem >
+/*template< typename Problem >
 typename Problem :: RealType Merson< Problem > :: computeError( const RealType tau )
 {
-   const IndexType size = k1->getSize();
-   const RealType* _k1 = k1->getData();
-   const RealType* _k3 = k3->getData();
-   const RealType* _k4 = k4->getData();
-   const RealType* _k5 = k5->getData();
-   RealType* _kAux = kAux->getData();
+   const IndexType size = _k1->getSize();
+   const RealType* k1 = _k1->getData();
+   const RealType* k3 = _k3->getData();
+   const RealType* k4 = _k4->getData();
+   const RealType* k5 = _k5->getData();
+   RealType* kAux = _kAux->getData();
 
    RealType eps( 0.0 ), maxEps( 0.0 );
    if( std::is_same< DeviceType, Devices::Host >::value )
@@ -379,10 +344,10 @@ typename Problem :: RealType Merson< Problem > :: computeError( const RealType t
          for( IndexType i = 0; i < size; i ++  )
          {
             RealType err = ( RealType ) ( tau / 3.0 *
-                                 abs( 0.2 * _k1[ i ] +
-                                     -0.9 * _k3[ i ] +
-                                      0.8 * _k4[ i ] +
-                                     -0.1 * _k5[ i ] ) );
+                                 abs( 0.2 * k1[ i ] +
+                                     -0.9 * k3[ i ] +
+                                      0.8 * k4[ i ] +
+                                     -0.1 * k5[ i ] ) );
             localEps = max( localEps, err );
          }
          this->openMPErrorEstimateBuffer[ Devices::Host::getThreadIdx() ] = localEps;
@@ -404,13 +369,13 @@ typename Problem :: RealType Merson< Problem > :: computeError( const RealType t
          const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
          computeErrorKernel<<< cudaBlocks, cudaBlockSize >>>( currentSize,
                                                               tau,
-                                                              &_k1[ gridOffset ],
-                                                              &_k3[ gridOffset ],
-                                                              &_k4[ gridOffset ],
-                                                              &_k5[ gridOffset ],
-                                                              &_kAux[ gridOffset ] );
+                                                              &k1[ gridOffset ],
+                                                              &k3[ gridOffset ],
+                                                              &k4[ gridOffset ],
+                                                              &k5[ gridOffset ],
+                                                              &kAux[ gridOffset ] );
          cudaDeviceSynchronize();
-         eps = std::max( eps, kAux->max() );
+         eps = std::max( eps, _kAux->max() );
       }
 #endif
    }
@@ -425,20 +390,20 @@ void Merson< Problem >::computeNewTimeLevel( const RealType time,
                                              RealType& currentResidue )
 {
    RealType localResidue = RealType( 0.0 );
-   IndexType size = k1->getSize();
+   IndexType size = _k1->getSize();
    RealType* _u = u->getData();
-   RealType* _k1 = k1->getData();
-   RealType* _k4 = k4->getData();
-   RealType* _k5 = k5->getData();
+   RealType* k1 = _k1->getData();
+   RealType* k4 = _k4->getData();
+   RealType* k5 = _k5->getData();
 
    if( std::is_same< DeviceType, Devices::Host >::value )
    {
 #ifdef HAVE_OPENMP
-#pragma omp parallel for reduction(+:localResidue) firstprivate( size, _u, _k1, _k4, _k5, tau ) if( Devices::Host::isOMPEnabled() )
+#pragma omp parallel for reduction(+:localResidue) firstprivate( size, _u, k1, k4, k5, tau ) if( Devices::Host::isOMPEnabled() )
 #endif
       for( IndexType i = 0; i < size; i ++ )
       {
-         const RealType add = tau / 6.0 * ( _k1[ i ] + 4.0 * _k4[ i ] + _k5[ i ] );
+         const RealType add = tau / 6.0 * ( k1[ i ] + 4.0 * k4[ i ] + k5[ i ] );
          _u[ i ] += add;
          localResidue += abs( ( RealType ) add );
       }
@@ -462,9 +427,9 @@ void Merson< Problem >::computeNewTimeLevel( const RealType time,
 
          updateUMerson<<< cudaBlocks, cudaBlockSize, sharedMemory >>>( currentSize,
                                                                        tau,
-                                                                       &_k1[ gridOffset ],
-                                                                       &_k4[ gridOffset ],
-                                                                       &_k5[ gridOffset ],
+                                                                       &k1[ gridOffset ],
+                                                                       &k4[ gridOffset ],
+                                                                       &k5[ gridOffset ],
                                                                        &_u[ gridOffset ],
                                                                        this->cudaBlockResidue.getData() );
          localResidue += this->cudaBlockResidue.sum();
@@ -490,17 +455,17 @@ void Merson< Problem >::writeGrids( const DofVectorPointer& u )
 {
    std::cout << "Writing Merson solver grids ...";
    File( "Merson-u.tnl", std::ios_base::out ) << *u;
-   File( "Merson-k1.tnl", std::ios_base::out ) << *k1;
-   File( "Merson-k2.tnl", std::ios_base::out ) << *k2;
-   File( "Merson-k3.tnl", std::ios_base::out ) << *k3;
-   File( "Merson-k4.tnl", std::ios_base::out ) << *k4;
-   File( "Merson-k5.tnl", std::ios_base::out ) << *k5;
+   File( "Merson-k1.tnl", std::ios_base::out ) << *_k1;
+   File( "Merson-k2.tnl", std::ios_base::out ) << *_k2;
+   File( "Merson-k3.tnl", std::ios_base::out ) << *_k3;
+   File( "Merson-k4.tnl", std::ios_base::out ) << *_k4;
+   File( "Merson-k5.tnl", std::ios_base::out ) << *_k5;
    std::cout << " done. PRESS A KEY." << std::endl;
    getchar();
 }
 
 #ifdef HAVE_CUDA
-
+/*
 template< typename RealType, typename Index >
 __global__ void computeK2Arg( const Index size,
                               const RealType tau,
@@ -552,6 +517,7 @@ __global__ void computeK5Arg( const Index size,
    if( i < size )
       k5_arg[ i ] = u[ i ] + tau * ( 0.5 * k1[ i ] - 1.5 * k3[ i ] + 2.0 * k4[ i ] );
 }
+*/
 
 template< typename RealType, typename Index >
 __global__ void computeErrorKernel( const Index size,
-- 
GitLab