Rewritting the Merson solver using expression templates.

7398085f · Tomáš Oberhuber · 5f87aba3 · 7398085f · 7398085f
Commit 7398085f authored 5 years ago by Tomáš Oberhuber
--- a/src/TNL/Solvers/ODE/Merson.h
+++ b/src/TNL/Solvers/ODE/Merson.h
@@ -65,7 +65,7 @@ class Merson : public ExplicitSolver< Problem >
   void writeGrids( const DofVectorPointer& u );
-   DofVectorPointer k1, k2, k3, k4, k5, kAux;
+   DofVectorPointer _k1, _k2, _k3, _k4, _k5, _kAux;
   /****
    * This controls the accuracy of the solver

--- a/src/TNL/Solvers/ODE/Merson_impl.h
+++ b/src/TNL/Solvers/ODE/Merson_impl.h
@@ -128,7 +128,7 @@ void Merson< Problem > :: setAdaptivity( const RealType& a )
 };
 template< typename Problem >
-bool Merson< Problem > :: solve( DofVectorPointer& u )
+bool Merson< Problem >::solve( DofVectorPointer& _u )
 {
   if( ! this->problem )
   {
@@ -143,19 +143,25 @@ bool Merson< Problem > :: solve( DofVectorPointer& u )
   /****
    * First setup the supporting meshes k1...k5 and kAux.
    */
-   k1->setLike( *u );
+   _k1->setLike( *_u );
-   k2->setLike( *u );
+   _k2->setLike( *_u );
-   k3->setLike( *u );
+   _k3->setLike( *_u );
-   k4->setLike( *u );
+   _k4->setLike( *_u );
-   k5->setLike( *u );
+   _k5->setLike( *_u );
-   kAux->setLike( *u );
+   _kAux->setLike( *_u );
-   k1->setValue( 0.0 );
+   auto k1 = _k1->getView();
-   k2->setValue( 0.0 );
+   auto k2 = _k2->getView();
-   k3->setValue( 0.0 );
+   auto k3 = _k3->getView();
-   k4->setValue( 0.0 );
+   auto k4 = _k4->getView();
-   k5->setValue( 0.0 );
+   auto k5 = _k5->getView();
-   kAux->setValue( 0.0 );
+   auto kAux = _kAux->getView();
+   auto u = _u->getView();
+   k1 = 0.0;
+   k2 = 0.0;
+   k3 = 0.0;
+   k4 = 0.0;
+   k5 = 0.0;
+   kAux = 0.0;
   /****
    * Set necessary parameters
@@ -170,37 +176,69 @@ bool Merson< Problem > :: solve( DofVectorPointer& u )
   this->refreshSolverMonitor();
-   /****
+   /////
-    * Start the main loop
+   // Start the main loop
-    */
   while( this->checkNextIteration() )
   {
-      /****
+      /////
-       * Compute Runge-Kutta coefficients
+      // Compute Runge-Kutta coefficients
-       */
+      RealType tau_3 = currentTau / 3.0;
-      computeKFunctions( u, time, currentTau );
+      /////
+      // k1
+      this->problem->getExplicitUpdate( time, currentTau, _u, _k1 );
+      /////
+      // k2
+      kAux = u + currentTau * ( 1.0 / 3.0 * k1 );
+      this->problem->applyBoundaryConditions( time + tau_3, _kAux );
+      this->problem->getExplicitUpdate( time + tau_3, currentTau, _kAux, _k2 );
+      /////
+      // k3
+      kAux = u + currentTau * 1.0 / 6.0 * ( k1 + k2 );
+      this->problem->applyBoundaryConditions( time + tau_3, _kAux );
+      this->problem->getExplicitUpdate( time + tau_3, currentTau, _kAux, _k3 );
+      /////
+      // k4
+      kAux = u + currentTau * ( 0.125 * k1 + 0.375 * k3 );
+      this->problem->applyBoundaryConditions( time + 0.5 * currentTau, _kAux );
+      this->problem->getExplicitUpdate( time + 0.5 * currentTau, currentTau, _kAux, _k4 );
+      /////
+      // k5
+      kAux = u + currentTau * ( 0.5 * k1 - 1.5 * k3 + 2.0 * k4 );
+      this->problem->applyBoundaryConditions( time + currentTau, _kAux );
+      this->problem->getExplicitUpdate( time + currentTau, currentTau, _kAux, _k5 );
      if( this->testingMode )
-         writeGrids( u );
+         writeGrids( _u );
-      /****
+      /////
-       * Compute an error of the approximation.
+      // Compute an error of the approximation.
-       */
+      RealType error( 0.0 );
-      RealType eps( 0.0 );
      if( adaptivity != 0.0 )
-         eps = computeError( currentTau );
+      {
+         const RealType localError = 
+            max( currentTau / 3.0 * abs( 0.2 * k1 -0.9 * k3 + 0.8 * k4 -0.1 * k5 ) );
+            Problem::CommunicatorType::Allreduce( &localError, &error, 1, MPI_MAX, Problem::CommunicatorType::AllGroup );
+      }
-      if( adaptivity == 0.0 || eps < adaptivity )
+      if( adaptivity == 0.0 || error < adaptivity )
      {
         RealType lastResidue = this->getResidue();
         RealType newResidue( 0.0 );
         time += currentTau;
-         computeNewTimeLevel( time, currentTau, u, newResidue );
-         this->setResidue( newResidue );
+         auto reduction = [] __cuda_callable__ ( RealType& a , const RealType& b ) { a += b; };
+         auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a , const volatile RealType& b ) { a += b; };
-         /****
+         this->setResidue( addAndReduceAbs( u, currentTau / 6.0 * ( k1 + 4.0 * k4 + k5 ),
-          * When time is close to stopTime the new residue
+            reduction, volatileReduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) );
-          * may be inaccurate significantly.
-          */
+         /////
+         // When time is close to stopTime the new residue
+         // may be inaccurate significantly.
         if( abs( time - this->stopTime ) < 1.0e-7 ) this->setResidue( lastResidue );
@@ -209,12 +247,11 @@ bool Merson< Problem > :: solve( DofVectorPointer& u )
      }
      this->refreshSolverMonitor();
-      /****
+      /////
-       * Compute the new time step.
+      // Compute the new time step.
-       */
+      if( adaptivity != 0.0 && error != 0.0 )
-      if( adaptivity != 0.0 && eps != 0.0 )
      {
-         currentTau *= 0.8 * ::pow( adaptivity / eps, 0.2 );
+         currentTau *= 0.8 * ::pow( adaptivity / error, 0.2 );
         currentTau = min( currentTau, this->getMaxTau() );
 #ifdef USE_MPI
         TNLMPI::Bcast( currentTau, 1, 0 );
@@ -225,11 +262,8 @@ bool Merson< Problem > :: solve( DofVectorPointer& u )
      else this->tau = currentTau;
-      /****
+      /////
-       * Check stop conditions.
+      // Check stop conditions.
-       */
-      //cerr << "residue = " << residue << std::endl;
-      //cerr << "this->getConvergenceResidue() = " << this->getConvergenceResidue() << std::endl;
      if( time >= this->getStopTime() ||
          ( this->getConvergenceResidue() != 0.0 && this->getResidue() < this->getConvergenceResidue() ) )
      {
@@ -243,126 +277,57 @@ bool Merson< Problem > :: solve( DofVectorPointer& u )
 };
 template< typename Problem >
-void Merson< Problem >::computeKFunctions( DofVectorPointer& u,
+void Merson< Problem >::computeKFunctions( DofVectorPointer& _u,
                                             const RealType& time,
                                             RealType tau )
 {
-   IndexType size = u->getSize();
+   auto k1 = _k1->getView();
+   auto k2 = _k2->getView();
-   RealType* _k1 = k1->getData();
+   auto k3 = _k3->getView();
-   RealType* _k2 = k2->getData();
+   auto k4 = _k4->getView();
-   RealType* _k3 = k3->getData();
+   auto k5 = _k5->getView();
-   RealType* _k4 = k4->getData();
+   auto kAux = _kAux->getView();
-   RealType* _k5 = k5->getData();
+   auto u = _u->getView();
-   RealType* _kAux = kAux->getData();
-   RealType* _u = u->getData();
   RealType tau_3 = tau / 3.0;
+   /////
-   if( std::is_same< DeviceType, Devices::Host >::value )
+   // k1
-   {
+   this->problem->getExplicitUpdate( time, tau, _u, _k1 );
-      this->problem->getExplicitUpdate( time, tau, u, k1 );
+   /////
-   #ifdef HAVE_OPENMP
+   // k2
-   #pragma omp parallel for firstprivate( size, _kAux, _u, _k1, tau, tau_3 ) if( Devices::Host::isOMPEnabled() )
+   kAux = u + tau * ( 1.0 / 3.0 * k1 );
-   #endif
+   this->problem->applyBoundaryConditions( time + tau_3, _kAux );
-      for( IndexType i = 0; i < size; i ++ )
+   this->problem->getExplicitUpdate( time + tau_3, tau, _kAux, _k2 );
-         _kAux[ i ] = _u[ i ] + tau * ( 1.0 / 3.0 * _k1[ i ] );
-      this->problem->applyBoundaryConditions( time + tau_3, kAux );
+   /////
-      this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k2 );
+   // k3
+   kAux = u + tau * 1.0 / 6.0 * ( k1 + k2 );
-   #ifdef HAVE_OPENMP
+   this->problem->applyBoundaryConditions( time + tau_3, _kAux );
-   #pragma omp parallel for firstprivate( size, _kAux, _u, _k1, _k2, tau, tau_3 ) if( Devices::Host::isOMPEnabled() )
+   this->problem->getExplicitUpdate( time + tau_3, tau, _kAux, _k3 );
-   #endif
-      for( IndexType i = 0; i < size; i ++ )
+   /////
-         _kAux[ i ] = _u[ i ] + tau * 1.0 / 6.0 * ( _k1[ i ] + _k2[ i ] );
+   // k4
-      this->problem->applyBoundaryConditions( time + tau_3, kAux );
+   kAux = u + tau * ( 0.125 * k1 + 0.375 * k3 );
-      this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k3 );
+   this->problem->applyBoundaryConditions( time + 0.5 * tau, _kAux );
+   this->problem->getExplicitUpdate( time + 0.5 * tau, tau, _kAux, _k4 );
-   #ifdef HAVE_OPENMP
-   #pragma omp parallel for firstprivate( size, _kAux, _u, _k1, _k3, tau, tau_3 ) if( Devices::Host::isOMPEnabled() )
+   /////
-   #endif
+   // k5
-      for( IndexType i = 0; i < size; i ++ )
+   kAux = u + tau * ( 0.5 * k1 - 1.5 * k3 + 2.0 * k4 );
-         _kAux[ i ] = _u[ i ] + tau * ( 0.125 * _k1[ i ] + 0.375 * _k3[ i ] );
+   this->problem->applyBoundaryConditions( time + tau, _kAux );
-      this->problem->applyBoundaryConditions( time + 0.5 * tau, kAux );
+   this->problem->getExplicitUpdate( time + tau, tau, _kAux, _k5 );
-      this->problem->getExplicitUpdate( time + 0.5 * tau, tau, kAux, k4 );
-   #ifdef HAVE_OPENMP
-   #pragma omp parallel for firstprivate( size, _kAux, _u, _k1, _k3, _k4, tau, tau_3 ) if( Devices::Host::isOMPEnabled() )
-   #endif
-      for( IndexType i = 0; i < size; i ++ )
-         _kAux[ i ] = _u[ i ] + tau * ( 0.5 * _k1[ i ] - 1.5 * _k3[ i ] + 2.0 * _k4[ i ] );
-      this->problem->applyBoundaryConditions( time + tau, kAux );
-      this->problem->getExplicitUpdate( time + tau, tau, kAux, k5 );
-   }
-   if( std::is_same< DeviceType, Devices::Cuda >::value )
-   {
-#ifdef HAVE_CUDA
-      dim3 cudaBlockSize( 512 );
-      const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
-      const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks );
-      this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) );
-      const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x;
-      this->problem->getExplicitUpdate( time, tau, u, k1 );
-      cudaDeviceSynchronize();
-      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
-      {
-         const IndexType gridOffset = gridIdx * threadsPerGrid;
-         const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
-         computeK2Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_kAux[ gridOffset ] );
-      }
-      cudaDeviceSynchronize();
-      this->problem->applyBoundaryConditions( time + tau_3, kAux );
-      this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k2 );
-      cudaDeviceSynchronize();
-      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
-      {
-         const IndexType gridOffset = gridIdx * threadsPerGrid;
-         const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
-         computeK3Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k2[ gridOffset ], &_kAux[ gridOffset ] );
-      }
-      cudaDeviceSynchronize();
-      this->problem->applyBoundaryConditions( time + tau_3, kAux );
-      this->problem->getExplicitUpdate( time + tau_3, tau, kAux, k3 );
-      cudaDeviceSynchronize();
-      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
-      {
-         const IndexType gridOffset = gridIdx * threadsPerGrid;
-         const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
-         computeK4Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k3[ gridOffset ], &_kAux[ gridOffset ] );
-      }
-      cudaDeviceSynchronize();
-      this->problem->applyBoundaryConditions( time + 0.5 * tau, kAux );
-      this->problem->getExplicitUpdate( time + 0.5 * tau, tau, kAux, k4 );
-      cudaDeviceSynchronize();
-      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
-      {
-         const IndexType gridOffset = gridIdx * threadsPerGrid;
-         const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
-         computeK5Arg<<< cudaBlocks, cudaBlockSize >>>( currentSize, tau, &_u[ gridOffset ], &_k1[ gridOffset ], &_k3[ gridOffset ], &_k4[ gridOffset ], &_kAux[ gridOffset ] );
-      }
-      cudaDeviceSynchronize();
-      this->problem->applyBoundaryConditions( time + tau, kAux );
-      this->problem->getExplicitUpdate( time + tau, tau, kAux, k5 );
-      cudaDeviceSynchronize();
-#endif
-   }
 }
-template< typename Problem >
+/*template< typename Problem >
 typename Problem :: RealType Merson< Problem > :: computeError( const RealType tau )
 {
-   const IndexType size = k1->getSize();
+   const IndexType size = _k1->getSize();
-   const RealType* _k1 = k1->getData();
+   const RealType* k1 = _k1->getData();
-   const RealType* _k3 = k3->getData();
+   const RealType* k3 = _k3->getData();
-   const RealType* _k4 = k4->getData();
+   const RealType* k4 = _k4->getData();
-   const RealType* _k5 = k5->getData();
+   const RealType* k5 = _k5->getData();
-   RealType* _kAux = kAux->getData();
+   RealType* kAux = _kAux->getData();
   RealType eps( 0.0 ), maxEps( 0.0 );
   if( std::is_same< DeviceType, Devices::Host >::value )
@@ -379,10 +344,10 @@ typename Problem :: RealType Merson< Problem > :: computeError( const RealType t
         for( IndexType i = 0; i < size; i ++  )
         {
            RealType err = ( RealType ) ( tau / 3.0 *
-                                 abs( 0.2 * _k1[ i ] +
+                                 abs( 0.2 * k1[ i ] +
-                                     -0.9 * _k3[ i ] +
+                                     -0.9 * k3[ i ] +
-                                      0.8 * _k4[ i ] +
+                                      0.8 * k4[ i ] +
-                                     -0.1 * _k5[ i ] ) );
+                                     -0.1 * k5[ i ] ) );
            localEps = max( localEps, err );
         }
         this->openMPErrorEstimateBuffer[ Devices::Host::getThreadIdx() ] = localEps;
@@ -404,13 +369,13 @@ typename Problem :: RealType Merson< Problem > :: computeError( const RealType t
         const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
         computeErrorKernel<<< cudaBlocks, cudaBlockSize >>>( currentSize,
                                                              tau,
-                                                              &_k1[ gridOffset ],
+                                                              &k1[ gridOffset ],
-                                                              &_k3[ gridOffset ],
+                                                              &k3[ gridOffset ],
-                                                              &_k4[ gridOffset ],
+                                                              &k4[ gridOffset ],
-                                                              &_k5[ gridOffset ],
+                                                              &k5[ gridOffset ],
-                                                              &_kAux[ gridOffset ] );
+                                                              &kAux[ gridOffset ] );
         cudaDeviceSynchronize();
-         eps = std::max( eps, kAux->max() );
+         eps = std::max( eps, _kAux->max() );
      }
 #endif
   }
@@ -425,20 +390,20 @@ void Merson< Problem >::computeNewTimeLevel( const RealType time,
                                             RealType& currentResidue )
 {
   RealType localResidue = RealType( 0.0 );
-   IndexType size = k1->getSize();
+   IndexType size = _k1->getSize();
   RealType* _u = u->getData();
-   RealType* _k1 = k1->getData();
+   RealType* k1 = _k1->getData();
-   RealType* _k4 = k4->getData();
+   RealType* k4 = _k4->getData();
-   RealType* _k5 = k5->getData();
+   RealType* k5 = _k5->getData();
   if( std::is_same< DeviceType, Devices::Host >::value )
   {
 #ifdef HAVE_OPENMP
-#pragma omp parallel for reduction(+:localResidue) firstprivate( size, _u, _k1, _k4, _k5, tau ) if( Devices::Host::isOMPEnabled() )
+#pragma omp parallel for reduction(+:localResidue) firstprivate( size, _u, k1, k4, k5, tau ) if( Devices::Host::isOMPEnabled() )
 #endif
      for( IndexType i = 0; i < size; i ++ )
      {
-         const RealType add = tau / 6.0 * ( _k1[ i ] + 4.0 * _k4[ i ] + _k5[ i ] );
+         const RealType add = tau / 6.0 * ( k1[ i ] + 4.0 * k4[ i ] + k5[ i ] );
         _u[ i ] += add;
         localResidue += abs( ( RealType ) add );
      }
@@ -462,9 +427,9 @@ void Merson< Problem >::computeNewTimeLevel( const RealType time,
         updateUMerson<<< cudaBlocks, cudaBlockSize, sharedMemory >>>( currentSize,
                                                                       tau,
-                                                                       &_k1[ gridOffset ],
+                                                                       &k1[ gridOffset ],
-                                                                       &_k4[ gridOffset ],
+                                                                       &k4[ gridOffset ],
-                                                                       &_k5[ gridOffset ],
+                                                                       &k5[ gridOffset ],
                                                                       &_u[ gridOffset ],
                                                                       this->cudaBlockResidue.getData() );
         localResidue += this->cudaBlockResidue.sum();
@@ -490,17 +455,17 @@ void Merson< Problem >::writeGrids( const DofVectorPointer& u )
 {
   std::cout << "Writing Merson solver grids ...";
   File( "Merson-u.tnl", std::ios_base::out ) << *u;
-   File( "Merson-k1.tnl", std::ios_base::out ) << *k1;
+   File( "Merson-k1.tnl", std::ios_base::out ) << *_k1;
-   File( "Merson-k2.tnl", std::ios_base::out ) << *k2;
+   File( "Merson-k2.tnl", std::ios_base::out ) << *_k2;
-   File( "Merson-k3.tnl", std::ios_base::out ) << *k3;
+   File( "Merson-k3.tnl", std::ios_base::out ) << *_k3;
-   File( "Merson-k4.tnl", std::ios_base::out ) << *k4;
+   File( "Merson-k4.tnl", std::ios_base::out ) << *_k4;
-   File( "Merson-k5.tnl", std::ios_base::out ) << *k5;
+   File( "Merson-k5.tnl", std::ios_base::out ) << *_k5;
   std::cout << " done. PRESS A KEY." << std::endl;
   getchar();
 }
 #ifdef HAVE_CUDA
+/*
 template< typename RealType, typename Index >
 __global__ void computeK2Arg( const Index size,
                              const RealType tau,
@@ -552,6 +517,7 @@ __global__ void computeK5Arg( const Index size,
   if( i < size )
      k5_arg[ i ] = u[ i ] + tau * ( 0.5 * k1[ i ] - 1.5 * k3[ i ] + 2.0 * k4[ i ] );
 }
+*/
 template< typename RealType, typename Index >
 __global__ void computeErrorKernel( const Index size,