From 5f87aba32866310ee33168817129403338d829e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 26 Jun 2019 21:28:34 +0200
Subject: [PATCH] Implemented addAndReduceAbs, refactoring of the Euler solver.

---
 .../Expressions/ExpressionTemplates.h         |  52 ++++++++
 .../Expressions/StaticExpressionTemplates.h   |  46 +++++++
 src/TNL/Problems/HeatEquationProblem.h        |  51 ++++----
 src/TNL/Problems/HeatEquationProblem_impl.h   |  32 +++--
 src/TNL/Solvers/ODE/Euler.h                   |  40 +++---
 src/TNL/Solvers/ODE/Euler.hpp                 | 119 +-----------------
 src/UnitTests/Containers/VectorTest-8.h       |  31 ++++-
 7 files changed, 193 insertions(+), 178 deletions(-)

diff --git a/src/TNL/Containers/Expressions/ExpressionTemplates.h b/src/TNL/Containers/Expressions/ExpressionTemplates.h
index 8a7863e561..dab52995a8 100644
--- a/src/TNL/Containers/Expressions/ExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/ExpressionTemplates.h
@@ -2252,6 +2252,58 @@ Result addAndReduce( Vector& lhs,
    return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
 }
 
+////
+// Addition and reduction
+template< typename Vector,
+   typename T1,
+   typename T2,
+   template< typename, typename > class Operation,
+   typename Reduction,
+   typename VolatileReduction,
+   typename Result >
+Result addAndReduceAbs( Vector& lhs,
+   const Containers::Expressions::BinaryExpressionTemplate< T1, T2, Operation >& expression,
+   Reduction& reduction,
+   VolatileReduction& volatileReduction,
+   const Result& zero )
+{
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+   using DeviceType = typename Vector::DeviceType;
+
+   RealType* lhs_data = lhs.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType {
+      const RealType aux = expression[ i ];
+      lhs_data[ i ] += aux;
+      return TNL::abs( aux );
+   };
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+}
+
+template< typename Vector,
+   typename T1,
+   template< typename > class Operation,
+   typename Reduction,
+   typename VolatileReduction,
+   typename Result >
+Result addAndReduceAbs( Vector& lhs,
+   const Containers::Expressions::UnaryExpressionTemplate< T1, Operation >& expression,
+   Reduction& reduction,
+   VolatileReduction& volatileReduction,
+   const Result& zero )
+{
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+   using DeviceType = typename Vector::DeviceType;
+
+   RealType* lhs_data = lhs.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType {
+      const RealType aux = expression[ i ];
+      lhs_data[ i ] += aux;
+      return TNL::abs( aux );
+   };
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+}
 
 ////
 // Output stream
diff --git a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
index cec0d3ce22..967de900dd 100644
--- a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
@@ -2476,5 +2476,51 @@ Result addAndReduce( Vector& lhs,
    return result;
 }
 
+////
+// Addition with reduction of abs
+template< typename Vector,
+   typename T1,
+   typename T2,
+   template< typename, typename > class Operation,
+   typename Reduction,
+   typename VolatileReduction,
+   typename Result >
+__cuda_callable__
+Result addAndReduceAbs( Vector& lhs,
+   const Containers::Expressions::StaticBinaryExpressionTemplate< T1, T2, Operation >& expression,
+   Reduction& reduction,
+   VolatileReduction& volatileReduction,
+   const Result& zero )
+{
+   Result result( zero );
+   for( int i = 0; i < Vector::getSize(); i++ ) {
+      const Result aux = expression[ i ];
+      lhs[ i ] += aux;
+      reduction( result, TNL::abs( aux ) );
+   }
+   return result;
+}
+
+template< typename Vector,
+   typename T1,
+   template< typename > class Operation,
+   typename Reduction,
+   typename VolatileReduction,
+   typename Result >
+__cuda_callable__
+Result addAndReduceAbs( Vector& lhs,
+   const Containers::Expressions::StaticUnaryExpressionTemplate< T1, Operation >& expression,
+   Reduction& reduction,
+   VolatileReduction& volatileReduction,
+   const Result& zero )
+{
+   Result result( zero );
+   for( int i = 0; i < Vector::getSize(); i++ ) {
+      const Result aux = expression[ i ];
+      lhs[ i ] += aux;
+      reduction( result, TNL::abs( aux ) );
+   }
+   return result;
+}
 
 } // namespace TNL
diff --git a/src/TNL/Problems/HeatEquationProblem.h b/src/TNL/Problems/HeatEquationProblem.h
index a04e6ea2a9..cddd70746a 100644
--- a/src/TNL/Problems/HeatEquationProblem.h
+++ b/src/TNL/Problems/HeatEquationProblem.h
@@ -95,7 +95,7 @@ class HeatEquationProblem : public PDEProblem< Mesh,
                               DofVectorPointer& _fu );
       
       void applyBoundaryConditions( const RealType& time,
-                                    DofVectorPointer& dofs );      
+                                    DofVectorPointer& dofs );
 
       template< typename MatrixPointer >
       void assemblyLinearSystem( const RealType& time,
@@ -104,31 +104,32 @@ class HeatEquationProblem : public PDEProblem< Mesh,
                                  MatrixPointer& matrixPointer,
                                  DofVectorPointer& rightHandSidePointer );
 
-      protected:
-         
-         MeshFunctionPointer uPointer;
-         MeshFunctionPointer fuPointer;
-      
-         DifferentialOperatorPointer differentialOperatorPointer;
-
-         BoundaryConditionPointer boundaryConditionPointer;
-
-         RightHandSidePointer rightHandSidePointer;
-         
-         Timer gpuTransferTimer;
-         
-         Solvers::PDE::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide > explicitUpdater;
-         
-         Solvers::PDE::LinearSystemAssembler< Mesh, 
-                                              MeshFunctionType,
-                                              DifferentialOperator,
-                                              BoundaryCondition,
-                                              RightHandSide,
-                                              Solvers::PDE::BackwardTimeDiscretisation,
-                                              DofVectorType > systemAssembler;
-
-        Meshes::DistributedMeshes::DistrGridIOTypes distributedIOType;
+   protected:
+
+      MeshFunctionPointer uPointer;
+      MeshFunctionPointer fuPointer;
+
+      DifferentialOperatorPointer differentialOperatorPointer;
+
+      BoundaryConditionPointer boundaryConditionPointer;
+
+      RightHandSidePointer rightHandSidePointer;
+
+      Timer gpuTransferTimer;
+
+      Solvers::PDE::ExplicitUpdater< Mesh, MeshFunctionType, DifferentialOperator, BoundaryCondition, RightHandSide > explicitUpdater;
+
+      Solvers::PDE::LinearSystemAssembler< Mesh,
+                                           MeshFunctionType,
+                                           DifferentialOperator,
+                                           BoundaryCondition,
+                                           RightHandSide,
+                                           Solvers::PDE::BackwardTimeDiscretisation,
+                                           DofVectorType > systemAssembler;
+
+     Meshes::DistributedMeshes::DistrGridIOTypes distributedIOType;
 
+     bool catchExceptions = true;
 };
 
 } // namespace Problems
diff --git a/src/TNL/Problems/HeatEquationProblem_impl.h b/src/TNL/Problems/HeatEquationProblem_impl.h
index 0c0cd7418e..64b4a2ca91 100644
--- a/src/TNL/Problems/HeatEquationProblem_impl.h
+++ b/src/TNL/Problems/HeatEquationProblem_impl.h
@@ -95,11 +95,11 @@ setup( const Config::ParameterContainer& parameters,
       return false;
    }
 
-   String param=parameters.getParameter< String >( "distributed-grid-io-type" );
-   if(param=="MpiIO")
-        distributedIOType=Meshes::DistributedMeshes::MpiIO;
-   if(param=="LocalCopy")
-        distributedIOType=Meshes::DistributedMeshes::LocalCopy;
+   String param = parameters.getParameter< String >( "distributed-grid-io-type" );
+   if( param == "MpiIO" )
+        distributedIOType = Meshes::DistributedMeshes::MpiIO;
+   if( param == "LocalCopy" )
+        distributedIOType = Meshes::DistributedMeshes::LocalCopy;
 
    this->explicitUpdater.setDifferentialOperator( this->differentialOperatorPointer );
    this->explicitUpdater.setBoundaryConditions( this->boundaryConditionPointer );
@@ -107,6 +107,8 @@ setup( const Config::ParameterContainer& parameters,
    this->systemAssembler.setDifferentialOperator( this->differentialOperatorPointer );
    this->systemAssembler.setBoundaryConditions( this->boundaryConditionPointer );
    this->systemAssembler.setRightHandSide( this->rightHandSidePointer );
+
+   this->catchExceptions = parameters.getParameter< bool >( "catch-exceptions" );
    return true;
 }
 
@@ -150,7 +152,6 @@ setInitialCondition( const Config::ParameterContainer& parameters,
 {
    this->bindDofs( dofs );
    const String& initialConditionFile = parameters.getParameter< String >( "initial-condition" );
-   std::cout<<"setInitialCondition" <<std::endl; 
    if(CommunicatorType::isDistributed())
     {
         std::cout<<"Nodes Distribution: " << uPointer->getMesh().getDistributedMesh()->printProcessDistr() << std::endl;
@@ -162,15 +163,20 @@ setInitialCondition( const Config::ParameterContainer& parameters,
     }
     else
     {
-      try
-      {
-         this->uPointer->boundLoad( initialConditionFile );
-      }
-      catch(...)
+      if( this->catchExceptions )
       {
-         std::cerr << "I am not able to load the initial condition from the file " << initialConditionFile << "." << std::endl;
-         return false;
+         try
+         {
+            this->uPointer->boundLoad( initialConditionFile );
+         }
+         catch( std::ios_base::failure& e )
+         {
+            std::cerr << "I am not able to load the initial condition from the file " << initialConditionFile << "." << std::endl;
+            std::cerr << e.what() << std::endl;
+            return false;
+         }
       }
+      else this->uPointer->boundLoad( initialConditionFile );
    }
    return true;
 }
diff --git a/src/TNL/Solvers/ODE/Euler.h b/src/TNL/Solvers/ODE/Euler.h
index c782b08cd9..57c4e10b94 100644
--- a/src/TNL/Solvers/ODE/Euler.h
+++ b/src/TNL/Solvers/ODE/Euler.h
@@ -25,38 +25,34 @@ class Euler : public ExplicitSolver< Problem >
 {
    public:
 
-   using ProblemType = Problem;
-   using DofVectorType = typename ProblemType::DofVectorType;
-   using RealType = typename ProblemType::RealType;
-   using DeviceType = typename ProblemType::DeviceType;
-   using IndexType  = typename ProblemType::IndexType;
-   using DofVectorView = typename ViewTypeGetter< DofVectorType >::Type;
-   using DofVectorPointer = Pointers::SharedPointer<  DofVectorType, DeviceType >;
+      using ProblemType = Problem;
+      using DofVectorType = typename ProblemType::DofVectorType;
+      using RealType = typename ProblemType::RealType;
+      using DeviceType = typename ProblemType::DeviceType;
+      using IndexType  = typename ProblemType::IndexType;
+      using DofVectorView = typename ViewTypeGetter< DofVectorType >::Type;
+      using DofVectorPointer = Pointers::SharedPointer<  DofVectorType, DeviceType >;
 
-   Euler();
+      Euler();
 
-   static String getType();
+      static String getType();
 
-   static void configSetup( Config::ConfigDescription& config,
-                            const String& prefix = "" );
+      static void configSetup( Config::ConfigDescription& config,
+                               const String& prefix = "" );
 
-   bool setup( const Config::ParameterContainer& parameters,
-              const String& prefix = "" );
+      bool setup( const Config::ParameterContainer& parameters,
+                 const String& prefix = "" );
 
-   void setCFLCondition( const RealType& cfl );
+      void setCFLCondition( const RealType& cfl );
 
-   const RealType& getCFLCondition() const;
+      const RealType& getCFLCondition() const;
 
-   bool solve( DofVectorPointer& u );
+      bool solve( DofVectorPointer& u );
 
    protected:
-   void computeNewTimeLevel( DofVectorPointer& u,
-                             RealType tau,
-                             RealType& currentResidue );
+      DofVectorPointer _k1;
 
-   DofVectorPointer _k1;
-
-   RealType cflCondition;
+      RealType cflCondition;
 };
 
 } // namespace ODE
diff --git a/src/TNL/Solvers/ODE/Euler.hpp b/src/TNL/Solvers/ODE/Euler.hpp
index 5e2c4979e7..c107df8a29 100644
--- a/src/TNL/Solvers/ODE/Euler.hpp
+++ b/src/TNL/Solvers/ODE/Euler.hpp
@@ -119,17 +119,9 @@ bool Euler< Problem > :: solve( DofVectorPointer& _u )
             continue;
          }
       }
-      RealType newResidue( 0.0 );
-      //computeNewTimeLevel( u, currentTau, newResidue );
-     /* auto fetch = [ currentTau, k1, u ] __cuda_callable__ ( IndexType i ) -> RealType {
-         const RealType add = currentTau * k1[ i ];
-         u[ i ] += add;
-         return TNL::abs( add ); };
-      auto reduction = [=] __cuda_callable__ ( RealType& a , const RealType& b ) { a += b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a , const volatile RealType& b ) { a += b; };
-      return Containers::Algorithms::Reduction< DeviceType >::reduce( u.getSize(), reduction, volatileReduction, fetch, 0.0 );*/
-      
-      this->setResidue( newResidue );
+      auto reduction = [] __cuda_callable__ ( RealType& a , const RealType& b ) { a += b; };
+      auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a , const volatile RealType& b ) { a += b; };
+      this->setResidue( addAndReduceAbs( u, currentTau * k1, reduction, volatileReduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) );
 
       /****
        * When time is close to stopTime the new residue
@@ -169,111 +161,6 @@ bool Euler< Problem > :: solve( DofVectorPointer& _u )
    }
 };
 
-/*template< typename Problem >
-void Euler< Problem > :: computeNewTimeLevel( DofVectorPointer& u,
-                                              RealType tau,
-                                              RealType& currentResidue )
-{
-   RealType localResidue = RealType( 0.0 );
-   const IndexType size = k1->getSize();
-   RealType* _u = u->getData();
-   RealType* _k1 = k1->getData();
-
-   if( std::is_same< DeviceType, Devices::Host >::value )
-   {
-#ifdef HAVE_OPENMP
-#pragma omp parallel for reduction(+:localResidue) firstprivate( _u, _k1, tau ) if( Devices::Host::isOMPEnabled() )
-#endif
-      for( IndexType i = 0; i < size; i ++ )
-      {
-         const RealType add = tau * _k1[ i ];
-         _u[ i ] += add;
-         localResidue += std::fabs( add );
-      }
-   }
-   if( std::is_same< DeviceType, Devices::Cuda >::value )
-   {
-#ifdef HAVE_CUDA
-      dim3 cudaBlockSize( 512 );
-      const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
-      const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks );
-      this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) );
-      const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x;
-
-      localResidue = 0.0;
-      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
-      {
-         const IndexType sharedMemory = cudaBlockSize.x * sizeof( RealType );
-         const IndexType gridOffset = gridIdx * threadsPerGrid;
-         const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
-         const IndexType currentGridSize = Devices::Cuda::getNumberOfBlocks( currentSize, cudaBlockSize.x );
-
-         updateUEuler<<< currentGridSize, cudaBlockSize, sharedMemory >>>( currentSize,
-                                                                      tau,
-                                                                      &_k1[ gridOffset ],
-                                                                      &_u[ gridOffset ],
-                                                                      this->cudaBlockResidue.getData() );
-         localResidue += this->cudaBlockResidue.sum();
-         cudaDeviceSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
-      }
-#endif
-   }
-   
-   //MIC
-   if( std::is_same< DeviceType, Devices::MIC >::value )
-   {
-
-#ifdef HAVE_MIC
-      Devices::MICHider<RealType> mu;
-      mu.pointer=_u;
-      Devices::MICHider<RealType> mk1;
-      mk1.pointer=_k1;
-    #pragma offload target(mic) in(mu,mk1,size) inout(localResidue)
-    {
-      #pragma omp parallel for reduction(+:localResidue) firstprivate( mu, mk1 )  
-      for( IndexType i = 0; i < size; i ++ )
-      {
-         const RealType add = tau * mk1.pointer[ i ];
-         mu.pointer[ i ] += add;
-         localResidue += std::fabs( add );
-      }
-    }
-#endif
-   }
-   localResidue /= tau * ( RealType ) size;
-   Problem::CommunicatorType::Allreduce( &localResidue, &currentResidue, 1, MPI_SUM, Problem::CommunicatorType::AllGroup );
-   //std::cerr << "Local residue = " << localResidue << " - globalResidue = " << currentResidue << std::endl;
-}
-
-#ifdef HAVE_CUDA
-template< typename RealType, typename Index >
-__global__ void updateUEuler( const Index size,
-                              const RealType tau,
-                              const RealType* k1,
-                              RealType* u,
-                              RealType* cudaBlockResidue )
-{
-   extern __shared__ RealType du[];
-   const Index blockOffset = blockIdx. x * blockDim.x;
-   const Index i = blockOffset  + threadIdx. x;
-   if( i < size )
-      u[ i ] += du[ threadIdx.x ] = tau * k1[ i ];
-   else
-      du[ threadIdx.x ] = 0.0;
-   du[ threadIdx.x ] = abs( du[ threadIdx.x ] );
-   __syncthreads();
-
-   const Index rest = size - blockOffset;
-   Index n =  rest < blockDim.x ? rest : blockDim.x;
-
-   computeBlockResidue( du,
-                        cudaBlockResidue,
-                        n );
-}
-#endif
-*/
-
 } // namespace ODE
 } // namespace Solvers
 } // namespace TNL
diff --git a/src/UnitTests/Containers/VectorTest-8.h b/src/UnitTests/Containers/VectorTest-8.h
index 7b3e908df3..c70408f404 100644
--- a/src/UnitTests/Containers/VectorTest-8.h
+++ b/src/UnitTests/Containers/VectorTest-8.h
@@ -72,7 +72,7 @@ TYPED_TEST( VectorTest, evaluateAndReduce )
 // error #3049-D: The enclosing parent function ("TestBody") for an extended __host__ __device__ lambda cannot have private or protected access within its class
 template< typename VectorView >
 typename VectorView::RealType
-performAddAndReduce( VectorView& u, VectorView& v, VectorView& w )
+performAddAndReduce1( VectorView& u, VectorView& v, VectorView& w )
 {
    using RealType = typename VectorView::RealType;
 
@@ -81,6 +81,18 @@ performAddAndReduce( VectorView& u, VectorView& v, VectorView& w )
    return addAndReduce( w, u * v, reduction, volatileReduction, ( RealType ) 0.0 );
 }
 
+template< typename VectorView >
+typename VectorView::RealType
+performAddAndReduce2( VectorView& v, VectorView& w )
+{
+   using RealType = typename VectorView::RealType;
+
+   auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
+   auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
+   return addAndReduce( w, 5.0 * v, reduction, volatileReduction, ( RealType ) 0.0 );
+}
+
+
 TYPED_TEST( VectorTest, addAndReduce )
 {
    using VectorType = typename TestFixture::VectorType;
@@ -101,9 +113,24 @@ TYPED_TEST( VectorTest, addAndReduce )
       w.setElement( i, x );
       aux += x * y;
    }
-   auto r = performAddAndReduce( u, v, w );
+   auto r = performAddAndReduce1( u, v, w );
    EXPECT_TRUE( w == u * v + u );
    EXPECT_NEAR( aux, r, 1.0e-5 );
+
+   aux = 0.0;
+   for( int i = 0; i < size; i++ )
+   {
+      const RealType x = i;
+      const RealType y = size / 2 - i;
+      u.setElement( i, x );
+      v.setElement( i, y );
+      w.setElement( i, x );
+      aux += 5.0 * y;
+   }
+   r = performAddAndReduce2( v, w );
+   EXPECT_TRUE( w == 5.0 * v + u );
+   EXPECT_NEAR( aux, r, 1.0e-5 );
+
 }
 
 #endif // HAVE_GTEST
-- 
GitLab