From 0a561076b3563198e8fbeed118e33369596ef19f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Wed, 26 Sep 2018 10:39:21 +0200
Subject: [PATCH] Refactoring GMRES variants

---
 src/TNL/Solvers/Linear/CMakeLists.txt      |   2 -
 src/TNL/Solvers/Linear/CWYGMRES.h          | 115 ----
 src/TNL/Solvers/Linear/CWYGMRES_impl.h     | 633 ------------------
 src/TNL/Solvers/Linear/GMRES.h             |  86 ++-
 src/TNL/Solvers/Linear/GMRES_impl.h        | 742 +++++++++++++++------
 src/TNL/Solvers/LinearSolverTypeResolver.h |   5 +-
 src/TNL/Solvers/SolverConfig_impl.h        |   5 -
 7 files changed, 594 insertions(+), 994 deletions(-)
 delete mode 100644 src/TNL/Solvers/Linear/CWYGMRES.h
 delete mode 100644 src/TNL/Solvers/Linear/CWYGMRES_impl.h

diff --git a/src/TNL/Solvers/Linear/CMakeLists.txt b/src/TNL/Solvers/Linear/CMakeLists.txt
index f44f0a095f..2e33af0b8e 100644
--- a/src/TNL/Solvers/Linear/CMakeLists.txt
+++ b/src/TNL/Solvers/Linear/CMakeLists.txt
@@ -6,8 +6,6 @@ SET( headers BICGStab.h
              BICGStabL_impl.h
              CG.h
              CG_impl.h
-             CWYGMRES.h
-             CWYGMRES_impl.h
              GMRES.h
              GMRES_impl.h
              Jacobi.h
diff --git a/src/TNL/Solvers/Linear/CWYGMRES.h b/src/TNL/Solvers/Linear/CWYGMRES.h
deleted file mode 100644
index 1cccc132dd..0000000000
--- a/src/TNL/Solvers/Linear/CWYGMRES.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/***************************************************************************
-                          CWYGMRES.h  -  description
-                             -------------------
-    begin                : May 13, 2016
-    copyright            : (C) 2016 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky
-
-#pragma once
-
-#include "LinearSolver.h"
-
-#include <TNL/Containers/Vector.h>
-
-namespace TNL {
-namespace Solvers {
-namespace Linear {
-
-template< typename Matrix >
-class CWYGMRES
-: public LinearSolver< Matrix >
-{
-   using Base = LinearSolver< Matrix >;
-public:
-   using RealType = typename Base::RealType;
-   using DeviceType = typename Base::DeviceType;
-   using IndexType = typename Base::IndexType;
-   using VectorViewType = typename Base::VectorViewType;
-   using ConstVectorViewType = typename Base::ConstVectorViewType;
-
-   String getType() const;
-
-   static void configSetup( Config::ConfigDescription& config,
-                            const String& prefix = "" );
-
-   bool setup( const Config::ParameterContainer& parameters,
-               const String& prefix = "" ) override;
-
-   void setRestarting( IndexType rest );
-
-   bool solve( ConstVectorViewType b, VectorViewType x ) override;
-
-protected:
-   using DeviceVector = Containers::Vector< RealType, DeviceType, IndexType >;
-   using HostVector = Containers::Vector< RealType, Devices::Host, IndexType >;
-
-   void hauseholder_generate( DeviceVector& Y,
-                              HostVector& T,
-                              const int& i,
-                              DeviceVector& w );
-
-   void hauseholder_apply_trunc( HostVector& out,
-                                 DeviceVector& Y,
-                                 HostVector& T,
-                                 const int& i,
-                                 DeviceVector& w );
-
-   void hauseholder_cwy( DeviceVector& w,
-                         DeviceVector& Y,
-                         HostVector& T,
-                         const int& i );
-
-   void hauseholder_cwy_transposed( DeviceVector& w,
-                                    DeviceVector& Y,
-                                    HostVector& T,
-                                    const int& i,
-                                    DeviceVector& z );
-
-   template< typename Vector >
-   void update( IndexType k,
-                IndexType m,
-                const HostVector& H,
-                const HostVector& s,
-                DeviceVector& v,
-                Vector& x );
-
-   void generatePlaneRotation( RealType& dx,
-                               RealType& dy,
-                               RealType& cs,
-                               RealType& sn );
-
-   void applyPlaneRotation( RealType& dx,
-                            RealType& dy,
-                            RealType& cs,
-                            RealType& sn );
-
-
-   void setSize( IndexType _size, IndexType m );
-
-   // single vectors
-   DeviceVector r, z, w, _M_tmp;
-   // matrices (in column-major format)
-   DeviceVector V, Y;
-   // duplicate of the upper (m+1)x(m+1) submatrix of Y (it is lower triangular) for fast access
-   HostVector YL, T;
-   // host-only storage for Givens rotations and the least squares problem
-   HostVector cs, sn, H, s;
-
-   IndexType size = 0;
-   IndexType ldSize = 0;
-   IndexType restarting_min = 10;
-   IndexType restarting_max = 10;
-   IndexType restarting_step_min = 3;
-   IndexType restarting_step_max = 3;
-};
-
-} // namespace Linear
-} // namespace Solvers
-} // namespace TNL
-
-#include "CWYGMRES_impl.h"
diff --git a/src/TNL/Solvers/Linear/CWYGMRES_impl.h b/src/TNL/Solvers/Linear/CWYGMRES_impl.h
deleted file mode 100644
index 4989f50207..0000000000
--- a/src/TNL/Solvers/Linear/CWYGMRES_impl.h
+++ /dev/null
@@ -1,633 +0,0 @@
-/***************************************************************************
-                          CWYGMRES.h  -  description
-                             -------------------
-    begin                : May 13, 2016
-    copyright            : (C) 2016 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky
-
-#pragma once
-
-#include <type_traits>
-#include <cmath>
-
-#include <TNL/Exceptions/CudaSupportMissing.h>
-#include <TNL/Containers/Algorithms/Multireduction.h>
-#include <TNL/Matrices/MatrixOperations.h>
-
-#include "CWYGMRES.h"
-
-namespace TNL {
-namespace Solvers {
-namespace Linear {
-
-template< typename Matrix >
-String
-CWYGMRES< Matrix >::
-getType() const
-{
-   return String( "CWYGMRES< " ) +
-          this->matrix -> getType() + ", " +
-          this->preconditioner -> getType() + " >";
-}
-
-template< typename Matrix >
-void
-CWYGMRES< Matrix >::
-configSetup( Config::ConfigDescription& config,
-             const String& prefix )
-{
-   config.addEntry< int >( prefix + "gmres-restarting-min", "Minimal number of iterations after which the GMRES restarts.", 10 );
-   config.addEntry< int >( prefix + "gmres-restarting-max", "Maximal number of iterations after which the GMRES restarts.", 10 );
-   config.addEntry< int >( prefix + "gmres-restarting-step-min", "Minimal adjusting step for the adaptivity of the GMRES restarting parameter.", 3 );
-   config.addEntry< int >( prefix + "gmres-restarting-step-max", "Maximal adjusting step for the adaptivity of the GMRES restarting parameter.", 3 );
-}
-
-template< typename Matrix >
-bool
-CWYGMRES< Matrix >::
-setup( const Config::ParameterContainer& parameters,
-       const String& prefix )
-{
-   restarting_min = parameters.getParameter< int >( "gmres-restarting-min" );
-   this->setRestarting( parameters.getParameter< int >( "gmres-restarting-max" ) );
-   restarting_step_min = parameters.getParameter< int >( "gmres-restarting-step-min" );
-   restarting_step_max = parameters.getParameter< int >( "gmres-restarting-step-max" );
-   return LinearSolver< Matrix >::setup( parameters, prefix );
-}
-
-template< typename Matrix >
-void
-CWYGMRES< Matrix >::
-setRestarting( IndexType rest )
-{
-   if( size != 0 )
-      setSize( size, rest );
-   restarting_max = rest;
-}
-
-template< typename Matrix >
-bool
-CWYGMRES< Matrix >::
-solve( ConstVectorViewType b, VectorViewType x )
-{
-   TNL_ASSERT_TRUE( this->matrix, "No matrix was set in CWYGMRES. Call setMatrix() before solve()." );
-   if( restarting_min <= 0 || restarting_max <= 0 || restarting_min > restarting_max )
-   {
-      std::cerr << "Wrong value for the GMRES restarting parameters: r_min = " << restarting_min
-                << ", r_max = " << restarting_max << std::endl;
-      return false;
-   }
-   if( restarting_step_min < 0 || restarting_step_max < 0 || restarting_step_min > restarting_step_max )
-   {
-      std::cerr << "Wrong value for the GMRES restarting adjustment parameters: d_min = " << restarting_step_min
-                << ", d_max = " << restarting_step_max << std::endl;
-      return false;
-   }
-   setSize( this->matrix->getRows(), restarting_max );
-
-   RealType normb( 0.0 ), beta( 0.0 );
-   /****
-    * 1. Solve r from M r = b - A x_0
-    */
-   if( this->preconditioner )
-   {
-      this->preconditioner->solve( b, _M_tmp );
-      normb = _M_tmp.lpNorm( ( RealType ) 2.0 );
-
-      this->matrix->vectorProduct( x, _M_tmp );
-      _M_tmp.addVector( b, ( RealType ) 1.0, -1.0 );
-
-      this->preconditioner->solve( _M_tmp, r );
-   }
-   else
-   {
-      this->matrix->vectorProduct( x, r );
-      normb = b.lpNorm( ( RealType ) 2.0 );
-      r.addVector( b, ( RealType ) 1.0, -1.0 );
-   }
-   beta = r.lpNorm( ( RealType ) 2.0 );
-
-   //cout << "norm b = " << normb <<std::endl;
-   //cout << " beta = " << beta <<std::endl;
-
-
-   if( normb == 0.0 )
-      normb = 1.0;
-
-   this->resetIterations();
-   this->setResidue( beta / normb );
-
-   // parameters for the adaptivity of the restarting parameter
-         RealType beta_ratio = 1;           // = beta / beta_ratio (small value indicates good convergence rate)
-   const RealType max_beta_ratio = 0.99;    // = cos(8Â°) \approx 0.99
-   const RealType min_beta_ratio = 0.175;   // = cos(80Â°) \approx 0.175
-         int restart_cycles = 0;    // counter of restart cycles
-         int m = restarting_max;    // current restarting parameter
-
-   DeviceVector vi, vk;
-   while( this->checkNextIteration() )
-   {
-      // adaptivity of the restarting parameter
-      // reference:  A.H. Baker, E.R. Jessup, Tz.V. Kolev - A simple strategy for varying the restart parameter in GMRES(m)
-      //             http://www.sciencedirect.com/science/article/pii/S0377042709000132
-      if( restart_cycles > 0 ) {
-         if( beta_ratio > max_beta_ratio )
-            // near stagnation -> set maximum
-            m = restarting_max;
-         else if( beta_ratio >= min_beta_ratio ) {
-            // the step size is determined based on current m using linear interpolation
-            // between restarting_step_min and restarting_step_max
-            const int step = restarting_step_min + (float) ( restarting_step_max - restarting_step_min ) /
-                                                           ( restarting_max - restarting_min ) *
-                                                           ( m - restarting_min );
-            if( m - step >= restarting_min )
-               m -= step;
-            else
-               // set restarting_max when we hit restarting_min (see Baker et al. (2009))
-               m = restarting_max;
-         }
-//         std::cerr << "restarting: cycle = " << restart_cycles << ", beta_ratio = " << beta_ratio << ", m = " << m << "    " << std::endl;
-      }
-
-      /***
-       * z = r / | r | =  1.0 / beta * r
-       */
-//      z.addVector( r, ( RealType ) 1.0 / beta, ( RealType ) 0.0 );
-      z = r;
-
-      H.setValue( 0.0 );
-      s.setValue( 0.0 );
-      T.setValue( 0.0 );
-
-      // NOTE: this is unstable, s[0] is set later
-//      s[ 0 ] = beta;
-
-      /****
-       * Starting m-loop
-       */
-      for( IndexType i = 0; i <= m && this->nextIteration(); i++ )
-      {
-//        std::cout << "==== i = " << i << " ====" <<std::endl;
-
-         /****
-          * Generate new Hauseholder transformation from vector z.
-          */
-         hauseholder_generate( Y, T, i, z );
-
-         if( i == 0 ) {
-            /****
-             * s = e_1^T * P_i * z
-             */
-            hauseholder_apply_trunc( s, Y, T, i, z );
-         }
-         else {
-            /***
-             * H_{i-1} = P_i * z
-             */
-            HostVector h;
-            h.bind( &H.getData()[ (i - 1) * (m + 1) ], m + 1 );
-            hauseholder_apply_trunc( h, Y, T, i, z );
-         }
-
-         /***
-          * Generate new basis vector v_i, using the compact WY representation:
-          *     v_i = (I - Y_i * T_i Y_i^T) * e_i
-          */
-         vi.bind( &V.getData()[ i * ldSize ], size );
-         hauseholder_cwy( vi, Y, T, i );
-
-         if( i < m ) {
-            /****
-             * Solve w from M w = A v_i
-             */
-            if( this->preconditioner )
-            {
-               this->matrix->vectorProduct( vi, _M_tmp );
-               this->preconditioner->solve( _M_tmp, w );
-            }
-            else
-                this->matrix->vectorProduct( vi, w );
-
-            /****
-             * Apply all previous Hauseholder transformations, using the compact WY representation:
-             *     z = (I - Y_i * T_i^T * Y_i^T) * w
-             */
-            hauseholder_cwy_transposed( z, Y, T, i, w );
-         }
-
-//        std::cout << "vi.norm = " << vi.lpNorm( 2.0 ) <<std::endl;
-//        std::cout << "H (before rotations) = " <<std::endl;
-//         for( int i = 0; i <= m; i++ ) {
-//             for( int j = 0; j < m; j++ )
-//                std::cout << H[ i + j * (m+1) ] << "\t";
-//            std::cout <<std::endl;
-//         }
-//        std::cout << "s = " << s <<std::endl;
-
-         /****
-          * Applying the Givens rotations
-          */
-         if( i > 0 ) {
-            for( IndexType k = 0; k < i - 1; k++ )
-               applyPlaneRotation( H[ k     + (i - 1) * (m + 1) ],
-                                   H[ k + 1 + (i - 1) * (m + 1) ],
-                                   cs[ k ],
-                                   sn[ k ] );
-
-            if( H[ i + (i - 1) * (m + 1) ] != 0.0 ) {
-               generatePlaneRotation( H[ i - 1 + (i - 1) * (m + 1) ],
-                                      H[ i     + (i - 1) * (m + 1) ],
-                                      cs[ i - 1 ],
-                                      sn[ i - 1 ]);
-               applyPlaneRotation( H[ i - 1 + (i - 1) * (m + 1) ],
-                                   H[ i     + (i - 1) * (m + 1) ],
-                                   cs[ i - 1 ],
-                                   sn[ i - 1 ]);
-               applyPlaneRotation( s[ i - 1 ],
-                                   s[ i     ],
-                                   cs[ i - 1 ],
-                                   sn[ i - 1 ] );
-            }
-         }
-
-//        std::cout << "H (after rotations) = " <<std::endl;
-//         for( int i = 0; i <= m; i++ ) {
-//             for( int j = 0; j < m; j++ )
-//                std::cout << H[ i + j * (m+1) ] << "\t";
-//            std::cout <<std::endl;
-//         }
-//        std::cout << "s (after rotations) = " << s <<std::endl;
-//        std::cout << "residue / normb = " << std::fabs( s[ i ] ) / normb <<std::endl;
-
-//         for( int k = 0; k <= i; k++ ) {
-//            vk.bind( &V.getData()[ k * ldSize ], size );
-//           std::cout << "(v" << k << ",v" << i << ") = " << vk.scalarProduct( vi ) <<std::endl;
-//         }
-
-         this->setResidue( std::fabs( s[ i ] ) / normb );
-         if( ! this->checkNextIteration() ) {
-            update( i - 1, m, H, s, V, x );
-            this->refreshSolverMonitor( true );
-            return this->checkConvergence();
-         }
-         else {
-            this->refreshSolverMonitor();
-         }
-      }
-      update( m - 1, m, H, s, V, x );
-
-      /****
-       * r = M.solve(b - A * x);
-       */
-      const RealType beta_old = beta;
-      if( this->preconditioner )
-      {
-         this->matrix->vectorProduct( x, _M_tmp );
-         _M_tmp.addVector( b, ( RealType ) 1.0, -1.0 );
-         this->preconditioner->solve( _M_tmp, r );
-      }
-      else
-      {
-         this->matrix->vectorProduct( x, r );
-         r.addVector( b, ( RealType ) 1.0, -1.0 );
-      }
-      beta = r.lpNorm( ( RealType ) 2.0 );
-      this->setResidue( beta / normb );
-
-//     std::cout << " x = " << x <<std::endl;
-//     std::cout << " beta = " << beta <<std::endl;
-//     std::cout << "residue = " << beta / normb <<std::endl;
-
-      // update parameters for the adaptivity of the restarting parameter
-      ++restart_cycles;
-      beta_ratio = beta / beta_old;
-   }
-   this->refreshSolverMonitor( true );
-   return this->checkConvergence();
-}
-
-#ifdef HAVE_CUDA
-template< typename DestinationElement,
-          typename SourceElement,
-          typename Index >
-__global__ void
-copyTruncatedVectorKernel( DestinationElement* destination,
-                           const SourceElement* source,
-                           const Index from,
-                           const Index size )
-{
-   Index elementIdx = blockIdx.x * blockDim.x + threadIdx.x;
-   const Index gridSize = blockDim.x * gridDim.x;
-
-   while( elementIdx < from )
-   {
-      destination[ elementIdx ] = (DestinationElement) 0.0;
-      elementIdx += gridSize;
-   }
-   while( elementIdx < size )
-   {
-      destination[ elementIdx ] = source[ elementIdx ];
-      elementIdx += gridSize;
-   }
-}
-#endif
-
-template< typename Matrix >
-void
-CWYGMRES< Matrix >::
-hauseholder_generate( DeviceVector& Y,
-                      HostVector& T,
-                      const int& i,
-                      DeviceVector& z )
-{
-   DeviceVector y_i;
-   y_i.bind( &Y.getData()[ i * ldSize ], size );
-
-   // XXX: the upper-right triangle of Y will be full of zeros, which can be exploited for optimization
-   if( std::is_same< DeviceType, Devices::Host >::value ) {
-      for( IndexType j = 0; j < size; j++ ) {
-         if( j < i )
-            y_i[ j ] = 0.0;
-         else
-            y_i[ j ] = z[ j ];
-      }
-   }
-   if( std::is_same< DeviceType, Devices::Cuda >::value ) {
-#ifdef HAVE_CUDA
-      dim3 blockSize( 256 );
-      dim3 gridSize;
-      gridSize.x = min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( size, blockSize.x ) );
-
-      copyTruncatedVectorKernel<<< gridSize, blockSize >>>( y_i.getData(),
-                                                            z.getData(),
-                                                            i,
-                                                            size );
-      TNL_CHECK_CUDA_DEVICE;
-#else
-      throw Exceptions::CudaSupportMissing();
-#endif
-   }
-
-   // norm of the TRUNCATED vector z
-   const RealType normz = y_i.lpNorm( 2.0 );
-   const RealType y_ii = y_i.getElement( i );
-   if( y_ii > 0.0 ) {
-      y_i.setElement( i, y_ii + normz );
-   }
-   else {
-      y_i.setElement( i, y_ii - normz );
-   }
-
-   // compute the norm of the y_i vector; equivalent to this calculation by definition:
-   //       const RealType norm_yi = y_i.lpNorm( 2.0 );
-   const RealType norm_yi = std::sqrt( 2 * (normz * normz + std::fabs( y_ii ) * normz) );
-
-   // XXX: normalization is slower, but more stable
-//   y_i *= 1.0 / norm_yi;
-//   const RealType t_i = 2.0;
-   // assuming it's stable enough...
-   const RealType t_i = 2.0 / (norm_yi * norm_yi);
-
-   T[ i + i * (restarting_max + 1) ] = t_i;
-   if( i > 0 ) {
-      // aux = Y_{i-1}^T * y_i
-      RealType aux[ i ];
-      Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
-      Containers::Algorithms::Multireduction< DeviceType >::reduce
-               ( scalarProduct,
-                 i,
-                 size,
-                 Y.getData(),
-                 ldSize,
-                 y_i.getData(),
-                 aux );
-
-      // [T_i]_{0..i-1} = - T_{i-1} * t_i * aux
-      for( int k = 0; k < i; k++ ) {
-         T[ k + i * (restarting_max + 1) ] = 0.0;
-         for( int j = k; j < i; j++ )
-            T[ k + i * (restarting_max + 1) ] -= T[ k + j * (restarting_max + 1) ] * (t_i * aux[ j ]);
-      }
-   }
-}
-
-template< typename Matrix >
-void
-CWYGMRES< Matrix >::
-hauseholder_apply_trunc( HostVector& out,
-                         DeviceVector& Y,
-                         HostVector& T,
-                         const int& i,
-                         DeviceVector& z )
-{
-   DeviceVector y_i;
-   y_i.bind( &Y.getData()[ i * ldSize ], size );
-
-   const RealType aux = T[ i + i * (restarting_max + 1) ] * y_i.scalarProduct( z );
-   if( std::is_same< DeviceType, Devices::Host >::value ) {
-      for( int k = 0; k <= i; k++ )
-         out[ k ] = z[ k ] - y_i[ k ] * aux;
-   }
-   if( std::is_same< DeviceType, Devices::Cuda >::value ) {
-      // copy part of y_i to buffer on host
-      // here we duplicate the upper (m+1)x(m+1) submatrix of Y on host for fast access
-      RealType* host_yi = &YL[ i * (restarting_max + 1) ];
-      RealType host_z[ i + 1 ];
-      Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_yi, y_i.getData(), restarting_max + 1 );
-      Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_z, z.getData(), i + 1 );
-      for( int k = 0; k <= i; k++ )
-         out[ k ] = host_z[ k ] - host_yi[ k ] * aux;
-   }
-}
-
-template< typename Matrix >
-void
-CWYGMRES< Matrix >::
-hauseholder_cwy( DeviceVector& v,
-                 DeviceVector& Y,
-                 HostVector& T,
-                 const int& i )
-{
-   // aux = Y_i^T * e_i
-   RealType aux[ i + 1 ];
-   if( std::is_same< DeviceType, Devices::Host >::value ) {
-      for( int k = 0; k <= i; k++ )
-         aux[ k ] = Y[ i + k * ldSize ];
-   }
-   if( std::is_same< DeviceType, Devices::Cuda >::value ) {
-      // the upper (m+1)x(m+1) submatrix of Y is duplicated on host for fast access
-      for( int k = 0; k <= i; k++ )
-         aux[ k ] = YL[ i + k * (restarting_max + 1) ];
-   }
-
-   // aux = T_i * aux
-   // Note that T_i is upper triangular, so we can overwrite the aux vector with the result in place
-   for( int k = 0; k <= i; k++ ) {
-      RealType aux2 = 0.0;
-      for( int j = k; j <= i; j++ )
-         aux2 += T[ k + j * (restarting_max + 1) ] * aux[ j ];
-      aux[ k ] = aux2;
-   }
-
-   // v = e_i - Y_i * aux
-   MatrixOperations< DeviceType >::gemv( size, i + 1,
-                                         -1.0, Y.getData(), ldSize, aux,
-                                         0.0, v.getData() );
-   v.setElement( i, 1.0 + v.getElement( i ) );
-}
-
-template< typename Matrix >
-void
-CWYGMRES< Matrix >::
-hauseholder_cwy_transposed( DeviceVector& z,
-                            DeviceVector& Y,
-                            HostVector& T,
-                            const int& i,
-                            DeviceVector& w )
-{
-   // aux = Y_i^T * w
-   RealType aux[ i + 1 ];
-   Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
-   Containers::Algorithms::Multireduction< DeviceType >::reduce
-            ( scalarProduct,
-              i + 1,
-              size,
-              Y.getData(),
-              ldSize,
-              w.getData(),
-              aux );
-
-   // aux = T_i^T * aux
-   // Note that T_i^T is lower triangular, so we can overwrite the aux vector with the result in place
-   for( int k = i; k >= 0; k-- ) {
-      RealType aux2 = 0.0;
-      for( int j = 0; j <= k; j++ )
-         aux2 += T[ j + k * (restarting_max + 1) ] * aux[ j ];
-      aux[ k ] = aux2;
-   }
-
-   // z = w - Y_i * aux
-   z = w;
-   MatrixOperations< DeviceType >::gemv( size, i + 1,
-                                         -1.0, Y.getData(), ldSize, aux,
-                                         1.0, z.getData() );
-}
-
-template< typename Matrix >
-   template< typename Vector >
-void
-CWYGMRES< Matrix >::
-update( IndexType k,
-        IndexType m,
-        const HostVector& H,
-        const HostVector& s,
-        DeviceVector& v,
-        Vector& x )
-{
-//   Containers::Vector< RealType, Devices::Host, IndexType > y;
-//   y.setSize( m + 1 );
-   RealType y[ m + 1 ];
-
-   IndexType i, j;
-   for( i = 0; i <= m ; i ++ )
-      y[ i ] = s[ i ];
-
-   // Backsolve:
-   for( i = k; i >= 0; i--) {
-      if( H[ i + i * ( m + 1 ) ] == 0 ) {
-//         for( int _i = 0; _i <= i; _i++ ) {
-//             for( int _j = 0; _j < i; _j++ )
-//                std::cout << H[ _i + _j * (m+1) ] << "  ";
-//            std::cout <<std::endl;
-//         }
-         std::cerr << "H.norm = " << H.lpNorm( 2.0 ) << std::endl;
-         std::cerr << "s = " << s << std::endl;
-         std::cerr << "k = " << k << ", m = " << m << std::endl;
-         throw 1;
-      }
-      y[ i ] /= H[ i + i * ( m + 1 ) ];
-      for( j = i - 1; j >= 0; j--)
-         y[ j ] -= H[ j + i * ( m + 1 ) ] * y[ i ];
-   }
-
-   // x = V * y + x
-   MatrixOperations< DeviceType >::gemv( size, k + 1,
-                                         1.0, v.getData(), ldSize, y,
-                                         1.0, x.getData() );
-}
-
-template< typename Matrix >
-void
-CWYGMRES< Matrix >::
-generatePlaneRotation( RealType& dx,
-                       RealType& dy,
-                       RealType& cs,
-                       RealType& sn )
-{
-   if( dy == 0.0 )
-   {
-      cs = 1.0;
-      sn = 0.0;
-   }
-   else
-      if( std::fabs( dy ) > std::fabs( dx ) )
-      {
-         RealType temp = dx / dy;
-         sn = 1.0 / std::sqrt( 1.0 + temp * temp );
-         cs = temp * sn;
-      }
-      else
-      {
-         RealType temp = dy / dx;
-         cs = 1.0 / std::sqrt( 1.0 + temp * temp );
-         sn = temp * cs;
-      }
-}
-
-template< typename Matrix >
-void CWYGMRES< Matrix > ::
-applyPlaneRotation( RealType& dx,
-                    RealType& dy,
-                    RealType& cs,
-                    RealType& sn )
-{
-   RealType temp  =  cs * dx + sn * dy;
-   dy =  cs * dy - sn * dx;
-   dx = temp;
-}
-
-template< typename Matrix >
-void CWYGMRES< Matrix > :: setSize( IndexType _size, IndexType m )
-{
-   if( size == _size && restarting_max == m )
-      return;
-   size = _size;
-   if( std::is_same< DeviceType, Devices::Cuda >::value )
-      // align each column to 256 bytes - optimal for CUDA
-      ldSize = roundToMultiple( size, 256 / sizeof( RealType ) );
-   else
-       // on the host, we add 1 to disrupt the cache false-sharing pattern
-      ldSize = roundToMultiple( size, 256 / sizeof( RealType ) ) + 1;
-   restarting_max = m;
-   r.setSize( size );
-   z.setSize( size );
-   w.setSize( size );
-   V.setSize( ldSize * ( m + 1 ) );
-   Y.setSize( ldSize * ( m + 1 ) );
-   T.setSize( (m + 1) * (m + 1) );
-   YL.setSize( (m + 1) * (m + 1) );
-   cs.setSize( m + 1 );
-   sn.setSize( m + 1 );
-   H.setSize( ( m + 1 ) * m );
-   s.setSize( m + 1 );
-   _M_tmp.setSize( size );
-}
-
-} // namespace Linear
-} // namespace Solvers
-} // namespace TNL
diff --git a/src/TNL/Solvers/Linear/GMRES.h b/src/TNL/Solvers/Linear/GMRES.h
index 05c171f622..280a7ad119 100644
--- a/src/TNL/Solvers/Linear/GMRES.h
+++ b/src/TNL/Solvers/Linear/GMRES.h
@@ -1,13 +1,15 @@
 /***************************************************************************
                           GMRES.h  -  description
                              -------------------
-    begin                : 2007/07/31
-    copyright            : (C) 2007 by Tomas Oberhuber
+    begin                : May 13, 2016
+    copyright            : (C) 2016 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
 /* See Copyright Notice in tnl/Copyright */
 
+// Implemented by: Jakub Klinkovsky
+
 #pragma once
 
 #include "LinearSolver.h"
@@ -38,18 +40,54 @@ public:
    bool setup( const Config::ParameterContainer& parameters,
                const String& prefix = "" ) override;
 
-   void setRestarting( IndexType rest );
-
    bool solve( ConstVectorViewType b, VectorViewType x ) override;
 
 protected:
-   template< typename VectorT >
-   void update( IndexType k,
-                IndexType m,
-                const Containers::Vector< RealType, Devices::Host, IndexType >& H,
-                const Containers::Vector< RealType, Devices::Host, IndexType >& s,
-                Containers::Vector< RealType, DeviceType, IndexType >& v,
-                VectorT& x );
+   using ConstDeviceView = Containers::VectorView< const RealType, DeviceType, IndexType >;
+   using DeviceView = Containers::VectorView< RealType, DeviceType, IndexType >;
+   using HostView = Containers::VectorView< RealType, Devices::Host, IndexType >;
+   using DeviceVector = Containers::Vector< RealType, DeviceType, IndexType >;
+   using HostVector = Containers::Vector< RealType, Devices::Host, IndexType >;
+
+   enum class Variant { MGS, MGSR, CWY };
+
+   int orthogonalize_MGS( const int m, const RealType normb, const RealType beta );
+
+   int orthogonalize_CWY( const int m, const RealType normb, const RealType beta );
+
+   void compute_residue( VectorViewType r, ConstVectorViewType x, ConstVectorViewType b );
+
+   void preconditioned_matvec( DeviceVector& w, ConstDeviceView v );
+
+   void hauseholder_generate( DeviceVector& Y,
+                              HostVector& T,
+                              const int i,
+                              DeviceVector& w );
+
+   void hauseholder_apply_trunc( HostView out,
+                                 DeviceVector& Y,
+                                 HostVector& T,
+                                 const int i,
+                                 DeviceVector& w );
+
+   void hauseholder_cwy( DeviceVector& w,
+                         DeviceVector& Y,
+                         HostVector& T,
+                         const int i );
+
+   void hauseholder_cwy_transposed( DeviceVector& w,
+                                    DeviceVector& Y,
+                                    HostVector& T,
+                                    const int i,
+                                    DeviceVector& z );
+
+   template< typename Vector >
+   void update( const int k,
+                const int m,
+                const HostVector& H,
+                const HostVector& s,
+                DeviceVector& v,
+                Vector& x );
 
    void generatePlaneRotation( RealType& dx,
                                RealType& dy,
@@ -61,17 +99,29 @@ protected:
                             RealType& cs,
                             RealType& sn );
 
+   void apply_givens_rotations( const int i, const int m );
+
+
+   void setSize( const IndexType size );
 
-   void setSize( IndexType _size, IndexType m );
+   // selected GMRES variant
+   Variant variant = Variant::CWY;
 
-   Containers::Vector< RealType, DeviceType, IndexType > _r, w, _v, _M_tmp;
-   Containers::Vector< RealType, Devices::Host, IndexType > _s, _cs, _sn, _H;
+   // single vectors
+   DeviceVector r, w, z, _M_tmp;
+   // matrices (in column-major format)
+   DeviceVector V, Y;
+   // (CWY only) duplicate of the upper (m+1)x(m+1) submatrix of Y (it is lower triangular) for fast access
+   HostVector YL, T;
+   // host-only storage for Givens rotations and the least squares problem
+   HostVector cs, sn, H, s;
 
    IndexType size = 0;
-   IndexType restarting_min = 10;
-   IndexType restarting_max = 10;
-   IndexType restarting_step_min = 3;
-   IndexType restarting_step_max = 3;
+   IndexType ldSize = 0;
+   int restarting_min = 10;
+   int restarting_max = 10;
+   int restarting_step_min = 3;
+   int restarting_step_max = 3;
 };
 
 } // namespace Linear
diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h
index 220cecf23e..4e5f0360f6 100644
--- a/src/TNL/Solvers/Linear/GMRES_impl.h
+++ b/src/TNL/Solvers/Linear/GMRES_impl.h
@@ -1,17 +1,24 @@
 /***************************************************************************
                           GMRES_impl.h  -  description
                              -------------------
-    begin                : Nov 25, 2012
-    copyright            : (C) 2012 by Tomas Oberhuber
+    begin                : May 13, 2016
+    copyright            : (C) 2016 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
 /* See Copyright Notice in tnl/Copyright */
 
+// Implemented by: Jakub Klinkovsky
+
 #pragma once
 
+#include <type_traits>
 #include <cmath>
 
+#include <TNL/Exceptions/CudaSupportMissing.h>
+#include <TNL/Containers/Algorithms/Multireduction.h>
+#include <TNL/Matrices/MatrixOperations.h>
+
 #include "GMRES.h"
 
 namespace TNL {
@@ -34,6 +41,10 @@ GMRES< Matrix >::
 configSetup( Config::ConfigDescription& config,
              const String& prefix )
 {
+   config.addEntry< String >( prefix + "gmres-variant", "Minimal number of iterations after which the GMRES restarts.", "CWY" );
+      config.addEntryEnum( "MGS" );
+      config.addEntryEnum( "MGSR" );
+      config.addEntryEnum( "CWY" );
    config.addEntry< int >( prefix + "gmres-restarting-min", "Minimal number of iterations after which the GMRES restarts.", 10 );
    config.addEntry< int >( prefix + "gmres-restarting-max", "Maximal number of iterations after which the GMRES restarts.", 10 );
    config.addEntry< int >( prefix + "gmres-restarting-step-min", "Minimal adjusting step for the adaptivity of the GMRES restarting parameter.", 3 );
@@ -46,21 +57,22 @@ GMRES< Matrix >::
 setup( const Config::ParameterContainer& parameters,
        const String& prefix )
 {
+   const String var = parameters.getParameter< String >( "gmres-variant" );
+   if( var == "MGS" )
+      variant = Variant::MGS;
+   else if( var == "MGSR" )
+      variant = Variant::MGSR;
+   else if( var == "CWY" )
+      variant = Variant::CWY;
+   else
+      return false;
+
    restarting_min = parameters.getParameter< int >( "gmres-restarting-min" );
-   this->setRestarting( parameters.getParameter< int >( "gmres-restarting-max" ) );
+   restarting_max = parameters.getParameter< int >( "gmres-restarting-max" );
    restarting_step_min = parameters.getParameter< int >( "gmres-restarting-step-min" );
    restarting_step_max = parameters.getParameter< int >( "gmres-restarting-step-max" );
-   return LinearSolver< Matrix >::setup( parameters, prefix );
-}
 
-template< typename Matrix >
-void
-GMRES< Matrix >::
-setRestarting( IndexType rest )
-{
-   if( size != 0 )
-      setSize( size, rest );
-   restarting_max = rest;
+   return LinearSolver< Matrix >::setup( parameters, prefix );
 }
 
 template< typename Matrix >
@@ -69,58 +81,34 @@ GMRES< Matrix >::
 solve( ConstVectorViewType b, VectorViewType x )
 {
    TNL_ASSERT_TRUE( this->matrix, "No matrix was set in GMRES. Call setMatrix() before solve()." );
-   if( restarting_min <= 0 || restarting_max <= 0 || restarting_min > restarting_max )
-   {
+   if( restarting_min <= 0 || restarting_max <= 0 || restarting_min > restarting_max ) {
       std::cerr << "Wrong value for the GMRES restarting parameters: r_min = " << restarting_min
                 << ", r_max = " << restarting_max << std::endl;
       return false;
    }
-   if( restarting_step_min < 0 || restarting_step_max < 0 || restarting_step_min > restarting_step_max )
-   {
+   if( restarting_step_min < 0 || restarting_step_max < 0 || restarting_step_min > restarting_step_max ) {
       std::cerr << "Wrong value for the GMRES restarting adjustment parameters: d_min = " << restarting_step_min
                 << ", d_max = " << restarting_step_max << std::endl;
       return false;
    }
-   setSize( this->matrix->getRows(), restarting_max );
-
-   IndexType _size = size;
- 
-   //RealType *w = _w.getData();
-   RealType *s = _s.getData();
-   RealType *cs = _cs.getData();
-   RealType *sn = _sn.getData();
-   RealType *v = _v.getData();
-   RealType *H = _H.getData();
-   RealType *M_tmp = _M_tmp.getData();
-
-   RealType normb( 0.0 ), beta( 0.0 );
-   /****
-    * 1. Solve r from M r = b - A x_0
-    */
-   if( this->preconditioner )
-   {
-      this->preconditioner->solve( b, _M_tmp );
-      normb = _M_tmp.lpNorm( ( RealType ) 2.0 );
+   setSize( this->matrix->getRows() );
 
-      this->matrix->vectorProduct( x, _M_tmp );
-      _M_tmp.addVector( b, ( RealType ) 1.0, -1.0 );
-
-      this->preconditioner->solve( _M_tmp, _r );
+   // initialize the norm of the preconditioned right-hand-side
+   RealType normb;
+   if( this->preconditioner ) {
+      this->preconditioner->solve( b, _M_tmp );
+      normb = _M_tmp.lpNorm( 2.0 );
    }
    else
-   {
-      this->matrix->vectorProduct( x, _r );
-      normb = b.lpNorm( ( RealType ) 2.0 );
-      _r.addVector( b, ( RealType ) 1.0, -1.0 );
-   }
-   beta = _r.lpNorm( ( RealType ) 2.0 );
- 
-   //cout << "norm b = " << normb << std::endl;
-   //cout << " beta = " << beta << std::endl;
-
+      normb = b.lpNorm( 2.0 );
+   if( normb == 0.0 )
+      normb = 1.0;
 
-   if( normb == 0.0 ) normb = 1.0;
+   // r = M.solve(b - A * x);
+   compute_residue( r, x, b );
+   RealType beta = r.lpNorm( 2.0 );
 
+   // initialize stopping criterion
    this->resetIterations();
    this->setResidue( beta / normb );
 
@@ -131,9 +119,7 @@ solve( ConstVectorViewType b, VectorViewType x )
          int restart_cycles = 0;    // counter of restart cycles
          int m = restarting_max;    // current restarting parameter
 
-   Containers::Vector< RealType, DeviceType, IndexType > vi, vk;
-   while( this->checkNextIteration() )
-   {
+   while( this->checkNextIteration() ) {
       // adaptivity of the restarting parameter
       // reference:  A.H. Baker, E.R. Jessup, Tz.V. Kolev - A simple strategy for varying the restart parameter in GMRES(m)
       //             http://www.sciencedirect.com/science/article/pii/S0377042709000132
@@ -153,185 +139,471 @@ solve( ConstVectorViewType b, VectorViewType x )
                // set restarting_max when we hit restarting_min (see Baker et al. (2009))
                m = restarting_max;
          }
-//         std::cerr << "restarting: cycle = " << restart_cycles << ", beta_ratio = " << beta_ratio << ", m = " << m << "    " << std::endl;
       }
 
-      for( IndexType i = 0; i < m + 1; i ++ )
-         H[ i ] = s[ i ] = cs[ i ] = sn[ i ] = 0.0;
+      // orthogonalization
+      int o_steps = 0;
+      switch( variant ) {
+         case Variant::MGS:
+         case Variant::MGSR:
+            o_steps = orthogonalize_MGS( m, normb, beta );
+            break;
+         case Variant::CWY:
+            o_steps = orthogonalize_CWY( m, normb, beta );
+            break;
+      }
+
+      if( o_steps < m ) {
+         // exact solution has been reached early
+         update( o_steps, m, H, s, V, x );
+         this->refreshSolverMonitor( true );
+         return this->checkConvergence();
+      }
+
+      // update the solution approximation
+      update( m - 1, m, H, s, V, x );
+
+      // compute the new residual vector
+      compute_residue( r, x, b );
+      const RealType beta_old = beta;
+      beta = r.lpNorm( 2.0 );
+      this->setResidue( beta / normb );
+
+      // update parameters for the adaptivity of the restarting parameter
+      ++restart_cycles;
+      beta_ratio = beta / beta_old;
+   }
 
+   this->refreshSolverMonitor( true );
+   return this->checkConvergence();
+}
+
+template< typename Matrix >
+int
+GMRES< Matrix >::
+orthogonalize_MGS( const int m, const RealType normb, const RealType beta )
+{
+   DeviceView vi, vk;
+
+   /***
+    * v_0 = r / | r | =  1.0 / beta * r
+    */
+   vi.bind( V.getData(), ldSize );
+   vi.addVector( r, 1.0 / beta, 0.0 );
+
+   H.setValue( 0.0 );
+   s.setValue( 0.0 );
+   s[ 0 ] = beta;
+
+   /****
+    * Starting m-loop
+    */
+   for( int i = 0; i < m && this->nextIteration(); i++ ) {
+      vi.bind( &( V.getData()[ i * ldSize ] ), size );
       /****
-       * v = 0
+       * Solve w from M w = A v_i
        */
-      _v.setValue( ( RealType ) 0.0 );
+      preconditioned_matvec( w, vi );
+
+      for( int k = 0; k <= i; k++ )
+         H[ k + i * ( m + 1 ) ] = 0.0;
+      const int reorthogonalize = (variant == Variant::MGSR) ? 2 : 1;
+      for( int l = 0; l < reorthogonalize; l++ )
+         for( int k = 0; k <= i; k++ ) {
+            vk.bind( &( V.getData()[ k * ldSize ] ), size );
+            /***
+             * H_{k,i} = ( w, v_k )
+             */
+            RealType H_k_i = w.scalarProduct( vk );
+            H[ k + i * ( m + 1 ) ] += H_k_i;
+
+            /****
+             * w = w - H_{k,i} v_k
+             */
+            w.addVector( vk, -H_k_i );
+         }
+      /***
+       * H_{i+1,i} = |w|
+       */
+      RealType normw = w.lpNorm( ( RealType ) 2.0 );
+      H[ i + 1 + i * ( m + 1 ) ] = normw;
 
       /***
-       * v_0 = r / | r | =  1.0 / beta * r
+       * v_{i+1} = w / |w|
+       */
+      vi.bind( &( V.getData()[ ( i + 1 ) * ldSize ] ), size );
+      vi.addVector( w, 1.0 / normw, 0.0 );
+
+      /****
+       * Applying the Givens rotations G_0, ..., G_i
        */
-      vi.bind( _v.getData(), size );
-      vi.addVector( _r, ( RealType ) 1.0 / beta );
+      apply_givens_rotations( i, m );
 
-      _s.setValue( ( RealType ) 0.0 );
-      _s[ 0 ] = beta;
+      this->setResidue( std::fabs( s[ i + 1 ] ) / normb );
+      if( ! this->checkNextIteration() )
+         return i;
+      else
+         this->refreshSolverMonitor();
+   }
 
+   return m;
+}
 
+template< typename Matrix >
+int
+GMRES< Matrix >::
+orthogonalize_CWY( const int m, const RealType normb, const RealType beta )
+{
+   DeviceVector vi, vk;
 
+   /***
+    * z = r / | r | =  1.0 / beta * r
+    */
+   // TODO: investigate normalization by beta and normb
+//   z.addVector( r, 1.0 / beta, 0.0 );
+//   z.addVector( r, 1.0 / normb, 0.0 );
+   z = r;
+
+   H.setValue( 0.0 );
+   s.setValue( 0.0 );
+   T.setValue( 0.0 );
+
+   // NOTE: this is unstable, s[0] is set later in hauseholder_apply_trunc
+//   s[ 0 ] = beta;
+
+   /****
+    * Starting m-loop
+    */
+   for( int i = 0; i <= m && this->nextIteration(); i++ ) {
       /****
-       * Starting m-loop
+       * Generate new Hauseholder transformation from vector z.
        */
-      for( IndexType i = 0; i < m && this->nextIteration(); i++ )
-      {
-         vi.bind( &( _v.getData()[ i * size ] ), size );
+      hauseholder_generate( Y, T, i, z );
+
+      if( i == 0 ) {
          /****
-          * Solve w from M w = A v_i
+          * s = e_1^T * P_i * z
           */
-         if( this->preconditioner )
-         {
-            this->matrix->vectorProduct( vi, _M_tmp );
-            this->preconditioner->solve( _M_tmp, w );
-         }
-         else
-             this->matrix->vectorProduct( vi, w );
- 
-         //cout << " i = " << i << " vi = " << vi << std::endl;
-
-         for( IndexType k = 0; k <= i; k++ )
-            H[ k + i * ( m + 1 ) ] = 0.0;
-         for( IndexType l = 0; l < 2; l++ )
-            for( IndexType k = 0; k <= i; k++ )
-            {
-               vk.bind( &( _v.getData()[ k * _size ] ), _size );
-               /***
-                * H_{k,i} = ( w, v_k )
-                */
-               RealType H_k_i = w.scalarProduct( vk );
-               H[ k + i * ( m + 1 ) ] += H_k_i;
-
-               /****
-                * w = w - H_{k,i} v_k
-                */
-               w.addVector( vk, -H_k_i );
-
-               //cout << "H_ki = " << H_k_i << std::endl;
-               //cout << "w = " << w << std::endl;
-            }
+         hauseholder_apply_trunc( s, Y, T, i, z );
+      }
+      else {
          /***
-          * H_{i+1,i} = |w|
+          * H_{i-1} = P_i * z
           */
-         RealType normw = w.lpNorm( ( RealType ) 2.0 );
-         H[ i + 1 + i * ( m + 1 ) ] = normw;
+         HostView h( &H.getData()[ (i - 1) * (m + 1) ], m + 1 );
+         hauseholder_apply_trunc( h, Y, T, i, z );
+      }
 
-         //cout << "normw = " << normw << std::endl;
- 
-         /***
-          * v_{i+1} = w / |w|
-          */
-         vi.bind( &( _v.getData()[ ( i + 1 ) * size ] ), size );
-         vi.addVector( w, ( RealType ) 1.0 / normw );
- 
-         //cout << "vi = " << vi << std::endl;
- 
+      /***
+       * Generate new basis vector v_i, using the compact WY representation:
+       *     v_i = (I - Y_i * T_i Y_i^T) * e_i
+       */
+      vi.bind( &V.getData()[ i * ldSize ], size );
+      hauseholder_cwy( vi, Y, T, i );
+
+      if( i < m ) {
          /****
-          * Applying the Givens rotations
+          * Solve w from M w = A v_i
           */
-         for( IndexType k = 0; k < i; k++ )
-            applyPlaneRotation( H[ k + i * ( m + 1 )],
-                                H[ k + 1 + i * ( m + 1 ) ],
-                                cs[ k ],
-                                sn[ k ] );
-
-         generatePlaneRotation( H[ i + i * ( m + 1 ) ],
-                                H[ i + 1 + i * ( m + 1 ) ],
-                                cs[ i ],
-                                sn[ i ]);
-         applyPlaneRotation( H[ i + i * ( m + 1 ) ],
-                             H[ i + 1 + i * ( m + 1 ) ],
-                             cs[ i ],
-                             sn[ i ]);
-         applyPlaneRotation( s[ i ],
-                             s[ i + 1 ],
-                             cs[ i ],
-                             sn[ i ] );
+         preconditioned_matvec( w, vi );
 
-         this->setResidue( std::fabs( s[ i + 1 ] ) / normb );
-         if( ! this->checkNextIteration() ) {
-            update( i, m, _H, _s, _v, x );
-            this->refreshSolverMonitor( true );
-            return this->checkConvergence();
-         }
-         else
-         {
-            this->refreshSolverMonitor();
-         }
+         /****
+          * Apply all previous Hauseholder transformations, using the compact WY representation:
+          *     z = (I - Y_i * T_i^T * Y_i^T) * w
+          */
+         hauseholder_cwy_transposed( z, Y, T, i, w );
       }
-      //cout << "x = " << x << std::endl;
-      update( m - 1, m, _H, _s, _v, x );
-      //cout << "x = " << x << std::endl;
 
       /****
-       * r = M.solve(b - A * x);
+       * Applying the Givens rotations G_0, ..., G_{i-1}
        */
-      const RealType beta_old = beta;
-      beta = 0.0;
-      if( this->preconditioner )
-      {
-         this->matrix->vectorProduct( x, _M_tmp );
-         _M_tmp.addVector( b, ( RealType ) 1.0, -1.0 );
-         this->preconditioner->solve( _M_tmp, _r );
-         beta = _r.lpNorm( ( RealType ) 2.0 );
-      }
+      if( i > 0 )
+         apply_givens_rotations( i - 1, m );
+
+      this->setResidue( std::fabs( s[ i ] ) / normb );
+      if( ! this->checkNextIteration() )
+         return i - 1;
       else
-      {
-         this->matrix->vectorProduct( x, _r );
-         _r.addVector( b, ( RealType ) 1.0, -1.0 );
-         beta = _r.lpNorm( ( RealType ) 2.0 );
+         this->refreshSolverMonitor();
+   }
+
+   return m;
+}
+
+template< typename Matrix >
+void
+GMRES< Matrix >::
+compute_residue( VectorViewType r, ConstVectorViewType x, ConstVectorViewType b )
+{
+   /****
+    * r = M.solve(b - A * x);
+    */
+   if( this->preconditioner ) {
+      this->matrix->vectorProduct( x, _M_tmp );
+      _M_tmp.addVector( b, 1.0, -1.0 );
+      this->preconditioner->solve( _M_tmp, r );
+   }
+   else {
+      this->matrix->vectorProduct( x, r );
+      r.addVector( b, 1.0, -1.0 );
+   }
+}
+
+template< typename Matrix >
+void
+GMRES< Matrix >::
+preconditioned_matvec( DeviceVector& w, ConstDeviceView v )
+{
+   /****
+    * w = M.solve(A * v_i);
+    */
+   if( this->preconditioner ) {
+      this->matrix->vectorProduct( v, _M_tmp );
+      this->preconditioner->solve( _M_tmp, w );
+   }
+   else
+      this->matrix->vectorProduct( v, w );
+}
+
+#ifdef HAVE_CUDA
+template< typename DestinationElement,
+          typename SourceElement,
+          typename Index >
+__global__ void
+copyTruncatedVectorKernel( DestinationElement* destination,
+                           const SourceElement* source,
+                           const Index from,
+                           const Index size )
+{
+   Index elementIdx = blockIdx.x * blockDim.x + threadIdx.x;
+   const Index gridSize = blockDim.x * gridDim.x;
+
+   while( elementIdx < from ) {
+      destination[ elementIdx ] = (DestinationElement) 0.0;
+      elementIdx += gridSize;
+   }
+   while( elementIdx < size ) {
+      destination[ elementIdx ] = source[ elementIdx ];
+      elementIdx += gridSize;
+   }
+}
+#endif
+
+template< typename Matrix >
+void
+GMRES< Matrix >::
+hauseholder_generate( DeviceVector& Y,
+                      HostVector& T,
+                      const int i,
+                      DeviceVector& z )
+{
+   DeviceView y_i( &Y.getData()[ i * ldSize ], size );
+
+   // XXX: the upper-right triangle of Y will be full of zeros, which can be exploited for optimization
+   if( std::is_same< DeviceType, Devices::Host >::value ) {
+      for( IndexType j = 0; j < size; j++ ) {
+         if( j < i )
+            y_i[ j ] = 0.0;
+         else
+            y_i[ j ] = z[ j ];
       }
-      this->setResidue( beta / normb );
+   }
+   if( std::is_same< DeviceType, Devices::Cuda >::value ) {
+#ifdef HAVE_CUDA
+      dim3 blockSize( 256 );
+      dim3 gridSize;
+      gridSize.x = min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( size, blockSize.x ) );
+
+      copyTruncatedVectorKernel<<< gridSize, blockSize >>>( y_i.getData(),
+                                                            z.getData(),
+                                                            i,
+                                                            size );
+      TNL_CHECK_CUDA_DEVICE;
+#else
+      throw Exceptions::CudaSupportMissing();
+#endif
+   }
 
-      //cout << " x = " << x << std::endl;
-      //cout << " beta = " << beta << std::endl;
-      //cout << "residue = " << beta / normb << std::endl;
+   // norm of the TRUNCATED vector z
+   const RealType normz = y_i.lpNorm( 2.0 );
+   const RealType y_ii = y_i.getElement( i );
+   if( y_ii > 0.0 )
+      y_i.setElement( i, y_ii + normz );
+   else
+      y_i.setElement( i, y_ii - normz );
+
+   // compute the norm of the y_i vector; equivalent to this calculation by definition:
+   //       const RealType norm_yi = y_i.lpNorm( 2.0 );
+   const RealType norm_yi = std::sqrt( 2 * (normz * normz + std::fabs( y_ii ) * normz) );
+
+   // XXX: normalization is slower, but more stable
+//   y_i *= 1.0 / norm_yi;
+//   const RealType t_i = 2.0;
+   // assuming it's stable enough...
+   const RealType t_i = 2.0 / (norm_yi * norm_yi);
+
+   T[ i + i * (restarting_max + 1) ] = t_i;
+   if( i > 0 ) {
+      // aux = Y_{i-1}^T * y_i
+      RealType aux[ i ];
+      Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
+      Containers::Algorithms::Multireduction< DeviceType >::reduce
+               ( scalarProduct,
+                 i,
+                 size,
+                 Y.getData(),
+                 ldSize,
+                 y_i.getData(),
+                 aux );
+
+      // [T_i]_{0..i-1} = - T_{i-1} * t_i * aux
+      for( int k = 0; k < i; k++ ) {
+         T[ k + i * (restarting_max + 1) ] = 0.0;
+         for( int j = k; j < i; j++ )
+            T[ k + i * (restarting_max + 1) ] -= T[ k + j * (restarting_max + 1) ] * (t_i * aux[ j ]);
+      }
+   }
+}
 
-      // update parameters for the adaptivity of the restarting parameter
-      ++restart_cycles;
-      beta_ratio = beta / beta_old;
+template< typename Matrix >
+void
+GMRES< Matrix >::
+hauseholder_apply_trunc( HostView out,
+                         DeviceVector& Y,
+                         HostVector& T,
+                         const int i,
+                         DeviceVector& z )
+{
+   DeviceView y_i( &Y.getData()[ i * ldSize ], size );
+
+   const RealType aux = T[ i + i * (restarting_max + 1) ] * y_i.scalarProduct( z );
+   if( std::is_same< DeviceType, Devices::Host >::value ) {
+      for( int k = 0; k <= i; k++ )
+         out[ k ] = z[ k ] - y_i[ k ] * aux;
+   }
+   if( std::is_same< DeviceType, Devices::Cuda >::value ) {
+      // copy part of y_i to buffer on host
+      // here we duplicate the upper (m+1)x(m+1) submatrix of Y on host for fast access
+      RealType* host_yi = &YL[ i * (restarting_max + 1) ];
+      RealType host_z[ i + 1 ];
+      Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_yi, y_i.getData(), restarting_max + 1 );
+      Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_z, z.getData(), i + 1 );
+      for( int k = 0; k <= i; k++ )
+         out[ k ] = host_z[ k ] - host_yi[ k ] * aux;
    }
-   this->refreshSolverMonitor( true );
-   return this->checkConvergence();
 }
 
 template< typename Matrix >
-   template< typename VectorT >
 void
 GMRES< Matrix >::
-update( IndexType k,
-        IndexType m,
-        const Containers::Vector< RealType, Devices::Host, IndexType >& H,
-        const Containers::Vector< RealType, Devices::Host, IndexType >& s,
-        Containers::Vector< RealType, DeviceType, IndexType >& v,
-        VectorT& x )
+hauseholder_cwy( DeviceVector& v,
+                 DeviceVector& Y,
+                 HostVector& T,
+                 const int i )
 {
-   Containers::Vector< RealType, Devices::Host, IndexType > y;
-   y.setSize( m + 1 );
+   // aux = Y_i^T * e_i
+   RealType aux[ i + 1 ];
+   if( std::is_same< DeviceType, Devices::Host >::value ) {
+      for( int k = 0; k <= i; k++ )
+         aux[ k ] = Y[ i + k * ldSize ];
+   }
+   if( std::is_same< DeviceType, Devices::Cuda >::value ) {
+      // the upper (m+1)x(m+1) submatrix of Y is duplicated on host for fast access
+      for( int k = 0; k <= i; k++ )
+         aux[ k ] = YL[ i + k * (restarting_max + 1) ];
+   }
+
+   // aux = T_i * aux
+   // Note that T_i is upper triangular, so we can overwrite the aux vector with the result in place
+   for( int k = 0; k <= i; k++ ) {
+      RealType aux2 = 0.0;
+      for( int j = k; j <= i; j++ )
+         aux2 += T[ k + j * (restarting_max + 1) ] * aux[ j ];
+      aux[ k ] = aux2;
+   }
 
-   IndexType i, j;
-   for( i = 0; i <= m ; i ++ )
+   // v = e_i - Y_i * aux
+   MatrixOperations< DeviceType >::gemv( size, i + 1,
+                                         -1.0, Y.getData(), ldSize, aux,
+                                         0.0, v.getData() );
+   v.setElement( i, 1.0 + v.getElement( i ) );
+}
+
+template< typename Matrix >
+void
+GMRES< Matrix >::
+hauseholder_cwy_transposed( DeviceVector& z,
+                            DeviceVector& Y,
+                            HostVector& T,
+                            const int i,
+                            DeviceVector& w )
+{
+   // aux = Y_i^T * w
+   RealType aux[ i + 1 ];
+   Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
+   Containers::Algorithms::Multireduction< DeviceType >::reduce
+            ( scalarProduct,
+              i + 1,
+              size,
+              Y.getData(),
+              ldSize,
+              w.getData(),
+              aux );
+
+   // aux = T_i^T * aux
+   // Note that T_i^T is lower triangular, so we can overwrite the aux vector with the result in place
+   for( int k = i; k >= 0; k-- ) {
+      RealType aux2 = 0.0;
+      for( int j = 0; j <= k; j++ )
+         aux2 += T[ j + k * (restarting_max + 1) ] * aux[ j ];
+      aux[ k ] = aux2;
+   }
+
+   // z = w - Y_i * aux
+   z = w;
+   MatrixOperations< DeviceType >::gemv( size, i + 1,
+                                         -1.0, Y.getData(), ldSize, aux,
+                                         1.0, z.getData() );
+}
+
+template< typename Matrix >
+   template< typename Vector >
+void
+GMRES< Matrix >::
+update( const int k,
+        const int m,
+        const HostVector& H,
+        const HostVector& s,
+        DeviceVector& v,
+        Vector& x )
+{
+   RealType y[ m + 1 ];
+
+   for( int i = 0; i <= m ; i ++ )
       y[ i ] = s[ i ];
 
    // Backsolve:
-   for( i = k; i >= 0; i--)
-   {
-      //cout << " y = " << y << std::endl;
+   for( int i = k; i >= 0; i--) {
+      if( H[ i + i * ( m + 1 ) ] == 0 ) {
+//         for( int _i = 0; _i <= i; _i++ ) {
+//             for( int _j = 0; _j < i; _j++ )
+//                std::cout << H[ _i + _j * (m+1) ] << "  ";
+//            std::cout << std::endl;
+//         }
+         std::cerr << "H.norm = " << H.lpNorm( 2.0 ) << std::endl;
+         std::cerr << "s = " << s << std::endl;
+         std::cerr << "k = " << k << ", m = " << m << std::endl;
+         throw 1;
+      }
       y[ i ] /= H[ i + i * ( m + 1 ) ];
-      for( j = i - 1; j >= 0; j--)
+      for( int j = i - 1; j >= 0; j--)
          y[ j ] -= H[ j + i * ( m + 1 ) ] * y[ i ];
    }
 
-   Containers::Vector< RealType, DeviceType, IndexType > vi;
-   for( i = 0; i <= k; i++)
-   {
-      vi.bind( &( v.getData()[ i * this->size ] ), x.getSize() );
-      x.addVector( vi, y[ i ] );
-   }
+   // x = V * y + x
+   MatrixOperations< DeviceType >::gemv( size, k + 1,
+                                         1.0, v.getData(), ldSize, y,
+                                         1.0, x.getData() );
 }
 
 template< typename Matrix >
@@ -342,24 +614,20 @@ generatePlaneRotation( RealType& dx,
                        RealType& cs,
                        RealType& sn )
 {
-   if( dy == 0.0 )
-   {
+   if( dy == 0.0 ) {
       cs = 1.0;
       sn = 0.0;
    }
-   else
-      if( std::fabs( dy ) > std::fabs( dx ) )
-      {
-         RealType temp = dx / dy;
-         sn = 1.0 / std::sqrt( 1.0 + temp * temp );
-         cs = temp * sn;
-      }
-      else
-      {
-         RealType temp = dy / dx;
-         cs = 1.0 / std::sqrt( 1.0 + temp * temp );
-         sn = temp * cs;
-      }
+   else if( std::fabs( dy ) > std::fabs( dx ) ) {
+      const RealType temp = dx / dy;
+      sn = 1.0 / std::sqrt( 1.0 + temp * temp );
+      cs = temp * sn;
+   }
+   else {
+      const RealType temp = dy / dx;
+      cs = 1.0 / std::sqrt( 1.0 + temp * temp );
+      sn = temp * cs;
+   }
 }
 
 template< typename Matrix >
@@ -370,28 +638,68 @@ applyPlaneRotation( RealType& dx,
                     RealType& cs,
                     RealType& sn )
 {
-   RealType temp  =  cs * dx + sn * dy;
-   dy =  cs * dy - sn * dx;
+   const RealType temp = cs * dx + sn * dy;
+   dy = cs * dy - sn * dx;
    dx = temp;
 }
 
 template< typename Matrix >
 void
 GMRES< Matrix >::
-setSize( IndexType _size, IndexType m )
+apply_givens_rotations( int i, int m )
+{
+   for( int k = 0; k < i; k++ )
+      applyPlaneRotation( H[ k     + i * (m + 1) ],
+                          H[ k + 1 + i * (m + 1) ],
+                          cs[ k ],
+                          sn[ k ] );
+
+   if( H[ i + 1 + i * (m + 1) ] != 0.0 ) {
+      generatePlaneRotation( H[ i     + i * (m + 1) ],
+                             H[ i + 1 + i * (m + 1) ],
+                             cs[ i ],
+                             sn[ i ]);
+      applyPlaneRotation( H[ i     + i * (m + 1) ],
+                          H[ i + 1 + i * (m + 1) ],
+                          cs[ i ],
+                          sn[ i ]);
+      applyPlaneRotation( s[ i     ],
+                          s[ i + 1 ],
+                          cs[ i ],
+                          sn[ i ] );
+   }
+}
+
+template< typename Matrix >
+void
+GMRES< Matrix >::
+setSize( const IndexType size )
 {
-   if( size == _size && restarting_max == m )
-      return;
-   size = _size;
-   restarting_max = m;
-   _r.setSize( size );
+   this->size = size;
+   if( std::is_same< DeviceType, Devices::Cuda >::value )
+      // align each column to 256 bytes - optimal for CUDA
+      ldSize = roundToMultiple( size, 256 / sizeof( RealType ) );
+   else
+      // on the host, we add 1 to disrupt the cache false-sharing pattern
+      ldSize = roundToMultiple( size, 256 / sizeof( RealType ) ) + 1;
+
+   const int m = restarting_max;
+   r.setSize( size );
    w.setSize( size );
-   _s.setSize( m + 1 );
-   _cs.setSize( m + 1 );
-   _sn.setSize( m + 1 );
-   _v.setSize( size * ( m + 1 ) );
-   _H.setSize( ( m + 1 ) * m );
+   V.setSize( ldSize * ( m + 1 ) );
+   cs.setSize( m + 1 );
+   sn.setSize( m + 1 );
+   H.setSize( ( m + 1 ) * m );
+   s.setSize( m + 1 );
    _M_tmp.setSize( size );
+
+   // CWY-specific storage
+   if( variant == Variant::CWY ) {
+      z.setSize( size );
+      Y.setSize( ldSize * ( m + 1 ) );
+      T.setSize( (m + 1) * (m + 1) );
+      YL.setSize( (m + 1) * (m + 1) );
+   }
 }
 
 } // namespace Linear
diff --git a/src/TNL/Solvers/LinearSolverTypeResolver.h b/src/TNL/Solvers/LinearSolverTypeResolver.h
index 8842038009..61a141f504 100644
--- a/src/TNL/Solvers/LinearSolverTypeResolver.h
+++ b/src/TNL/Solvers/LinearSolverTypeResolver.h
@@ -19,7 +19,6 @@
 #include <TNL/Solvers/Linear/BICGStab.h>
 #include <TNL/Solvers/Linear/BICGStabL.h>
 #include <TNL/Solvers/Linear/GMRES.h>
-#include <TNL/Solvers/Linear/CWYGMRES.h>
 #include <TNL/Solvers/Linear/TFQMR.h>
 #include <TNL/Solvers/Linear/UmfpackWrapper.h>
 #include <TNL/Solvers/Linear/Preconditioners/Diagonal.h>
@@ -45,8 +44,6 @@ getLinearSolver( const Config::ParameterContainer& parameters )
       return std::make_shared< Linear::BICGStabL< MatrixType > >();
    if( discreteSolver == "gmres" )
       return std::make_shared< Linear::GMRES< MatrixType > >();
-   if( discreteSolver == "cwygmres" )
-      return std::make_shared< Linear::CWYGMRES< MatrixType > >();
    if( discreteSolver == "tfqmr" )
       return std::make_shared< Linear::TFQMR< MatrixType > >();
 #ifdef HAVE_UMFPACK
@@ -54,7 +51,7 @@ getLinearSolver( const Config::ParameterContainer& parameters )
       return std::make_shared< Linear::UmfpackWrapper< MatrixType > >();
 #endif
 
-   std::cerr << "Unknown semi-implicit discrete solver " << discreteSolver << ". It can be only: sor, cg, bicgstab, bicgstabl, gmres, cwygmres, tfqmr";
+   std::cerr << "Unknown semi-implicit discrete solver " << discreteSolver << ". It can be only: sor, cg, bicgstab, bicgstabl, gmres, tfqmr";
 #ifdef HAVE_UMFPACK
    std::cerr << ", umfpack"
 #endif
diff --git a/src/TNL/Solvers/SolverConfig_impl.h b/src/TNL/Solvers/SolverConfig_impl.h
index bc86767e16..598cee285a 100644
--- a/src/TNL/Solvers/SolverConfig_impl.h
+++ b/src/TNL/Solvers/SolverConfig_impl.h
@@ -21,7 +21,6 @@
 #include <TNL/Solvers/Linear/BICGStab.h>
 #include <TNL/Solvers/Linear/BICGStabL.h>
 #include <TNL/Solvers/Linear/GMRES.h>
-#include <TNL/Solvers/Linear/CWYGMRES.h>
 #include <TNL/Solvers/Linear/TFQMR.h>
 #include <TNL/Solvers/Linear/UmfpackWrapper.h>
 #include <TNL/Solvers/Linear/Preconditioners/Diagonal.h>
@@ -135,7 +134,6 @@ bool SolverConfig< ConfigTag, ProblemConfig >::configSetup( Config::ConfigDescri
       config.addEntryEnum( "cg" );
       config.addEntryEnum( "bicgstab" );
       config.addEntryEnum( "bicgstabl" );
-      config.addEntryEnum( "cwygmres" );
       config.addEntryEnum( "gmres" );
       config.addEntryEnum( "tfqmr" );
       config.addEntryEnum( "sor" );
@@ -174,10 +172,7 @@ bool SolverConfig< ConfigTag, ProblemConfig >::configSetup( Config::ConfigDescri
       Linear::CG< MatrixType >::configSetup( config );
       Linear::BICGStab< MatrixType >::configSetup( config );
       Linear::BICGStabL< MatrixType >::configSetup( config );
-
-      // GMRES and CWYGMRES have the same options
       Linear::GMRES< MatrixType >::configSetup( config );
-
       Linear::TFQMR< MatrixType >::configSetup( config );
       Linear::SOR< MatrixType >::configSetup( config );
 
-- 
GitLab