From 0a561076b3563198e8fbeed118e33369596ef19f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz> Date: Wed, 26 Sep 2018 10:39:21 +0200 Subject: [PATCH] Refactoring GMRES variants --- src/TNL/Solvers/Linear/CMakeLists.txt | 2 - src/TNL/Solvers/Linear/CWYGMRES.h | 115 ---- src/TNL/Solvers/Linear/CWYGMRES_impl.h | 633 ------------------ src/TNL/Solvers/Linear/GMRES.h | 86 ++- src/TNL/Solvers/Linear/GMRES_impl.h | 742 +++++++++++++++------ src/TNL/Solvers/LinearSolverTypeResolver.h | 5 +- src/TNL/Solvers/SolverConfig_impl.h | 5 - 7 files changed, 594 insertions(+), 994 deletions(-) delete mode 100644 src/TNL/Solvers/Linear/CWYGMRES.h delete mode 100644 src/TNL/Solvers/Linear/CWYGMRES_impl.h diff --git a/src/TNL/Solvers/Linear/CMakeLists.txt b/src/TNL/Solvers/Linear/CMakeLists.txt index f44f0a095f..2e33af0b8e 100644 --- a/src/TNL/Solvers/Linear/CMakeLists.txt +++ b/src/TNL/Solvers/Linear/CMakeLists.txt @@ -6,8 +6,6 @@ SET( headers BICGStab.h BICGStabL_impl.h CG.h CG_impl.h - CWYGMRES.h - CWYGMRES_impl.h GMRES.h GMRES_impl.h Jacobi.h diff --git a/src/TNL/Solvers/Linear/CWYGMRES.h b/src/TNL/Solvers/Linear/CWYGMRES.h deleted file mode 100644 index 1cccc132dd..0000000000 --- a/src/TNL/Solvers/Linear/CWYGMRES.h +++ /dev/null @@ -1,115 +0,0 @@ -/*************************************************************************** - CWYGMRES.h - description - ------------------- - begin : May 13, 2016 - copyright : (C) 2016 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Jakub Klinkovsky - -#pragma once - -#include "LinearSolver.h" - -#include <TNL/Containers/Vector.h> - -namespace TNL { -namespace Solvers { -namespace Linear { - -template< typename Matrix > -class CWYGMRES -: public LinearSolver< Matrix > -{ - using Base = LinearSolver< Matrix >; -public: - using RealType = typename Base::RealType; - using DeviceType = typename Base::DeviceType; - using IndexType = typename Base::IndexType; - using VectorViewType = typename Base::VectorViewType; - using ConstVectorViewType = typename Base::ConstVectorViewType; - - String getType() const; - - static void configSetup( Config::ConfigDescription& config, - const String& prefix = "" ); - - bool setup( const Config::ParameterContainer& parameters, - const String& prefix = "" ) override; - - void setRestarting( IndexType rest ); - - bool solve( ConstVectorViewType b, VectorViewType x ) override; - -protected: - using DeviceVector = Containers::Vector< RealType, DeviceType, IndexType >; - using HostVector = Containers::Vector< RealType, Devices::Host, IndexType >; - - void hauseholder_generate( DeviceVector& Y, - HostVector& T, - const int& i, - DeviceVector& w ); - - void hauseholder_apply_trunc( HostVector& out, - DeviceVector& Y, - HostVector& T, - const int& i, - DeviceVector& w ); - - void hauseholder_cwy( DeviceVector& w, - DeviceVector& Y, - HostVector& T, - const int& i ); - - void hauseholder_cwy_transposed( DeviceVector& w, - DeviceVector& Y, - HostVector& T, - const int& i, - DeviceVector& z ); - - template< typename Vector > - void update( IndexType k, - IndexType m, - const HostVector& H, - const HostVector& s, - DeviceVector& v, - Vector& x ); - - void generatePlaneRotation( RealType& dx, - RealType& dy, - RealType& cs, - RealType& sn ); - - void applyPlaneRotation( RealType& dx, - RealType& dy, - RealType& cs, - RealType& sn ); - - - void setSize( IndexType _size, IndexType m ); - - // single vectors - DeviceVector r, z, w, _M_tmp; - // matrices (in column-major format) - DeviceVector V, Y; - // duplicate of the upper (m+1)x(m+1) submatrix of Y (it is lower triangular) for fast access - HostVector YL, T; - // host-only storage for Givens rotations and the least squares problem - HostVector cs, sn, H, s; - - IndexType size = 0; - IndexType ldSize = 0; - IndexType restarting_min = 10; - IndexType restarting_max = 10; - IndexType restarting_step_min = 3; - IndexType restarting_step_max = 3; -}; - -} // namespace Linear -} // namespace Solvers -} // namespace TNL - -#include "CWYGMRES_impl.h" diff --git a/src/TNL/Solvers/Linear/CWYGMRES_impl.h b/src/TNL/Solvers/Linear/CWYGMRES_impl.h deleted file mode 100644 index 4989f50207..0000000000 --- a/src/TNL/Solvers/Linear/CWYGMRES_impl.h +++ /dev/null @@ -1,633 +0,0 @@ -/*************************************************************************** - CWYGMRES.h - description - ------------------- - begin : May 13, 2016 - copyright : (C) 2016 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Jakub Klinkovsky - -#pragma once - -#include <type_traits> -#include <cmath> - -#include <TNL/Exceptions/CudaSupportMissing.h> -#include <TNL/Containers/Algorithms/Multireduction.h> -#include <TNL/Matrices/MatrixOperations.h> - -#include "CWYGMRES.h" - -namespace TNL { -namespace Solvers { -namespace Linear { - -template< typename Matrix > -String -CWYGMRES< Matrix >:: -getType() const -{ - return String( "CWYGMRES< " ) + - this->matrix -> getType() + ", " + - this->preconditioner -> getType() + " >"; -} - -template< typename Matrix > -void -CWYGMRES< Matrix >:: -configSetup( Config::ConfigDescription& config, - const String& prefix ) -{ - config.addEntry< int >( prefix + "gmres-restarting-min", "Minimal number of iterations after which the GMRES restarts.", 10 ); - config.addEntry< int >( prefix + "gmres-restarting-max", "Maximal number of iterations after which the GMRES restarts.", 10 ); - config.addEntry< int >( prefix + "gmres-restarting-step-min", "Minimal adjusting step for the adaptivity of the GMRES restarting parameter.", 3 ); - config.addEntry< int >( prefix + "gmres-restarting-step-max", "Maximal adjusting step for the adaptivity of the GMRES restarting parameter.", 3 ); -} - -template< typename Matrix > -bool -CWYGMRES< Matrix >:: -setup( const Config::ParameterContainer& parameters, - const String& prefix ) -{ - restarting_min = parameters.getParameter< int >( "gmres-restarting-min" ); - this->setRestarting( parameters.getParameter< int >( "gmres-restarting-max" ) ); - restarting_step_min = parameters.getParameter< int >( "gmres-restarting-step-min" ); - restarting_step_max = parameters.getParameter< int >( "gmres-restarting-step-max" ); - return LinearSolver< Matrix >::setup( parameters, prefix ); -} - -template< typename Matrix > -void -CWYGMRES< Matrix >:: -setRestarting( IndexType rest ) -{ - if( size != 0 ) - setSize( size, rest ); - restarting_max = rest; -} - -template< typename Matrix > -bool -CWYGMRES< Matrix >:: -solve( ConstVectorViewType b, VectorViewType x ) -{ - TNL_ASSERT_TRUE( this->matrix, "No matrix was set in CWYGMRES. Call setMatrix() before solve()." ); - if( restarting_min <= 0 || restarting_max <= 0 || restarting_min > restarting_max ) - { - std::cerr << "Wrong value for the GMRES restarting parameters: r_min = " << restarting_min - << ", r_max = " << restarting_max << std::endl; - return false; - } - if( restarting_step_min < 0 || restarting_step_max < 0 || restarting_step_min > restarting_step_max ) - { - std::cerr << "Wrong value for the GMRES restarting adjustment parameters: d_min = " << restarting_step_min - << ", d_max = " << restarting_step_max << std::endl; - return false; - } - setSize( this->matrix->getRows(), restarting_max ); - - RealType normb( 0.0 ), beta( 0.0 ); - /**** - * 1. Solve r from M r = b - A x_0 - */ - if( this->preconditioner ) - { - this->preconditioner->solve( b, _M_tmp ); - normb = _M_tmp.lpNorm( ( RealType ) 2.0 ); - - this->matrix->vectorProduct( x, _M_tmp ); - _M_tmp.addVector( b, ( RealType ) 1.0, -1.0 ); - - this->preconditioner->solve( _M_tmp, r ); - } - else - { - this->matrix->vectorProduct( x, r ); - normb = b.lpNorm( ( RealType ) 2.0 ); - r.addVector( b, ( RealType ) 1.0, -1.0 ); - } - beta = r.lpNorm( ( RealType ) 2.0 ); - - //cout << "norm b = " << normb <<std::endl; - //cout << " beta = " << beta <<std::endl; - - - if( normb == 0.0 ) - normb = 1.0; - - this->resetIterations(); - this->setResidue( beta / normb ); - - // parameters for the adaptivity of the restarting parameter - RealType beta_ratio = 1; // = beta / beta_ratio (small value indicates good convergence rate) - const RealType max_beta_ratio = 0.99; // = cos(8°) \approx 0.99 - const RealType min_beta_ratio = 0.175; // = cos(80°) \approx 0.175 - int restart_cycles = 0; // counter of restart cycles - int m = restarting_max; // current restarting parameter - - DeviceVector vi, vk; - while( this->checkNextIteration() ) - { - // adaptivity of the restarting parameter - // reference: A.H. Baker, E.R. Jessup, Tz.V. Kolev - A simple strategy for varying the restart parameter in GMRES(m) - // http://www.sciencedirect.com/science/article/pii/S0377042709000132 - if( restart_cycles > 0 ) { - if( beta_ratio > max_beta_ratio ) - // near stagnation -> set maximum - m = restarting_max; - else if( beta_ratio >= min_beta_ratio ) { - // the step size is determined based on current m using linear interpolation - // between restarting_step_min and restarting_step_max - const int step = restarting_step_min + (float) ( restarting_step_max - restarting_step_min ) / - ( restarting_max - restarting_min ) * - ( m - restarting_min ); - if( m - step >= restarting_min ) - m -= step; - else - // set restarting_max when we hit restarting_min (see Baker et al. (2009)) - m = restarting_max; - } -// std::cerr << "restarting: cycle = " << restart_cycles << ", beta_ratio = " << beta_ratio << ", m = " << m << " " << std::endl; - } - - /*** - * z = r / | r | = 1.0 / beta * r - */ -// z.addVector( r, ( RealType ) 1.0 / beta, ( RealType ) 0.0 ); - z = r; - - H.setValue( 0.0 ); - s.setValue( 0.0 ); - T.setValue( 0.0 ); - - // NOTE: this is unstable, s[0] is set later -// s[ 0 ] = beta; - - /**** - * Starting m-loop - */ - for( IndexType i = 0; i <= m && this->nextIteration(); i++ ) - { -// std::cout << "==== i = " << i << " ====" <<std::endl; - - /**** - * Generate new Hauseholder transformation from vector z. - */ - hauseholder_generate( Y, T, i, z ); - - if( i == 0 ) { - /**** - * s = e_1^T * P_i * z - */ - hauseholder_apply_trunc( s, Y, T, i, z ); - } - else { - /*** - * H_{i-1} = P_i * z - */ - HostVector h; - h.bind( &H.getData()[ (i - 1) * (m + 1) ], m + 1 ); - hauseholder_apply_trunc( h, Y, T, i, z ); - } - - /*** - * Generate new basis vector v_i, using the compact WY representation: - * v_i = (I - Y_i * T_i Y_i^T) * e_i - */ - vi.bind( &V.getData()[ i * ldSize ], size ); - hauseholder_cwy( vi, Y, T, i ); - - if( i < m ) { - /**** - * Solve w from M w = A v_i - */ - if( this->preconditioner ) - { - this->matrix->vectorProduct( vi, _M_tmp ); - this->preconditioner->solve( _M_tmp, w ); - } - else - this->matrix->vectorProduct( vi, w ); - - /**** - * Apply all previous Hauseholder transformations, using the compact WY representation: - * z = (I - Y_i * T_i^T * Y_i^T) * w - */ - hauseholder_cwy_transposed( z, Y, T, i, w ); - } - -// std::cout << "vi.norm = " << vi.lpNorm( 2.0 ) <<std::endl; -// std::cout << "H (before rotations) = " <<std::endl; -// for( int i = 0; i <= m; i++ ) { -// for( int j = 0; j < m; j++ ) -// std::cout << H[ i + j * (m+1) ] << "\t"; -// std::cout <<std::endl; -// } -// std::cout << "s = " << s <<std::endl; - - /**** - * Applying the Givens rotations - */ - if( i > 0 ) { - for( IndexType k = 0; k < i - 1; k++ ) - applyPlaneRotation( H[ k + (i - 1) * (m + 1) ], - H[ k + 1 + (i - 1) * (m + 1) ], - cs[ k ], - sn[ k ] ); - - if( H[ i + (i - 1) * (m + 1) ] != 0.0 ) { - generatePlaneRotation( H[ i - 1 + (i - 1) * (m + 1) ], - H[ i + (i - 1) * (m + 1) ], - cs[ i - 1 ], - sn[ i - 1 ]); - applyPlaneRotation( H[ i - 1 + (i - 1) * (m + 1) ], - H[ i + (i - 1) * (m + 1) ], - cs[ i - 1 ], - sn[ i - 1 ]); - applyPlaneRotation( s[ i - 1 ], - s[ i ], - cs[ i - 1 ], - sn[ i - 1 ] ); - } - } - -// std::cout << "H (after rotations) = " <<std::endl; -// for( int i = 0; i <= m; i++ ) { -// for( int j = 0; j < m; j++ ) -// std::cout << H[ i + j * (m+1) ] << "\t"; -// std::cout <<std::endl; -// } -// std::cout << "s (after rotations) = " << s <<std::endl; -// std::cout << "residue / normb = " << std::fabs( s[ i ] ) / normb <<std::endl; - -// for( int k = 0; k <= i; k++ ) { -// vk.bind( &V.getData()[ k * ldSize ], size ); -// std::cout << "(v" << k << ",v" << i << ") = " << vk.scalarProduct( vi ) <<std::endl; -// } - - this->setResidue( std::fabs( s[ i ] ) / normb ); - if( ! this->checkNextIteration() ) { - update( i - 1, m, H, s, V, x ); - this->refreshSolverMonitor( true ); - return this->checkConvergence(); - } - else { - this->refreshSolverMonitor(); - } - } - update( m - 1, m, H, s, V, x ); - - /**** - * r = M.solve(b - A * x); - */ - const RealType beta_old = beta; - if( this->preconditioner ) - { - this->matrix->vectorProduct( x, _M_tmp ); - _M_tmp.addVector( b, ( RealType ) 1.0, -1.0 ); - this->preconditioner->solve( _M_tmp, r ); - } - else - { - this->matrix->vectorProduct( x, r ); - r.addVector( b, ( RealType ) 1.0, -1.0 ); - } - beta = r.lpNorm( ( RealType ) 2.0 ); - this->setResidue( beta / normb ); - -// std::cout << " x = " << x <<std::endl; -// std::cout << " beta = " << beta <<std::endl; -// std::cout << "residue = " << beta / normb <<std::endl; - - // update parameters for the adaptivity of the restarting parameter - ++restart_cycles; - beta_ratio = beta / beta_old; - } - this->refreshSolverMonitor( true ); - return this->checkConvergence(); -} - -#ifdef HAVE_CUDA -template< typename DestinationElement, - typename SourceElement, - typename Index > -__global__ void -copyTruncatedVectorKernel( DestinationElement* destination, - const SourceElement* source, - const Index from, - const Index size ) -{ - Index elementIdx = blockIdx.x * blockDim.x + threadIdx.x; - const Index gridSize = blockDim.x * gridDim.x; - - while( elementIdx < from ) - { - destination[ elementIdx ] = (DestinationElement) 0.0; - elementIdx += gridSize; - } - while( elementIdx < size ) - { - destination[ elementIdx ] = source[ elementIdx ]; - elementIdx += gridSize; - } -} -#endif - -template< typename Matrix > -void -CWYGMRES< Matrix >:: -hauseholder_generate( DeviceVector& Y, - HostVector& T, - const int& i, - DeviceVector& z ) -{ - DeviceVector y_i; - y_i.bind( &Y.getData()[ i * ldSize ], size ); - - // XXX: the upper-right triangle of Y will be full of zeros, which can be exploited for optimization - if( std::is_same< DeviceType, Devices::Host >::value ) { - for( IndexType j = 0; j < size; j++ ) { - if( j < i ) - y_i[ j ] = 0.0; - else - y_i[ j ] = z[ j ]; - } - } - if( std::is_same< DeviceType, Devices::Cuda >::value ) { -#ifdef HAVE_CUDA - dim3 blockSize( 256 ); - dim3 gridSize; - gridSize.x = min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( size, blockSize.x ) ); - - copyTruncatedVectorKernel<<< gridSize, blockSize >>>( y_i.getData(), - z.getData(), - i, - size ); - TNL_CHECK_CUDA_DEVICE; -#else - throw Exceptions::CudaSupportMissing(); -#endif - } - - // norm of the TRUNCATED vector z - const RealType normz = y_i.lpNorm( 2.0 ); - const RealType y_ii = y_i.getElement( i ); - if( y_ii > 0.0 ) { - y_i.setElement( i, y_ii + normz ); - } - else { - y_i.setElement( i, y_ii - normz ); - } - - // compute the norm of the y_i vector; equivalent to this calculation by definition: - // const RealType norm_yi = y_i.lpNorm( 2.0 ); - const RealType norm_yi = std::sqrt( 2 * (normz * normz + std::fabs( y_ii ) * normz) ); - - // XXX: normalization is slower, but more stable -// y_i *= 1.0 / norm_yi; -// const RealType t_i = 2.0; - // assuming it's stable enough... - const RealType t_i = 2.0 / (norm_yi * norm_yi); - - T[ i + i * (restarting_max + 1) ] = t_i; - if( i > 0 ) { - // aux = Y_{i-1}^T * y_i - RealType aux[ i ]; - Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct; - Containers::Algorithms::Multireduction< DeviceType >::reduce - ( scalarProduct, - i, - size, - Y.getData(), - ldSize, - y_i.getData(), - aux ); - - // [T_i]_{0..i-1} = - T_{i-1} * t_i * aux - for( int k = 0; k < i; k++ ) { - T[ k + i * (restarting_max + 1) ] = 0.0; - for( int j = k; j < i; j++ ) - T[ k + i * (restarting_max + 1) ] -= T[ k + j * (restarting_max + 1) ] * (t_i * aux[ j ]); - } - } -} - -template< typename Matrix > -void -CWYGMRES< Matrix >:: -hauseholder_apply_trunc( HostVector& out, - DeviceVector& Y, - HostVector& T, - const int& i, - DeviceVector& z ) -{ - DeviceVector y_i; - y_i.bind( &Y.getData()[ i * ldSize ], size ); - - const RealType aux = T[ i + i * (restarting_max + 1) ] * y_i.scalarProduct( z ); - if( std::is_same< DeviceType, Devices::Host >::value ) { - for( int k = 0; k <= i; k++ ) - out[ k ] = z[ k ] - y_i[ k ] * aux; - } - if( std::is_same< DeviceType, Devices::Cuda >::value ) { - // copy part of y_i to buffer on host - // here we duplicate the upper (m+1)x(m+1) submatrix of Y on host for fast access - RealType* host_yi = &YL[ i * (restarting_max + 1) ]; - RealType host_z[ i + 1 ]; - Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_yi, y_i.getData(), restarting_max + 1 ); - Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_z, z.getData(), i + 1 ); - for( int k = 0; k <= i; k++ ) - out[ k ] = host_z[ k ] - host_yi[ k ] * aux; - } -} - -template< typename Matrix > -void -CWYGMRES< Matrix >:: -hauseholder_cwy( DeviceVector& v, - DeviceVector& Y, - HostVector& T, - const int& i ) -{ - // aux = Y_i^T * e_i - RealType aux[ i + 1 ]; - if( std::is_same< DeviceType, Devices::Host >::value ) { - for( int k = 0; k <= i; k++ ) - aux[ k ] = Y[ i + k * ldSize ]; - } - if( std::is_same< DeviceType, Devices::Cuda >::value ) { - // the upper (m+1)x(m+1) submatrix of Y is duplicated on host for fast access - for( int k = 0; k <= i; k++ ) - aux[ k ] = YL[ i + k * (restarting_max + 1) ]; - } - - // aux = T_i * aux - // Note that T_i is upper triangular, so we can overwrite the aux vector with the result in place - for( int k = 0; k <= i; k++ ) { - RealType aux2 = 0.0; - for( int j = k; j <= i; j++ ) - aux2 += T[ k + j * (restarting_max + 1) ] * aux[ j ]; - aux[ k ] = aux2; - } - - // v = e_i - Y_i * aux - MatrixOperations< DeviceType >::gemv( size, i + 1, - -1.0, Y.getData(), ldSize, aux, - 0.0, v.getData() ); - v.setElement( i, 1.0 + v.getElement( i ) ); -} - -template< typename Matrix > -void -CWYGMRES< Matrix >:: -hauseholder_cwy_transposed( DeviceVector& z, - DeviceVector& Y, - HostVector& T, - const int& i, - DeviceVector& w ) -{ - // aux = Y_i^T * w - RealType aux[ i + 1 ]; - Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct; - Containers::Algorithms::Multireduction< DeviceType >::reduce - ( scalarProduct, - i + 1, - size, - Y.getData(), - ldSize, - w.getData(), - aux ); - - // aux = T_i^T * aux - // Note that T_i^T is lower triangular, so we can overwrite the aux vector with the result in place - for( int k = i; k >= 0; k-- ) { - RealType aux2 = 0.0; - for( int j = 0; j <= k; j++ ) - aux2 += T[ j + k * (restarting_max + 1) ] * aux[ j ]; - aux[ k ] = aux2; - } - - // z = w - Y_i * aux - z = w; - MatrixOperations< DeviceType >::gemv( size, i + 1, - -1.0, Y.getData(), ldSize, aux, - 1.0, z.getData() ); -} - -template< typename Matrix > - template< typename Vector > -void -CWYGMRES< Matrix >:: -update( IndexType k, - IndexType m, - const HostVector& H, - const HostVector& s, - DeviceVector& v, - Vector& x ) -{ -// Containers::Vector< RealType, Devices::Host, IndexType > y; -// y.setSize( m + 1 ); - RealType y[ m + 1 ]; - - IndexType i, j; - for( i = 0; i <= m ; i ++ ) - y[ i ] = s[ i ]; - - // Backsolve: - for( i = k; i >= 0; i--) { - if( H[ i + i * ( m + 1 ) ] == 0 ) { -// for( int _i = 0; _i <= i; _i++ ) { -// for( int _j = 0; _j < i; _j++ ) -// std::cout << H[ _i + _j * (m+1) ] << " "; -// std::cout <<std::endl; -// } - std::cerr << "H.norm = " << H.lpNorm( 2.0 ) << std::endl; - std::cerr << "s = " << s << std::endl; - std::cerr << "k = " << k << ", m = " << m << std::endl; - throw 1; - } - y[ i ] /= H[ i + i * ( m + 1 ) ]; - for( j = i - 1; j >= 0; j--) - y[ j ] -= H[ j + i * ( m + 1 ) ] * y[ i ]; - } - - // x = V * y + x - MatrixOperations< DeviceType >::gemv( size, k + 1, - 1.0, v.getData(), ldSize, y, - 1.0, x.getData() ); -} - -template< typename Matrix > -void -CWYGMRES< Matrix >:: -generatePlaneRotation( RealType& dx, - RealType& dy, - RealType& cs, - RealType& sn ) -{ - if( dy == 0.0 ) - { - cs = 1.0; - sn = 0.0; - } - else - if( std::fabs( dy ) > std::fabs( dx ) ) - { - RealType temp = dx / dy; - sn = 1.0 / std::sqrt( 1.0 + temp * temp ); - cs = temp * sn; - } - else - { - RealType temp = dy / dx; - cs = 1.0 / std::sqrt( 1.0 + temp * temp ); - sn = temp * cs; - } -} - -template< typename Matrix > -void CWYGMRES< Matrix > :: -applyPlaneRotation( RealType& dx, - RealType& dy, - RealType& cs, - RealType& sn ) -{ - RealType temp = cs * dx + sn * dy; - dy = cs * dy - sn * dx; - dx = temp; -} - -template< typename Matrix > -void CWYGMRES< Matrix > :: setSize( IndexType _size, IndexType m ) -{ - if( size == _size && restarting_max == m ) - return; - size = _size; - if( std::is_same< DeviceType, Devices::Cuda >::value ) - // align each column to 256 bytes - optimal for CUDA - ldSize = roundToMultiple( size, 256 / sizeof( RealType ) ); - else - // on the host, we add 1 to disrupt the cache false-sharing pattern - ldSize = roundToMultiple( size, 256 / sizeof( RealType ) ) + 1; - restarting_max = m; - r.setSize( size ); - z.setSize( size ); - w.setSize( size ); - V.setSize( ldSize * ( m + 1 ) ); - Y.setSize( ldSize * ( m + 1 ) ); - T.setSize( (m + 1) * (m + 1) ); - YL.setSize( (m + 1) * (m + 1) ); - cs.setSize( m + 1 ); - sn.setSize( m + 1 ); - H.setSize( ( m + 1 ) * m ); - s.setSize( m + 1 ); - _M_tmp.setSize( size ); -} - -} // namespace Linear -} // namespace Solvers -} // namespace TNL diff --git a/src/TNL/Solvers/Linear/GMRES.h b/src/TNL/Solvers/Linear/GMRES.h index 05c171f622..280a7ad119 100644 --- a/src/TNL/Solvers/Linear/GMRES.h +++ b/src/TNL/Solvers/Linear/GMRES.h @@ -1,13 +1,15 @@ /*************************************************************************** GMRES.h - description ------------------- - begin : 2007/07/31 - copyright : (C) 2007 by Tomas Oberhuber + begin : May 13, 2016 + copyright : (C) 2016 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ +// Implemented by: Jakub Klinkovsky + #pragma once #include "LinearSolver.h" @@ -38,18 +40,54 @@ public: bool setup( const Config::ParameterContainer& parameters, const String& prefix = "" ) override; - void setRestarting( IndexType rest ); - bool solve( ConstVectorViewType b, VectorViewType x ) override; protected: - template< typename VectorT > - void update( IndexType k, - IndexType m, - const Containers::Vector< RealType, Devices::Host, IndexType >& H, - const Containers::Vector< RealType, Devices::Host, IndexType >& s, - Containers::Vector< RealType, DeviceType, IndexType >& v, - VectorT& x ); + using ConstDeviceView = Containers::VectorView< const RealType, DeviceType, IndexType >; + using DeviceView = Containers::VectorView< RealType, DeviceType, IndexType >; + using HostView = Containers::VectorView< RealType, Devices::Host, IndexType >; + using DeviceVector = Containers::Vector< RealType, DeviceType, IndexType >; + using HostVector = Containers::Vector< RealType, Devices::Host, IndexType >; + + enum class Variant { MGS, MGSR, CWY }; + + int orthogonalize_MGS( const int m, const RealType normb, const RealType beta ); + + int orthogonalize_CWY( const int m, const RealType normb, const RealType beta ); + + void compute_residue( VectorViewType r, ConstVectorViewType x, ConstVectorViewType b ); + + void preconditioned_matvec( DeviceVector& w, ConstDeviceView v ); + + void hauseholder_generate( DeviceVector& Y, + HostVector& T, + const int i, + DeviceVector& w ); + + void hauseholder_apply_trunc( HostView out, + DeviceVector& Y, + HostVector& T, + const int i, + DeviceVector& w ); + + void hauseholder_cwy( DeviceVector& w, + DeviceVector& Y, + HostVector& T, + const int i ); + + void hauseholder_cwy_transposed( DeviceVector& w, + DeviceVector& Y, + HostVector& T, + const int i, + DeviceVector& z ); + + template< typename Vector > + void update( const int k, + const int m, + const HostVector& H, + const HostVector& s, + DeviceVector& v, + Vector& x ); void generatePlaneRotation( RealType& dx, RealType& dy, @@ -61,17 +99,29 @@ protected: RealType& cs, RealType& sn ); + void apply_givens_rotations( const int i, const int m ); + + + void setSize( const IndexType size ); - void setSize( IndexType _size, IndexType m ); + // selected GMRES variant + Variant variant = Variant::CWY; - Containers::Vector< RealType, DeviceType, IndexType > _r, w, _v, _M_tmp; - Containers::Vector< RealType, Devices::Host, IndexType > _s, _cs, _sn, _H; + // single vectors + DeviceVector r, w, z, _M_tmp; + // matrices (in column-major format) + DeviceVector V, Y; + // (CWY only) duplicate of the upper (m+1)x(m+1) submatrix of Y (it is lower triangular) for fast access + HostVector YL, T; + // host-only storage for Givens rotations and the least squares problem + HostVector cs, sn, H, s; IndexType size = 0; - IndexType restarting_min = 10; - IndexType restarting_max = 10; - IndexType restarting_step_min = 3; - IndexType restarting_step_max = 3; + IndexType ldSize = 0; + int restarting_min = 10; + int restarting_max = 10; + int restarting_step_min = 3; + int restarting_step_max = 3; }; } // namespace Linear diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h index 220cecf23e..4e5f0360f6 100644 --- a/src/TNL/Solvers/Linear/GMRES_impl.h +++ b/src/TNL/Solvers/Linear/GMRES_impl.h @@ -1,17 +1,24 @@ /*************************************************************************** GMRES_impl.h - description ------------------- - begin : Nov 25, 2012 - copyright : (C) 2012 by Tomas Oberhuber + begin : May 13, 2016 + copyright : (C) 2016 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ +// Implemented by: Jakub Klinkovsky + #pragma once +#include <type_traits> #include <cmath> +#include <TNL/Exceptions/CudaSupportMissing.h> +#include <TNL/Containers/Algorithms/Multireduction.h> +#include <TNL/Matrices/MatrixOperations.h> + #include "GMRES.h" namespace TNL { @@ -34,6 +41,10 @@ GMRES< Matrix >:: configSetup( Config::ConfigDescription& config, const String& prefix ) { + config.addEntry< String >( prefix + "gmres-variant", "Minimal number of iterations after which the GMRES restarts.", "CWY" ); + config.addEntryEnum( "MGS" ); + config.addEntryEnum( "MGSR" ); + config.addEntryEnum( "CWY" ); config.addEntry< int >( prefix + "gmres-restarting-min", "Minimal number of iterations after which the GMRES restarts.", 10 ); config.addEntry< int >( prefix + "gmres-restarting-max", "Maximal number of iterations after which the GMRES restarts.", 10 ); config.addEntry< int >( prefix + "gmres-restarting-step-min", "Minimal adjusting step for the adaptivity of the GMRES restarting parameter.", 3 ); @@ -46,21 +57,22 @@ GMRES< Matrix >:: setup( const Config::ParameterContainer& parameters, const String& prefix ) { + const String var = parameters.getParameter< String >( "gmres-variant" ); + if( var == "MGS" ) + variant = Variant::MGS; + else if( var == "MGSR" ) + variant = Variant::MGSR; + else if( var == "CWY" ) + variant = Variant::CWY; + else + return false; + restarting_min = parameters.getParameter< int >( "gmres-restarting-min" ); - this->setRestarting( parameters.getParameter< int >( "gmres-restarting-max" ) ); + restarting_max = parameters.getParameter< int >( "gmres-restarting-max" ); restarting_step_min = parameters.getParameter< int >( "gmres-restarting-step-min" ); restarting_step_max = parameters.getParameter< int >( "gmres-restarting-step-max" ); - return LinearSolver< Matrix >::setup( parameters, prefix ); -} -template< typename Matrix > -void -GMRES< Matrix >:: -setRestarting( IndexType rest ) -{ - if( size != 0 ) - setSize( size, rest ); - restarting_max = rest; + return LinearSolver< Matrix >::setup( parameters, prefix ); } template< typename Matrix > @@ -69,58 +81,34 @@ GMRES< Matrix >:: solve( ConstVectorViewType b, VectorViewType x ) { TNL_ASSERT_TRUE( this->matrix, "No matrix was set in GMRES. Call setMatrix() before solve()." ); - if( restarting_min <= 0 || restarting_max <= 0 || restarting_min > restarting_max ) - { + if( restarting_min <= 0 || restarting_max <= 0 || restarting_min > restarting_max ) { std::cerr << "Wrong value for the GMRES restarting parameters: r_min = " << restarting_min << ", r_max = " << restarting_max << std::endl; return false; } - if( restarting_step_min < 0 || restarting_step_max < 0 || restarting_step_min > restarting_step_max ) - { + if( restarting_step_min < 0 || restarting_step_max < 0 || restarting_step_min > restarting_step_max ) { std::cerr << "Wrong value for the GMRES restarting adjustment parameters: d_min = " << restarting_step_min << ", d_max = " << restarting_step_max << std::endl; return false; } - setSize( this->matrix->getRows(), restarting_max ); - - IndexType _size = size; - - //RealType *w = _w.getData(); - RealType *s = _s.getData(); - RealType *cs = _cs.getData(); - RealType *sn = _sn.getData(); - RealType *v = _v.getData(); - RealType *H = _H.getData(); - RealType *M_tmp = _M_tmp.getData(); - - RealType normb( 0.0 ), beta( 0.0 ); - /**** - * 1. Solve r from M r = b - A x_0 - */ - if( this->preconditioner ) - { - this->preconditioner->solve( b, _M_tmp ); - normb = _M_tmp.lpNorm( ( RealType ) 2.0 ); + setSize( this->matrix->getRows() ); - this->matrix->vectorProduct( x, _M_tmp ); - _M_tmp.addVector( b, ( RealType ) 1.0, -1.0 ); - - this->preconditioner->solve( _M_tmp, _r ); + // initialize the norm of the preconditioned right-hand-side + RealType normb; + if( this->preconditioner ) { + this->preconditioner->solve( b, _M_tmp ); + normb = _M_tmp.lpNorm( 2.0 ); } else - { - this->matrix->vectorProduct( x, _r ); - normb = b.lpNorm( ( RealType ) 2.0 ); - _r.addVector( b, ( RealType ) 1.0, -1.0 ); - } - beta = _r.lpNorm( ( RealType ) 2.0 ); - - //cout << "norm b = " << normb << std::endl; - //cout << " beta = " << beta << std::endl; - + normb = b.lpNorm( 2.0 ); + if( normb == 0.0 ) + normb = 1.0; - if( normb == 0.0 ) normb = 1.0; + // r = M.solve(b - A * x); + compute_residue( r, x, b ); + RealType beta = r.lpNorm( 2.0 ); + // initialize stopping criterion this->resetIterations(); this->setResidue( beta / normb ); @@ -131,9 +119,7 @@ solve( ConstVectorViewType b, VectorViewType x ) int restart_cycles = 0; // counter of restart cycles int m = restarting_max; // current restarting parameter - Containers::Vector< RealType, DeviceType, IndexType > vi, vk; - while( this->checkNextIteration() ) - { + while( this->checkNextIteration() ) { // adaptivity of the restarting parameter // reference: A.H. Baker, E.R. Jessup, Tz.V. Kolev - A simple strategy for varying the restart parameter in GMRES(m) // http://www.sciencedirect.com/science/article/pii/S0377042709000132 @@ -153,185 +139,471 @@ solve( ConstVectorViewType b, VectorViewType x ) // set restarting_max when we hit restarting_min (see Baker et al. (2009)) m = restarting_max; } -// std::cerr << "restarting: cycle = " << restart_cycles << ", beta_ratio = " << beta_ratio << ", m = " << m << " " << std::endl; } - for( IndexType i = 0; i < m + 1; i ++ ) - H[ i ] = s[ i ] = cs[ i ] = sn[ i ] = 0.0; + // orthogonalization + int o_steps = 0; + switch( variant ) { + case Variant::MGS: + case Variant::MGSR: + o_steps = orthogonalize_MGS( m, normb, beta ); + break; + case Variant::CWY: + o_steps = orthogonalize_CWY( m, normb, beta ); + break; + } + + if( o_steps < m ) { + // exact solution has been reached early + update( o_steps, m, H, s, V, x ); + this->refreshSolverMonitor( true ); + return this->checkConvergence(); + } + + // update the solution approximation + update( m - 1, m, H, s, V, x ); + + // compute the new residual vector + compute_residue( r, x, b ); + const RealType beta_old = beta; + beta = r.lpNorm( 2.0 ); + this->setResidue( beta / normb ); + + // update parameters for the adaptivity of the restarting parameter + ++restart_cycles; + beta_ratio = beta / beta_old; + } + this->refreshSolverMonitor( true ); + return this->checkConvergence(); +} + +template< typename Matrix > +int +GMRES< Matrix >:: +orthogonalize_MGS( const int m, const RealType normb, const RealType beta ) +{ + DeviceView vi, vk; + + /*** + * v_0 = r / | r | = 1.0 / beta * r + */ + vi.bind( V.getData(), ldSize ); + vi.addVector( r, 1.0 / beta, 0.0 ); + + H.setValue( 0.0 ); + s.setValue( 0.0 ); + s[ 0 ] = beta; + + /**** + * Starting m-loop + */ + for( int i = 0; i < m && this->nextIteration(); i++ ) { + vi.bind( &( V.getData()[ i * ldSize ] ), size ); /**** - * v = 0 + * Solve w from M w = A v_i */ - _v.setValue( ( RealType ) 0.0 ); + preconditioned_matvec( w, vi ); + + for( int k = 0; k <= i; k++ ) + H[ k + i * ( m + 1 ) ] = 0.0; + const int reorthogonalize = (variant == Variant::MGSR) ? 2 : 1; + for( int l = 0; l < reorthogonalize; l++ ) + for( int k = 0; k <= i; k++ ) { + vk.bind( &( V.getData()[ k * ldSize ] ), size ); + /*** + * H_{k,i} = ( w, v_k ) + */ + RealType H_k_i = w.scalarProduct( vk ); + H[ k + i * ( m + 1 ) ] += H_k_i; + + /**** + * w = w - H_{k,i} v_k + */ + w.addVector( vk, -H_k_i ); + } + /*** + * H_{i+1,i} = |w| + */ + RealType normw = w.lpNorm( ( RealType ) 2.0 ); + H[ i + 1 + i * ( m + 1 ) ] = normw; /*** - * v_0 = r / | r | = 1.0 / beta * r + * v_{i+1} = w / |w| + */ + vi.bind( &( V.getData()[ ( i + 1 ) * ldSize ] ), size ); + vi.addVector( w, 1.0 / normw, 0.0 ); + + /**** + * Applying the Givens rotations G_0, ..., G_i */ - vi.bind( _v.getData(), size ); - vi.addVector( _r, ( RealType ) 1.0 / beta ); + apply_givens_rotations( i, m ); - _s.setValue( ( RealType ) 0.0 ); - _s[ 0 ] = beta; + this->setResidue( std::fabs( s[ i + 1 ] ) / normb ); + if( ! this->checkNextIteration() ) + return i; + else + this->refreshSolverMonitor(); + } + return m; +} +template< typename Matrix > +int +GMRES< Matrix >:: +orthogonalize_CWY( const int m, const RealType normb, const RealType beta ) +{ + DeviceVector vi, vk; + /*** + * z = r / | r | = 1.0 / beta * r + */ + // TODO: investigate normalization by beta and normb +// z.addVector( r, 1.0 / beta, 0.0 ); +// z.addVector( r, 1.0 / normb, 0.0 ); + z = r; + + H.setValue( 0.0 ); + s.setValue( 0.0 ); + T.setValue( 0.0 ); + + // NOTE: this is unstable, s[0] is set later in hauseholder_apply_trunc +// s[ 0 ] = beta; + + /**** + * Starting m-loop + */ + for( int i = 0; i <= m && this->nextIteration(); i++ ) { /**** - * Starting m-loop + * Generate new Hauseholder transformation from vector z. */ - for( IndexType i = 0; i < m && this->nextIteration(); i++ ) - { - vi.bind( &( _v.getData()[ i * size ] ), size ); + hauseholder_generate( Y, T, i, z ); + + if( i == 0 ) { /**** - * Solve w from M w = A v_i + * s = e_1^T * P_i * z */ - if( this->preconditioner ) - { - this->matrix->vectorProduct( vi, _M_tmp ); - this->preconditioner->solve( _M_tmp, w ); - } - else - this->matrix->vectorProduct( vi, w ); - - //cout << " i = " << i << " vi = " << vi << std::endl; - - for( IndexType k = 0; k <= i; k++ ) - H[ k + i * ( m + 1 ) ] = 0.0; - for( IndexType l = 0; l < 2; l++ ) - for( IndexType k = 0; k <= i; k++ ) - { - vk.bind( &( _v.getData()[ k * _size ] ), _size ); - /*** - * H_{k,i} = ( w, v_k ) - */ - RealType H_k_i = w.scalarProduct( vk ); - H[ k + i * ( m + 1 ) ] += H_k_i; - - /**** - * w = w - H_{k,i} v_k - */ - w.addVector( vk, -H_k_i ); - - //cout << "H_ki = " << H_k_i << std::endl; - //cout << "w = " << w << std::endl; - } + hauseholder_apply_trunc( s, Y, T, i, z ); + } + else { /*** - * H_{i+1,i} = |w| + * H_{i-1} = P_i * z */ - RealType normw = w.lpNorm( ( RealType ) 2.0 ); - H[ i + 1 + i * ( m + 1 ) ] = normw; + HostView h( &H.getData()[ (i - 1) * (m + 1) ], m + 1 ); + hauseholder_apply_trunc( h, Y, T, i, z ); + } - //cout << "normw = " << normw << std::endl; - - /*** - * v_{i+1} = w / |w| - */ - vi.bind( &( _v.getData()[ ( i + 1 ) * size ] ), size ); - vi.addVector( w, ( RealType ) 1.0 / normw ); - - //cout << "vi = " << vi << std::endl; - + /*** + * Generate new basis vector v_i, using the compact WY representation: + * v_i = (I - Y_i * T_i Y_i^T) * e_i + */ + vi.bind( &V.getData()[ i * ldSize ], size ); + hauseholder_cwy( vi, Y, T, i ); + + if( i < m ) { /**** - * Applying the Givens rotations + * Solve w from M w = A v_i */ - for( IndexType k = 0; k < i; k++ ) - applyPlaneRotation( H[ k + i * ( m + 1 )], - H[ k + 1 + i * ( m + 1 ) ], - cs[ k ], - sn[ k ] ); - - generatePlaneRotation( H[ i + i * ( m + 1 ) ], - H[ i + 1 + i * ( m + 1 ) ], - cs[ i ], - sn[ i ]); - applyPlaneRotation( H[ i + i * ( m + 1 ) ], - H[ i + 1 + i * ( m + 1 ) ], - cs[ i ], - sn[ i ]); - applyPlaneRotation( s[ i ], - s[ i + 1 ], - cs[ i ], - sn[ i ] ); + preconditioned_matvec( w, vi ); - this->setResidue( std::fabs( s[ i + 1 ] ) / normb ); - if( ! this->checkNextIteration() ) { - update( i, m, _H, _s, _v, x ); - this->refreshSolverMonitor( true ); - return this->checkConvergence(); - } - else - { - this->refreshSolverMonitor(); - } + /**** + * Apply all previous Hauseholder transformations, using the compact WY representation: + * z = (I - Y_i * T_i^T * Y_i^T) * w + */ + hauseholder_cwy_transposed( z, Y, T, i, w ); } - //cout << "x = " << x << std::endl; - update( m - 1, m, _H, _s, _v, x ); - //cout << "x = " << x << std::endl; /**** - * r = M.solve(b - A * x); + * Applying the Givens rotations G_0, ..., G_{i-1} */ - const RealType beta_old = beta; - beta = 0.0; - if( this->preconditioner ) - { - this->matrix->vectorProduct( x, _M_tmp ); - _M_tmp.addVector( b, ( RealType ) 1.0, -1.0 ); - this->preconditioner->solve( _M_tmp, _r ); - beta = _r.lpNorm( ( RealType ) 2.0 ); - } + if( i > 0 ) + apply_givens_rotations( i - 1, m ); + + this->setResidue( std::fabs( s[ i ] ) / normb ); + if( ! this->checkNextIteration() ) + return i - 1; else - { - this->matrix->vectorProduct( x, _r ); - _r.addVector( b, ( RealType ) 1.0, -1.0 ); - beta = _r.lpNorm( ( RealType ) 2.0 ); + this->refreshSolverMonitor(); + } + + return m; +} + +template< typename Matrix > +void +GMRES< Matrix >:: +compute_residue( VectorViewType r, ConstVectorViewType x, ConstVectorViewType b ) +{ + /**** + * r = M.solve(b - A * x); + */ + if( this->preconditioner ) { + this->matrix->vectorProduct( x, _M_tmp ); + _M_tmp.addVector( b, 1.0, -1.0 ); + this->preconditioner->solve( _M_tmp, r ); + } + else { + this->matrix->vectorProduct( x, r ); + r.addVector( b, 1.0, -1.0 ); + } +} + +template< typename Matrix > +void +GMRES< Matrix >:: +preconditioned_matvec( DeviceVector& w, ConstDeviceView v ) +{ + /**** + * w = M.solve(A * v_i); + */ + if( this->preconditioner ) { + this->matrix->vectorProduct( v, _M_tmp ); + this->preconditioner->solve( _M_tmp, w ); + } + else + this->matrix->vectorProduct( v, w ); +} + +#ifdef HAVE_CUDA +template< typename DestinationElement, + typename SourceElement, + typename Index > +__global__ void +copyTruncatedVectorKernel( DestinationElement* destination, + const SourceElement* source, + const Index from, + const Index size ) +{ + Index elementIdx = blockIdx.x * blockDim.x + threadIdx.x; + const Index gridSize = blockDim.x * gridDim.x; + + while( elementIdx < from ) { + destination[ elementIdx ] = (DestinationElement) 0.0; + elementIdx += gridSize; + } + while( elementIdx < size ) { + destination[ elementIdx ] = source[ elementIdx ]; + elementIdx += gridSize; + } +} +#endif + +template< typename Matrix > +void +GMRES< Matrix >:: +hauseholder_generate( DeviceVector& Y, + HostVector& T, + const int i, + DeviceVector& z ) +{ + DeviceView y_i( &Y.getData()[ i * ldSize ], size ); + + // XXX: the upper-right triangle of Y will be full of zeros, which can be exploited for optimization + if( std::is_same< DeviceType, Devices::Host >::value ) { + for( IndexType j = 0; j < size; j++ ) { + if( j < i ) + y_i[ j ] = 0.0; + else + y_i[ j ] = z[ j ]; } - this->setResidue( beta / normb ); + } + if( std::is_same< DeviceType, Devices::Cuda >::value ) { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ); + dim3 gridSize; + gridSize.x = min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( size, blockSize.x ) ); + + copyTruncatedVectorKernel<<< gridSize, blockSize >>>( y_i.getData(), + z.getData(), + i, + size ); + TNL_CHECK_CUDA_DEVICE; +#else + throw Exceptions::CudaSupportMissing(); +#endif + } - //cout << " x = " << x << std::endl; - //cout << " beta = " << beta << std::endl; - //cout << "residue = " << beta / normb << std::endl; + // norm of the TRUNCATED vector z + const RealType normz = y_i.lpNorm( 2.0 ); + const RealType y_ii = y_i.getElement( i ); + if( y_ii > 0.0 ) + y_i.setElement( i, y_ii + normz ); + else + y_i.setElement( i, y_ii - normz ); + + // compute the norm of the y_i vector; equivalent to this calculation by definition: + // const RealType norm_yi = y_i.lpNorm( 2.0 ); + const RealType norm_yi = std::sqrt( 2 * (normz * normz + std::fabs( y_ii ) * normz) ); + + // XXX: normalization is slower, but more stable +// y_i *= 1.0 / norm_yi; +// const RealType t_i = 2.0; + // assuming it's stable enough... + const RealType t_i = 2.0 / (norm_yi * norm_yi); + + T[ i + i * (restarting_max + 1) ] = t_i; + if( i > 0 ) { + // aux = Y_{i-1}^T * y_i + RealType aux[ i ]; + Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct; + Containers::Algorithms::Multireduction< DeviceType >::reduce + ( scalarProduct, + i, + size, + Y.getData(), + ldSize, + y_i.getData(), + aux ); + + // [T_i]_{0..i-1} = - T_{i-1} * t_i * aux + for( int k = 0; k < i; k++ ) { + T[ k + i * (restarting_max + 1) ] = 0.0; + for( int j = k; j < i; j++ ) + T[ k + i * (restarting_max + 1) ] -= T[ k + j * (restarting_max + 1) ] * (t_i * aux[ j ]); + } + } +} - // update parameters for the adaptivity of the restarting parameter - ++restart_cycles; - beta_ratio = beta / beta_old; +template< typename Matrix > +void +GMRES< Matrix >:: +hauseholder_apply_trunc( HostView out, + DeviceVector& Y, + HostVector& T, + const int i, + DeviceVector& z ) +{ + DeviceView y_i( &Y.getData()[ i * ldSize ], size ); + + const RealType aux = T[ i + i * (restarting_max + 1) ] * y_i.scalarProduct( z ); + if( std::is_same< DeviceType, Devices::Host >::value ) { + for( int k = 0; k <= i; k++ ) + out[ k ] = z[ k ] - y_i[ k ] * aux; + } + if( std::is_same< DeviceType, Devices::Cuda >::value ) { + // copy part of y_i to buffer on host + // here we duplicate the upper (m+1)x(m+1) submatrix of Y on host for fast access + RealType* host_yi = &YL[ i * (restarting_max + 1) ]; + RealType host_z[ i + 1 ]; + Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_yi, y_i.getData(), restarting_max + 1 ); + Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_z, z.getData(), i + 1 ); + for( int k = 0; k <= i; k++ ) + out[ k ] = host_z[ k ] - host_yi[ k ] * aux; } - this->refreshSolverMonitor( true ); - return this->checkConvergence(); } template< typename Matrix > - template< typename VectorT > void GMRES< Matrix >:: -update( IndexType k, - IndexType m, - const Containers::Vector< RealType, Devices::Host, IndexType >& H, - const Containers::Vector< RealType, Devices::Host, IndexType >& s, - Containers::Vector< RealType, DeviceType, IndexType >& v, - VectorT& x ) +hauseholder_cwy( DeviceVector& v, + DeviceVector& Y, + HostVector& T, + const int i ) { - Containers::Vector< RealType, Devices::Host, IndexType > y; - y.setSize( m + 1 ); + // aux = Y_i^T * e_i + RealType aux[ i + 1 ]; + if( std::is_same< DeviceType, Devices::Host >::value ) { + for( int k = 0; k <= i; k++ ) + aux[ k ] = Y[ i + k * ldSize ]; + } + if( std::is_same< DeviceType, Devices::Cuda >::value ) { + // the upper (m+1)x(m+1) submatrix of Y is duplicated on host for fast access + for( int k = 0; k <= i; k++ ) + aux[ k ] = YL[ i + k * (restarting_max + 1) ]; + } + + // aux = T_i * aux + // Note that T_i is upper triangular, so we can overwrite the aux vector with the result in place + for( int k = 0; k <= i; k++ ) { + RealType aux2 = 0.0; + for( int j = k; j <= i; j++ ) + aux2 += T[ k + j * (restarting_max + 1) ] * aux[ j ]; + aux[ k ] = aux2; + } - IndexType i, j; - for( i = 0; i <= m ; i ++ ) + // v = e_i - Y_i * aux + MatrixOperations< DeviceType >::gemv( size, i + 1, + -1.0, Y.getData(), ldSize, aux, + 0.0, v.getData() ); + v.setElement( i, 1.0 + v.getElement( i ) ); +} + +template< typename Matrix > +void +GMRES< Matrix >:: +hauseholder_cwy_transposed( DeviceVector& z, + DeviceVector& Y, + HostVector& T, + const int i, + DeviceVector& w ) +{ + // aux = Y_i^T * w + RealType aux[ i + 1 ]; + Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct; + Containers::Algorithms::Multireduction< DeviceType >::reduce + ( scalarProduct, + i + 1, + size, + Y.getData(), + ldSize, + w.getData(), + aux ); + + // aux = T_i^T * aux + // Note that T_i^T is lower triangular, so we can overwrite the aux vector with the result in place + for( int k = i; k >= 0; k-- ) { + RealType aux2 = 0.0; + for( int j = 0; j <= k; j++ ) + aux2 += T[ j + k * (restarting_max + 1) ] * aux[ j ]; + aux[ k ] = aux2; + } + + // z = w - Y_i * aux + z = w; + MatrixOperations< DeviceType >::gemv( size, i + 1, + -1.0, Y.getData(), ldSize, aux, + 1.0, z.getData() ); +} + +template< typename Matrix > + template< typename Vector > +void +GMRES< Matrix >:: +update( const int k, + const int m, + const HostVector& H, + const HostVector& s, + DeviceVector& v, + Vector& x ) +{ + RealType y[ m + 1 ]; + + for( int i = 0; i <= m ; i ++ ) y[ i ] = s[ i ]; // Backsolve: - for( i = k; i >= 0; i--) - { - //cout << " y = " << y << std::endl; + for( int i = k; i >= 0; i--) { + if( H[ i + i * ( m + 1 ) ] == 0 ) { +// for( int _i = 0; _i <= i; _i++ ) { +// for( int _j = 0; _j < i; _j++ ) +// std::cout << H[ _i + _j * (m+1) ] << " "; +// std::cout << std::endl; +// } + std::cerr << "H.norm = " << H.lpNorm( 2.0 ) << std::endl; + std::cerr << "s = " << s << std::endl; + std::cerr << "k = " << k << ", m = " << m << std::endl; + throw 1; + } y[ i ] /= H[ i + i * ( m + 1 ) ]; - for( j = i - 1; j >= 0; j--) + for( int j = i - 1; j >= 0; j--) y[ j ] -= H[ j + i * ( m + 1 ) ] * y[ i ]; } - Containers::Vector< RealType, DeviceType, IndexType > vi; - for( i = 0; i <= k; i++) - { - vi.bind( &( v.getData()[ i * this->size ] ), x.getSize() ); - x.addVector( vi, y[ i ] ); - } + // x = V * y + x + MatrixOperations< DeviceType >::gemv( size, k + 1, + 1.0, v.getData(), ldSize, y, + 1.0, x.getData() ); } template< typename Matrix > @@ -342,24 +614,20 @@ generatePlaneRotation( RealType& dx, RealType& cs, RealType& sn ) { - if( dy == 0.0 ) - { + if( dy == 0.0 ) { cs = 1.0; sn = 0.0; } - else - if( std::fabs( dy ) > std::fabs( dx ) ) - { - RealType temp = dx / dy; - sn = 1.0 / std::sqrt( 1.0 + temp * temp ); - cs = temp * sn; - } - else - { - RealType temp = dy / dx; - cs = 1.0 / std::sqrt( 1.0 + temp * temp ); - sn = temp * cs; - } + else if( std::fabs( dy ) > std::fabs( dx ) ) { + const RealType temp = dx / dy; + sn = 1.0 / std::sqrt( 1.0 + temp * temp ); + cs = temp * sn; + } + else { + const RealType temp = dy / dx; + cs = 1.0 / std::sqrt( 1.0 + temp * temp ); + sn = temp * cs; + } } template< typename Matrix > @@ -370,28 +638,68 @@ applyPlaneRotation( RealType& dx, RealType& cs, RealType& sn ) { - RealType temp = cs * dx + sn * dy; - dy = cs * dy - sn * dx; + const RealType temp = cs * dx + sn * dy; + dy = cs * dy - sn * dx; dx = temp; } template< typename Matrix > void GMRES< Matrix >:: -setSize( IndexType _size, IndexType m ) +apply_givens_rotations( int i, int m ) +{ + for( int k = 0; k < i; k++ ) + applyPlaneRotation( H[ k + i * (m + 1) ], + H[ k + 1 + i * (m + 1) ], + cs[ k ], + sn[ k ] ); + + if( H[ i + 1 + i * (m + 1) ] != 0.0 ) { + generatePlaneRotation( H[ i + i * (m + 1) ], + H[ i + 1 + i * (m + 1) ], + cs[ i ], + sn[ i ]); + applyPlaneRotation( H[ i + i * (m + 1) ], + H[ i + 1 + i * (m + 1) ], + cs[ i ], + sn[ i ]); + applyPlaneRotation( s[ i ], + s[ i + 1 ], + cs[ i ], + sn[ i ] ); + } +} + +template< typename Matrix > +void +GMRES< Matrix >:: +setSize( const IndexType size ) { - if( size == _size && restarting_max == m ) - return; - size = _size; - restarting_max = m; - _r.setSize( size ); + this->size = size; + if( std::is_same< DeviceType, Devices::Cuda >::value ) + // align each column to 256 bytes - optimal for CUDA + ldSize = roundToMultiple( size, 256 / sizeof( RealType ) ); + else + // on the host, we add 1 to disrupt the cache false-sharing pattern + ldSize = roundToMultiple( size, 256 / sizeof( RealType ) ) + 1; + + const int m = restarting_max; + r.setSize( size ); w.setSize( size ); - _s.setSize( m + 1 ); - _cs.setSize( m + 1 ); - _sn.setSize( m + 1 ); - _v.setSize( size * ( m + 1 ) ); - _H.setSize( ( m + 1 ) * m ); + V.setSize( ldSize * ( m + 1 ) ); + cs.setSize( m + 1 ); + sn.setSize( m + 1 ); + H.setSize( ( m + 1 ) * m ); + s.setSize( m + 1 ); _M_tmp.setSize( size ); + + // CWY-specific storage + if( variant == Variant::CWY ) { + z.setSize( size ); + Y.setSize( ldSize * ( m + 1 ) ); + T.setSize( (m + 1) * (m + 1) ); + YL.setSize( (m + 1) * (m + 1) ); + } } } // namespace Linear diff --git a/src/TNL/Solvers/LinearSolverTypeResolver.h b/src/TNL/Solvers/LinearSolverTypeResolver.h index 8842038009..61a141f504 100644 --- a/src/TNL/Solvers/LinearSolverTypeResolver.h +++ b/src/TNL/Solvers/LinearSolverTypeResolver.h @@ -19,7 +19,6 @@ #include <TNL/Solvers/Linear/BICGStab.h> #include <TNL/Solvers/Linear/BICGStabL.h> #include <TNL/Solvers/Linear/GMRES.h> -#include <TNL/Solvers/Linear/CWYGMRES.h> #include <TNL/Solvers/Linear/TFQMR.h> #include <TNL/Solvers/Linear/UmfpackWrapper.h> #include <TNL/Solvers/Linear/Preconditioners/Diagonal.h> @@ -45,8 +44,6 @@ getLinearSolver( const Config::ParameterContainer& parameters ) return std::make_shared< Linear::BICGStabL< MatrixType > >(); if( discreteSolver == "gmres" ) return std::make_shared< Linear::GMRES< MatrixType > >(); - if( discreteSolver == "cwygmres" ) - return std::make_shared< Linear::CWYGMRES< MatrixType > >(); if( discreteSolver == "tfqmr" ) return std::make_shared< Linear::TFQMR< MatrixType > >(); #ifdef HAVE_UMFPACK @@ -54,7 +51,7 @@ getLinearSolver( const Config::ParameterContainer& parameters ) return std::make_shared< Linear::UmfpackWrapper< MatrixType > >(); #endif - std::cerr << "Unknown semi-implicit discrete solver " << discreteSolver << ". It can be only: sor, cg, bicgstab, bicgstabl, gmres, cwygmres, tfqmr"; + std::cerr << "Unknown semi-implicit discrete solver " << discreteSolver << ". It can be only: sor, cg, bicgstab, bicgstabl, gmres, tfqmr"; #ifdef HAVE_UMFPACK std::cerr << ", umfpack" #endif diff --git a/src/TNL/Solvers/SolverConfig_impl.h b/src/TNL/Solvers/SolverConfig_impl.h index bc86767e16..598cee285a 100644 --- a/src/TNL/Solvers/SolverConfig_impl.h +++ b/src/TNL/Solvers/SolverConfig_impl.h @@ -21,7 +21,6 @@ #include <TNL/Solvers/Linear/BICGStab.h> #include <TNL/Solvers/Linear/BICGStabL.h> #include <TNL/Solvers/Linear/GMRES.h> -#include <TNL/Solvers/Linear/CWYGMRES.h> #include <TNL/Solvers/Linear/TFQMR.h> #include <TNL/Solvers/Linear/UmfpackWrapper.h> #include <TNL/Solvers/Linear/Preconditioners/Diagonal.h> @@ -135,7 +134,6 @@ bool SolverConfig< ConfigTag, ProblemConfig >::configSetup( Config::ConfigDescri config.addEntryEnum( "cg" ); config.addEntryEnum( "bicgstab" ); config.addEntryEnum( "bicgstabl" ); - config.addEntryEnum( "cwygmres" ); config.addEntryEnum( "gmres" ); config.addEntryEnum( "tfqmr" ); config.addEntryEnum( "sor" ); @@ -174,10 +172,7 @@ bool SolverConfig< ConfigTag, ProblemConfig >::configSetup( Config::ConfigDescri Linear::CG< MatrixType >::configSetup( config ); Linear::BICGStab< MatrixType >::configSetup( config ); Linear::BICGStabL< MatrixType >::configSetup( config ); - - // GMRES and CWYGMRES have the same options Linear::GMRES< MatrixType >::configSetup( config ); - Linear::TFQMR< MatrixType >::configSetup( config ); Linear::SOR< MatrixType >::configSetup( config ); -- GitLab