Loading src/TNL/Solvers/Linear/GMRES.h +48 −27 Original line number Diff line number Diff line Loading @@ -14,8 +14,6 @@ #include "LinearSolver.h" #include <TNL/Containers/Vector.h> namespace TNL { namespace Solvers { namespace Linear { Loading @@ -25,12 +23,19 @@ class GMRES : public LinearSolver< Matrix > { using Base = LinearSolver< Matrix >; // compatibility shortcuts using Traits = Linear::Traits< Matrix >; using CommunicatorType = typename Traits::CommunicatorType; public: using RealType = typename Base::RealType; using DeviceType = typename Base::DeviceType; using IndexType = typename Base::IndexType; // distributed vectors/views using VectorViewType = typename Base::VectorViewType; using ConstVectorViewType = typename Base::ConstVectorViewType; using VectorType = typename Traits::VectorType; String getType() const; Loading @@ -43,11 +48,12 @@ public: bool solve( ConstVectorViewType b, VectorViewType x ) override; protected: using ConstDeviceView = Containers::VectorView< const RealType, DeviceType, IndexType >; using DeviceView = Containers::VectorView< RealType, DeviceType, IndexType >; using HostView = Containers::VectorView< RealType, Devices::Host, IndexType >; using DeviceVector = Containers::Vector< RealType, DeviceType, IndexType >; using HostVector = Containers::Vector< RealType, Devices::Host, IndexType >; // local vectors/views using ConstDeviceView = typename Traits::ConstLocalVectorViewType; using DeviceView = typename Traits::LocalVectorViewType; using HostView = typename DeviceView::HostType; using DeviceVector = typename Traits::LocalVectorType; using HostVector = typename DeviceVector::HostType; enum class Variant { MGS, MGSR, CWY }; Loading @@ -57,36 +63,37 @@ protected: void compute_residue( VectorViewType r, ConstVectorViewType x, ConstVectorViewType b ); void preconditioned_matvec( DeviceVector& w, ConstDeviceView v ); void preconditioned_matvec( VectorViewType w, ConstVectorViewType v ); void hauseholder_generate( DeviceVector& Y, HostVector& T, const int i, DeviceVector& w ); // nvcc allows __cuda_callable__ lambdas only in public methods #ifdef __NVCC__ public: #endif void hauseholder_generate( const int i, VectorViewType y_i, ConstVectorViewType z ); #ifdef __NVCC__ protected: #endif void hauseholder_apply_trunc( HostView out, DeviceVector& Y, HostVector& T, const int i, DeviceVector& w ); VectorViewType y_i, ConstVectorViewType z ); void hauseholder_cwy( DeviceVector& w, DeviceVector& Y, HostVector& T, void hauseholder_cwy( VectorViewType v, const int i ); void hauseholder_cwy_transposed( DeviceVector& w, DeviceVector& Y, HostVector& T, void hauseholder_cwy_transposed( VectorViewType z, const int i, DeviceVector& z ); ConstVectorViewType w ); template< typename Vector > void update( const int k, const int m, const HostVector& H, const HostVector& s, DeviceVector& v, DeviceVector& V, Vector& x ); void generatePlaneRotation( RealType& dx, Loading @@ -101,15 +108,28 @@ protected: void apply_givens_rotations( const int i, const int m ); void setSize( const VectorViewType& x ); // Specialized methods to distinguish between normal and distributed matrices // in the implementation. template< typename M > static IndexType getLocalOffset( const M& m ) { return 0; } void setSize( const IndexType size ); template< typename M > static IndexType getLocalOffset( const DistributedContainers::DistributedMatrix< M >& m ) { return m.getLocalRowRange().getBegin(); } // selected GMRES variant Variant variant = Variant::CWY; // single vectors DeviceVector r, w, z, _M_tmp; // matrices (in column-major format) // single vectors (distributed) VectorType r, w, z, _M_tmp; // matrices (in column-major format) (local) DeviceVector V, Y; // (CWY only) duplicate of the upper (m+1)x(m+1) submatrix of Y (it is lower triangular) for fast access HostVector YL, T; Loading @@ -118,6 +138,7 @@ protected: IndexType size = 0; IndexType ldSize = 0; IndexType localOffset = 0; int restarting_min = 10; int restarting_max = 10; int restarting_step_min = 3; Loading src/TNL/Solvers/Linear/GMRES_impl.h +104 −124 Original line number Diff line number Diff line Loading @@ -91,7 +91,7 @@ solve( ConstVectorViewType b, VectorViewType x ) << ", d_max = " << restarting_step_max << std::endl; return false; } setSize( this->matrix->getRows() ); setSize( x ); // initialize the norm of the preconditioned right-hand-side RealType normb; Loading Loading @@ -183,13 +183,15 @@ int GMRES< Matrix >:: orthogonalize_MGS( const int m, const RealType normb, const RealType beta ) { DeviceView vi, vk; // initial binding to _M_tmp sets the correct local range, global size and // communication group for distributed views VectorViewType v_i( _M_tmp ), v_k( _M_tmp ); /*** * v_0 = r / | r | = 1.0 / beta * r */ vi.bind( V.getData(), ldSize ); vi.addVector( r, 1.0 / beta, 0.0 ); v_i.bind( V.getData(), size ); v_i.addVector( r, 1.0 / beta, 0.0 ); H.setValue( 0.0 ); s.setValue( 0.0 ); Loading @@ -199,28 +201,28 @@ orthogonalize_MGS( const int m, const RealType normb, const RealType beta ) * Starting m-loop */ for( int i = 0; i < m && this->nextIteration(); i++ ) { vi.bind( &( V.getData()[ i * ldSize ] ), size ); v_i.bind( &V.getData()[ i * ldSize ], size ); /**** * Solve w from M w = A v_i */ preconditioned_matvec( w, vi ); preconditioned_matvec( w, v_i ); for( int k = 0; k <= i; k++ ) H[ k + i * ( m + 1 ) ] = 0.0; const int reorthogonalize = (variant == Variant::MGSR) ? 2 : 1; for( int l = 0; l < reorthogonalize; l++ ) for( int k = 0; k <= i; k++ ) { vk.bind( &( V.getData()[ k * ldSize ] ), size ); v_k.bind( &V.getData()[ k * ldSize ], size ); /*** * H_{k,i} = ( w, v_k ) */ RealType H_k_i = w.scalarProduct( vk ); RealType H_k_i = w.scalarProduct( v_k ); H[ k + i * ( m + 1 ) ] += H_k_i; /**** * w = w - H_{k,i} v_k */ w.addVector( vk, -H_k_i ); w.addVector( v_k, -H_k_i ); } /*** * H_{i+1,i} = |w| Loading @@ -231,8 +233,8 @@ orthogonalize_MGS( const int m, const RealType normb, const RealType beta ) /*** * v_{i+1} = w / |w| */ vi.bind( &( V.getData()[ ( i + 1 ) * ldSize ] ), size ); vi.addVector( w, 1.0 / normw, 0.0 ); v_i.bind( &V.getData()[ ( i + 1 ) * ldSize ], size ); v_i.addVector( w, 1.0 / normw, 0.0 ); /**** * Applying the Givens rotations G_0, ..., G_i Loading @@ -254,7 +256,9 @@ int GMRES< Matrix >:: orthogonalize_CWY( const int m, const RealType normb, const RealType beta ) { DeviceVector vi, vk; // initial binding to _M_tmp sets the correct local range, global size and // communication group for distributed views VectorViewType v_i( _M_tmp ), y_i( _M_tmp ); /*** * z = r / | r | = 1.0 / beta * r Loading @@ -278,40 +282,41 @@ orthogonalize_CWY( const int m, const RealType normb, const RealType beta ) /**** * Generate new Hauseholder transformation from vector z. */ hauseholder_generate( Y, T, i, z ); y_i.bind( &Y.getData()[ i * ldSize ], size ); hauseholder_generate( i, y_i, z ); if( i == 0 ) { /**** * s = e_1^T * P_i * z */ hauseholder_apply_trunc( s, Y, T, i, z ); hauseholder_apply_trunc( s, i, y_i, z ); } else { /*** * H_{i-1} = P_i * z */ HostView h( &H.getData()[ (i - 1) * (m + 1) ], m + 1 ); hauseholder_apply_trunc( h, Y, T, i, z ); hauseholder_apply_trunc( h, i, y_i, z ); } /*** * Generate new basis vector v_i, using the compact WY representation: * v_i = (I - Y_i * T_i Y_i^T) * e_i */ vi.bind( &V.getData()[ i * ldSize ], size ); hauseholder_cwy( vi, Y, T, i ); v_i.bind( &V.getData()[ i * ldSize ], size ); hauseholder_cwy( v_i, i ); if( i < m ) { /**** * Solve w from M w = A v_i */ preconditioned_matvec( w, vi ); preconditioned_matvec( w, v_i ); /**** * Apply all previous Hauseholder transformations, using the compact WY representation: * z = (I - Y_i * T_i^T * Y_i^T) * w */ hauseholder_cwy_transposed( z, Y, T, i, w ); hauseholder_cwy_transposed( z, i, w ); } /**** Loading @@ -321,7 +326,7 @@ orthogonalize_CWY( const int m, const RealType normb, const RealType beta ) apply_givens_rotations( i - 1, m ); this->setResidue( std::fabs( s[ i ] ) / normb ); if( ! this->checkNextIteration() ) if( i > 0 && ! this->checkNextIteration() ) return i - 1; else this->refreshSolverMonitor(); Loading Loading @@ -352,7 +357,7 @@ compute_residue( VectorViewType r, ConstVectorViewType x, ConstVectorViewType b template< typename Matrix > void GMRES< Matrix >:: preconditioned_matvec( DeviceVector& w, ConstDeviceView v ) preconditioned_matvec( VectorViewType w, ConstVectorViewType v ) { /**** * w = M.solve(A * v_i); Loading @@ -365,67 +370,35 @@ preconditioned_matvec( DeviceVector& w, ConstDeviceView v ) this->matrix->vectorProduct( v, w ); } #ifdef HAVE_CUDA template< typename DestinationElement, typename SourceElement, typename Index > __global__ void copyTruncatedVectorKernel( DestinationElement* destination, const SourceElement* source, const Index from, const Index size ) { Index elementIdx = blockIdx.x * blockDim.x + threadIdx.x; const Index gridSize = blockDim.x * gridDim.x; while( elementIdx < from ) { destination[ elementIdx ] = (DestinationElement) 0.0; elementIdx += gridSize; } while( elementIdx < size ) { destination[ elementIdx ] = source[ elementIdx ]; elementIdx += gridSize; } } #endif template< typename Matrix > void GMRES< Matrix >:: hauseholder_generate( DeviceVector& Y, HostVector& T, const int i, DeviceVector& z ) hauseholder_generate( const int i, VectorViewType y_i, ConstVectorViewType z ) { DeviceView y_i( &Y.getData()[ i * ldSize ], size ); // XXX: the upper-right triangle of Y will be full of zeros, which can be exploited for optimization if( std::is_same< DeviceType, Devices::Host >::value ) { for( IndexType j = 0; j < size; j++ ) { if( localOffset == 0 ) { TNL_ASSERT_LT( i, size, "upper-right triangle of Y is not on rank 0" ); auto kernel_truncation = [=] __cuda_callable__ ( IndexType j ) mutable { if( j < i ) y_i[ j ] = 0.0; else y_i[ j ] = z[ j ]; }; ParallelFor< DeviceType >::exec( (IndexType) 0, size, kernel_truncation ); } } if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef HAVE_CUDA dim3 blockSize( 256 ); dim3 gridSize; gridSize.x = min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( size, blockSize.x ) ); copyTruncatedVectorKernel<<< gridSize, blockSize >>>( y_i.getData(), z.getData(), i, size ); TNL_CHECK_CUDA_DEVICE; #else throw Exceptions::CudaSupportMissing(); #endif else { ConstDeviceView z_local = Traits::getLocalVectorView( z ); DeviceView y_i_local = Traits::getLocalVectorView( y_i ); y_i_local = z_local; } // norm of the TRUNCATED vector z const RealType normz = y_i.lpNorm( 2.0 ); RealType norm_yi = 0; if( localOffset == 0 ) { const RealType y_ii = y_i.getElement( i ); if( y_ii > 0.0 ) y_i.setElement( i, y_ii + normz ); Loading @@ -434,7 +407,10 @@ hauseholder_generate( DeviceVector& Y, // compute the norm of the y_i vector; equivalent to this calculation by definition: // const RealType norm_yi = y_i.lpNorm( 2.0 ); const RealType norm_yi = std::sqrt( 2 * (normz * normz + std::fabs( y_ii ) * normz) ); norm_yi = std::sqrt( 2 * (normz * normz + std::fabs( y_ii ) * normz) ); } // no-op if the problem is not distributed CommunicatorType::Bcast( &norm_yi, 1, 0, Traits::getCommunicationGroup( *this->matrix ) ); // XXX: normalization is slower, but more stable // y_i *= 1.0 / norm_yi; Loading @@ -453,8 +429,10 @@ hauseholder_generate( DeviceVector& Y, size, Y.getData(), ldSize, y_i.getData(), Traits::getLocalVectorView( y_i ).getData(), aux ); // no-op if the problem is not distributed CommunicatorType::Allreduce( aux, i, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) ); // [T_i]_{0..i-1} = - T_{i-1} * t_i * aux for( int k = 0; k < i; k++ ) { Loading @@ -469,49 +447,49 @@ template< typename Matrix > void GMRES< Matrix >:: hauseholder_apply_trunc( HostView out, DeviceVector& Y, HostVector& T, const int i, DeviceVector& z ) VectorViewType y_i, ConstVectorViewType z ) { DeviceView y_i( &Y.getData()[ i * ldSize ], size ); // copy part of y_i to the YL buffer // The upper (m+1)x(m+1) submatrix of Y is duplicated in the YL buffer, // which resides on host and is broadcasted from rank 0 to all processes. HostView YL_i( &YL[ i * (restarting_max + 1) ], restarting_max + 1 ); Containers::Algorithms::ArrayOperations< Devices::Host, DeviceType >::copyMemory( YL_i.getData(), Traits::getLocalVectorView( y_i ).getData(), YL_i.getSize() ); // no-op if the problem is not distributed CommunicatorType::Bcast( YL_i.getData(), YL_i.getSize(), 0, Traits::getCommunicationGroup( *this->matrix ) ); // TODO: is aux always 1? const RealType aux = T[ i + i * (restarting_max + 1) ] * y_i.scalarProduct( z ); if( localOffset == 0 ) { if( std::is_same< DeviceType, Devices::Host >::value ) { for( int k = 0; k <= i; k++ ) out[ k ] = z[ k ] - y_i[ k ] * aux; } if( std::is_same< DeviceType, Devices::Cuda >::value ) { // copy part of y_i to buffer on host // here we duplicate the upper (m+1)x(m+1) submatrix of Y on host for fast access RealType* host_yi = &YL[ i * (restarting_max + 1) ]; RealType host_z[ i + 1 ]; Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_yi, y_i.getData(), restarting_max + 1 ); Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_z, z.getData(), i + 1 ); Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( host_z, Traits::getLocalVectorView( z ).getData(), i + 1 ); for( int k = 0; k <= i; k++ ) out[ k ] = host_z[ k ] - host_yi[ k ] * aux; out[ k ] = host_z[ k ] - YL_i[ k ] * aux; } } // no-op if the problem is not distributed CommunicatorType::Bcast( out.getData(), i + 1, 0, Traits::getCommunicationGroup( *this->matrix ) ); } template< typename Matrix > void GMRES< Matrix >:: hauseholder_cwy( DeviceVector& v, DeviceVector& Y, HostVector& T, hauseholder_cwy( VectorViewType v, const int i ) { // aux = Y_i^T * e_i RealType aux[ i + 1 ]; if( std::is_same< DeviceType, Devices::Host >::value ) { for( int k = 0; k <= i; k++ ) aux[ k ] = Y[ i + k * ldSize ]; } if( std::is_same< DeviceType, Devices::Cuda >::value ) { // the upper (m+1)x(m+1) submatrix of Y is duplicated on host for fast access // the upper (m+1)x(m+1) submatrix of Y is duplicated on host // (faster access than from the device and it is broadcasted to all processes) for( int k = 0; k <= i; k++ ) aux[ k ] = YL[ i + k * (restarting_max + 1) ]; } // aux = T_i * aux // Note that T_i is upper triangular, so we can overwrite the aux vector with the result in place Loading @@ -525,18 +503,17 @@ hauseholder_cwy( DeviceVector& v, // v = e_i - Y_i * aux MatrixOperations< DeviceType >::gemv( size, i + 1, -1.0, Y.getData(), ldSize, aux, 0.0, v.getData() ); 0.0, Traits::getLocalVectorView( v ).getData() ); if( localOffset == 0 ) v.setElement( i, 1.0 + v.getElement( i ) ); } template< typename Matrix > void GMRES< Matrix >:: hauseholder_cwy_transposed( DeviceVector& z, DeviceVector& Y, HostVector& T, hauseholder_cwy_transposed( VectorViewType z, const int i, DeviceVector& w ) ConstVectorViewType w ) { // aux = Y_i^T * w RealType aux[ i + 1 ]; Loading @@ -547,8 +524,10 @@ hauseholder_cwy_transposed( DeviceVector& z, size, Y.getData(), ldSize, w.getData(), Traits::getLocalVectorView( w ).getData(), aux ); // no-op if the problem is not distributed Traits::CommunicatorType::Allreduce( aux, i + 1, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) ); // aux = T_i^T * aux // Note that T_i^T is lower triangular, so we can overwrite the aux vector with the result in place Loading @@ -563,7 +542,7 @@ hauseholder_cwy_transposed( DeviceVector& z, z = w; MatrixOperations< DeviceType >::gemv( size, i + 1, -1.0, Y.getData(), ldSize, aux, 1.0, z.getData() ); 1.0, Traits::getLocalVectorView( z ).getData() ); } template< typename Matrix > Loading @@ -574,7 +553,7 @@ update( const int k, const int m, const HostVector& H, const HostVector& s, DeviceVector& v, DeviceVector& V, Vector& x ) { RealType y[ m + 1 ]; Loading Loading @@ -602,8 +581,8 @@ update( const int k, // x = V * y + x MatrixOperations< DeviceType >::gemv( size, k + 1, 1.0, v.getData(), ldSize, y, 1.0, x.getData() ); 1.0, V.getData(), ldSize, y, 1.0, Traits::getLocalVectorView( x ).getData() ); } template< typename Matrix > Loading Loading @@ -673,29 +652,30 @@ apply_givens_rotations( int i, int m ) template< typename Matrix > void GMRES< Matrix >:: setSize( const IndexType size ) setSize( const VectorViewType& x ) { this->size = size; this->size = Traits::getLocalVectorView( x ).getSize(); if( std::is_same< DeviceType, Devices::Cuda >::value ) // align each column to 256 bytes - optimal for CUDA ldSize = roundToMultiple( size, 256 / sizeof( RealType ) ); else // on the host, we add 1 to disrupt the cache false-sharing pattern ldSize = roundToMultiple( size, 256 / sizeof( RealType ) ) + 1; localOffset = getLocalOffset( *this->matrix ); const int m = restarting_max; r.setSize( size ); w.setSize( size ); r.setLike( x ); w.setLike( x ); _M_tmp.setLike( x ); V.setSize( ldSize * ( m + 1 ) ); cs.setSize( m + 1 ); sn.setSize( m + 1 ); H.setSize( ( m + 1 ) * m ); s.setSize( m + 1 ); _M_tmp.setSize( size ); // CWY-specific storage if( variant == Variant::CWY ) { z.setSize( size ); z.setLike( x ); Y.setSize( ldSize * ( m + 1 ) ); T.setSize( (m + 1) * (m + 1) ); YL.setSize( (m + 1) * (m + 1) ); Loading src/TNL/Solvers/Linear/Traits.h +11 −0 Original line number Diff line number Diff line Loading @@ -12,7 +12,10 @@ #pragma once #include <TNL/Communicators/NoDistrCommunicator.h> #include <TNL/Containers/Vector.h> #include <TNL/Containers/VectorView.h> #include <TNL/DistributedContainers/DistributedVector.h> #include <TNL/DistributedContainers/DistributedVectorView.h> #include <TNL/DistributedContainers/DistributedMatrix.h> Loading @@ -23,6 +26,8 @@ namespace Linear { template< typename Matrix > struct Traits { using CommunicatorType = Communicators::NoDistrCommunicator; using VectorType = Containers::Vector < typename Matrix::RealType, typename Matrix::DeviceType, Loading @@ -45,11 +50,15 @@ struct Traits static const Matrix& getLocalMatrix( const Matrix& m ) { return m; } static ConstLocalVectorViewType getLocalVectorView( ConstVectorViewType v ) { return v; } static LocalVectorViewType getLocalVectorView( VectorViewType v ) { return v; } static typename CommunicatorType::CommunicationGroup getCommunicationGroup( const Matrix& m ) { return CommunicatorType::AllGroup; } }; template< typename Matrix, typename Communicator > struct Traits< DistributedContainers::DistributedMatrix< Matrix, Communicator > > { using CommunicatorType = Communicator; using VectorType = DistributedContainers::DistributedVector < typename Matrix::RealType, typename Matrix::DeviceType, Loading Loading @@ -84,6 +93,8 @@ struct Traits< DistributedContainers::DistributedMatrix< Matrix, Communicator > { return m.getLocalMatrix(); } static ConstLocalVectorViewType getLocalVectorView( ConstVectorViewType v ) { return v.getLocalVectorView(); } static LocalVectorViewType getLocalVectorView( VectorViewType v ) { return v.getLocalVectorView(); } static typename CommunicatorType::CommunicationGroup getCommunicationGroup( const DistributedContainers::DistributedMatrix< Matrix, Communicator >& m ) { return m.getCommunicationGroup(); } }; } // namespace Linear Loading Loading
src/TNL/Solvers/Linear/GMRES.h +48 −27 Original line number Diff line number Diff line Loading @@ -14,8 +14,6 @@ #include "LinearSolver.h" #include <TNL/Containers/Vector.h> namespace TNL { namespace Solvers { namespace Linear { Loading @@ -25,12 +23,19 @@ class GMRES : public LinearSolver< Matrix > { using Base = LinearSolver< Matrix >; // compatibility shortcuts using Traits = Linear::Traits< Matrix >; using CommunicatorType = typename Traits::CommunicatorType; public: using RealType = typename Base::RealType; using DeviceType = typename Base::DeviceType; using IndexType = typename Base::IndexType; // distributed vectors/views using VectorViewType = typename Base::VectorViewType; using ConstVectorViewType = typename Base::ConstVectorViewType; using VectorType = typename Traits::VectorType; String getType() const; Loading @@ -43,11 +48,12 @@ public: bool solve( ConstVectorViewType b, VectorViewType x ) override; protected: using ConstDeviceView = Containers::VectorView< const RealType, DeviceType, IndexType >; using DeviceView = Containers::VectorView< RealType, DeviceType, IndexType >; using HostView = Containers::VectorView< RealType, Devices::Host, IndexType >; using DeviceVector = Containers::Vector< RealType, DeviceType, IndexType >; using HostVector = Containers::Vector< RealType, Devices::Host, IndexType >; // local vectors/views using ConstDeviceView = typename Traits::ConstLocalVectorViewType; using DeviceView = typename Traits::LocalVectorViewType; using HostView = typename DeviceView::HostType; using DeviceVector = typename Traits::LocalVectorType; using HostVector = typename DeviceVector::HostType; enum class Variant { MGS, MGSR, CWY }; Loading @@ -57,36 +63,37 @@ protected: void compute_residue( VectorViewType r, ConstVectorViewType x, ConstVectorViewType b ); void preconditioned_matvec( DeviceVector& w, ConstDeviceView v ); void preconditioned_matvec( VectorViewType w, ConstVectorViewType v ); void hauseholder_generate( DeviceVector& Y, HostVector& T, const int i, DeviceVector& w ); // nvcc allows __cuda_callable__ lambdas only in public methods #ifdef __NVCC__ public: #endif void hauseholder_generate( const int i, VectorViewType y_i, ConstVectorViewType z ); #ifdef __NVCC__ protected: #endif void hauseholder_apply_trunc( HostView out, DeviceVector& Y, HostVector& T, const int i, DeviceVector& w ); VectorViewType y_i, ConstVectorViewType z ); void hauseholder_cwy( DeviceVector& w, DeviceVector& Y, HostVector& T, void hauseholder_cwy( VectorViewType v, const int i ); void hauseholder_cwy_transposed( DeviceVector& w, DeviceVector& Y, HostVector& T, void hauseholder_cwy_transposed( VectorViewType z, const int i, DeviceVector& z ); ConstVectorViewType w ); template< typename Vector > void update( const int k, const int m, const HostVector& H, const HostVector& s, DeviceVector& v, DeviceVector& V, Vector& x ); void generatePlaneRotation( RealType& dx, Loading @@ -101,15 +108,28 @@ protected: void apply_givens_rotations( const int i, const int m ); void setSize( const VectorViewType& x ); // Specialized methods to distinguish between normal and distributed matrices // in the implementation. template< typename M > static IndexType getLocalOffset( const M& m ) { return 0; } void setSize( const IndexType size ); template< typename M > static IndexType getLocalOffset( const DistributedContainers::DistributedMatrix< M >& m ) { return m.getLocalRowRange().getBegin(); } // selected GMRES variant Variant variant = Variant::CWY; // single vectors DeviceVector r, w, z, _M_tmp; // matrices (in column-major format) // single vectors (distributed) VectorType r, w, z, _M_tmp; // matrices (in column-major format) (local) DeviceVector V, Y; // (CWY only) duplicate of the upper (m+1)x(m+1) submatrix of Y (it is lower triangular) for fast access HostVector YL, T; Loading @@ -118,6 +138,7 @@ protected: IndexType size = 0; IndexType ldSize = 0; IndexType localOffset = 0; int restarting_min = 10; int restarting_max = 10; int restarting_step_min = 3; Loading
src/TNL/Solvers/Linear/GMRES_impl.h +104 −124 Original line number Diff line number Diff line Loading @@ -91,7 +91,7 @@ solve( ConstVectorViewType b, VectorViewType x ) << ", d_max = " << restarting_step_max << std::endl; return false; } setSize( this->matrix->getRows() ); setSize( x ); // initialize the norm of the preconditioned right-hand-side RealType normb; Loading Loading @@ -183,13 +183,15 @@ int GMRES< Matrix >:: orthogonalize_MGS( const int m, const RealType normb, const RealType beta ) { DeviceView vi, vk; // initial binding to _M_tmp sets the correct local range, global size and // communication group for distributed views VectorViewType v_i( _M_tmp ), v_k( _M_tmp ); /*** * v_0 = r / | r | = 1.0 / beta * r */ vi.bind( V.getData(), ldSize ); vi.addVector( r, 1.0 / beta, 0.0 ); v_i.bind( V.getData(), size ); v_i.addVector( r, 1.0 / beta, 0.0 ); H.setValue( 0.0 ); s.setValue( 0.0 ); Loading @@ -199,28 +201,28 @@ orthogonalize_MGS( const int m, const RealType normb, const RealType beta ) * Starting m-loop */ for( int i = 0; i < m && this->nextIteration(); i++ ) { vi.bind( &( V.getData()[ i * ldSize ] ), size ); v_i.bind( &V.getData()[ i * ldSize ], size ); /**** * Solve w from M w = A v_i */ preconditioned_matvec( w, vi ); preconditioned_matvec( w, v_i ); for( int k = 0; k <= i; k++ ) H[ k + i * ( m + 1 ) ] = 0.0; const int reorthogonalize = (variant == Variant::MGSR) ? 2 : 1; for( int l = 0; l < reorthogonalize; l++ ) for( int k = 0; k <= i; k++ ) { vk.bind( &( V.getData()[ k * ldSize ] ), size ); v_k.bind( &V.getData()[ k * ldSize ], size ); /*** * H_{k,i} = ( w, v_k ) */ RealType H_k_i = w.scalarProduct( vk ); RealType H_k_i = w.scalarProduct( v_k ); H[ k + i * ( m + 1 ) ] += H_k_i; /**** * w = w - H_{k,i} v_k */ w.addVector( vk, -H_k_i ); w.addVector( v_k, -H_k_i ); } /*** * H_{i+1,i} = |w| Loading @@ -231,8 +233,8 @@ orthogonalize_MGS( const int m, const RealType normb, const RealType beta ) /*** * v_{i+1} = w / |w| */ vi.bind( &( V.getData()[ ( i + 1 ) * ldSize ] ), size ); vi.addVector( w, 1.0 / normw, 0.0 ); v_i.bind( &V.getData()[ ( i + 1 ) * ldSize ], size ); v_i.addVector( w, 1.0 / normw, 0.0 ); /**** * Applying the Givens rotations G_0, ..., G_i Loading @@ -254,7 +256,9 @@ int GMRES< Matrix >:: orthogonalize_CWY( const int m, const RealType normb, const RealType beta ) { DeviceVector vi, vk; // initial binding to _M_tmp sets the correct local range, global size and // communication group for distributed views VectorViewType v_i( _M_tmp ), y_i( _M_tmp ); /*** * z = r / | r | = 1.0 / beta * r Loading @@ -278,40 +282,41 @@ orthogonalize_CWY( const int m, const RealType normb, const RealType beta ) /**** * Generate new Hauseholder transformation from vector z. */ hauseholder_generate( Y, T, i, z ); y_i.bind( &Y.getData()[ i * ldSize ], size ); hauseholder_generate( i, y_i, z ); if( i == 0 ) { /**** * s = e_1^T * P_i * z */ hauseholder_apply_trunc( s, Y, T, i, z ); hauseholder_apply_trunc( s, i, y_i, z ); } else { /*** * H_{i-1} = P_i * z */ HostView h( &H.getData()[ (i - 1) * (m + 1) ], m + 1 ); hauseholder_apply_trunc( h, Y, T, i, z ); hauseholder_apply_trunc( h, i, y_i, z ); } /*** * Generate new basis vector v_i, using the compact WY representation: * v_i = (I - Y_i * T_i Y_i^T) * e_i */ vi.bind( &V.getData()[ i * ldSize ], size ); hauseholder_cwy( vi, Y, T, i ); v_i.bind( &V.getData()[ i * ldSize ], size ); hauseholder_cwy( v_i, i ); if( i < m ) { /**** * Solve w from M w = A v_i */ preconditioned_matvec( w, vi ); preconditioned_matvec( w, v_i ); /**** * Apply all previous Hauseholder transformations, using the compact WY representation: * z = (I - Y_i * T_i^T * Y_i^T) * w */ hauseholder_cwy_transposed( z, Y, T, i, w ); hauseholder_cwy_transposed( z, i, w ); } /**** Loading @@ -321,7 +326,7 @@ orthogonalize_CWY( const int m, const RealType normb, const RealType beta ) apply_givens_rotations( i - 1, m ); this->setResidue( std::fabs( s[ i ] ) / normb ); if( ! this->checkNextIteration() ) if( i > 0 && ! this->checkNextIteration() ) return i - 1; else this->refreshSolverMonitor(); Loading Loading @@ -352,7 +357,7 @@ compute_residue( VectorViewType r, ConstVectorViewType x, ConstVectorViewType b template< typename Matrix > void GMRES< Matrix >:: preconditioned_matvec( DeviceVector& w, ConstDeviceView v ) preconditioned_matvec( VectorViewType w, ConstVectorViewType v ) { /**** * w = M.solve(A * v_i); Loading @@ -365,67 +370,35 @@ preconditioned_matvec( DeviceVector& w, ConstDeviceView v ) this->matrix->vectorProduct( v, w ); } #ifdef HAVE_CUDA template< typename DestinationElement, typename SourceElement, typename Index > __global__ void copyTruncatedVectorKernel( DestinationElement* destination, const SourceElement* source, const Index from, const Index size ) { Index elementIdx = blockIdx.x * blockDim.x + threadIdx.x; const Index gridSize = blockDim.x * gridDim.x; while( elementIdx < from ) { destination[ elementIdx ] = (DestinationElement) 0.0; elementIdx += gridSize; } while( elementIdx < size ) { destination[ elementIdx ] = source[ elementIdx ]; elementIdx += gridSize; } } #endif template< typename Matrix > void GMRES< Matrix >:: hauseholder_generate( DeviceVector& Y, HostVector& T, const int i, DeviceVector& z ) hauseholder_generate( const int i, VectorViewType y_i, ConstVectorViewType z ) { DeviceView y_i( &Y.getData()[ i * ldSize ], size ); // XXX: the upper-right triangle of Y will be full of zeros, which can be exploited for optimization if( std::is_same< DeviceType, Devices::Host >::value ) { for( IndexType j = 0; j < size; j++ ) { if( localOffset == 0 ) { TNL_ASSERT_LT( i, size, "upper-right triangle of Y is not on rank 0" ); auto kernel_truncation = [=] __cuda_callable__ ( IndexType j ) mutable { if( j < i ) y_i[ j ] = 0.0; else y_i[ j ] = z[ j ]; }; ParallelFor< DeviceType >::exec( (IndexType) 0, size, kernel_truncation ); } } if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef HAVE_CUDA dim3 blockSize( 256 ); dim3 gridSize; gridSize.x = min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( size, blockSize.x ) ); copyTruncatedVectorKernel<<< gridSize, blockSize >>>( y_i.getData(), z.getData(), i, size ); TNL_CHECK_CUDA_DEVICE; #else throw Exceptions::CudaSupportMissing(); #endif else { ConstDeviceView z_local = Traits::getLocalVectorView( z ); DeviceView y_i_local = Traits::getLocalVectorView( y_i ); y_i_local = z_local; } // norm of the TRUNCATED vector z const RealType normz = y_i.lpNorm( 2.0 ); RealType norm_yi = 0; if( localOffset == 0 ) { const RealType y_ii = y_i.getElement( i ); if( y_ii > 0.0 ) y_i.setElement( i, y_ii + normz ); Loading @@ -434,7 +407,10 @@ hauseholder_generate( DeviceVector& Y, // compute the norm of the y_i vector; equivalent to this calculation by definition: // const RealType norm_yi = y_i.lpNorm( 2.0 ); const RealType norm_yi = std::sqrt( 2 * (normz * normz + std::fabs( y_ii ) * normz) ); norm_yi = std::sqrt( 2 * (normz * normz + std::fabs( y_ii ) * normz) ); } // no-op if the problem is not distributed CommunicatorType::Bcast( &norm_yi, 1, 0, Traits::getCommunicationGroup( *this->matrix ) ); // XXX: normalization is slower, but more stable // y_i *= 1.0 / norm_yi; Loading @@ -453,8 +429,10 @@ hauseholder_generate( DeviceVector& Y, size, Y.getData(), ldSize, y_i.getData(), Traits::getLocalVectorView( y_i ).getData(), aux ); // no-op if the problem is not distributed CommunicatorType::Allreduce( aux, i, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) ); // [T_i]_{0..i-1} = - T_{i-1} * t_i * aux for( int k = 0; k < i; k++ ) { Loading @@ -469,49 +447,49 @@ template< typename Matrix > void GMRES< Matrix >:: hauseholder_apply_trunc( HostView out, DeviceVector& Y, HostVector& T, const int i, DeviceVector& z ) VectorViewType y_i, ConstVectorViewType z ) { DeviceView y_i( &Y.getData()[ i * ldSize ], size ); // copy part of y_i to the YL buffer // The upper (m+1)x(m+1) submatrix of Y is duplicated in the YL buffer, // which resides on host and is broadcasted from rank 0 to all processes. HostView YL_i( &YL[ i * (restarting_max + 1) ], restarting_max + 1 ); Containers::Algorithms::ArrayOperations< Devices::Host, DeviceType >::copyMemory( YL_i.getData(), Traits::getLocalVectorView( y_i ).getData(), YL_i.getSize() ); // no-op if the problem is not distributed CommunicatorType::Bcast( YL_i.getData(), YL_i.getSize(), 0, Traits::getCommunicationGroup( *this->matrix ) ); // TODO: is aux always 1? const RealType aux = T[ i + i * (restarting_max + 1) ] * y_i.scalarProduct( z ); if( localOffset == 0 ) { if( std::is_same< DeviceType, Devices::Host >::value ) { for( int k = 0; k <= i; k++ ) out[ k ] = z[ k ] - y_i[ k ] * aux; } if( std::is_same< DeviceType, Devices::Cuda >::value ) { // copy part of y_i to buffer on host // here we duplicate the upper (m+1)x(m+1) submatrix of Y on host for fast access RealType* host_yi = &YL[ i * (restarting_max + 1) ]; RealType host_z[ i + 1 ]; Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_yi, y_i.getData(), restarting_max + 1 ); Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_z, z.getData(), i + 1 ); Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( host_z, Traits::getLocalVectorView( z ).getData(), i + 1 ); for( int k = 0; k <= i; k++ ) out[ k ] = host_z[ k ] - host_yi[ k ] * aux; out[ k ] = host_z[ k ] - YL_i[ k ] * aux; } } // no-op if the problem is not distributed CommunicatorType::Bcast( out.getData(), i + 1, 0, Traits::getCommunicationGroup( *this->matrix ) ); } template< typename Matrix > void GMRES< Matrix >:: hauseholder_cwy( DeviceVector& v, DeviceVector& Y, HostVector& T, hauseholder_cwy( VectorViewType v, const int i ) { // aux = Y_i^T * e_i RealType aux[ i + 1 ]; if( std::is_same< DeviceType, Devices::Host >::value ) { for( int k = 0; k <= i; k++ ) aux[ k ] = Y[ i + k * ldSize ]; } if( std::is_same< DeviceType, Devices::Cuda >::value ) { // the upper (m+1)x(m+1) submatrix of Y is duplicated on host for fast access // the upper (m+1)x(m+1) submatrix of Y is duplicated on host // (faster access than from the device and it is broadcasted to all processes) for( int k = 0; k <= i; k++ ) aux[ k ] = YL[ i + k * (restarting_max + 1) ]; } // aux = T_i * aux // Note that T_i is upper triangular, so we can overwrite the aux vector with the result in place Loading @@ -525,18 +503,17 @@ hauseholder_cwy( DeviceVector& v, // v = e_i - Y_i * aux MatrixOperations< DeviceType >::gemv( size, i + 1, -1.0, Y.getData(), ldSize, aux, 0.0, v.getData() ); 0.0, Traits::getLocalVectorView( v ).getData() ); if( localOffset == 0 ) v.setElement( i, 1.0 + v.getElement( i ) ); } template< typename Matrix > void GMRES< Matrix >:: hauseholder_cwy_transposed( DeviceVector& z, DeviceVector& Y, HostVector& T, hauseholder_cwy_transposed( VectorViewType z, const int i, DeviceVector& w ) ConstVectorViewType w ) { // aux = Y_i^T * w RealType aux[ i + 1 ]; Loading @@ -547,8 +524,10 @@ hauseholder_cwy_transposed( DeviceVector& z, size, Y.getData(), ldSize, w.getData(), Traits::getLocalVectorView( w ).getData(), aux ); // no-op if the problem is not distributed Traits::CommunicatorType::Allreduce( aux, i + 1, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) ); // aux = T_i^T * aux // Note that T_i^T is lower triangular, so we can overwrite the aux vector with the result in place Loading @@ -563,7 +542,7 @@ hauseholder_cwy_transposed( DeviceVector& z, z = w; MatrixOperations< DeviceType >::gemv( size, i + 1, -1.0, Y.getData(), ldSize, aux, 1.0, z.getData() ); 1.0, Traits::getLocalVectorView( z ).getData() ); } template< typename Matrix > Loading @@ -574,7 +553,7 @@ update( const int k, const int m, const HostVector& H, const HostVector& s, DeviceVector& v, DeviceVector& V, Vector& x ) { RealType y[ m + 1 ]; Loading Loading @@ -602,8 +581,8 @@ update( const int k, // x = V * y + x MatrixOperations< DeviceType >::gemv( size, k + 1, 1.0, v.getData(), ldSize, y, 1.0, x.getData() ); 1.0, V.getData(), ldSize, y, 1.0, Traits::getLocalVectorView( x ).getData() ); } template< typename Matrix > Loading Loading @@ -673,29 +652,30 @@ apply_givens_rotations( int i, int m ) template< typename Matrix > void GMRES< Matrix >:: setSize( const IndexType size ) setSize( const VectorViewType& x ) { this->size = size; this->size = Traits::getLocalVectorView( x ).getSize(); if( std::is_same< DeviceType, Devices::Cuda >::value ) // align each column to 256 bytes - optimal for CUDA ldSize = roundToMultiple( size, 256 / sizeof( RealType ) ); else // on the host, we add 1 to disrupt the cache false-sharing pattern ldSize = roundToMultiple( size, 256 / sizeof( RealType ) ) + 1; localOffset = getLocalOffset( *this->matrix ); const int m = restarting_max; r.setSize( size ); w.setSize( size ); r.setLike( x ); w.setLike( x ); _M_tmp.setLike( x ); V.setSize( ldSize * ( m + 1 ) ); cs.setSize( m + 1 ); sn.setSize( m + 1 ); H.setSize( ( m + 1 ) * m ); s.setSize( m + 1 ); _M_tmp.setSize( size ); // CWY-specific storage if( variant == Variant::CWY ) { z.setSize( size ); z.setLike( x ); Y.setSize( ldSize * ( m + 1 ) ); T.setSize( (m + 1) * (m + 1) ); YL.setSize( (m + 1) * (m + 1) ); Loading
src/TNL/Solvers/Linear/Traits.h +11 −0 Original line number Diff line number Diff line Loading @@ -12,7 +12,10 @@ #pragma once #include <TNL/Communicators/NoDistrCommunicator.h> #include <TNL/Containers/Vector.h> #include <TNL/Containers/VectorView.h> #include <TNL/DistributedContainers/DistributedVector.h> #include <TNL/DistributedContainers/DistributedVectorView.h> #include <TNL/DistributedContainers/DistributedMatrix.h> Loading @@ -23,6 +26,8 @@ namespace Linear { template< typename Matrix > struct Traits { using CommunicatorType = Communicators::NoDistrCommunicator; using VectorType = Containers::Vector < typename Matrix::RealType, typename Matrix::DeviceType, Loading @@ -45,11 +50,15 @@ struct Traits static const Matrix& getLocalMatrix( const Matrix& m ) { return m; } static ConstLocalVectorViewType getLocalVectorView( ConstVectorViewType v ) { return v; } static LocalVectorViewType getLocalVectorView( VectorViewType v ) { return v; } static typename CommunicatorType::CommunicationGroup getCommunicationGroup( const Matrix& m ) { return CommunicatorType::AllGroup; } }; template< typename Matrix, typename Communicator > struct Traits< DistributedContainers::DistributedMatrix< Matrix, Communicator > > { using CommunicatorType = Communicator; using VectorType = DistributedContainers::DistributedVector < typename Matrix::RealType, typename Matrix::DeviceType, Loading Loading @@ -84,6 +93,8 @@ struct Traits< DistributedContainers::DistributedMatrix< Matrix, Communicator > { return m.getLocalMatrix(); } static ConstLocalVectorViewType getLocalVectorView( ConstVectorViewType v ) { return v.getLocalVectorView(); } static LocalVectorViewType getLocalVectorView( VectorViewType v ) { return v.getLocalVectorView(); } static typename CommunicatorType::CommunicationGroup getCommunicationGroup( const DistributedContainers::DistributedMatrix< Matrix, Communicator >& m ) { return m.getCommunicationGroup(); } }; } // namespace Linear Loading