diff --git a/src/TNL/Solvers/Linear/GMRES.h b/src/TNL/Solvers/Linear/GMRES.h index 280a7ad119947f3850435d25d202d165a6f1d4f7..8ad3fad2e083c91e793c0178c57999ab0086f354 100644 --- a/src/TNL/Solvers/Linear/GMRES.h +++ b/src/TNL/Solvers/Linear/GMRES.h @@ -14,8 +14,6 @@ #include "LinearSolver.h" -#include <TNL/Containers/Vector.h> - namespace TNL { namespace Solvers { namespace Linear { @@ -25,12 +23,19 @@ class GMRES : public LinearSolver< Matrix > { using Base = LinearSolver< Matrix >; + + // compatibility shortcuts + using Traits = Linear::Traits< Matrix >; + using CommunicatorType = typename Traits::CommunicatorType; + public: using RealType = typename Base::RealType; using DeviceType = typename Base::DeviceType; using IndexType = typename Base::IndexType; + // distributed vectors/views using VectorViewType = typename Base::VectorViewType; using ConstVectorViewType = typename Base::ConstVectorViewType; + using VectorType = typename Traits::VectorType; String getType() const; @@ -43,11 +48,12 @@ public: bool solve( ConstVectorViewType b, VectorViewType x ) override; protected: - using ConstDeviceView = Containers::VectorView< const RealType, DeviceType, IndexType >; - using DeviceView = Containers::VectorView< RealType, DeviceType, IndexType >; - using HostView = Containers::VectorView< RealType, Devices::Host, IndexType >; - using DeviceVector = Containers::Vector< RealType, DeviceType, IndexType >; - using HostVector = Containers::Vector< RealType, Devices::Host, IndexType >; + // local vectors/views + using ConstDeviceView = typename Traits::ConstLocalVectorViewType; + using DeviceView = typename Traits::LocalVectorViewType; + using HostView = typename DeviceView::HostType; + using DeviceVector = typename Traits::LocalVectorType; + using HostVector = typename DeviceVector::HostType; enum class Variant { MGS, MGSR, CWY }; @@ -57,36 +63,37 @@ protected: void compute_residue( VectorViewType r, ConstVectorViewType x, ConstVectorViewType b ); - void preconditioned_matvec( DeviceVector& w, ConstDeviceView v ); + void preconditioned_matvec( VectorViewType w, ConstVectorViewType v ); - void hauseholder_generate( DeviceVector& Y, - HostVector& T, - const int i, - DeviceVector& w ); +// nvcc allows __cuda_callable__ lambdas only in public methods +#ifdef __NVCC__ +public: +#endif + void hauseholder_generate( const int i, + VectorViewType y_i, + ConstVectorViewType z ); +#ifdef __NVCC__ +protected: +#endif void hauseholder_apply_trunc( HostView out, - DeviceVector& Y, - HostVector& T, const int i, - DeviceVector& w ); + VectorViewType y_i, + ConstVectorViewType z ); - void hauseholder_cwy( DeviceVector& w, - DeviceVector& Y, - HostVector& T, + void hauseholder_cwy( VectorViewType v, const int i ); - void hauseholder_cwy_transposed( DeviceVector& w, - DeviceVector& Y, - HostVector& T, + void hauseholder_cwy_transposed( VectorViewType z, const int i, - DeviceVector& z ); + ConstVectorViewType w ); template< typename Vector > void update( const int k, const int m, const HostVector& H, const HostVector& s, - DeviceVector& v, + DeviceVector& V, Vector& x ); void generatePlaneRotation( RealType& dx, @@ -101,15 +108,28 @@ protected: void apply_givens_rotations( const int i, const int m ); + void setSize( const VectorViewType& x ); + + // Specialized methods to distinguish between normal and distributed matrices + // in the implementation. + template< typename M > + static IndexType getLocalOffset( const M& m ) + { + return 0; + } - void setSize( const IndexType size ); + template< typename M > + static IndexType getLocalOffset( const DistributedContainers::DistributedMatrix< M >& m ) + { + return m.getLocalRowRange().getBegin(); + } // selected GMRES variant Variant variant = Variant::CWY; - // single vectors - DeviceVector r, w, z, _M_tmp; - // matrices (in column-major format) + // single vectors (distributed) + VectorType r, w, z, _M_tmp; + // matrices (in column-major format) (local) DeviceVector V, Y; // (CWY only) duplicate of the upper (m+1)x(m+1) submatrix of Y (it is lower triangular) for fast access HostVector YL, T; @@ -118,6 +138,7 @@ protected: IndexType size = 0; IndexType ldSize = 0; + IndexType localOffset = 0; int restarting_min = 10; int restarting_max = 10; int restarting_step_min = 3; diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h index 4e5f0360f6325732421c9b786cecc69a46025c94..2e412398e579cfa62d59f79eec40299aa0945d68 100644 --- a/src/TNL/Solvers/Linear/GMRES_impl.h +++ b/src/TNL/Solvers/Linear/GMRES_impl.h @@ -91,7 +91,7 @@ solve( ConstVectorViewType b, VectorViewType x ) << ", d_max = " << restarting_step_max << std::endl; return false; } - setSize( this->matrix->getRows() ); + setSize( x ); // initialize the norm of the preconditioned right-hand-side RealType normb; @@ -183,13 +183,15 @@ int GMRES< Matrix >:: orthogonalize_MGS( const int m, const RealType normb, const RealType beta ) { - DeviceView vi, vk; + // initial binding to _M_tmp sets the correct local range, global size and + // communication group for distributed views + VectorViewType v_i( _M_tmp ), v_k( _M_tmp ); /*** * v_0 = r / | r | = 1.0 / beta * r */ - vi.bind( V.getData(), ldSize ); - vi.addVector( r, 1.0 / beta, 0.0 ); + v_i.bind( V.getData(), size ); + v_i.addVector( r, 1.0 / beta, 0.0 ); H.setValue( 0.0 ); s.setValue( 0.0 ); @@ -199,28 +201,28 @@ orthogonalize_MGS( const int m, const RealType normb, const RealType beta ) * Starting m-loop */ for( int i = 0; i < m && this->nextIteration(); i++ ) { - vi.bind( &( V.getData()[ i * ldSize ] ), size ); + v_i.bind( &V.getData()[ i * ldSize ], size ); /**** * Solve w from M w = A v_i */ - preconditioned_matvec( w, vi ); + preconditioned_matvec( w, v_i ); for( int k = 0; k <= i; k++ ) H[ k + i * ( m + 1 ) ] = 0.0; const int reorthogonalize = (variant == Variant::MGSR) ? 2 : 1; for( int l = 0; l < reorthogonalize; l++ ) for( int k = 0; k <= i; k++ ) { - vk.bind( &( V.getData()[ k * ldSize ] ), size ); + v_k.bind( &V.getData()[ k * ldSize ], size ); /*** * H_{k,i} = ( w, v_k ) */ - RealType H_k_i = w.scalarProduct( vk ); + RealType H_k_i = w.scalarProduct( v_k ); H[ k + i * ( m + 1 ) ] += H_k_i; /**** * w = w - H_{k,i} v_k */ - w.addVector( vk, -H_k_i ); + w.addVector( v_k, -H_k_i ); } /*** * H_{i+1,i} = |w| @@ -231,8 +233,8 @@ orthogonalize_MGS( const int m, const RealType normb, const RealType beta ) /*** * v_{i+1} = w / |w| */ - vi.bind( &( V.getData()[ ( i + 1 ) * ldSize ] ), size ); - vi.addVector( w, 1.0 / normw, 0.0 ); + v_i.bind( &V.getData()[ ( i + 1 ) * ldSize ], size ); + v_i.addVector( w, 1.0 / normw, 0.0 ); /**** * Applying the Givens rotations G_0, ..., G_i @@ -254,7 +256,9 @@ int GMRES< Matrix >:: orthogonalize_CWY( const int m, const RealType normb, const RealType beta ) { - DeviceVector vi, vk; + // initial binding to _M_tmp sets the correct local range, global size and + // communication group for distributed views + VectorViewType v_i( _M_tmp ), y_i( _M_tmp ); /*** * z = r / | r | = 1.0 / beta * r @@ -278,40 +282,41 @@ orthogonalize_CWY( const int m, const RealType normb, const RealType beta ) /**** * Generate new Hauseholder transformation from vector z. */ - hauseholder_generate( Y, T, i, z ); + y_i.bind( &Y.getData()[ i * ldSize ], size ); + hauseholder_generate( i, y_i, z ); if( i == 0 ) { /**** * s = e_1^T * P_i * z */ - hauseholder_apply_trunc( s, Y, T, i, z ); + hauseholder_apply_trunc( s, i, y_i, z ); } else { /*** * H_{i-1} = P_i * z */ HostView h( &H.getData()[ (i - 1) * (m + 1) ], m + 1 ); - hauseholder_apply_trunc( h, Y, T, i, z ); + hauseholder_apply_trunc( h, i, y_i, z ); } /*** * Generate new basis vector v_i, using the compact WY representation: * v_i = (I - Y_i * T_i Y_i^T) * e_i */ - vi.bind( &V.getData()[ i * ldSize ], size ); - hauseholder_cwy( vi, Y, T, i ); + v_i.bind( &V.getData()[ i * ldSize ], size ); + hauseholder_cwy( v_i, i ); if( i < m ) { /**** * Solve w from M w = A v_i */ - preconditioned_matvec( w, vi ); + preconditioned_matvec( w, v_i ); /**** * Apply all previous Hauseholder transformations, using the compact WY representation: * z = (I - Y_i * T_i^T * Y_i^T) * w */ - hauseholder_cwy_transposed( z, Y, T, i, w ); + hauseholder_cwy_transposed( z, i, w ); } /**** @@ -321,7 +326,7 @@ orthogonalize_CWY( const int m, const RealType normb, const RealType beta ) apply_givens_rotations( i - 1, m ); this->setResidue( std::fabs( s[ i ] ) / normb ); - if( ! this->checkNextIteration() ) + if( i > 0 && ! this->checkNextIteration() ) return i - 1; else this->refreshSolverMonitor(); @@ -352,7 +357,7 @@ compute_residue( VectorViewType r, ConstVectorViewType x, ConstVectorViewType b template< typename Matrix > void GMRES< Matrix >:: -preconditioned_matvec( DeviceVector& w, ConstDeviceView v ) +preconditioned_matvec( VectorViewType w, ConstVectorViewType v ) { /**** * w = M.solve(A * v_i); @@ -365,76 +370,47 @@ preconditioned_matvec( DeviceVector& w, ConstDeviceView v ) this->matrix->vectorProduct( v, w ); } -#ifdef HAVE_CUDA -template< typename DestinationElement, - typename SourceElement, - typename Index > -__global__ void -copyTruncatedVectorKernel( DestinationElement* destination, - const SourceElement* source, - const Index from, - const Index size ) -{ - Index elementIdx = blockIdx.x * blockDim.x + threadIdx.x; - const Index gridSize = blockDim.x * gridDim.x; - - while( elementIdx < from ) { - destination[ elementIdx ] = (DestinationElement) 0.0; - elementIdx += gridSize; - } - while( elementIdx < size ) { - destination[ elementIdx ] = source[ elementIdx ]; - elementIdx += gridSize; - } -} -#endif - template< typename Matrix > void GMRES< Matrix >:: -hauseholder_generate( DeviceVector& Y, - HostVector& T, - const int i, - DeviceVector& z ) +hauseholder_generate( const int i, + VectorViewType y_i, + ConstVectorViewType z ) { - DeviceView y_i( &Y.getData()[ i * ldSize ], size ); - // XXX: the upper-right triangle of Y will be full of zeros, which can be exploited for optimization - if( std::is_same< DeviceType, Devices::Host >::value ) { - for( IndexType j = 0; j < size; j++ ) { + if( localOffset == 0 ) { + TNL_ASSERT_LT( i, size, "upper-right triangle of Y is not on rank 0" ); + auto kernel_truncation = [=] __cuda_callable__ ( IndexType j ) mutable + { if( j < i ) y_i[ j ] = 0.0; else y_i[ j ] = z[ j ]; - } + }; + ParallelFor< DeviceType >::exec( (IndexType) 0, size, kernel_truncation ); } - if( std::is_same< DeviceType, Devices::Cuda >::value ) { -#ifdef HAVE_CUDA - dim3 blockSize( 256 ); - dim3 gridSize; - gridSize.x = min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( size, blockSize.x ) ); - - copyTruncatedVectorKernel<<< gridSize, blockSize >>>( y_i.getData(), - z.getData(), - i, - size ); - TNL_CHECK_CUDA_DEVICE; -#else - throw Exceptions::CudaSupportMissing(); -#endif + else { + ConstDeviceView z_local = Traits::getLocalVectorView( z ); + DeviceView y_i_local = Traits::getLocalVectorView( y_i ); + y_i_local = z_local; } // norm of the TRUNCATED vector z const RealType normz = y_i.lpNorm( 2.0 ); - const RealType y_ii = y_i.getElement( i ); - if( y_ii > 0.0 ) - y_i.setElement( i, y_ii + normz ); - else - y_i.setElement( i, y_ii - normz ); + RealType norm_yi = 0; + if( localOffset == 0 ) { + const RealType y_ii = y_i.getElement( i ); + if( y_ii > 0.0 ) + y_i.setElement( i, y_ii + normz ); + else + y_i.setElement( i, y_ii - normz ); - // compute the norm of the y_i vector; equivalent to this calculation by definition: - // const RealType norm_yi = y_i.lpNorm( 2.0 ); - const RealType norm_yi = std::sqrt( 2 * (normz * normz + std::fabs( y_ii ) * normz) ); + // compute the norm of the y_i vector; equivalent to this calculation by definition: + // const RealType norm_yi = y_i.lpNorm( 2.0 ); + norm_yi = std::sqrt( 2 * (normz * normz + std::fabs( y_ii ) * normz) ); + } + // no-op if the problem is not distributed + CommunicatorType::Bcast( &norm_yi, 1, 0, Traits::getCommunicationGroup( *this->matrix ) ); // XXX: normalization is slower, but more stable // y_i *= 1.0 / norm_yi; @@ -453,8 +429,10 @@ hauseholder_generate( DeviceVector& Y, size, Y.getData(), ldSize, - y_i.getData(), + Traits::getLocalVectorView( y_i ).getData(), aux ); + // no-op if the problem is not distributed + CommunicatorType::Allreduce( aux, i, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) ); // [T_i]_{0..i-1} = - T_{i-1} * t_i * aux for( int k = 0; k < i; k++ ) { @@ -469,49 +447,49 @@ template< typename Matrix > void GMRES< Matrix >:: hauseholder_apply_trunc( HostView out, - DeviceVector& Y, - HostVector& T, const int i, - DeviceVector& z ) + VectorViewType y_i, + ConstVectorViewType z ) { - DeviceView y_i( &Y.getData()[ i * ldSize ], size ); - + // copy part of y_i to the YL buffer + // The upper (m+1)x(m+1) submatrix of Y is duplicated in the YL buffer, + // which resides on host and is broadcasted from rank 0 to all processes. + HostView YL_i( &YL[ i * (restarting_max + 1) ], restarting_max + 1 ); + Containers::Algorithms::ArrayOperations< Devices::Host, DeviceType >::copyMemory( YL_i.getData(), Traits::getLocalVectorView( y_i ).getData(), YL_i.getSize() ); + // no-op if the problem is not distributed + CommunicatorType::Bcast( YL_i.getData(), YL_i.getSize(), 0, Traits::getCommunicationGroup( *this->matrix ) ); + + // TODO: is aux always 1? const RealType aux = T[ i + i * (restarting_max + 1) ] * y_i.scalarProduct( z ); - if( std::is_same< DeviceType, Devices::Host >::value ) { - for( int k = 0; k <= i; k++ ) - out[ k ] = z[ k ] - y_i[ k ] * aux; - } - if( std::is_same< DeviceType, Devices::Cuda >::value ) { - // copy part of y_i to buffer on host - // here we duplicate the upper (m+1)x(m+1) submatrix of Y on host for fast access - RealType* host_yi = &YL[ i * (restarting_max + 1) ]; - RealType host_z[ i + 1 ]; - Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_yi, y_i.getData(), restarting_max + 1 ); - Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_z, z.getData(), i + 1 ); - for( int k = 0; k <= i; k++ ) - out[ k ] = host_z[ k ] - host_yi[ k ] * aux; + if( localOffset == 0 ) { + if( std::is_same< DeviceType, Devices::Host >::value ) { + for( int k = 0; k <= i; k++ ) + out[ k ] = z[ k ] - y_i[ k ] * aux; + } + if( std::is_same< DeviceType, Devices::Cuda >::value ) { + RealType host_z[ i + 1 ]; + Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( host_z, Traits::getLocalVectorView( z ).getData(), i + 1 ); + for( int k = 0; k <= i; k++ ) + out[ k ] = host_z[ k ] - YL_i[ k ] * aux; + } } + + // no-op if the problem is not distributed + CommunicatorType::Bcast( out.getData(), i + 1, 0, Traits::getCommunicationGroup( *this->matrix ) ); } template< typename Matrix > void GMRES< Matrix >:: -hauseholder_cwy( DeviceVector& v, - DeviceVector& Y, - HostVector& T, +hauseholder_cwy( VectorViewType v, const int i ) { // aux = Y_i^T * e_i RealType aux[ i + 1 ]; - if( std::is_same< DeviceType, Devices::Host >::value ) { - for( int k = 0; k <= i; k++ ) - aux[ k ] = Y[ i + k * ldSize ]; - } - if( std::is_same< DeviceType, Devices::Cuda >::value ) { - // the upper (m+1)x(m+1) submatrix of Y is duplicated on host for fast access - for( int k = 0; k <= i; k++ ) - aux[ k ] = YL[ i + k * (restarting_max + 1) ]; - } + // the upper (m+1)x(m+1) submatrix of Y is duplicated on host + // (faster access than from the device and it is broadcasted to all processes) + for( int k = 0; k <= i; k++ ) + aux[ k ] = YL[ i + k * (restarting_max + 1) ]; // aux = T_i * aux // Note that T_i is upper triangular, so we can overwrite the aux vector with the result in place @@ -525,18 +503,17 @@ hauseholder_cwy( DeviceVector& v, // v = e_i - Y_i * aux MatrixOperations< DeviceType >::gemv( size, i + 1, -1.0, Y.getData(), ldSize, aux, - 0.0, v.getData() ); - v.setElement( i, 1.0 + v.getElement( i ) ); + 0.0, Traits::getLocalVectorView( v ).getData() ); + if( localOffset == 0 ) + v.setElement( i, 1.0 + v.getElement( i ) ); } template< typename Matrix > void GMRES< Matrix >:: -hauseholder_cwy_transposed( DeviceVector& z, - DeviceVector& Y, - HostVector& T, +hauseholder_cwy_transposed( VectorViewType z, const int i, - DeviceVector& w ) + ConstVectorViewType w ) { // aux = Y_i^T * w RealType aux[ i + 1 ]; @@ -547,8 +524,10 @@ hauseholder_cwy_transposed( DeviceVector& z, size, Y.getData(), ldSize, - w.getData(), + Traits::getLocalVectorView( w ).getData(), aux ); + // no-op if the problem is not distributed + Traits::CommunicatorType::Allreduce( aux, i + 1, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) ); // aux = T_i^T * aux // Note that T_i^T is lower triangular, so we can overwrite the aux vector with the result in place @@ -563,7 +542,7 @@ hauseholder_cwy_transposed( DeviceVector& z, z = w; MatrixOperations< DeviceType >::gemv( size, i + 1, -1.0, Y.getData(), ldSize, aux, - 1.0, z.getData() ); + 1.0, Traits::getLocalVectorView( z ).getData() ); } template< typename Matrix > @@ -574,7 +553,7 @@ update( const int k, const int m, const HostVector& H, const HostVector& s, - DeviceVector& v, + DeviceVector& V, Vector& x ) { RealType y[ m + 1 ]; @@ -602,8 +581,8 @@ update( const int k, // x = V * y + x MatrixOperations< DeviceType >::gemv( size, k + 1, - 1.0, v.getData(), ldSize, y, - 1.0, x.getData() ); + 1.0, V.getData(), ldSize, y, + 1.0, Traits::getLocalVectorView( x ).getData() ); } template< typename Matrix > @@ -673,29 +652,30 @@ apply_givens_rotations( int i, int m ) template< typename Matrix > void GMRES< Matrix >:: -setSize( const IndexType size ) +setSize( const VectorViewType& x ) { - this->size = size; + this->size = Traits::getLocalVectorView( x ).getSize(); if( std::is_same< DeviceType, Devices::Cuda >::value ) // align each column to 256 bytes - optimal for CUDA ldSize = roundToMultiple( size, 256 / sizeof( RealType ) ); else // on the host, we add 1 to disrupt the cache false-sharing pattern ldSize = roundToMultiple( size, 256 / sizeof( RealType ) ) + 1; + localOffset = getLocalOffset( *this->matrix ); const int m = restarting_max; - r.setSize( size ); - w.setSize( size ); + r.setLike( x ); + w.setLike( x ); + _M_tmp.setLike( x ); V.setSize( ldSize * ( m + 1 ) ); cs.setSize( m + 1 ); sn.setSize( m + 1 ); H.setSize( ( m + 1 ) * m ); s.setSize( m + 1 ); - _M_tmp.setSize( size ); // CWY-specific storage if( variant == Variant::CWY ) { - z.setSize( size ); + z.setLike( x ); Y.setSize( ldSize * ( m + 1 ) ); T.setSize( (m + 1) * (m + 1) ); YL.setSize( (m + 1) * (m + 1) ); diff --git a/src/TNL/Solvers/Linear/Traits.h b/src/TNL/Solvers/Linear/Traits.h index 343fae5c8ae717117612ebecfdc0db638dbc21cf..5fd61c0156bea20895b35eff5b0cf99d5fc1704b 100644 --- a/src/TNL/Solvers/Linear/Traits.h +++ b/src/TNL/Solvers/Linear/Traits.h @@ -12,7 +12,10 @@ #pragma once +#include <TNL/Communicators/NoDistrCommunicator.h> +#include <TNL/Containers/Vector.h> #include <TNL/Containers/VectorView.h> +#include <TNL/DistributedContainers/DistributedVector.h> #include <TNL/DistributedContainers/DistributedVectorView.h> #include <TNL/DistributedContainers/DistributedMatrix.h> @@ -23,6 +26,8 @@ namespace Linear { template< typename Matrix > struct Traits { + using CommunicatorType = Communicators::NoDistrCommunicator; + using VectorType = Containers::Vector < typename Matrix::RealType, typename Matrix::DeviceType, @@ -45,11 +50,15 @@ struct Traits static const Matrix& getLocalMatrix( const Matrix& m ) { return m; } static ConstLocalVectorViewType getLocalVectorView( ConstVectorViewType v ) { return v; } static LocalVectorViewType getLocalVectorView( VectorViewType v ) { return v; } + + static typename CommunicatorType::CommunicationGroup getCommunicationGroup( const Matrix& m ) { return CommunicatorType::AllGroup; } }; template< typename Matrix, typename Communicator > struct Traits< DistributedContainers::DistributedMatrix< Matrix, Communicator > > { + using CommunicatorType = Communicator; + using VectorType = DistributedContainers::DistributedVector < typename Matrix::RealType, typename Matrix::DeviceType, @@ -84,6 +93,8 @@ struct Traits< DistributedContainers::DistributedMatrix< Matrix, Communicator > { return m.getLocalMatrix(); } static ConstLocalVectorViewType getLocalVectorView( ConstVectorViewType v ) { return v.getLocalVectorView(); } static LocalVectorViewType getLocalVectorView( VectorViewType v ) { return v.getLocalVectorView(); } + + static typename CommunicatorType::CommunicationGroup getCommunicationGroup( const DistributedContainers::DistributedMatrix< Matrix, Communicator >& m ) { return m.getCommunicationGroup(); } }; } // namespace Linear