diff --git a/src/TNL/Solvers/Linear/GMRES.h b/src/TNL/Solvers/Linear/GMRES.h
index 280a7ad119947f3850435d25d202d165a6f1d4f7..8ad3fad2e083c91e793c0178c57999ab0086f354 100644
--- a/src/TNL/Solvers/Linear/GMRES.h
+++ b/src/TNL/Solvers/Linear/GMRES.h
@@ -14,8 +14,6 @@
 
 #include "LinearSolver.h"
 
-#include <TNL/Containers/Vector.h>
-
 namespace TNL {
 namespace Solvers {
 namespace Linear {
@@ -25,12 +23,19 @@ class GMRES
 : public LinearSolver< Matrix >
 {
    using Base = LinearSolver< Matrix >;
+
+   // compatibility shortcuts
+   using Traits = Linear::Traits< Matrix >;
+   using CommunicatorType = typename Traits::CommunicatorType;
+
 public:
    using RealType = typename Base::RealType;
    using DeviceType = typename Base::DeviceType;
    using IndexType = typename Base::IndexType;
+   // distributed vectors/views
    using VectorViewType = typename Base::VectorViewType;
    using ConstVectorViewType = typename Base::ConstVectorViewType;
+   using VectorType = typename Traits::VectorType;
 
    String getType() const;
 
@@ -43,11 +48,12 @@ public:
    bool solve( ConstVectorViewType b, VectorViewType x ) override;
 
 protected:
-   using ConstDeviceView = Containers::VectorView< const RealType, DeviceType, IndexType >;
-   using DeviceView = Containers::VectorView< RealType, DeviceType, IndexType >;
-   using HostView = Containers::VectorView< RealType, Devices::Host, IndexType >;
-   using DeviceVector = Containers::Vector< RealType, DeviceType, IndexType >;
-   using HostVector = Containers::Vector< RealType, Devices::Host, IndexType >;
+   // local vectors/views
+   using ConstDeviceView = typename Traits::ConstLocalVectorViewType;
+   using DeviceView = typename Traits::LocalVectorViewType;
+   using HostView = typename DeviceView::HostType;
+   using DeviceVector = typename Traits::LocalVectorType;
+   using HostVector = typename DeviceVector::HostType;
 
    enum class Variant { MGS, MGSR, CWY };
 
@@ -57,36 +63,37 @@ protected:
 
    void compute_residue( VectorViewType r, ConstVectorViewType x, ConstVectorViewType b );
 
-   void preconditioned_matvec( DeviceVector& w, ConstDeviceView v );
+   void preconditioned_matvec( VectorViewType w, ConstVectorViewType v );
 
-   void hauseholder_generate( DeviceVector& Y,
-                              HostVector& T,
-                              const int i,
-                              DeviceVector& w );
+// nvcc allows __cuda_callable__ lambdas only in public methods
+#ifdef __NVCC__
+public:
+#endif
+   void hauseholder_generate( const int i,
+                              VectorViewType y_i,
+                              ConstVectorViewType z );
+#ifdef __NVCC__
+protected:
+#endif
 
    void hauseholder_apply_trunc( HostView out,
-                                 DeviceVector& Y,
-                                 HostVector& T,
                                  const int i,
-                                 DeviceVector& w );
+                                 VectorViewType y_i,
+                                 ConstVectorViewType z );
 
-   void hauseholder_cwy( DeviceVector& w,
-                         DeviceVector& Y,
-                         HostVector& T,
+   void hauseholder_cwy( VectorViewType v,
                          const int i );
 
-   void hauseholder_cwy_transposed( DeviceVector& w,
-                                    DeviceVector& Y,
-                                    HostVector& T,
+   void hauseholder_cwy_transposed( VectorViewType z,
                                     const int i,
-                                    DeviceVector& z );
+                                    ConstVectorViewType w );
 
    template< typename Vector >
    void update( const int k,
                 const int m,
                 const HostVector& H,
                 const HostVector& s,
-                DeviceVector& v,
+                DeviceVector& V,
                 Vector& x );
 
    void generatePlaneRotation( RealType& dx,
@@ -101,15 +108,28 @@ protected:
 
    void apply_givens_rotations( const int i, const int m );
 
+   void setSize( const VectorViewType& x );
+
+   // Specialized methods to distinguish between normal and distributed matrices
+   // in the implementation.
+   template< typename M >
+   static IndexType getLocalOffset( const M& m )
+   {
+      return 0;
+   }
 
-   void setSize( const IndexType size );
+   template< typename M >
+   static IndexType getLocalOffset( const DistributedContainers::DistributedMatrix< M >& m )
+   {
+      return m.getLocalRowRange().getBegin();
+   }
 
    // selected GMRES variant
    Variant variant = Variant::CWY;
 
-   // single vectors
-   DeviceVector r, w, z, _M_tmp;
-   // matrices (in column-major format)
+   // single vectors (distributed)
+   VectorType r, w, z, _M_tmp;
+   // matrices (in column-major format) (local)
    DeviceVector V, Y;
    // (CWY only) duplicate of the upper (m+1)x(m+1) submatrix of Y (it is lower triangular) for fast access
    HostVector YL, T;
@@ -118,6 +138,7 @@ protected:
 
    IndexType size = 0;
    IndexType ldSize = 0;
+   IndexType localOffset = 0;
    int restarting_min = 10;
    int restarting_max = 10;
    int restarting_step_min = 3;
diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h
index 4e5f0360f6325732421c9b786cecc69a46025c94..2e412398e579cfa62d59f79eec40299aa0945d68 100644
--- a/src/TNL/Solvers/Linear/GMRES_impl.h
+++ b/src/TNL/Solvers/Linear/GMRES_impl.h
@@ -91,7 +91,7 @@ solve( ConstVectorViewType b, VectorViewType x )
                 << ", d_max = " << restarting_step_max << std::endl;
       return false;
    }
-   setSize( this->matrix->getRows() );
+   setSize( x );
 
    // initialize the norm of the preconditioned right-hand-side
    RealType normb;
@@ -183,13 +183,15 @@ int
 GMRES< Matrix >::
 orthogonalize_MGS( const int m, const RealType normb, const RealType beta )
 {
-   DeviceView vi, vk;
+   // initial binding to _M_tmp sets the correct local range, global size and
+   // communication group for distributed views
+   VectorViewType v_i( _M_tmp ), v_k( _M_tmp );
 
    /***
     * v_0 = r / | r | =  1.0 / beta * r
     */
-   vi.bind( V.getData(), ldSize );
-   vi.addVector( r, 1.0 / beta, 0.0 );
+   v_i.bind( V.getData(), size );
+   v_i.addVector( r, 1.0 / beta, 0.0 );
 
    H.setValue( 0.0 );
    s.setValue( 0.0 );
@@ -199,28 +201,28 @@ orthogonalize_MGS( const int m, const RealType normb, const RealType beta )
     * Starting m-loop
     */
    for( int i = 0; i < m && this->nextIteration(); i++ ) {
-      vi.bind( &( V.getData()[ i * ldSize ] ), size );
+      v_i.bind( &V.getData()[ i * ldSize ], size );
       /****
        * Solve w from M w = A v_i
        */
-      preconditioned_matvec( w, vi );
+      preconditioned_matvec( w, v_i );
 
       for( int k = 0; k <= i; k++ )
          H[ k + i * ( m + 1 ) ] = 0.0;
       const int reorthogonalize = (variant == Variant::MGSR) ? 2 : 1;
       for( int l = 0; l < reorthogonalize; l++ )
          for( int k = 0; k <= i; k++ ) {
-            vk.bind( &( V.getData()[ k * ldSize ] ), size );
+            v_k.bind( &V.getData()[ k * ldSize ], size );
             /***
              * H_{k,i} = ( w, v_k )
              */
-            RealType H_k_i = w.scalarProduct( vk );
+            RealType H_k_i = w.scalarProduct( v_k );
             H[ k + i * ( m + 1 ) ] += H_k_i;
 
             /****
              * w = w - H_{k,i} v_k
              */
-            w.addVector( vk, -H_k_i );
+            w.addVector( v_k, -H_k_i );
          }
       /***
        * H_{i+1,i} = |w|
@@ -231,8 +233,8 @@ orthogonalize_MGS( const int m, const RealType normb, const RealType beta )
       /***
        * v_{i+1} = w / |w|
        */
-      vi.bind( &( V.getData()[ ( i + 1 ) * ldSize ] ), size );
-      vi.addVector( w, 1.0 / normw, 0.0 );
+      v_i.bind( &V.getData()[ ( i + 1 ) * ldSize ], size );
+      v_i.addVector( w, 1.0 / normw, 0.0 );
 
       /****
        * Applying the Givens rotations G_0, ..., G_i
@@ -254,7 +256,9 @@ int
 GMRES< Matrix >::
 orthogonalize_CWY( const int m, const RealType normb, const RealType beta )
 {
-   DeviceVector vi, vk;
+   // initial binding to _M_tmp sets the correct local range, global size and
+   // communication group for distributed views
+   VectorViewType v_i( _M_tmp ), y_i( _M_tmp );
 
    /***
     * z = r / | r | =  1.0 / beta * r
@@ -278,40 +282,41 @@ orthogonalize_CWY( const int m, const RealType normb, const RealType beta )
       /****
        * Generate new Hauseholder transformation from vector z.
        */
-      hauseholder_generate( Y, T, i, z );
+      y_i.bind( &Y.getData()[ i * ldSize ], size );
+      hauseholder_generate( i, y_i, z );
 
       if( i == 0 ) {
          /****
           * s = e_1^T * P_i * z
           */
-         hauseholder_apply_trunc( s, Y, T, i, z );
+         hauseholder_apply_trunc( s, i, y_i, z );
       }
       else {
          /***
           * H_{i-1} = P_i * z
           */
          HostView h( &H.getData()[ (i - 1) * (m + 1) ], m + 1 );
-         hauseholder_apply_trunc( h, Y, T, i, z );
+         hauseholder_apply_trunc( h, i, y_i, z );
       }
 
       /***
        * Generate new basis vector v_i, using the compact WY representation:
        *     v_i = (I - Y_i * T_i Y_i^T) * e_i
        */
-      vi.bind( &V.getData()[ i * ldSize ], size );
-      hauseholder_cwy( vi, Y, T, i );
+      v_i.bind( &V.getData()[ i * ldSize ], size );
+      hauseholder_cwy( v_i, i );
 
       if( i < m ) {
          /****
           * Solve w from M w = A v_i
           */
-         preconditioned_matvec( w, vi );
+         preconditioned_matvec( w, v_i );
 
          /****
           * Apply all previous Hauseholder transformations, using the compact WY representation:
           *     z = (I - Y_i * T_i^T * Y_i^T) * w
           */
-         hauseholder_cwy_transposed( z, Y, T, i, w );
+         hauseholder_cwy_transposed( z, i, w );
       }
 
       /****
@@ -321,7 +326,7 @@ orthogonalize_CWY( const int m, const RealType normb, const RealType beta )
          apply_givens_rotations( i - 1, m );
 
       this->setResidue( std::fabs( s[ i ] ) / normb );
-      if( ! this->checkNextIteration() )
+      if( i > 0 && ! this->checkNextIteration() )
          return i - 1;
       else
          this->refreshSolverMonitor();
@@ -352,7 +357,7 @@ compute_residue( VectorViewType r, ConstVectorViewType x, ConstVectorViewType b
 template< typename Matrix >
 void
 GMRES< Matrix >::
-preconditioned_matvec( DeviceVector& w, ConstDeviceView v )
+preconditioned_matvec( VectorViewType w, ConstVectorViewType v )
 {
    /****
     * w = M.solve(A * v_i);
@@ -365,76 +370,47 @@ preconditioned_matvec( DeviceVector& w, ConstDeviceView v )
       this->matrix->vectorProduct( v, w );
 }
 
-#ifdef HAVE_CUDA
-template< typename DestinationElement,
-          typename SourceElement,
-          typename Index >
-__global__ void
-copyTruncatedVectorKernel( DestinationElement* destination,
-                           const SourceElement* source,
-                           const Index from,
-                           const Index size )
-{
-   Index elementIdx = blockIdx.x * blockDim.x + threadIdx.x;
-   const Index gridSize = blockDim.x * gridDim.x;
-
-   while( elementIdx < from ) {
-      destination[ elementIdx ] = (DestinationElement) 0.0;
-      elementIdx += gridSize;
-   }
-   while( elementIdx < size ) {
-      destination[ elementIdx ] = source[ elementIdx ];
-      elementIdx += gridSize;
-   }
-}
-#endif
-
 template< typename Matrix >
 void
 GMRES< Matrix >::
-hauseholder_generate( DeviceVector& Y,
-                      HostVector& T,
-                      const int i,
-                      DeviceVector& z )
+hauseholder_generate( const int i,
+                      VectorViewType y_i,
+                      ConstVectorViewType z )
 {
-   DeviceView y_i( &Y.getData()[ i * ldSize ], size );
-
    // XXX: the upper-right triangle of Y will be full of zeros, which can be exploited for optimization
-   if( std::is_same< DeviceType, Devices::Host >::value ) {
-      for( IndexType j = 0; j < size; j++ ) {
+   if( localOffset == 0 ) {
+      TNL_ASSERT_LT( i, size, "upper-right triangle of Y is not on rank 0" );
+      auto kernel_truncation = [=] __cuda_callable__ ( IndexType j ) mutable
+      {
          if( j < i )
             y_i[ j ] = 0.0;
          else
             y_i[ j ] = z[ j ];
-      }
+      };
+      ParallelFor< DeviceType >::exec( (IndexType) 0, size, kernel_truncation );
    }
-   if( std::is_same< DeviceType, Devices::Cuda >::value ) {
-#ifdef HAVE_CUDA
-      dim3 blockSize( 256 );
-      dim3 gridSize;
-      gridSize.x = min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( size, blockSize.x ) );
-
-      copyTruncatedVectorKernel<<< gridSize, blockSize >>>( y_i.getData(),
-                                                            z.getData(),
-                                                            i,
-                                                            size );
-      TNL_CHECK_CUDA_DEVICE;
-#else
-      throw Exceptions::CudaSupportMissing();
-#endif
+   else {
+      ConstDeviceView z_local = Traits::getLocalVectorView( z );
+      DeviceView y_i_local = Traits::getLocalVectorView( y_i );
+      y_i_local = z_local;
    }
 
    // norm of the TRUNCATED vector z
    const RealType normz = y_i.lpNorm( 2.0 );
-   const RealType y_ii = y_i.getElement( i );
-   if( y_ii > 0.0 )
-      y_i.setElement( i, y_ii + normz );
-   else
-      y_i.setElement( i, y_ii - normz );
+   RealType norm_yi = 0;
+   if( localOffset == 0 ) {
+      const RealType y_ii = y_i.getElement( i );
+      if( y_ii > 0.0 )
+         y_i.setElement( i, y_ii + normz );
+      else
+         y_i.setElement( i, y_ii - normz );
 
-   // compute the norm of the y_i vector; equivalent to this calculation by definition:
-   //       const RealType norm_yi = y_i.lpNorm( 2.0 );
-   const RealType norm_yi = std::sqrt( 2 * (normz * normz + std::fabs( y_ii ) * normz) );
+      // compute the norm of the y_i vector; equivalent to this calculation by definition:
+      //       const RealType norm_yi = y_i.lpNorm( 2.0 );
+      norm_yi = std::sqrt( 2 * (normz * normz + std::fabs( y_ii ) * normz) );
+   }
+   // no-op if the problem is not distributed
+   CommunicatorType::Bcast( &norm_yi, 1, 0, Traits::getCommunicationGroup( *this->matrix ) );
 
    // XXX: normalization is slower, but more stable
 //   y_i *= 1.0 / norm_yi;
@@ -453,8 +429,10 @@ hauseholder_generate( DeviceVector& Y,
                  size,
                  Y.getData(),
                  ldSize,
-                 y_i.getData(),
+                 Traits::getLocalVectorView( y_i ).getData(),
                  aux );
+      // no-op if the problem is not distributed
+      CommunicatorType::Allreduce( aux, i, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) );
 
       // [T_i]_{0..i-1} = - T_{i-1} * t_i * aux
       for( int k = 0; k < i; k++ ) {
@@ -469,49 +447,49 @@ template< typename Matrix >
 void
 GMRES< Matrix >::
 hauseholder_apply_trunc( HostView out,
-                         DeviceVector& Y,
-                         HostVector& T,
                          const int i,
-                         DeviceVector& z )
+                         VectorViewType y_i,
+                         ConstVectorViewType z )
 {
-   DeviceView y_i( &Y.getData()[ i * ldSize ], size );
-
+   // copy part of y_i to the YL buffer
+   // The upper (m+1)x(m+1) submatrix of Y is duplicated in the YL buffer,
+   // which resides on host and is broadcasted from rank 0 to all processes.
+   HostView YL_i( &YL[ i * (restarting_max + 1) ], restarting_max + 1 );
+   Containers::Algorithms::ArrayOperations< Devices::Host, DeviceType >::copyMemory( YL_i.getData(), Traits::getLocalVectorView( y_i ).getData(), YL_i.getSize() );
+   // no-op if the problem is not distributed
+   CommunicatorType::Bcast( YL_i.getData(), YL_i.getSize(), 0, Traits::getCommunicationGroup( *this->matrix ) );
+
+   // TODO: is aux always 1?
    const RealType aux = T[ i + i * (restarting_max + 1) ] * y_i.scalarProduct( z );
-   if( std::is_same< DeviceType, Devices::Host >::value ) {
-      for( int k = 0; k <= i; k++ )
-         out[ k ] = z[ k ] - y_i[ k ] * aux;
-   }
-   if( std::is_same< DeviceType, Devices::Cuda >::value ) {
-      // copy part of y_i to buffer on host
-      // here we duplicate the upper (m+1)x(m+1) submatrix of Y on host for fast access
-      RealType* host_yi = &YL[ i * (restarting_max + 1) ];
-      RealType host_z[ i + 1 ];
-      Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_yi, y_i.getData(), restarting_max + 1 );
-      Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( host_z, z.getData(), i + 1 );
-      for( int k = 0; k <= i; k++ )
-         out[ k ] = host_z[ k ] - host_yi[ k ] * aux;
+   if( localOffset == 0 ) {
+      if( std::is_same< DeviceType, Devices::Host >::value ) {
+         for( int k = 0; k <= i; k++ )
+            out[ k ] = z[ k ] - y_i[ k ] * aux;
+      }
+      if( std::is_same< DeviceType, Devices::Cuda >::value ) {
+         RealType host_z[ i + 1 ];
+         Containers::Algorithms::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( host_z, Traits::getLocalVectorView( z ).getData(), i + 1 );
+         for( int k = 0; k <= i; k++ )
+            out[ k ] = host_z[ k ] - YL_i[ k ] * aux;
+      }
    }
+
+   // no-op if the problem is not distributed
+   CommunicatorType::Bcast( out.getData(), i + 1, 0, Traits::getCommunicationGroup( *this->matrix ) );
 }
 
 template< typename Matrix >
 void
 GMRES< Matrix >::
-hauseholder_cwy( DeviceVector& v,
-                 DeviceVector& Y,
-                 HostVector& T,
+hauseholder_cwy( VectorViewType v,
                  const int i )
 {
    // aux = Y_i^T * e_i
    RealType aux[ i + 1 ];
-   if( std::is_same< DeviceType, Devices::Host >::value ) {
-      for( int k = 0; k <= i; k++ )
-         aux[ k ] = Y[ i + k * ldSize ];
-   }
-   if( std::is_same< DeviceType, Devices::Cuda >::value ) {
-      // the upper (m+1)x(m+1) submatrix of Y is duplicated on host for fast access
-      for( int k = 0; k <= i; k++ )
-         aux[ k ] = YL[ i + k * (restarting_max + 1) ];
-   }
+   // the upper (m+1)x(m+1) submatrix of Y is duplicated on host
+   // (faster access than from the device and it is broadcasted to all processes)
+   for( int k = 0; k <= i; k++ )
+      aux[ k ] = YL[ i + k * (restarting_max + 1) ];
 
    // aux = T_i * aux
    // Note that T_i is upper triangular, so we can overwrite the aux vector with the result in place
@@ -525,18 +503,17 @@ hauseholder_cwy( DeviceVector& v,
    // v = e_i - Y_i * aux
    MatrixOperations< DeviceType >::gemv( size, i + 1,
                                          -1.0, Y.getData(), ldSize, aux,
-                                         0.0, v.getData() );
-   v.setElement( i, 1.0 + v.getElement( i ) );
+                                         0.0, Traits::getLocalVectorView( v ).getData() );
+   if( localOffset == 0 )
+      v.setElement( i, 1.0 + v.getElement( i ) );
 }
 
 template< typename Matrix >
 void
 GMRES< Matrix >::
-hauseholder_cwy_transposed( DeviceVector& z,
-                            DeviceVector& Y,
-                            HostVector& T,
+hauseholder_cwy_transposed( VectorViewType z,
                             const int i,
-                            DeviceVector& w )
+                            ConstVectorViewType w )
 {
    // aux = Y_i^T * w
    RealType aux[ i + 1 ];
@@ -547,8 +524,10 @@ hauseholder_cwy_transposed( DeviceVector& z,
               size,
               Y.getData(),
               ldSize,
-              w.getData(),
+              Traits::getLocalVectorView( w ).getData(),
               aux );
+   // no-op if the problem is not distributed
+   Traits::CommunicatorType::Allreduce( aux, i + 1, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) );
 
    // aux = T_i^T * aux
    // Note that T_i^T is lower triangular, so we can overwrite the aux vector with the result in place
@@ -563,7 +542,7 @@ hauseholder_cwy_transposed( DeviceVector& z,
    z = w;
    MatrixOperations< DeviceType >::gemv( size, i + 1,
                                          -1.0, Y.getData(), ldSize, aux,
-                                         1.0, z.getData() );
+                                         1.0, Traits::getLocalVectorView( z ).getData() );
 }
 
 template< typename Matrix >
@@ -574,7 +553,7 @@ update( const int k,
         const int m,
         const HostVector& H,
         const HostVector& s,
-        DeviceVector& v,
+        DeviceVector& V,
         Vector& x )
 {
    RealType y[ m + 1 ];
@@ -602,8 +581,8 @@ update( const int k,
 
    // x = V * y + x
    MatrixOperations< DeviceType >::gemv( size, k + 1,
-                                         1.0, v.getData(), ldSize, y,
-                                         1.0, x.getData() );
+                                         1.0, V.getData(), ldSize, y,
+                                         1.0, Traits::getLocalVectorView( x ).getData() );
 }
 
 template< typename Matrix >
@@ -673,29 +652,30 @@ apply_givens_rotations( int i, int m )
 template< typename Matrix >
 void
 GMRES< Matrix >::
-setSize( const IndexType size )
+setSize( const VectorViewType& x )
 {
-   this->size = size;
+   this->size = Traits::getLocalVectorView( x ).getSize();
    if( std::is_same< DeviceType, Devices::Cuda >::value )
       // align each column to 256 bytes - optimal for CUDA
       ldSize = roundToMultiple( size, 256 / sizeof( RealType ) );
    else
       // on the host, we add 1 to disrupt the cache false-sharing pattern
       ldSize = roundToMultiple( size, 256 / sizeof( RealType ) ) + 1;
+   localOffset = getLocalOffset( *this->matrix );
 
    const int m = restarting_max;
-   r.setSize( size );
-   w.setSize( size );
+   r.setLike( x );
+   w.setLike( x );
+   _M_tmp.setLike( x );
    V.setSize( ldSize * ( m + 1 ) );
    cs.setSize( m + 1 );
    sn.setSize( m + 1 );
    H.setSize( ( m + 1 ) * m );
    s.setSize( m + 1 );
-   _M_tmp.setSize( size );
 
    // CWY-specific storage
    if( variant == Variant::CWY ) {
-      z.setSize( size );
+      z.setLike( x );
       Y.setSize( ldSize * ( m + 1 ) );
       T.setSize( (m + 1) * (m + 1) );
       YL.setSize( (m + 1) * (m + 1) );
diff --git a/src/TNL/Solvers/Linear/Traits.h b/src/TNL/Solvers/Linear/Traits.h
index 343fae5c8ae717117612ebecfdc0db638dbc21cf..5fd61c0156bea20895b35eff5b0cf99d5fc1704b 100644
--- a/src/TNL/Solvers/Linear/Traits.h
+++ b/src/TNL/Solvers/Linear/Traits.h
@@ -12,7 +12,10 @@
 
 #pragma once
 
+#include <TNL/Communicators/NoDistrCommunicator.h>
+#include <TNL/Containers/Vector.h>
 #include <TNL/Containers/VectorView.h>
+#include <TNL/DistributedContainers/DistributedVector.h>
 #include <TNL/DistributedContainers/DistributedVectorView.h>
 #include <TNL/DistributedContainers/DistributedMatrix.h>
 
@@ -23,6 +26,8 @@ namespace Linear {
 template< typename Matrix >
 struct Traits
 {
+   using CommunicatorType = Communicators::NoDistrCommunicator;
+
    using VectorType = Containers::Vector
          < typename Matrix::RealType,
            typename Matrix::DeviceType,
@@ -45,11 +50,15 @@ struct Traits
    static const Matrix& getLocalMatrix( const Matrix& m ) { return m; }
    static ConstLocalVectorViewType getLocalVectorView( ConstVectorViewType v ) { return v; }
    static LocalVectorViewType getLocalVectorView( VectorViewType v ) { return v; }
+
+   static typename CommunicatorType::CommunicationGroup getCommunicationGroup( const Matrix& m ) { return CommunicatorType::AllGroup; }
 };
 
 template< typename Matrix, typename Communicator >
 struct Traits< DistributedContainers::DistributedMatrix< Matrix, Communicator > >
 {
+   using CommunicatorType = Communicator;
+
    using VectorType = DistributedContainers::DistributedVector
          < typename Matrix::RealType,
            typename Matrix::DeviceType,
@@ -84,6 +93,8 @@ struct Traits< DistributedContainers::DistributedMatrix< Matrix, Communicator >
    { return m.getLocalMatrix(); }
    static ConstLocalVectorViewType getLocalVectorView( ConstVectorViewType v ) { return v.getLocalVectorView(); }
    static LocalVectorViewType getLocalVectorView( VectorViewType v ) { return v.getLocalVectorView(); }
+
+   static typename CommunicatorType::CommunicationGroup getCommunicationGroup( const DistributedContainers::DistributedMatrix< Matrix, Communicator >& m ) { return m.getCommunicationGroup(); }
 };
 
 } // namespace Linear