diff --git a/src/TNL/Containers/Segments/CSR.hpp b/src/TNL/Containers/Segments/CSR.hpp
index 8b8ddfff51d27e0139d3ac8ef2f4350739a15091..971754b5a4395c4abd460fa883496a24dbfd0118 100644
--- a/src/TNL/Containers/Segments/CSR.hpp
+++ b/src/TNL/Containers/Segments/CSR.hpp
@@ -225,8 +225,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
       const IndexType end = offsetsView[ i + 1 ];
       RealType aux( zero );
       bool compute( true );
+      IndexType localIdx( 0 );
       for( IndexType j = begin; j < end && compute; j++  )
-         reduction( aux, fetch( i, j, compute, args... ) );
+         reduction( aux, fetch( i, localIdx++, j, compute, args... ) );
       keeper( i, aux );
    };
    Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
diff --git a/src/TNL/Containers/Segments/SlicedEllpack.hpp b/src/TNL/Containers/Segments/SlicedEllpack.hpp
index 76790f393371a17da72759f9694129359d4a7495..31f417df26f77aa65d7b1b296c9c5dae61180e23 100644
--- a/src/TNL/Containers/Segments/SlicedEllpack.hpp
+++ b/src/TNL/Containers/Segments/SlicedEllpack.hpp
@@ -354,8 +354,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
          const IndexType end = begin + segmentSize;
          RealType aux( zero );
          bool compute( true );
+         IndexType localIdx( 0 );
          for( IndexType globalIdx = begin; globalIdx< end; globalIdx++  )
-            reduction( aux, fetch( segmentIdx, globalIdx, compute, args... ) );
+            reduction( aux, fetch( segmentIdx, localIdx++, globalIdx, compute, args... ) );
          keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
@@ -370,8 +371,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
          const IndexType end = sliceOffsets_view[ sliceIdx + 1 ];
          RealType aux( zero );
          bool compute( true );
+         IndexType localIdx( 0 );
          for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize  )
-            reduction( aux, fetch( segmentIdx, globalIdx, compute, args... ) );
+            reduction( aux, fetch( segmentIdx, localIdx++, globalIdx, compute, args... ) );
          keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
diff --git a/src/TNL/Matrices/Dense.h b/src/TNL/Matrices/Dense.h
index 90aa57170b99373187bafcb1bfeeadc83bf39f28..2e283c9e89f9be42f133132ab5a00b9881114244 100644
--- a/src/TNL/Matrices/Dense.h
+++ b/src/TNL/Matrices/Dense.h
@@ -183,6 +183,12 @@ class Dense : public Matrix< Real, Device, Index >
                 typename = typename Enabler< Device2 >::type >
       Dense& operator=( const Dense< Real2, Device2, Index2 >& matrix );
 
+      template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
+      bool operator==( const Dense< Real_, Device_, Index_, RowMajorOrder >& matrix ) const;
+
+      template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
+      bool operator!=( const Dense< Real_, Device_, Index_, RowMajorOrder >& matrix ) const;
+      
       void save( const String& fileName ) const;
 
       void load( const String& fileName );
diff --git a/src/TNL/Matrices/Dense.hpp b/src/TNL/Matrices/Dense.hpp
index fe11d67593075ca7ec19c57881114896e1e73704..ecd5aec1c99749aa5c5830a377e8dfb0884d86a1 100644
--- a/src/TNL/Matrices/Dense.hpp
+++ b/src/TNL/Matrices/Dense.hpp
@@ -994,6 +994,33 @@ Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::operator=( const Den
    throw Exceptions::NotImplementedError("Cross-device assignment for the Dense format is not implemented yet.");
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+   template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
+bool
+Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+operator==( const Dense< Real_, Device_, Index_, RowMajorOrder >& matrix ) const
+{
+   return( this->getRows() == matrix.getRows() &&
+           this->getColumns() == matrix.getColumns() &&
+           this->getValues() == matrix.getValues() );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+   template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
+bool
+Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+operator!=( const Dense< Real_, Device_, Index_, RowMajorOrder >& matrix ) const
+{
+   return ! ( *this == matrix );
+}
 
 template< typename Real,
           typename Device,
diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index c50f71612346a5ad7386f3eaa95e42fb2973a762..75d9179282add8a631596013a6291d3d7acb623e 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -175,6 +175,13 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        */
       SparseMatrix& operator=( const SparseMatrix& matrix );
 
+      /**
+       * \brief Assignment of dense matrix
+       */
+      template< typename Real_, typename Device_, typename Index_, bool RowMajorOrder, typename RealAllocator_ >
+      SparseMatrix& operator=( const Dense< Real_, Device_, Index_, RowMajorOrder, RealAllocator_ >& matrix );
+      
+      
       /**
        * \brief Assignment of any other matrix type.
        * @param matrix
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index 6c0655ce0122f32d884ee037f681f896f2b84511..d38b9de34ac608aaa2cf2d58773b2a10e37dd3dd 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -183,7 +183,7 @@ getCompressedRowLengths( Vector& rowLengths ) const
    rowLengths.setSize( this->getRows() );
    rowLengths = 0;
    auto rowLengths_view = rowLengths.getView();
-   auto fetch = [] __cuda_callable__ ( IndexType row, IndexType column, const RealType& value ) -> IndexType {
+   auto fetch = [] __cuda_callable__ ( IndexType row, IndexType column, IndexType globalIdx, const RealType& value ) -> IndexType {
       return ( value != 0.0 );
    };
    auto reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType a ) {
@@ -448,7 +448,7 @@ rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Ke
    const auto columns_view = this->columnIndexes.getConstView();
    const auto values_view = this->values.getConstView();
    const IndexType paddingIndex_ = this->getPaddingIndex();
-   auto fetch_ = [=] __cuda_callable__ ( IndexType rowIdx, IndexType globalIdx, bool& compute ) mutable -> decltype( fetch( IndexType(), IndexType(), RealType() ) ) {
+   auto fetch_ = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable -> decltype( fetch( IndexType(), IndexType(), RealType() ) ) {
       IndexType columnIdx = columns_view[ globalIdx ];
       if( columnIdx != paddingIndex_ )
          return fetch( rowIdx, columnIdx, values_view[ globalIdx ] );
@@ -615,7 +615,108 @@ operator=( const SparseMatrix& matrix )
    return *this;
 }
 
-// cross-device copy assignment
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          template< typename, typename, typename > class Segments,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Real_, typename Device_, typename Index_, bool RowMajorOrder, typename RealAllocator_ >
+SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >&
+SparseMatrix& operator=( const Dense< Real_, Device_, Index_, RowMajorOrder, RealAllocator_ >& matrix )
+{
+   using RHSMatrix = Dense< Real_, Device_, Index_, RowMajorOrder, RealAllocator_ >;
+   using RHSIndexType = typename RHSMatrix::IndexType;
+   using RHSRealType = typename RHSMatrix::RealType;
+   using RHSDeviceType = typename RHSMatrix::DeviceType;
+   using RHSRealAllocatorType = typename RHSMatrix::RealAllocatorType;
+
+   typename RHSMatrix::RowsCapacitiesType rowLengths;
+   matrix.getCompressedRowLengths( rowLengths );
+   this->setDimensions( matrix.getRows(), matrix.getColumns() );
+   this->setCompressedRowLengths( rowLengths );
+
+   // TODO: use getConstView when it works
+   const auto matrixView = const_cast< RHSMatrix& >( matrix ).getView();
+   const IndexType paddingIndex = this->getPaddingIndex();
+   auto columns_view = this->columnIndexes.getView();
+   auto values_view = this->values.getView();
+   columns_view = paddingIndex;
+
+   if( std::is_same< DeviceType, RHSDeviceType >::value )
+   {
+      const auto this_segments_view = this->segments.getView();
+      const auto segments_view = this->segments.getView();
+      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value ) mutable {
+         if( columnIndex != paddingIndex )
+         {
+            IndexType thisGlobalIdx = segments_view.getGlobalIndex( rowIdx, localIdx );
+            columns_view[ thisGlobalIdx ] = columnIndex;
+            values_view[ thisGlobalIdx ] = value;
+         }
+      };
+      matrix.forAllRows( f );
+   }
+   else
+   {
+      const IndexType maxRowLength = max( rowLengths );
+      const IndexType bufferRowsCount( 128 );
+      const size_t bufferSize = bufferRowsCount * maxRowLength;
+      Containers::Vector< RHSRealType, RHSDeviceType, RHSIndexType, RHSRealAllocatorType > matrixValuesBuffer( bufferSize );
+      Containers::Vector< RHSIndexType, RHSDeviceType, RHSIndexType, RHSIndexAllocatorType > matrixColumnsBuffer( bufferSize );
+      Containers::Vector< RealType, DeviceType, IndexType, RealAllocatorType > thisValuesBuffer( bufferSize );
+      Containers::Vector< IndexType, DeviceType, IndexType, IndexAllocatorType > thisColumnsBuffer( bufferSize );
+      auto matrixValuesBuffer_view = matrixValuesBuffer.getView();
+      auto matrixColumnsBuffer_view = matrixColumnsBuffer.getView();
+      auto thisValuesBuffer_view = thisValuesBuffer.getView();
+      auto thisColumnsBuffer_view = thisColumnsBuffer.getView();
+
+      IndexType baseRow( 0 );
+      const IndexType rowsCount = this->getRows();
+      while( baseRow < rowsCount )
+      {
+         const IndexType lastRow = min( baseRow + bufferRowsCount, rowsCount );
+         thisColumnsBuffer = paddingIndex;
+         matrixColumnsBuffer_view = paddingIndex;
+
+         ////
+         // Copy matrix elements into buffer
+         auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value ) mutable {
+            if( columnIndex != paddingIndex )
+            {
+               const IndexType bufferIdx = ( rowIdx - baseRow ) * maxRowLength + localIdx;
+               matrixColumnsBuffer_view[ bufferIdx ] = columnIndex;
+               matrixValuesBuffer_view[ bufferIdx ] = value;
+            }
+         };
+         matrix.forRows( baseRow, lastRow, f1 );
+
+         ////
+         // Copy the source matrix buffer to this matrix buffer
+         thisValuesBuffer_view = matrixValuesBuffer_view;
+         thisColumnsBuffer_view = matrixColumnsBuffer_view;
+
+         ////
+         // Copy matrix elements from the buffer to the matrix
+         auto f2 = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType& columnIndex, RealType& value  ) mutable {
+            const IndexType bufferIdx = ( rowIdx - baseRow ) * maxRowLength + localIdx;
+            const IndexType column = thisColumnsBuffer_view[ bufferIdx ];
+            if( column != paddingIndex )
+            {
+               columnIndex = column;
+               value = thisValuesBuffer_view[ bufferIdx ];
+            }
+         };
+         this->forRows( baseRow, lastRow, f2 );
+         baseRow += bufferRowsCount;
+      }
+      //std::cerr << "This matrix = " << std::endl << *this << std::endl;
+   }
+   return *this;
+   
+}
+
 template< typename Real,
           typename Device,
           typename Index,
diff --git a/src/UnitTests/Matrices/SparseMatrixCopyTest.h b/src/UnitTests/Matrices/SparseMatrixCopyTest.h
index f00daf1f3bb16ad5d259405a88fd2b7ed2b65656..6c4f8b2615fd2699e4139b1c863778934242c91f 100644
--- a/src/UnitTests/Matrices/SparseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/SparseMatrixCopyTest.h
@@ -14,6 +14,7 @@
 
 #include <TNL/Matrices/SparseMatrix.h>
 #include <TNL/Matrices/MatrixType.h>
+#include <TNL/Matrices/Dense.h>
 #include <TNL/Containers/Segments/CSR.h>
 #include <TNL/Containers/Segments/Ellpack.h>
 #include <TNL/Containers/Segments/SlicedEllpack.h>
@@ -436,6 +437,55 @@ void testConversion()
    }
 }
 
+template< typename Matrix >
+void denseMatrixAssignment()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+   
+   using DenseHost = TNL::Matrices::Dense< RealType, TNL::Devices::Host, IndexType >;
+   using DenseCuda = TNL::Matrices::Dense< RealType, TNL::Devices::Cuda, IndexType >;
+   
+   const IndexType rows( 10 ), columns( 10 );
+   DenseHost hostMatrix( rows, columns );
+   for( IndexType i = 0; i < columns; i++ )
+      for( IndexType j = 0; j <= i; j++ )
+         hostMatrix( i, j ) = i + j;
+   
+   Matrix matrix;
+   matrix = hostMatrix;
+   using RowCapacitiesType = typename Matrix::RowsCapacitiesType;
+   RowCapacitiesType rowCapacities;
+   matrix.getCompressedRowLengths( rowCapacities );
+   RowCapacitiesType exactRowLengths{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
+   EXPECT_EQ( rowCapacities, exactRowLengths );
+   for( IndexType i = 0; i < columns; i++ )
+      for( IndexType j = 0; j < rows; j++ )
+      {
+         if( j > i )
+            EXPECT_EQ( matrix.getElement( i, j ), 0.0 );
+         else
+            EXPECT_EQ( matrix.getElement( i, j ), i + j );
+      }
+   
+#ifdef HAVE_CUDA
+   DenseCuda cudaMatrix;
+   cudaMatrix = hostMatrix;
+   matrix = cudaMatrix;
+   matrix.getCompressedRowLengths( rowCapacities );
+   EXPECT_EQ( rowCapacities, exactRowLengths );
+   for( IndexType i = 0; i < columns; i++ )
+      for( IndexType j = 0; j < rows; j++ )
+      {
+         if( j > i )
+            EXPECT_EQ( matrix.getElement( i, j ), 0.0 );
+         else
+            EXPECT_EQ( matrix.getElement( i, j ), i + j );
+      }
+#endif
+}
+
 TEST( SparseMatrixCopyTest, CSR_HostToHost )
 {
    testCopyAssignment< CSR_host, CSR_host >();
@@ -568,6 +618,39 @@ TEST( SparseMatrixCopyTest, SlicedEllpack_to_Ellpack_cuda )
 }
 #endif
 
-#endif
+// Dense matrix assignment test
+TEST( SparseMatrixCopyTest, DenseMatrixAssignment_to_CSR_host )
+{
+   denseMatrixAssignment< CSR_host >();
+}
+
+TEST( SparseMatrixCopyTest, DenseMatrixAssignment_to_Ellpack_host )
+{
+   denseMatrixAssignment< E_host >();
+}
+
+TEST( SparseMatrixCopyTest, DenseMatrixAssignment_to_SlicedEllpack_host )
+{
+   denseMatrixAssignment< SE_host >();
+}
+
+#ifdef HAVE_CUDA
+TEST( SparseMatrixCopyTest, DenseMatrixAssignment_to_CSR_cuda )
+{
+   denseMatrixAssignment< CSR_cuda >();
+}
+
+TEST( SparseMatrixCopyTest, DenseMatrixAssignment_to_Ellpack_cuda )
+{
+   denseMatrixAssignment< E_cuda >();
+}
+
+TEST( SparseMatrixCopyTest, DenseMatrixAssignment_to_SlicedEllpack_cuda )
+{
+   denseMatrixAssignment< SE_cuda >();
+}
+#endif // HAVE_CUDA
+
+#endif //HAVE_GTEST
 
 #include "../main.h"