From 7049c029b5218291c7cca95ad9b5d93fa44745e9 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Sat, 4 Jan 2020 22:10:15 +0100
Subject: [PATCH] Added DenseMatrixView.

---
 src/TNL/Matrices/Dense.h                 |    7 +
 src/TNL/Matrices/Dense.hpp               |   34 +-
 src/TNL/Matrices/DenseMatrixView.h       |  207 +++++
 src/TNL/Matrices/DenseMatrixView.hpp     | 1068 ++++++++++++++++++++++
 src/UnitTests/Matrices/DenseMatrixTest.h |    4 +-
 5 files changed, 1317 insertions(+), 3 deletions(-)
 create mode 100644 src/TNL/Matrices/DenseMatrixView.h
 create mode 100644 src/TNL/Matrices/DenseMatrixView.hpp

diff --git a/src/TNL/Matrices/Dense.h b/src/TNL/Matrices/Dense.h
index 51308280db..18249a7b16 100644
--- a/src/TNL/Matrices/Dense.h
+++ b/src/TNL/Matrices/Dense.h
@@ -14,6 +14,7 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Matrices/DenseMatrixRowView.h>
 #include <TNL/Matrices/Matrix.h>
+#include <TNL/Matrices/DenseMatrixView.h>
 #include <TNL/Containers/Segments/Ellpack.h>
 
 namespace TNL {
@@ -47,6 +48,8 @@ class Dense : public Matrix< Real, Device, Index >
       using ValuesViewType = typename ValuesType::ViewType;
       using SegmentsType = Containers::Segments::Ellpack< DeviceType, IndexType, typename Allocators::Default< Device >::template Allocator< IndexType >, RowMajorOrder, 1 >;
       using SegmentViewType = typename SegmentsType::SegmentViewType;
+      using ViewType = DenseMatrixView< Real, Device, Index, MatrixType, SegmentsViewTemplate >;
+      using ConstViewType = DenseMatrixView< typename std::add_const< Real >::type, Device, Index, MatrixType, SegmentsViewTemplate >;
       using RowView = DenseMatrixRowView< SegmentViewType, ValuesViewType >;
 
       // TODO: remove this
@@ -61,6 +64,10 @@ class Dense : public Matrix< Real, Device, Index >
       Dense();
 
       Dense( const IndexType rows, const IndexType columns );
+      
+      ViewType getView();
+
+      ConstViewType getConstView() const;
 
       static String getSerializationType();
 
diff --git a/src/TNL/Matrices/Dense.hpp b/src/TNL/Matrices/Dense.hpp
index ebf2c03b95..85f1b560d0 100644
--- a/src/TNL/Matrices/Dense.hpp
+++ b/src/TNL/Matrices/Dense.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          Dense_impl.h  -  description
+                          Dense.hpp  -  description
                              -------------------
     begin                : Nov 29, 2013
     copyright            : (C) 2013 by Tomas Oberhuber
@@ -37,6 +37,38 @@ Dense( const IndexType rows, const IndexType columns )
    this->setDimensions( rows, columns );
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          template< typename, typename, typename > class Segments,
+          typename RealAllocator >
+auto
+Dense< Real, Device, Index, RowMajorOrder, Segments, RealAllocator >::
+getView() -> ViewType
+{
+   return ViewType( this->getRows(), 
+                    this->getColumns(),
+                    this->getValues().getView(),
+                    this->segments.getView() );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          template< typename, typename, typename > class Segments,
+          typename RealAllocator >
+auto
+Dense< Real, Device, Index, RowMajorOrder, Segments, RealAllocator >::
+getConstView() const -> ConstViewType
+{
+   return ConstViewType( this->getRows(),
+                         this->getColumns(),
+                         this->getValues().getConstView(),
+                         this->segments.getConstView() );
+}
+
 template< typename Real,
           typename Device,
           typename Index,
diff --git a/src/TNL/Matrices/DenseMatrixView.h b/src/TNL/Matrices/DenseMatrixView.h
new file mode 100644
index 0000000000..2334eb6364
--- /dev/null
+++ b/src/TNL/Matrices/DenseMatrixView.h
@@ -0,0 +1,207 @@
+/***************************************************************************
+                          DenseMatrixView.h  -  description
+                             -------------------
+    begin                : Nov 29, 2013
+    copyright            : (C) 2013 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Allocators/Default.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Matrices/DenseMatrixRowView.h>
+#include <TNL/Matrices/MatrixView.h>
+#include <TNL/Containers/Segments/EllpackView.h>
+
+namespace TNL {
+namespace Matrices {
+
+//template< typename Device >
+//class DenseDeviceDependentCode;
+
+template< typename Real = double,
+          typename Device = Devices::Host,
+          typename Index = int,
+          bool RowMajorOrder = std::is_same< Device, Devices::Host >::value >
+class DenseMatrixView : public MatrixView< Real, Device, Index >
+{
+   private:
+      // convenient template alias for controlling the selection of copy-assignment operator
+      template< typename Device2 >
+      using Enabler = std::enable_if< ! std::is_same< Device2, Device >::value >;
+
+      // friend class will be needed for templated assignment operators
+      //template< typename Real2, typename Device2, typename Index2 >
+      //friend class Dense;
+
+   public:
+      using RealType = Real;
+      using DeviceType = Device;
+      using IndexType = Index;
+      using BaseType = Matrix< Real, Device, Index >;
+      using ValuesType = typename BaseType::ValuesVector;
+      using ValuesViewType = typename ValuesType::ViewType;
+      using SegmentsType = Containers::Segments::Ellpack< DeviceType, IndexType, typename Allocators::Default< Device >::template Allocator< IndexType >, RowMajorOrder, 1 >;
+      using SegmentsViewType = typename SegmentsType::ViewType;
+      using SegmentViewType = typename SegmentsType::SegmentViewType;
+      using RowView = DenseMatrixRowView< SegmentViewType, ValuesViewType >;
+
+      // TODO: remove this
+      using CompressedRowLengthsVector = typename Matrix< Real, Device, Index >::CompressedRowLengthsVector;
+      using ConstCompressedRowLengthsVectorView = typename Matrix< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView;
+
+      template< typename _Real = Real,
+                typename _Device = Device,
+                typename _Index = Index >
+      using Self = Dense< _Real, _Device, _Index >;
+
+      __cuda_callable__
+      DenseMatrixView();
+
+      __cuda_callable__
+      DenseMatrixView( const IndexType rows,
+                       const IndexType columns,
+                       const ValuesViewType& values,
+                       const SegmentsViewType& segments );
+
+      __cuda_callable__
+      DenseMatrixView( const DenseMatrixView& m ) = default;
+
+      __cuda_callable__
+      ViewType getView();
+
+      __cuda_callable__
+      ConstViewType getConstView() const;
+
+      static String getSerializationType();
+
+      virtual String getSerializationTypeVirtual() const;
+
+      template< typename Vector >
+      void getCompressedRowLengths( Vector& rowLengths ) const;
+
+      [[deprecated]]
+      IndexType getRowLength( const IndexType row ) const;
+
+      IndexType getMaxRowLength() const;
+
+      IndexType getNumberOfMatrixElements() const;
+
+      IndexType getNumberOfNonzeroMatrixElements() const;
+
+      void reset();
+
+      __cuda_callable__
+      const RowView getRow( const IndexType& rowIdx ) const;
+
+      __cuda_callable__
+      RowView getRow( const IndexType& rowIdx );
+
+
+      void setValue( const RealType& v );
+
+      __cuda_callable__
+      Real& operator()( const IndexType row,
+                        const IndexType column );
+
+      __cuda_callable__
+      const Real& operator()( const IndexType row,
+                              const IndexType column ) const;
+
+      bool setElement( const IndexType row,
+                       const IndexType column,
+                       const RealType& value );
+
+      bool addElement( const IndexType row,
+                       const IndexType column,
+                       const RealType& value,
+                       const RealType& thisElementMultiplicator = 1.0 );
+
+      Real getElement( const IndexType row,
+                       const IndexType column ) const;
+
+      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+      void rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+
+      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+      void allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+
+      template< typename Function >
+      void forRows( IndexType first, IndexType last, Function& function ) const;
+
+      template< typename Function >
+      void forRows( IndexType first, IndexType last, Function& function );
+
+      template< typename Function >
+      void forAllRows( Function& function ) const;
+
+      template< typename Function >
+      void forAllRows( Function& function );
+
+      template< typename Vector >
+      __cuda_callable__
+      typename Vector::RealType rowVectorProduct( const IndexType row,
+                                                  const Vector& vector ) const;
+
+      template< typename InVector, typename OutVector >
+      void vectorProduct( const InVector& inVector,
+                          OutVector& outVector ) const;
+
+      template< typename Matrix >
+      void addMatrix( const Matrix& matrix,
+                      const RealType& matrixMultiplicator = 1.0,
+                      const RealType& thisMatrixMultiplicator = 1.0 );
+
+      template< typename Matrix1, typename Matrix2, int tileDim = 32 >
+      void getMatrixProduct( const Matrix1& matrix1,
+                          const Matrix2& matrix2,
+                          const RealType& matrix1Multiplicator = 1.0,
+                          const RealType& matrix2Multiplicator = 1.0 );
+
+      template< typename Matrix, int tileDim = 32 >
+      void getTransposition( const Matrix& matrix,
+                             const RealType& matrixMultiplicator = 1.0 );
+
+      template< typename Vector1, typename Vector2 >
+      void performSORIteration( const Vector1& b,
+                                const IndexType row,
+                                Vector2& x,
+                                const RealType& omega = 1.0 ) const;
+
+      // copy assignment
+      Dense& operator=( const Dense& matrix );
+
+      // cross-device copy assignment
+      template< typename Real2, typename Device2, typename Index2,
+                typename = typename Enabler< Device2 >::type >
+      Dense& operator=( const Dense< Real2, Device2, Index2 >& matrix );
+
+      void save( const String& fileName ) const;
+
+      void load( const String& fileName );
+
+      void save( File& file ) const;
+
+      void load( File& file );
+
+      void print( std::ostream& str ) const;
+
+   protected:
+
+      __cuda_callable__
+      IndexType getElementIndex( const IndexType row,
+                                 const IndexType column ) const;
+
+      typedef DenseDeviceDependentCode< DeviceType > DeviceDependentCode;
+      friend class DenseDeviceDependentCode< DeviceType >;
+
+      SegmentsViewType segments;
+};
+
+} // namespace Matrices
+} // namespace TNL
+
+#include <TNL/Matrices/DenseMatrixView.hpp>
diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
new file mode 100644
index 0000000000..18d6574ac3
--- /dev/null
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -0,0 +1,1068 @@
+/***************************************************************************
+                          DenseMatrixView.hpp  -  description
+                             -------------------
+    begin                : Nov 29, 2013
+    copyright            : (C) 2013 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Matrices/Dense.h>
+#include <TNL/Exceptions/NotImplementedError.h>
+
+namespace TNL {
+namespace Matrices {   
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          template< typename, typename > class SegmentsView >
+__cuda_callable__
+DenseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
+DenseMatrixView()
+{
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          template< typename, typename > class SegmentsView >
+__cuda_callable__
+DenseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
+DenseMatrixView( const IndexType rows,
+                  const IndexType columns,
+                  const ValuesViewType& values,
+                  const ColumnsIndexesViewType& columnIndexes,
+                  const SegmentsViewType& segments )
+ : MatrixView< Real, Device, Index >( rows, columns, values ), columnIndexes( columnIndexes ), segments( segments )
+{
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          template< typename, typename > class SegmentsView >
+__cuda_callable__
+auto
+DenseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
+getView() -> ViewType
+{
+   return ViewType( this->getRows(), 
+                    this->getColumns(),
+                    this->getValues().getView(),
+                    this->columnIndexes.getView(),
+                    this->segments.getView() );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          template< typename, typename > class SegmentsView >
+__cuda_callable__
+auto
+DenseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
+getConstView() const -> ConstViewType
+{
+   return ConstViewType( this->getRows(),
+                         this->getColumns(),
+                         this->getValues().getConstView(),
+                         this->getColumnsIndexes().getConstView(),
+                         this->segments.getConstView() );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder >
+String
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+getSerializationType()
+{
+   return String( "Matrices::Dense< " ) +
+          getType< RealType >() + ", " +
+          getType< Device >() + ", " +
+          getType< IndexType >() + " >";
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+String
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+getSerializationTypeVirtual() const
+{
+   return this->getSerializationType();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+   template< typename Vector >
+void
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+getCompressedRowLengths( Vector& rowLengths ) const
+{
+   rowLengths.setSize( this->getRows() );
+   rowLengths = 0;
+   auto rowLengths_view = rowLengths.getView();
+   auto fetch = [] __cuda_callable__ ( IndexType row, IndexType column, const RealType& value ) -> IndexType {
+      return ( value != 0.0 );
+   };
+   auto reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType a ) {
+      aux += a;
+   };
+   auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
+      rowLengths_view[ rowIdx ] = value;
+   };
+   this->allRowsReduction( fetch, reduce, keep, 0 );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getRowLength( const IndexType row ) const
+{
+   return this->getColumns();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getMaxRowLength() const
+{
+   return this->getColumns();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getNumberOfMatrixElements() const
+{
+   return this->getRows() * this->getColumns();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getNumberOfNonzeroMatrixElements() const
+{
+   const auto values_view = this->values.getConstView();
+   auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> IndexType {
+      return ( values_view[ i ] != 0.0 );
+   };
+   return Algorithms::Reduction< DeviceType >::reduce( this->values.getSize(), std::plus<>{}, fetch, 0 );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+void DenseMatrixView< Real, Device, Index, RowMajorOrder >::reset()
+{
+   Matrix< Real, Device, Index >::reset();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+void DenseMatrixView< Real, Device, Index, RowMajorOrder >::setValue( const Real& value )
+{
+   this->values = value;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+__cuda_callable__ auto
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+getRow( const IndexType& rowIdx ) const -> const RowView
+{
+   TNL_ASSERT_LT( rowIdx, this->getRows(), "Row index is larger than number of matrix rows." );
+   return RowView( this->segments.getSegmentView( rowIdx ), this->values.getView() );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+__cuda_callable__ auto
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+getRow( const IndexType& rowIdx ) -> RowView
+{
+   TNL_ASSERT_LT( rowIdx, this->getRows(), "Row index is larger than number of matrix rows." );
+   return RowView( this->segments.getSegmentView( rowIdx ), this->values.getView() );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+__cuda_callable__
+Real& DenseMatrixView< Real, Device, Index, RowMajorOrder >::operator()( const IndexType row,
+                                                const IndexType column )
+{
+   TNL_ASSERT_GE( row, 0, "Row index must be non-negative." );
+   TNL_ASSERT_LT( row, this->getRows(), "Row index is out of bounds." );
+   TNL_ASSERT_GE( column, 0, "Column index must be non-negative." );
+   TNL_ASSERT_LT( column, this->getColumns(), "Column index is out of bounds." );
+
+   return this->values.operator[]( this->getElementIndex( row, column ) );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+__cuda_callable__
+const Real& DenseMatrixView< Real, Device, Index, RowMajorOrder >::operator()( const IndexType row,
+                                                      const IndexType column ) const
+{
+   TNL_ASSERT_GE( row, 0, "Row index must be non-negative." );
+   TNL_ASSERT_LT( row, this->getRows(), "Row index is out of bounds." );
+   TNL_ASSERT_GE( column, 0, "Column index must be non-negative." );
+   TNL_ASSERT_LT( column, this->getColumns(), "Column index is out of bounds." );
+
+   return this->values.operator[]( this->getElementIndex( row, column ) );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+bool DenseMatrixView< Real, Device, Index, RowMajorOrder >::setElement( const IndexType row,
+                                               const IndexType column,
+                                               const RealType& value )
+{
+   this->values.setElement( this->getElementIndex( row, column ), value );
+   return true;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+bool DenseMatrixView< Real, Device, Index, RowMajorOrder >::addElement( const IndexType row,
+                                                        const IndexType column,
+                                                        const RealType& value,
+                                                        const RealType& thisElementMultiplicator )
+{
+   const IndexType elementIndex = this->getElementIndex( row, column );
+   if( thisElementMultiplicator == 1.0 )
+      this->values.setElement( elementIndex,
+                               this->values.getElement( elementIndex ) + value );
+   else
+      this->values.setElement( elementIndex,
+                               thisElementMultiplicator * this->values.getElement( elementIndex ) + value );
+   return true;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+Real 
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+getElement( const IndexType row,
+            const IndexType column ) const
+{
+   return this->values.getElement( this->getElementIndex( row, column ) );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+   template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
+void
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchValue& zero ) const
+{
+   const auto values_view = this->values.getConstView();
+   auto fetch_ = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType globalIdx, bool& compute ) mutable -> decltype( fetch( IndexType(), IndexType(), RealType() ) ) {
+         return fetch( rowIdx, columnIdx, values_view[ globalIdx ] );
+      return zero;
+   };
+   this->segments.segmentsReduction( first, last, fetch_, reduce, keep, zero );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+   template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+void
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+{
+   this->rowsReduction( 0, this->getRows(), fetch, reduce, keep, zero );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+   template< typename Function >
+void
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+forRows( IndexType first, IndexType last, Function& function ) const
+{
+   const auto values_view = this->values.getConstView();
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType globalIdx ) mutable -> bool {
+      function( rowIdx, columnIdx, values_view[ globalIdx ] );
+      return true;
+   };
+   this->segments.forSegments( first, last, f );
+
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+   template< typename Function >
+void
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+forRows( IndexType first, IndexType last, Function& function )
+{
+   auto values_view = this->values.getView();
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType globalIdx ) mutable -> bool {
+      function( rowIdx, columnIdx, values_view[ globalIdx ] );
+      return true;
+   };
+   this->segments.forSegments( first, last, f );
+
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+   template< typename Function >
+void
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+forAllRows( Function& function ) const
+{
+   this->forRows( 0, this->getRows(), function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+   template< typename Function >
+void
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+forAllRows( Function& function )
+{
+   this->forRows( 0, this->getRows(), function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+   template< typename Vector >
+__cuda_callable__
+typename Vector::RealType DenseMatrixView< Real, Device, Index, RowMajorOrder >::rowVectorProduct( const IndexType row,
+                                                                                   const Vector& vector ) const
+{
+   RealType sum( 0.0 );
+   // TODO: Fix this
+   //for( IndexType column = 0; column < this->getColumns(); column++ )
+   //   sum += this->getElementFast( row, column ) * vector[ column ];
+   return sum;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+   template< typename InVector,
+             typename OutVector >
+void DenseMatrixView< Real, Device, Index, RowMajorOrder >::vectorProduct( const InVector& inVector,
+                                                           OutVector& outVector ) const
+{
+   TNL_ASSERT( this->getColumns() == inVector.getSize(),
+            std::cerr << "Matrix columns: " << this->getColumns() << std::endl
+                 << "Vector size: " << inVector.getSize() << std::endl );
+   TNL_ASSERT( this->getRows() == outVector.getSize(),
+               std::cerr << "Matrix rows: " << this->getRows() << std::endl
+                    << "Vector size: " << outVector.getSize() << std::endl );
+
+   DeviceDependentCode::vectorProduct( *this, inVector, outVector );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+   template< typename Matrix >
+void DenseMatrixView< Real, Device, Index, RowMajorOrder >::addMatrix( const Matrix& matrix,
+                                              const RealType& matrixMultiplicator,
+                                              const RealType& thisMatrixMultiplicator )
+{
+   TNL_ASSERT( this->getColumns() == matrix.getColumns() &&
+              this->getRows() == matrix.getRows(),
+            std::cerr << "This matrix columns: " << this->getColumns() << std::endl
+                 << "This matrix rows: " << this->getRows() << std::endl
+                 << "That matrix columns: " << matrix.getColumns() << std::endl
+                 << "That matrix rows: " << matrix.getRows() << std::endl );
+
+   if( thisMatrixMultiplicator == 1.0 )
+      this->values += matrixMultiplicator * matrix.values;
+   else
+      this->values = thisMatrixMultiplicator * this->values + matrixMultiplicator * matrix.values;
+}
+
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator,
+          typename Matrix1,
+          typename Matrix2,
+          int tileDim,
+          int tileRowBlockSize >
+__global__ void DenseMatrixProductKernel( Dense< Real, Devices::Cuda, Index >* resultMatrix,
+                                                   const Matrix1* matrixA,
+                                                   const Matrix2* matrixB,
+                                                   const Real matrixAMultiplicator,
+                                                   const Real matrixBMultiplicator,
+                                                   const Index gridIdx_x,
+                                                   const Index gridIdx_y )
+{
+   /****
+    * Here we compute product C = A * B. To profit from the fast
+    * shared memory we do it by tiles.
+    */
+
+   typedef Index IndexType;
+   typedef Real RealType;
+   __shared__ Real tileA[ tileDim*tileDim ];
+   __shared__ Real tileB[ tileDim*tileDim ];
+   __shared__ Real tileC[ tileDim*tileDim ];
+
+   const IndexType& matrixARows = matrixA->getRows();
+   const IndexType& matrixAColumns = matrixA->getColumns();
+   const IndexType& matrixBRows = matrixB->getRows();
+   const IndexType& matrixBColumns = matrixB->getColumns();
+
+   /****
+    * Reset the tile C
+    */
+   for( IndexType row = 0; row < tileDim; row += tileRowBlockSize )
+      tileC[ ( row + threadIdx.y )*tileDim + threadIdx.x ] = 0.0;
+
+   /****
+    * Compute the result tile coordinates
+    */
+   const IndexType resultTileRow = ( gridIdx_y*gridDim.y + blockIdx.y )*tileDim;
+   const IndexType resultTileColumn = ( gridIdx_x*gridDim.x + blockIdx.x )*tileDim;
+
+   /****
+    * Sum over the matrix tiles
+    */
+   for( IndexType i = 0; i < matrixAColumns; i += tileDim )
+   {
+      for( IndexType row = 0; row < tileDim; row += tileRowBlockSize )
+      {
+         const IndexType matrixARow = resultTileRow + threadIdx.y + row;
+         const IndexType matrixAColumn = i + threadIdx.x;
+         if( matrixARow < matrixARows && matrixAColumn < matrixAColumns )
+            tileA[ (threadIdx.y + row)*tileDim + threadIdx.x ] =
+               matrixAMultiplicator * matrixA->getElementFast( matrixARow,  matrixAColumn );
+
+         const IndexType matrixBRow = i + threadIdx.y + row;
+         const IndexType matrixBColumn = resultTileColumn + threadIdx.x;
+         if( matrixBRow < matrixBRows && matrixBColumn < matrixBColumns )
+            tileB[ (threadIdx.y + row)*tileDim + threadIdx.x ] =
+               matrixBMultiplicator * matrixB->getElementFast( matrixBRow, matrixBColumn );
+      }
+      __syncthreads();
+
+      const IndexType tileALastRow = tnlCudaMin( tileDim, matrixARows - resultTileRow );
+      const IndexType tileALastColumn = tnlCudaMin( tileDim, matrixAColumns - i );
+      const IndexType tileBLastRow = tnlCudaMin( tileDim, matrixBRows - i );
+      const IndexType tileBLastColumn =
+         tnlCudaMin( tileDim, matrixBColumns - resultTileColumn );
+
+      for( IndexType row = 0; row < tileALastRow; row += tileRowBlockSize )
+      {
+         RealType sum( 0.0 );
+         for( IndexType j = 0; j < tileALastColumn; j++ )
+            sum += tileA[ ( threadIdx.y + row )*tileDim + j ]*
+                      tileB[ j*tileDim + threadIdx.x ];
+         tileC[ ( row + threadIdx.y )*tileDim + threadIdx.x ] += sum;
+      }
+      __syncthreads();
+   }
+
+   /****
+    * Write the result tile to the result matrix
+    */
+   const IndexType& matrixCRows = resultMatrix->getRows();
+   const IndexType& matrixCColumns = resultMatrix->getColumns();
+   for( IndexType row = 0; row < tileDim; row += tileRowBlockSize )
+   {
+      const IndexType matrixCRow = resultTileRow + row + threadIdx.y;
+      const IndexType matrixCColumn = resultTileColumn + threadIdx.x;
+      if( matrixCRow < matrixCRows && matrixCColumn < matrixCColumns )
+         resultMatrix->setElementFast( matrixCRow,
+                                       matrixCColumn,
+                                       tileC[ ( row + threadIdx.y )*tileDim + threadIdx.x ] );
+   }
+
+}
+#endif
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+   template< typename Matrix1, typename Matrix2, int tileDim >
+void DenseMatrixView< Real, Device, Index, RowMajorOrder >::getMatrixProduct( const Matrix1& matrix1,
+                                                              const Matrix2& matrix2,
+                                                              const RealType& matrix1Multiplicator,
+                                                              const RealType& matrix2Multiplicator )
+{
+   TNL_ASSERT( matrix1.getColumns() == matrix2.getRows() &&
+              this->getRows() == matrix1.getRows() &&
+              this->getColumns() == matrix2.getColumns(),
+            std::cerr << "This matrix columns: " << this->getColumns() << std::endl
+                 << "This matrix rows: " << this->getRows() << std::endl
+                 << "Matrix1 columns: " << matrix1.getColumns() << std::endl
+                 << "Matrix1 rows: " << matrix1.getRows() << std::endl
+                 << "Matrix2 columns: " << matrix2.getColumns() << std::endl
+                 << "Matrix2 rows: " << matrix2.getRows() << std::endl );
+
+   if( std::is_same< Device, Devices::Host >::value )
+      for( IndexType i = 0; i < this->getRows(); i += tileDim )
+         for( IndexType j = 0; j < this->getColumns(); j += tileDim )
+         {
+            const IndexType tileRows = min( tileDim, this->getRows() - i );
+            const IndexType tileColumns = min( tileDim, this->getColumns() - j );
+            for( IndexType i1 = i; i1 < i + tileRows; i1++ )
+               for( IndexType j1 = j; j1 < j + tileColumns; j1++ )
+                  this->setElementFast( i1, j1, 0.0 );
+
+            for( IndexType k = 0; k < matrix1.getColumns(); k += tileDim )
+            {
+               const IndexType lastK = min( k + tileDim, matrix1.getColumns() );
+               for( IndexType i1 = 0; i1 < tileRows; i1++ )
+                  for( IndexType j1 = 0; j1 < tileColumns; j1++ )
+                     for( IndexType k1 = k; k1 < lastK; k1++ )
+                        this->addElementFast( i + i1, j + j1,
+                            matrix1.getElementFast( i + i1, k1 ) * matrix2.getElementFast( k1, j + j1 ) );
+            }
+         }
+   if( std::is_same< Device, Devices::Cuda >::value )
+   {
+#ifdef HAVE_CUDA
+      dim3 cudaBlockSize( 0 ), cudaGridSize( 0 );
+      const IndexType matrixProductCudaBlockSize( 256 );
+      const IndexType rowTiles = roundUpDivision( this->getRows(), tileDim );
+      const IndexType columnTiles = roundUpDivision( this->getColumns(), tileDim );
+      const IndexType cudaBlockColumns( tileDim );
+      const IndexType cudaBlockRows( matrixProductCudaBlockSize / tileDim );
+      cudaBlockSize.x = cudaBlockColumns;
+      cudaBlockSize.y = cudaBlockRows;
+      const IndexType rowGrids = roundUpDivision( rowTiles, Cuda::getMaxGridSize() );
+      const IndexType columnGrids = roundUpDivision( columnTiles, Cuda::getMaxGridSize() );
+
+      for( IndexType gridIdx_x = 0; gridIdx_x < columnGrids; gridIdx_x++ )
+         for( IndexType gridIdx_y = 0; gridIdx_y < rowGrids; gridIdx_y++ )
+         {
+            cudaGridSize.x = cudaGridSize.y = Cuda::getMaxGridSize();
+            if( gridIdx_x == columnGrids - 1 )
+               cudaGridSize.x = columnTiles % Cuda::getMaxGridSize();
+            if( gridIdx_y == rowGrids - 1 )
+               cudaGridSize.y = rowTiles % Cuda::getMaxGridSize();
+            Dense* this_kernel = Cuda::passToDevice( *this );
+            Matrix1* matrix1_kernel = Cuda::passToDevice( matrix1 );
+            Matrix2* matrix2_kernel = Cuda::passToDevice( matrix2 );
+            DenseMatrixProductKernel< Real,
+                                               Index,
+                                               Matrix1,
+                                               Matrix2,
+                                               tileDim,
+                                               cudaBlockRows >
+                                           <<< cudaGridSize,
+                                               cudaBlockSize,
+                                               3*tileDim*tileDim >>>
+                                             ( this_kernel,
+                                               matrix1_kernel,
+                                               matrix2_kernel,
+                                               matrix1Multiplicator,
+                                               matrix2Multiplicator,
+                                               gridIdx_x,
+                                               gridIdx_y );
+            Cuda::freeFromDevice( this_kernel );
+            Cuda::freeFromDevice( matrix1_kernel );
+            Cuda::freeFromDevice( matrix2_kernel );
+         }
+#endif
+   }
+}
+
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          typename Matrix,
+          bool RowMajorOrder,
+          typename RealAllocator,
+          int tileDim,
+          int tileRowBlockSize >
+__global__ void DenseTranspositionAlignedKernel( Dense< Real, Devices::Cuda, Index >* resultMatrix,
+                                                          const Matrix* inputMatrix,
+                                                          const Real matrixMultiplicator,
+                                                          const Index gridIdx_x,
+                                                          const Index gridIdx_y )
+{
+   __shared__ Real tile[ tileDim*tileDim ];
+
+   const Index columns = inputMatrix->getColumns();
+   const Index rows = inputMatrix->getRows();
+
+
+   /****
+    * Diagonal mapping of the CUDA blocks
+    */
+   Index blockIdx_x, blockIdx_y;
+   if( columns == rows )
+   {
+      blockIdx_y = blockIdx.x;
+      blockIdx_x = (blockIdx.x+blockIdx.y)%gridDim.x;
+   }
+   else
+   {
+      Index bID = blockIdx.x + gridDim.x*blockIdx.y;
+      blockIdx_y = bID % gridDim.y;
+      blockIdx_x = ( ( bID / gridDim.y ) + blockIdx_y ) % gridDim.x;
+   }
+
+   /****
+    * Read the tile to the shared memory
+    */
+   const Index readRowPosition =
+      ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.y;
+   const Index readColumnPosition =
+      ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.x;
+   for( Index rowBlock = 0;
+        rowBlock < tileDim;
+        rowBlock += tileRowBlockSize )
+   {
+      tile[ Cuda::getInterleaving( threadIdx.x*tileDim +  threadIdx.y + rowBlock ) ] =
+               inputMatrix->getElementFast( readColumnPosition,
+                                            readRowPosition + rowBlock );
+   }
+   __syncthreads();
+
+   /****
+    * Write the tile to the global memory
+    */
+   const Index writeRowPosition =
+      ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.y;
+   const Index writeColumnPosition =
+      ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.x;
+   for( Index rowBlock = 0;
+        rowBlock < tileDim;
+        rowBlock += tileRowBlockSize )
+   {
+      resultMatrix->setElementFast( writeColumnPosition,
+                                    writeRowPosition + rowBlock,
+                                    matrixMultiplicator * tile[ Cuda::getInterleaving( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ] );
+
+   }
+
+}
+
+template< typename Real,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator,
+          typename Matrix,
+          int tileDim,
+          int tileRowBlockSize >
+__global__ void DenseTranspositionNonAlignedKernel( Dense< Real, Devices::Cuda, Index >* resultMatrix,
+                                                             const Matrix* inputMatrix,
+                                                             const Real matrixMultiplicator,
+                                                             const Index gridIdx_x,
+                                                             const Index gridIdx_y )
+{
+   __shared__ Real tile[ tileDim*tileDim ];
+
+   const Index columns = inputMatrix->getColumns();
+   const Index rows = inputMatrix->getRows();
+
+   /****
+    * Diagonal mapping of the CUDA blocks
+    */
+   Index blockIdx_x, blockIdx_y;
+   if( columns == rows )
+   {
+      blockIdx_y = blockIdx.x;
+      blockIdx_x = (blockIdx.x+blockIdx.y)%gridDim.x;
+   }
+   else
+   {
+      Index bID = blockIdx.x + gridDim.x*blockIdx.y;
+      blockIdx_y = bID % gridDim.y;
+      blockIdx_x = ( ( bID / gridDim.y ) + blockIdx_y ) % gridDim.x;
+   }
+
+   /****
+    * Read the tile to the shared memory
+    */
+   const Index readRowPosition =
+      ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.y;
+   const Index readColumnPosition =
+      ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.x;
+   if( readColumnPosition < columns )
+   {
+      const Index readOffset = readRowPosition * columns + readColumnPosition;
+      for( Index rowBlock = 0;
+           rowBlock < tileDim;
+           rowBlock += tileRowBlockSize )
+      {
+         if( readRowPosition + rowBlock < rows )
+            tile[ Cuda::getInterleaving( threadIdx.x*tileDim +  threadIdx.y + rowBlock ) ] =
+               inputMatrix->getElementFast( readColumnPosition,
+                                            readRowPosition + rowBlock );
+      }
+   }
+   __syncthreads();
+
+   /****
+    * Write the tile to the global memory
+    */
+   const Index writeRowPosition =
+      ( gridIdx_x*gridDim.x + blockIdx_x )*tileDim + threadIdx.y;
+   const Index writeColumnPosition =
+      ( gridIdx_y*gridDim.y + blockIdx_y )*tileDim + threadIdx.x;
+   if( writeColumnPosition < rows )
+   {
+      const Index writeOffset = writeRowPosition * rows + writeColumnPosition;
+      for( Index rowBlock = 0;
+           rowBlock < tileDim;
+           rowBlock += tileRowBlockSize )
+      {
+         if( writeRowPosition + rowBlock < columns )
+            resultMatrix->setElementFast( writeColumnPosition,
+                                          writeRowPosition + rowBlock,
+                                          matrixMultiplicator * tile[ Cuda::getInterleaving( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ] );
+      }
+   }
+
+}
+
+
+#endif
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+   template< typename Matrix, int tileDim >
+void DenseMatrixView< Real, Device, Index, RowMajorOrder >::getTransposition( const Matrix& matrix,
+                                                              const RealType& matrixMultiplicator )
+{
+   TNL_ASSERT( this->getColumns() == matrix.getRows() &&
+              this->getRows() == matrix.getColumns(),
+               std::cerr << "This matrix columns: " << this->getColumns() << std::endl
+                    << "This matrix rows: " << this->getRows() << std::endl
+                    << "That matrix columns: " << matrix.getColumns() << std::endl
+                    << "That matrix rows: " << matrix.getRows() << std::endl );
+ 
+   if( std::is_same< Device, Devices::Host >::value )
+   {
+      const IndexType& rows = matrix.getRows();
+      const IndexType& columns = matrix.getColumns();
+      for( IndexType i = 0; i < rows; i += tileDim )
+         for( IndexType j = 0; j < columns; j += tileDim )
+            for( IndexType k = i; k < i + tileDim && k < rows; k++ )
+               for( IndexType l = j; l < j + tileDim && l < columns; l++ )
+                  this->setElement( l, k, matrixMultiplicator * matrix. getElement( k, l ) );
+   }
+   if( std::is_same< Device, Devices::Cuda >::value )
+   {
+#ifdef HAVE_CUDA
+      dim3 cudaBlockSize( 0 ), cudaGridSize( 0 );
+      const IndexType matrixProductCudaBlockSize( 256 );
+      const IndexType rowTiles = roundUpDivision( this->getRows(), tileDim );
+      const IndexType columnTiles = roundUpDivision( this->getColumns(), tileDim );
+      const IndexType cudaBlockColumns( tileDim );
+      const IndexType cudaBlockRows( matrixProductCudaBlockSize / tileDim );
+      cudaBlockSize.x = cudaBlockColumns;
+      cudaBlockSize.y = cudaBlockRows;
+      const IndexType rowGrids = roundUpDivision( rowTiles, Cuda::getMaxGridSize() );
+      const IndexType columnGrids = roundUpDivision( columnTiles, Cuda::getMaxGridSize() );
+      const IndexType sharedMemorySize = tileDim*tileDim + tileDim*tileDim/Cuda::getNumberOfSharedMemoryBanks();
+
+      Dense* this_device = Cuda::passToDevice( *this );
+      Matrix* matrix_device = Cuda::passToDevice( matrix );
+
+      for( IndexType gridIdx_x = 0; gridIdx_x < columnGrids; gridIdx_x++ )
+         for( IndexType gridIdx_y = 0; gridIdx_y < rowGrids; gridIdx_y++ )
+         {
+            cudaGridSize.x = cudaGridSize.y = Cuda::getMaxGridSize();
+            if( gridIdx_x == columnGrids - 1)
+               cudaGridSize.x = columnTiles % Cuda::getMaxGridSize();
+            if( gridIdx_y == rowGrids - 1 )
+               cudaGridSize.y = rowTiles % Cuda::getMaxGridSize();
+            if( ( gridIdx_x < columnGrids - 1 || matrix.getColumns() % tileDim == 0 ) &&
+                ( gridIdx_y < rowGrids - 1 || matrix.getRows() % tileDim == 0 ) )
+            {
+               DenseTranspositionAlignedKernel< Real,
+                                                         Index,
+                                                         Matrix,
+                                                         tileDim,
+                                                         cudaBlockRows >
+                                                     <<< cudaGridSize,
+                                                         cudaBlockSize,
+                                                         sharedMemorySize  >>>
+                                                       ( this_device,
+                                                         matrix_device,
+                                                         matrixMultiplicator,
+                                                         gridIdx_x,
+                                                         gridIdx_y );
+            }
+            else
+            {
+               DenseTranspositionNonAlignedKernel< Real,
+                                                         Index,
+                                                         Matrix,
+                                                         tileDim,
+                                                         cudaBlockRows >
+                                                     <<< cudaGridSize,
+                                                         cudaBlockSize,
+                                                         sharedMemorySize  >>>
+                                                       ( this_device,
+                                                         matrix_device,
+                                                         matrixMultiplicator,
+                                                         gridIdx_x,
+                                                         gridIdx_y );
+            }
+            TNL_CHECK_CUDA_DEVICE;
+         }
+      Cuda::freeFromDevice( this_device );
+      Cuda::freeFromDevice( matrix_device );
+#endif
+   }
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+   template< typename Vector1, typename Vector2 >
+void DenseMatrixView< Real, Device, Index, RowMajorOrder >::performSORIteration( const Vector1& b,
+                                                        const IndexType row,
+                                                        Vector2& x,
+                                                        const RealType& omega ) const
+{
+   RealType sum( 0.0 ), diagonalValue;
+   for( IndexType i = 0; i < this->getColumns(); i++ )
+   {
+      if( i == row )
+         diagonalValue = this->getElement( row, row );
+      else
+         sum += this->getElement( row, i ) * x[ i ];
+   }
+   x[ row ] = ( 1.0 - omega ) * x[ row ] + omega / diagonalValue * ( b[ row ] - sum );
+}
+
+
+// copy assignment
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+DenseMatrixView< Real, Device, Index, RowMajorOrder >&
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::operator=( const Dense& matrix )
+{
+   this->setLike( matrix );
+   this->values = matrix.values;
+   return *this;
+}
+
+// cross-device copy assignment
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+   template< typename Real2, typename Device2, typename Index2, typename >
+DenseMatrixView< Real, Device, Index, RowMajorOrder >&
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::operator=( const Dense< Real2, Device2, Index2 >& matrix )
+{
+   static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value,
+                  "unknown device" );
+   static_assert( std::is_same< Device2, Devices::Host >::value || std::is_same< Device2, Devices::Cuda >::value,
+                  "unknown device" );
+
+   this->setLike( matrix );
+
+   throw Exceptions::NotImplementedError("Cross-device assignment for the Dense format is not implemented yet.");
+}
+
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+void DenseMatrixView< Real, Device, Index, RowMajorOrder >::save( const String& fileName ) const
+{
+   Object::save( fileName );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+void DenseMatrixView< Real, Device, Index, RowMajorOrder >::load( const String& fileName )
+{
+   Object::load( fileName );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+void DenseMatrixView< Real, Device, Index, RowMajorOrder >::save( File& file ) const
+{
+   Matrix< Real, Device, Index >::save( file );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+void DenseMatrixView< Real, Device, Index, RowMajorOrder >::load( File& file )
+{
+   Matrix< Real, Device, Index >::load( file );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+void DenseMatrixView< Real, Device, Index, RowMajorOrder >::print( std::ostream& str ) const
+{
+   for( IndexType row = 0; row < this->getRows(); row++ )
+   {
+      str <<"Row: " << row << " -> ";
+      for( IndexType column = 0; column < this->getColumns(); column++ )
+         str << " Col:" << column << "->" << this->getElement( row, column ) << "\t";
+      str << std::endl;
+   }
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+__cuda_callable__
+Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getElementIndex( const IndexType row,
+                                                              const IndexType column ) const
+{
+   return this->segments.getGlobalIndex( row, column );
+}
+
+template<>
+class DenseDeviceDependentCode< Devices::Host >
+{
+   public:
+
+      typedef Devices::Host Device;
+
+      template< typename Real,
+                typename Index,
+                bool RowMajorOrder,
+                typename RealAllocator,
+                typename InVector,
+                typename OutVector >
+      static void vectorProduct( const DenseMatrixView< Real, Device, Index, RowMajorOrder >& matrix,
+                                 const InVector& inVector,
+                                 OutVector& outVector )
+      {
+#ifdef HAVE_OPENMP
+#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
+#endif
+         for( Index row = 0; row < matrix.getRows(); row ++ )
+            outVector[ row ] = matrix.rowVectorProduct( row, inVector );
+      }
+};
+
+template<>
+class DenseDeviceDependentCode< Devices::Cuda >
+{
+   public:
+
+      typedef Devices::Cuda Device;
+
+      template< typename Real,
+                typename Index,
+                bool RowMajorOrder,
+                typename RealAllocator,
+                typename InVector,
+                typename OutVector >
+      static void vectorProduct( const DenseMatrixView< Real, Device, Index, RowMajorOrder >& matrix,
+                                 const InVector& inVector,
+                                 OutVector& outVector )
+      {
+         MatrixVectorProductCuda( matrix, inVector, outVector );
+      }
+};
+
+} // namespace Matrices
+} // namespace TNL
diff --git a/src/UnitTests/Matrices/DenseMatrixTest.h b/src/UnitTests/Matrices/DenseMatrixTest.h
index c7ada1240d..c0f9b92ff0 100644
--- a/src/UnitTests/Matrices/DenseMatrixTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixTest.h
@@ -597,8 +597,8 @@ void test_SetRow()
          { 0, 1, 2, 3, 4 },
          { 2, 3, 4, 5, 6 } };
       auto row = m_ptr->getRow( rowIdx );
-      for( IndexType i = 0; i < 5; i++ )
-      /   row.setElement( rowIdx, i ); //columnIndexes[ rowIdx ][ i ], values[ rowIdx ][ i ] );
+      //for( IndexType i = 0; i < 5; i++ )
+      ///   row.setElement( rowIdx, i ); //columnIndexes[ rowIdx ][ i ], values[ rowIdx ][ i ] );
    };
    TNL::Pointers::synchronizeSmartPointersOnDevice< DeviceType >();
    TNL::Algorithms::ParallelFor< DeviceType >::exec( 0, 3, f );
-- 
GitLab