From e0928ebba3c59d11b8716295b7aac572eaeedbfc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 1 Apr 2020 09:10:57 +0200
Subject: [PATCH 01/68] Bug: getRow() does not work on const matrices

---
 src/UnitTests/Matrices/SparseMatrixTest.hpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index 8080d45e5..37724d94e 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -79,6 +79,16 @@ void test_Constructors()
       EXPECT_EQ( m2.getRow( 3 ).getValue( 0 ), 1 );   // 3rd row
       EXPECT_EQ( m2.getRow( 3 ).getValue( 1 ), 1 );
       EXPECT_EQ( m2.getRow( 4 ).getValue( 0 ), 1 );   // 4th row
+
+      const Matrix& mm = m2;
+      EXPECT_EQ( mm.getRow( 0 ).getValue( 0 ), 1 );   // 0th row
+      EXPECT_EQ( mm.getRow( 1 ).getValue( 0 ), 1 );   // 1st row
+      EXPECT_EQ( mm.getRow( 1 ).getValue( 1 ), 1 );
+      EXPECT_EQ( mm.getRow( 2 ).getValue( 0 ), 1 );   // 2nd row
+      EXPECT_EQ( mm.getRow( 2 ).getValue( 1 ), 1 );
+      EXPECT_EQ( mm.getRow( 3 ).getValue( 0 ), 1 );   // 3rd row
+      EXPECT_EQ( mm.getRow( 3 ).getValue( 1 ), 1 );
+      EXPECT_EQ( mm.getRow( 4 ).getValue( 0 ), 1 );   // 4th row
    }
 
    m2.getCompressedRowLengths( v1 );
-- 
GitLab


From 37d330d4d7d34f2bc30576220cb3b22b8568f462 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 3 Apr 2020 20:19:49 +0200
Subject: [PATCH 02/68] Fixed logging of matrix file names in SpMV benchmark.

---
 src/Benchmarks/SpMV/spmv-legacy.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h
index f690a50c6..44b4af468 100644
--- a/src/Benchmarks/SpMV/spmv-legacy.h
+++ b/src/Benchmarks/SpMV/spmv-legacy.h
@@ -218,7 +218,7 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    // Perform benchmark on host with CSR as a reference CPU format
    //
    benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-         { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
+         { "matrix name", convertToString( inputFileName ) },
          { "non-zeros", convertToString( csrHostMatrix.getNumberOfNonzeroMatrixElements() ) },
          { "rows", convertToString( csrHostMatrix.getRows() ) },
          { "columns", convertToString( csrHostMatrix.getColumns() ) },
@@ -243,7 +243,7 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    //
 #ifdef HAVE_CUDA
    benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-         { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
+         { "matrix name", convertToString( inputFileName ) },
          { "non-zeros", convertToString( csrHostMatrix.getNumberOfNonzeroMatrixElements() ) },
          { "rows", convertToString( csrHostMatrix.getRows() ) },
          { "columns", convertToString( csrHostMatrix.getColumns() ) },
-- 
GitLab


From 61ef3528944127d0ce4540864bd840f162eafde1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 3 Apr 2020 20:29:07 +0200
Subject: [PATCH 03/68] Added unit test for SparseMatrix::getRow.

---
 src/UnitTests/Matrices/SparseMatrixTest.hpp | 135 +++++++++++++++++++-
 1 file changed, 134 insertions(+), 1 deletion(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index 37724d94e..5490f34f8 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -105,7 +105,7 @@ void test_Constructors()
     *    \  0  0  0 12  0 /
     */
 
-   Matrix m3( 6, 5, {
+   const Matrix m3( 6, 5, {
       { 0, 0,  1.0 }, { 0, 1, 2.0 }, { 0, 2, 3.0 },
       { 1, 1,  4.0 }, { 1, 2, 5.0 }, { 1, 3, 6.0 },
       { 2, 2,  7.0 }, { 2, 3, 8.0 }, { 2, 4, 9.0 },
@@ -150,6 +150,27 @@ void test_Constructors()
    EXPECT_EQ( m3.getElement( 5, 3 ), 12 );
    EXPECT_EQ( m3.getElement( 5, 4 ),  0 );
 
+   if( std::is_same< DeviceType, TNL::Devices::Host >::value )
+   {
+      EXPECT_EQ( m3.getRow( 0 ).getValue( 0 ),  1 );
+      EXPECT_EQ( m3.getRow( 0 ).getValue( 1 ),  2 );
+      EXPECT_EQ( m3.getRow( 0 ).getValue( 2 ),  3 );
+
+      EXPECT_EQ( m3.getRow( 1 ).getValue( 0 ),  4 );
+      EXPECT_EQ( m3.getRow( 1 ).getValue( 1 ),  5 );
+      EXPECT_EQ( m3.getRow( 1 ).getValue( 2 ),  6 );
+
+      EXPECT_EQ( m3.getRow( 2 ).getValue( 0 ),  7 );
+      EXPECT_EQ( m3.getRow( 2 ).getValue( 1 ),  8 );
+      EXPECT_EQ( m3.getRow( 2 ).getValue( 2 ),  9 );
+
+      EXPECT_EQ( m3.getRow( 3 ).getValue( 0 ), 10 );
+
+      EXPECT_EQ( m3.getRow( 4 ).getValue( 0 ), 11 );
+
+      EXPECT_EQ( m3.getRow( 5 ).getValue( 0 ), 12 );
+   }
+
    std::map< std::pair< int, int >, float > map;
    map[ { 0, 0 } ] = 1.0;
    map[ { 0, 1 } ] = 2.0;
@@ -384,6 +405,118 @@ void test_GetRow()
    using DeviceType = typename Matrix::DeviceType;
    using IndexType = typename Matrix::IndexType;
 
+   Matrix m2( {1, 2, 2, 2, 1 }, 5 );
+   typename Matrix::RowsCapacitiesType v1, v2{ 1, 2, 2, 2, 1 };
+   m2.setElement( 0, 0, 1 );   // 0th row
+   m2.setElement( 1, 0, 1 );   // 1st row
+   m2.setElement( 1, 1, 1 );
+   m2.setElement( 2, 1, 1 );   // 2nd row
+   m2.setElement( 2, 2, 1 );
+   m2.setElement( 3, 2, 1 );   // 3rd row
+   m2.setElement( 3, 3, 1 );
+   m2.setElement( 4, 4, 1 );   // 4th row
+
+   EXPECT_EQ( m2.getElement( 0, 0 ), 1 );   // 0th row
+   EXPECT_EQ( m2.getElement( 1, 0 ), 1 );   // 1st row
+   EXPECT_EQ( m2.getElement( 1, 1 ), 1 );
+   EXPECT_EQ( m2.getElement( 2, 1 ), 1 );   // 2nd row
+   EXPECT_EQ( m2.getElement( 2, 2 ), 1 );
+   EXPECT_EQ( m2.getElement( 3, 2 ), 1 );   // 3rd row
+   EXPECT_EQ( m2.getElement( 3, 3 ), 1 );
+   EXPECT_EQ( m2.getElement( 4, 4 ), 1 );   // 4th row
+
+   if( std::is_same< DeviceType, TNL::Devices::Host >::value )
+   {
+      EXPECT_EQ( m2.getRow( 0 ).getValue( 0 ), 1 );   // 0th row
+      EXPECT_EQ( m2.getRow( 1 ).getValue( 0 ), 1 );   // 1st row
+      EXPECT_EQ( m2.getRow( 1 ).getValue( 1 ), 1 );
+      EXPECT_EQ( m2.getRow( 2 ).getValue( 0 ), 1 );   // 2nd row
+      EXPECT_EQ( m2.getRow( 2 ).getValue( 1 ), 1 );
+      EXPECT_EQ( m2.getRow( 3 ).getValue( 0 ), 1 );   // 3rd row
+      EXPECT_EQ( m2.getRow( 3 ).getValue( 1 ), 1 );
+      EXPECT_EQ( m2.getRow( 4 ).getValue( 0 ), 1 );   // 4th row
+   }
+
+   m2.getCompressedRowLengths( v1 );
+   EXPECT_EQ( v1, v2 );
+
+   /*
+    * Sets up the following 6x5 sparse matrix:
+    *
+    *    /  1  2  3  0  0 \
+    *    |  0  4  5  6  0 |
+    *    |  0  0  7  8  9 |
+    *    | 10  0  0  0  0 |
+    *    |  0 11  0  0  0 |
+    *    \  0  0  0 12  0 /
+    */
+
+   const Matrix m3( 6, 5, {
+      { 0, 0,  1.0 }, { 0, 1, 2.0 }, { 0, 2, 3.0 },
+      { 1, 1,  4.0 }, { 1, 2, 5.0 }, { 1, 3, 6.0 },
+      { 2, 2,  7.0 }, { 2, 3, 8.0 }, { 2, 4, 9.0 },
+      { 3, 0, 10.0 },
+      { 4, 1, 11.0 },
+      { 5, 3, 12.0 } } );
+
+   // Check the set elements
+   EXPECT_EQ( m3.getElement( 0, 0 ),  1 );
+   EXPECT_EQ( m3.getElement( 0, 1 ),  2 );
+   EXPECT_EQ( m3.getElement( 0, 2 ),  3 );
+   EXPECT_EQ( m3.getElement( 0, 3 ),  0 );
+   EXPECT_EQ( m3.getElement( 0, 4 ),  0 );
+
+   EXPECT_EQ( m3.getElement( 1, 0 ),  0 );
+   EXPECT_EQ( m3.getElement( 1, 1 ),  4 );
+   EXPECT_EQ( m3.getElement( 1, 2 ),  5 );
+   EXPECT_EQ( m3.getElement( 1, 3 ),  6 );
+   EXPECT_EQ( m3.getElement( 1, 4 ),  0 );
+
+   EXPECT_EQ( m3.getElement( 2, 0 ),  0 );
+   EXPECT_EQ( m3.getElement( 2, 1 ),  0 );
+   EXPECT_EQ( m3.getElement( 2, 2 ),  7 );
+   EXPECT_EQ( m3.getElement( 2, 3 ),  8 );
+   EXPECT_EQ( m3.getElement( 2, 4 ),  9 );
+
+   EXPECT_EQ( m3.getElement( 3, 0 ), 10 );
+   EXPECT_EQ( m3.getElement( 3, 1 ),  0 );
+   EXPECT_EQ( m3.getElement( 3, 2 ),  0 );
+   EXPECT_EQ( m3.getElement( 3, 3 ),  0 );
+   EXPECT_EQ( m3.getElement( 3, 4 ),  0 );
+
+   EXPECT_EQ( m3.getElement( 4, 0 ),  0 );
+   EXPECT_EQ( m3.getElement( 4, 1 ), 11 );
+   EXPECT_EQ( m3.getElement( 4, 2 ),  0 );
+   EXPECT_EQ( m3.getElement( 4, 3 ),  0 );
+   EXPECT_EQ( m3.getElement( 4, 4 ),  0 );
+
+   EXPECT_EQ( m3.getElement( 5, 0 ),  0 );
+   EXPECT_EQ( m3.getElement( 5, 1 ),  0 );
+   EXPECT_EQ( m3.getElement( 5, 2 ),  0 );
+   EXPECT_EQ( m3.getElement( 5, 3 ), 12 );
+   EXPECT_EQ( m3.getElement( 5, 4 ),  0 );
+
+   if( std::is_same< DeviceType, TNL::Devices::Host >::value )
+   {
+      EXPECT_EQ( m3.getRow( 0 ).getValue( 0 ),  1 );
+      EXPECT_EQ( m3.getRow( 0 ).getValue( 1 ),  2 );
+      EXPECT_EQ( m3.getRow( 0 ).getValue( 2 ),  3 );
+
+      EXPECT_EQ( m3.getRow( 1 ).getValue( 0 ),  4 );
+      EXPECT_EQ( m3.getRow( 1 ).getValue( 1 ),  5 );
+      EXPECT_EQ( m3.getRow( 1 ).getValue( 2 ),  6 );
+
+      EXPECT_EQ( m3.getRow( 2 ).getValue( 0 ),  7 );
+      EXPECT_EQ( m3.getRow( 2 ).getValue( 1 ),  8 );
+      EXPECT_EQ( m3.getRow( 2 ).getValue( 2 ),  9 );
+
+      EXPECT_EQ( m3.getRow( 3 ).getValue( 0 ), 10 );
+
+      EXPECT_EQ( m3.getRow( 4 ).getValue( 0 ), 11 );
+
+      EXPECT_EQ( m3.getRow( 5 ).getValue( 0 ), 12 );
+   }
+
    /*
     * Sets up the following 10x10 sparse matrix:
     *
-- 
GitLab


From 3aadca3af040b4743a72f1de37820e870e550154 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 4 Apr 2020 12:30:51 +0200
Subject: [PATCH 04/68] Deactivating not fully functional CSR kernels in SpMV
 benchmark.

---
 src/Benchmarks/SpMV/spmv-legacy.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h
index 44b4af468..17b0b8f0d 100644
--- a/src/Benchmarks/SpMV/spmv-legacy.h
+++ b/src/Benchmarks/SpMV/spmv-legacy.h
@@ -276,10 +276,10 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
 #endif
 
    benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar    >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector    >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light     >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive  >( benchmark, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Stream    >( benchmark, hostOutVector, inputFileName, verboseMR );
+   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector    >( benchmark, hostOutVector, inputFileName, verboseMR );
+   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light     >( benchmark, hostOutVector, inputFileName, verboseMR );
+   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive  >( benchmark, hostOutVector, inputFileName, verboseMR );
+   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Stream    >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, SparseMatrix_CSR                 >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, Matrices::Legacy::Ellpack        >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, SparseMatrix_Ellpack             >( benchmark, hostOutVector, inputFileName, verboseMR );
-- 
GitLab


From 740f7551a97f9abdd96fd77b54eac45400e5fccb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 4 Apr 2020 12:31:32 +0200
Subject: [PATCH 05/68] Fixed getRow method for constant SparseMatrix and
 SparseMatrixView.

---
 src/TNL/Matrices/SparseMatrix.h        | 5 ++++-
 src/TNL/Matrices/SparseMatrix.hpp      | 2 +-
 src/TNL/Matrices/SparseMatrixRowView.h | 3 +++
 src/TNL/Matrices/SparseMatrixView.h    | 5 ++++-
 src/TNL/Matrices/SparseMatrixView.hpp  | 4 ++--
 5 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index 7dc554ae4..b3c90950a 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -59,11 +59,14 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       using ConstRowsCapacitiesView = typename RowsCapacitiesView::ConstViewType;
       using ValuesVectorType = typename Matrix< Real, Device, Index, RealAllocator >::ValuesVectorType;
       using ValuesViewType = typename ValuesVectorType::ViewType;
+      using ConstValuesViewType = typename ValuesViewType::ConstViewType;
       using ColumnsIndexesVectorType = Containers::Vector< IndexType, DeviceType, IndexType, IndexAllocatorType >;
       using ColumnsIndexesViewType = typename ColumnsIndexesVectorType::ViewType;
+      using ConstColumnsIndexesViewType = typename ColumnsIndexesViewType::ConstViewType;
       using ViewType = SparseMatrixView< Real, Device, Index, MatrixType, SegmentsViewTemplate >;
       using ConstViewType = SparseMatrixView< typename std::add_const< Real >::type, Device, Index, MatrixType, SegmentsViewTemplate >;
       using RowView = SparseMatrixRowView< SegmentViewType, ValuesViewType, ColumnsIndexesViewType, isBinary() >;
+      using ConstRowView = typename RowView::ConstViewType;
 
       // TODO: remove this - it is here only for compatibility with original matrix implementation
       typedef Containers::Vector< IndexType, DeviceType, IndexType > CompressedRowLengthsVector;
@@ -135,7 +138,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       void reset();
 
       __cuda_callable__
-      const RowView getRow( const IndexType& rowIdx ) const;
+      const ConstRowView getRow( const IndexType& rowIdx ) const;
 
       __cuda_callable__
       RowView getRow( const IndexType& rowIdx );
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index e143c014f..933177eae 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -364,7 +364,7 @@ template< typename Real,
           typename IndexAllocator >
 __cuda_callable__ auto
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
-getRow( const IndexType& rowIdx ) const -> const RowView
+getRow( const IndexType& rowIdx ) const -> const ConstRowView
 {
    return this->view.getRow( rowIdx );
 }
diff --git a/src/TNL/Matrices/SparseMatrixRowView.h b/src/TNL/Matrices/SparseMatrixRowView.h
index 8906ab5ae..0b89e685c 100644
--- a/src/TNL/Matrices/SparseMatrixRowView.h
+++ b/src/TNL/Matrices/SparseMatrixRowView.h
@@ -26,6 +26,9 @@ class SparseMatrixRowView
       using IndexType = typename SegmentViewType::IndexType;
       using ValuesViewType = ValuesView;
       using ColumnsIndexesViewType = ColumnsIndexesView;
+      using ConstValuesViewType = typename ValuesViewType::ConstViewType;
+      using ConstColumnsIndexesViewType = typename ColumnsIndexesViewType::ConstViewType;
+      using ConstViewType = SparseMatrixRowView< SegmentView, ConstValuesViewType, ConstColumnsIndexesViewType, isBinary_ >;
 
       static constexpr bool isBinary() { return isBinary_; };
 
diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index 4fa65b70a..183d77929 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -41,10 +41,13 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       using RowsCapacitiesView = Containers::VectorView< IndexType, DeviceType, IndexType >;
       using ConstRowsCapacitiesView = typename RowsCapacitiesView::ConstViewType;
       using ValuesViewType = typename BaseType::ValuesView;
+      using ConstValuesViewType = typename ValuesViewType::ConstViewType;
       using ColumnsIndexesViewType = Containers::VectorView< IndexType, DeviceType, IndexType >;
+      using ConstColumnsIndexesViewType = typename ColumnsIndexesViewType::ConstViewType;
       using ViewType = SparseMatrixView< typename std::remove_const< Real >::type, Device, Index, MatrixType, SegmentsViewTemplate >;
       using ConstViewType = SparseMatrixView< typename std::add_const< Real >::type, Device, Index, MatrixType, SegmentsViewTemplate >;
       using RowView = SparseMatrixRowView< SegmentViewType, ValuesViewType, ColumnsIndexesViewType, isBinary() >;
+      using ConstRowView = typename RowView::ConstViewType;
 
       // TODO: remove this - it is here only for compatibility with original matrix implementation
       typedef Containers::Vector< IndexType, DeviceType, IndexType > CompressedRowLengthsVector;
@@ -88,7 +91,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       void reset();
 
       __cuda_callable__
-      const RowView getRow( const IndexType& rowIdx ) const;
+      const ConstRowView getRow( const IndexType& rowIdx ) const;
 
       __cuda_callable__
       RowView getRow( const IndexType& rowIdx );
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 2bae61f98..ef0f40625 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -193,10 +193,10 @@ template< typename Real,
           template< typename, typename > class SegmentsView >
 __cuda_callable__ auto
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
-getRow( const IndexType& rowIdx ) const -> const RowView
+getRow( const IndexType& rowIdx ) const -> const ConstRowView
 {
    TNL_ASSERT_LT( rowIdx, this->getRows(), "Row index is larger than number of matrix rows." );
-   return RowView( this->segments.getSegmentView( rowIdx ), this->values.getView(), this->columnIndexes.getView() );
+   return ConstRowView( this->segments.getSegmentView( rowIdx ), this->values.getConstView(), this->columnIndexes.getConstView() );
 }
 
 template< typename Real,
-- 
GitLab


From 49853fdfe538151fbdb4f1cb17e0d08f0124a759 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 4 Apr 2020 17:44:21 +0200
Subject: [PATCH 06/68] Added smart lambdas to segments.

---
 src/TNL/Containers/Segments/CSRView.hpp       | 15 ++--
 .../Segments/ChunkedEllpackView.hpp           | 17 ++--
 src/TNL/Containers/Segments/EllpackView.hpp   | 20 ++---
 .../Containers/Segments/SlicedEllpackView.hpp |  8 +-
 .../Segments/details/CheckLambdas.h           | 81 +++++++++++++++++++
 .../Segments/details/LambdaAdapter.h          | 55 +++++++++++++
 src/TNL/Matrices/SparseMatrixView.hpp         | 17 +++-
 7 files changed, 184 insertions(+), 29 deletions(-)
 create mode 100644 src/TNL/Containers/Segments/details/CheckLambdas.h
 create mode 100644 src/TNL/Containers/Segments/details/LambdaAdapter.h

diff --git a/src/TNL/Containers/Segments/CSRView.hpp b/src/TNL/Containers/Segments/CSRView.hpp
index b94db8c88..54bac01a2 100644
--- a/src/TNL/Containers/Segments/CSRView.hpp
+++ b/src/TNL/Containers/Segments/CSRView.hpp
@@ -14,6 +14,7 @@
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Containers/Segments/CSRView.h>
 #include <TNL/Containers/Segments/details/CSR.h>
+#include <TNL/Containers/Segments/details/LambdaAdapter.h>
 
 namespace TNL {
    namespace Containers {
@@ -215,17 +216,17 @@ void
 CSRView< Device, Index >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    const auto offsetsView = this->offsets.getConstView();
-   auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable {
-      const IndexType begin = offsetsView[ i ];
-      const IndexType end = offsetsView[ i + 1 ];
+   auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
+      const IndexType begin = offsetsView[ segmentIdx ];
+      const IndexType end = offsetsView[ segmentIdx + 1 ];
       RealType aux( zero );
       IndexType localIdx( 0 );
       bool compute( true );
-      for( IndexType j = begin; j < end && compute; j++  )
-         reduction( aux, fetch( i, localIdx++, j, compute, args... ) );
-      keeper( i, aux );
+      for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
+         reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+      keeper( segmentIdx, aux );
    };
    Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
 }
diff --git a/src/TNL/Containers/Segments/ChunkedEllpackView.hpp b/src/TNL/Containers/Segments/ChunkedEllpackView.hpp
index b3b151624..de6de27d7 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpackView.hpp
+++ b/src/TNL/Containers/Segments/ChunkedEllpackView.hpp
@@ -13,6 +13,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Containers/Segments/ChunkedEllpackView.h>
+#include <TNL/Containers/Segments/details/LambdaAdapter.h>
 //#include <TNL/Containers/Segments/details/ChunkedEllpack.h>
 
 namespace TNL {
@@ -401,7 +402,7 @@ void
 ChunkedEllpackView< Device, Index, RowMajorOrder >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( std::is_same< DeviceType, Devices::Host >::value )
    {
       //segmentsReductionKernel( 0, first, last, fetch, reduction, keeper, zero, args... );
@@ -428,8 +429,8 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
             IndexType begin = sliceOffset + firstChunkOfSegment * chunkSize;
             IndexType end = begin + segmentSize;
             IndexType localIdx( 0 );
-            for( IndexType j = begin; j < end && compute; j++ )
-               reduction( aux, fetch( segmentIdx, localIdx++, j, compute, args...) );
+            for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++ )
+               reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
          }
          else
          {
@@ -438,8 +439,8 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
                IndexType begin = sliceOffset + firstChunkOfSegment + chunkIdx;
                IndexType end = begin + chunksInSlice * chunkSize;
                IndexType localIdx( 0 );
-               for( IndexType j = begin; j < end && compute; j += chunksInSlice )
-                  reduction( aux, fetch( segmentIdx, localIdx++, j, compute, args...) );
+               for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx += chunksInSlice )
+                  reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
             }
          }
          keeper( segmentIdx, aux );
@@ -459,9 +460,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
       {
          if( gridIdx == cudaGrids - 1 )
             cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-         ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args...  >
-            <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
-            ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+         //ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args...  >
+         //   <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
+         //   ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
       }
 #endif
    }
diff --git a/src/TNL/Containers/Segments/EllpackView.hpp b/src/TNL/Containers/Segments/EllpackView.hpp
index 84086f380..f2cfaf590 100644
--- a/src/TNL/Containers/Segments/EllpackView.hpp
+++ b/src/TNL/Containers/Segments/EllpackView.hpp
@@ -13,6 +13,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Containers/Segments/EllpackView.h>
+#include <TNL/Containers/Segments/details/LambdaAdapter.h>
 
 namespace TNL {
    namespace Containers {
@@ -258,19 +259,20 @@ void
 EllpackView< Device, Index, RowMajorOrder, Alignment >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( RowMajorOrder )
    {
       const IndexType segmentSize = this->segmentSize;
-      auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable {
-         const IndexType begin = i * segmentSize;
+      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
+         const IndexType begin = segmentIdx * segmentSize;
          const IndexType end = begin + segmentSize;
          RealType aux( zero );
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType j = begin; j < end && compute; j++  )
-            reduction( aux, fetch( i, localIdx++, j, compute, args... ) );
-         keeper( i, aux );
+            reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
+         keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
    }
@@ -278,15 +280,15 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
    {
       const IndexType storageSize = this->getStorageSize();
       const IndexType alignedSize = this->alignedSize;
-      auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable {
-         const IndexType begin = i;
+      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
+         const IndexType begin = segmentIdx;
          const IndexType end = storageSize;
          RealType aux( zero );
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType j = begin; j < end && compute; j += alignedSize  )
-            reduction( aux, fetch( i, localIdx++, j, compute, args... ) );
-         keeper( i, aux );
+            reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
+         keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
    }
diff --git a/src/TNL/Containers/Segments/SlicedEllpackView.hpp b/src/TNL/Containers/Segments/SlicedEllpackView.hpp
index c4e03aada..50f1c65ee 100644
--- a/src/TNL/Containers/Segments/SlicedEllpackView.hpp
+++ b/src/TNL/Containers/Segments/SlicedEllpackView.hpp
@@ -13,6 +13,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Containers/Segments/SlicedEllpackView.h>
+#include <TNL/Containers/Segments/details/LambdaAdapter.h>
 
 #include "SlicedEllpackView.h"
 
@@ -306,7 +307,8 @@ void
 SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
+   //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
    const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView();
    const auto sliceOffsets_view = this->sliceOffsets.getConstView();
    if( RowMajorOrder )
@@ -321,7 +323,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType globalIdx = begin; globalIdx< end; globalIdx++  )
-            reduction( aux, fetch( segmentIdx, localIdx++, globalIdx, compute, args... ) );
+            reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
          keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
@@ -338,7 +340,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize  )
-            reduction( aux, fetch( segmentIdx, localIdx++, globalIdx, compute, args... ) );
+            reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
          keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
diff --git a/src/TNL/Containers/Segments/details/CheckLambdas.h b/src/TNL/Containers/Segments/details/CheckLambdas.h
new file mode 100644
index 000000000..7e3955997
--- /dev/null
+++ b/src/TNL/Containers/Segments/details/CheckLambdas.h
@@ -0,0 +1,81 @@
+/***************************************************************************
+                          CheckLambdas.h -  description
+                             -------------------
+    begin                : Dpr 4, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+
+namespace TNL {
+   namespace Containers {
+      namespace Segments {
+         namespace details {
+
+template< typename Index,
+          typename Lambda >
+class CheckFetchLambdaAcceptsSegmentIdxAndCompute
+{
+   private:
+       typedef char YesType[1];
+       typedef char NoType[2];
+
+       template< typename C > static YesType& test( decltype(std::declval< C >()( Index(), Index(), Index(), std::declval< bool& >() ) ) );
+       template< typename C > static NoType& test(...);
+
+   public:
+       static constexpr bool value = ( sizeof( test< Lambda >(0) ) == sizeof( YesType ) );
+};
+
+template< typename Index,
+          typename Lambda >
+class CheckFetchLambdaAcceptsSegmentIdx
+{
+   private:
+       typedef char YesType[1];
+       typedef char NoType[2];
+
+       template< typename C > static YesType& test( decltype(std::declval< C >()( Index(), Index(), Index() ) ) );
+       template< typename C > static NoType& test(...);
+
+   public:
+       static constexpr bool value = ( sizeof( test< Lambda >(0) ) == sizeof( YesType ) );
+};
+
+template< typename Index,
+          typename Lambda >
+class CheckFetchLambdaAcceptsCompute
+{
+   private:
+       typedef char YesType[1];
+       typedef char NoType[2];
+
+       template< typename C > static YesType& test( decltype(std::declval< C >()( Index(), Index(), std::declval< bool& >() ) ) );
+       template< typename C > static NoType& test(...);
+
+   public:
+       static constexpr bool value = ( sizeof( test< Lambda >(0) ) == sizeof( YesType ) );
+};
+
+
+template< typename Index,
+          typename Lambda >
+class CheckFetchLambda
+{
+   static constexpr bool AcceptsSegmentIdxAndCompute = CheckFetchLambdaAcceptsSegmentIdxAndCompute< Index, Lambda >::value;
+   static constexpr bool AcceptsSegmentIdx = CheckFetchLambdaAcceptsSegmentIdx< Index, Lambda >::value;
+   static constexpr bool AcceptsCompute = CheckFetchLambdaAcceptsCompute< Index, Lambda >::value;
+
+   public:
+      static constexpr bool acceptsSegmentIdx() { return AcceptsSegmentIdxAndCompute || AcceptsSegmentIdx; };
+      static constexpr bool acceptsCompute() { return AcceptsSegmentIdxAndCompute || AcceptsCompute; };
+};
+
+         } // namespace details
+      } // namespace Segements
+   }  // namespace Conatiners
+} // namespace TNL
diff --git a/src/TNL/Containers/Segments/details/LambdaAdapter.h b/src/TNL/Containers/Segments/details/LambdaAdapter.h
new file mode 100644
index 000000000..affd9ac96
--- /dev/null
+++ b/src/TNL/Containers/Segments/details/LambdaAdapter.h
@@ -0,0 +1,55 @@
+/***************************************************************************
+                          LambdaAdapter.h -  description
+                             -------------------
+    begin                : Dpr 4, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+#include<TNL/Containers/Segments/details/CheckLambdas.h>
+
+#include "CheckLambdas.h"
+
+
+namespace TNL {
+   namespace Containers {
+      namespace Segments {
+         namespace details {
+
+template< typename Index,
+          typename Lambda,
+          bool AcceptsSegmentIdx = CheckFetchLambda< Index, Lambda >::acceptsSegmentIdx() >
+struct FetchLambdaAdapter
+{
+};
+
+template< typename Index,
+          typename Lambda >
+struct FetchLambdaAdapter< Index, Lambda, true >
+{
+   using ReturnType = decltype( std::declval< Lambda >()( Index(), Index(), Index(), std::declval< bool& >() ) );
+   
+   static ReturnType call( Lambda& f, Index segmentIdx, Index localIdx, Index globalIdx, bool& compute )
+   {
+      return f( segmentIdx, localIdx, globalIdx, compute );
+   }
+};
+
+template< typename Index,
+          typename Lambda >
+struct FetchLambdaAdapter< Index, Lambda, false >
+{
+   using ReturnType = decltype( std::declval< Lambda >()( Index(), Index(), std::declval< bool& >() ) );
+   static ReturnType call( Lambda& f, Index segmentIdx, Index localIdx, Index globalIdx, bool& compute )
+   {
+      return f( localIdx, globalIdx, compute );
+   }
+};
+
+         } // namespace details
+      } // namespace Segements
+   }  // namespace Conatiners
+} // namespace TNL
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index ef0f40625..e6f71c305 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -381,7 +381,7 @@ vectorProduct( const InVector& inVector,
    const IndexType paddingIndex = this->getPaddingIndex();
    if( isSymmetric() )
       outVector *= outVectorMultiplicator;
-   auto fetch = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable -> RealType {
+   auto symmetricFetch = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable -> RealType {
       const IndexType column = columnIndexesView[ globalIdx ];
       compute = ( column != paddingIndex );
       if( ! compute )
@@ -397,6 +397,16 @@ vectorProduct( const InVector& inVector,
          return inVectorView[ column ];
       return valuesView[ globalIdx ] * inVectorView[ column ];
    };
+   auto fetch = [=] __cuda_callable__ ( IndexType localIdx, IndexType globalIdx, bool& compute ) mutable -> RealType {
+      const IndexType column = columnIndexesView[ globalIdx ];
+      compute = ( column != paddingIndex );
+      if( ! compute )
+         return 0.0;
+      if( isBinary() )
+         return inVectorView[ column ];
+      return valuesView[ globalIdx ] * inVectorView[ column ];
+   };
+
    auto reduction = [] __cuda_callable__ ( RealType& sum, const RealType& value ) {
       sum += value;
    };
@@ -411,7 +421,10 @@ vectorProduct( const InVector& inVector,
             outVectorView[ row ] = outVectorMultiplicator * outVectorView[ row ] + matrixMultiplicator * value;
       }
    };
-   this->segments.segmentsReduction( 0, this->getRows(), fetch, reduction, keeper, ( RealType ) 0.0 );
+   if( isSymmetric() )
+      this->segments.segmentsReduction( 0, this->getRows(), symmetricFetch, reduction, keeper, ( RealType ) 0.0 );
+   else
+      this->segments.segmentsReduction( 0, this->getRows(), fetch, reduction, keeper, ( RealType ) 0.0 );
 
    /*const auto inVectorView = inVector.getConstView();
    auto outVectorView = outVector.getView();
-- 
GitLab


From e93c933e33a6eaa30239730fafc5e217afe3d982 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 4 Apr 2020 20:57:57 +0200
Subject: [PATCH 07/68] Changed smart lambdas in segments, implemented fast
 segments reduction in ChunkedEllpack.

---
 .../Containers/Segments/ChunkedEllpackView.h  |  18 ++
 .../Segments/ChunkedEllpackView.hpp           | 222 ++++++++++--------
 .../Segments/details/CheckLambdas.h           |  55 +----
 .../Segments/details/ChunkedEllpack.h         |  76 ++++++
 .../Segments/details/LambdaAdapter.h          |   6 +-
 src/TNL/Matrices/SparseMatrixView.hpp         |   2 +-
 6 files changed, 235 insertions(+), 144 deletions(-)

diff --git a/src/TNL/Containers/Segments/ChunkedEllpackView.h b/src/TNL/Containers/Segments/ChunkedEllpackView.h
index eaf2450b5..5b89b373f 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpackView.h
+++ b/src/TNL/Containers/Segments/ChunkedEllpackView.h
@@ -148,6 +148,21 @@ class ChunkedEllpackView
    protected:
 
 #ifdef HAVE_CUDA
+      template< typename Fetch,
+                typename Reduction,
+                typename ResultKeeper,
+                typename Real,
+                typename... Args >
+      __device__
+      void segmentsReductionKernelWithAllParameters( IndexType gridIdx,
+                                                     IndexType first,
+                                                     IndexType last,
+                                                     Fetch fetch,
+                                                     Reduction reduction,
+                                                     ResultKeeper keeper,
+                                                     Real zero,
+                                                     Args... args ) const;
+
       template< typename Fetch,
                 typename Reduction,
                 typename ResultKeeper,
@@ -206,6 +221,9 @@ class ChunkedEllpackView
                                                   ResultKeeper_ keeper,
                                                   Real_ zero,
                                                   Args_... args );
+
+      template< typename Index_, typename Fetch_, bool B_ >
+      friend struct details::ChunkedEllpackSegmentsReductionDispatcher;
 #endif
 };
       } // namespace Segements
diff --git a/src/TNL/Containers/Segments/ChunkedEllpackView.hpp b/src/TNL/Containers/Segments/ChunkedEllpackView.hpp
index de6de27d7..9d572f9f7 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpackView.hpp
+++ b/src/TNL/Containers/Segments/ChunkedEllpackView.hpp
@@ -20,30 +20,6 @@ namespace TNL {
    namespace Containers {
       namespace Segments {
 
-#ifdef HAVE_CUDA
-template< typename View,
-          typename Index,
-          typename Fetch,
-          typename Reduction,
-          typename ResultKeeper,
-          typename Real,
-          typename... Args >
-__global__
-void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack,
-                                            Index gridIdx,
-                                            Index first,
-                                            Index last,
-                                            Fetch fetch,
-                                            Reduction reduction,
-                                            ResultKeeper keeper,
-                                            Real zero,
-                                            Args... args )
-{
-   chunkedEllpack.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
-}
-#endif
-
-
 template< typename Device,
           typename Index,
           bool RowMajorOrder >
@@ -460,9 +436,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
       {
          if( gridIdx == cudaGrids - 1 )
             cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-         //ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args...  >
-         //   <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
-         //   ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+         details::ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args...  >
+            <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
+            ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
       }
 #endif
    }
@@ -560,6 +536,84 @@ printStructure( std::ostream& str ) const
 }
 
 #ifdef HAVE_CUDA
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder >
+   template< typename Fetch,
+             typename Reduction,
+             typename ResultKeeper,
+             typename Real,
+             typename... Args >
+__device__
+void
+ChunkedEllpackView< Device, Index, RowMajorOrder >::
+segmentsReductionKernelWithAllParameters( IndexType gridIdx,
+                                          IndexType first,
+                                          IndexType last,
+                                          Fetch fetch,
+                                          Reduction reduction,
+                                          ResultKeeper keeper,
+                                          Real zero,
+                                          Args... args ) const
+{
+   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+
+   const IndexType firstSlice = rowToSliceMapping[ first ];
+   const IndexType lastSlice = rowToSliceMapping[ last - 1 ];
+
+   const IndexType sliceIdx = firstSlice + gridIdx * Cuda::getMaxGridSize() + blockIdx.x;
+   if( sliceIdx > lastSlice )
+      return;
+
+   RealType* chunksResults = Cuda::getSharedMemory< RealType >();
+   __shared__ details::ChunkedEllpackSliceInfo< IndexType > sliceInfo;
+   if( threadIdx.x == 0 )
+      sliceInfo = this->slices[ sliceIdx ];
+   chunksResults[ threadIdx.x ] = zero;
+   __syncthreads();
+
+
+
+   const IndexType sliceOffset = sliceInfo.pointer;
+   const IndexType chunkSize = sliceInfo.chunkSize;
+   const IndexType chunkIdx = sliceIdx * chunksInSlice + threadIdx.x;
+   const IndexType segmentIdx = this->chunksToSegmentsMapping[ chunkIdx ];
+   IndexType firstChunkOfSegment( 0 );
+   if( segmentIdx != sliceInfo.firstSegment )
+      firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ];
+   IndexType localIdx = ( threadIdx.x - firstChunkOfSegment ) * chunkSize;
+   bool compute( true );
+
+   if( RowMajorOrder )
+   {
+      IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice
+      IndexType end = begin + chunkSize;
+      for( IndexType j = begin; j < end && compute; j++ )
+         reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute ) );
+   }
+   else
+   {
+      const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice
+      const IndexType end = begin + chunksInSlice * chunkSize;
+         for( IndexType j = begin; j < end && compute; j += chunksInSlice )
+            reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute ) );
+   }
+   __syncthreads();
+   if( threadIdx.x < sliceInfo.size )
+   {
+      const IndexType row = sliceInfo.firstSegment + threadIdx.x;
+      IndexType chunkIndex( 0 );
+      if( threadIdx.x != 0 )
+         chunkIndex = this->rowToChunkMapping[ row - 1 ];
+      const IndexType lastChunk = this->rowToChunkMapping[ row ];
+      RealType result( zero );
+      while( chunkIndex < lastChunk )
+         reduction( result,  chunksResults[ chunkIndex++ ] );
+      if( row >= first && row < last )
+         keeper( row, result );
+   }
+}
+
 template< typename Device,
           typename Index,
           bool RowMajorOrder >
@@ -580,73 +634,57 @@ segmentsReductionKernel( IndexType gridIdx,
                          Real zero,
                          Args... args ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   using RealType = decltype( fetch( IndexType(), std::declval< bool& >(), args... ) );
+
+   const IndexType firstSlice = rowToSliceMapping[ first ];
+   const IndexType lastSlice = rowToSliceMapping[ last - 1 ];
 
-   const IndexType firstSlice = rowToChunkMapping[ first ] / chunksInSlice;
-   const IndexType lastSlice = rowToChunkMapping[ last - 1 ] / chunksInSlice;
-   //for( IndexType sliceIdx = firstSlice; sliceIdx < lastSlice; sliceIdx++ )
+   const IndexType sliceIdx = firstSlice + gridIdx * Cuda::getMaxGridSize() + blockIdx.x;
+   if( sliceIdx > lastSlice )
+      return;
+
+   RealType* chunksResults = Cuda::getSharedMemory< RealType >();
+   __shared__ details::ChunkedEllpackSliceInfo< IndexType > sliceInfo;
+
+   if( threadIdx.x == 0 )
+      sliceInfo = this->slices[ sliceIdx ];
+   chunksResults[ threadIdx.x ] = zero;
+   __syncthreads();
+
+   const IndexType sliceOffset = sliceInfo.pointer;
+   const IndexType chunkSize = sliceInfo.chunkSize;
+   const IndexType chunkIdx = sliceIdx * chunksInSlice + threadIdx.x;
+   bool compute( true );
+
+   if( RowMajorOrder )
    {
-      const IndexType sliceIdx = gridIdx * Cuda::getMaxGridSize() + blockIdx.x;
-      if( sliceIdx >= lastSlice )
-         return;
-
-      RealType* chunksResults = Cuda::getSharedMemory< RealType >();
-      //for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ )
-      //{
-         __shared__ details::ChunkedEllpackSliceInfo< IndexType > sliceInfo;
-         if( threadIdx.x == 0 )
-            sliceInfo = this->slices[ sliceIdx ];
-         chunksResults[ threadIdx.x ] = zero;
-         __syncthreads();
-
-   
-
-         const IndexType sliceOffset = sliceInfo.pointer;
-         const IndexType chunkSize = sliceInfo.chunkSize;
-         const IndexType chunkIdx = sliceIdx * chunksInSlice + threadIdx.x;
-         const IndexType segmentIdx = this->chunksToSegmentsMapping[ chunkIdx ];
-         IndexType firstChunkOfSegment( 0 );
-         if( segmentIdx != sliceInfo.firstSegment )
-            firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ];
-         IndexType localIdx = ( threadIdx.x - firstChunkOfSegment ) * chunkSize;
-         bool compute( true );
-          
-         if( RowMajorOrder )
-         {
-            IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice
-            IndexType end = begin + chunkSize;
-            for( IndexType j = begin; j < end && compute; j++ )
-               reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute, args...) );
-         }
-         else
-         {
-            const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice
-            const IndexType end = begin + chunksInSlice * chunkSize;
-               for( IndexType j = begin; j < end && compute; j += chunksInSlice )
-                  reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute, args...) );
-         }
-         __syncthreads();
-      //}
-
-      //details::ChunkedEllpackSliceInfo< IndexType > sliceInfo;
-      //for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ )
-      //{
-         //if( threadIdx == 0 )
-         //   sliceInfo = this->slices[ sliceIdx ];
-         if( threadIdx.x < sliceInfo.size )
-         {
-            const IndexType row = sliceInfo.firstSegment + threadIdx.x;
-            IndexType chunkIndex( 0 );
-            if( threadIdx.x != 0 )
-               chunkIndex = this->rowToChunkMapping[ row - 1 ];
-            const IndexType lastChunk = this->rowToChunkMapping[ row ];
-            RealType result( zero );
-            while( chunkIndex < lastChunk )
-               reduction( result,  chunksResults[ chunkIndex++ ] );
-            keeper( row, result );
-         }
-      //} // threadIdx
-   } // sliceIdx
+      IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice
+      IndexType end = begin + chunkSize;
+      for( IndexType j = begin; j < end && compute; j++ )
+         reduction( chunksResults[ threadIdx.x ], fetch( j, compute ) );
+   }
+   else
+   {
+      const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice
+      const IndexType end = begin + chunksInSlice * chunkSize;
+         for( IndexType j = begin; j < end && compute; j += chunksInSlice )
+            reduction( chunksResults[ threadIdx.x ], fetch( j, compute ) );
+   }
+   __syncthreads();
+
+   if( threadIdx.x < sliceInfo.size )
+   {
+      const IndexType row = sliceInfo.firstSegment + threadIdx.x;
+      IndexType chunkIndex( 0 );
+      if( threadIdx.x != 0 )
+         chunkIndex = this->rowToChunkMapping[ row - 1 ];
+      const IndexType lastChunk = this->rowToChunkMapping[ row ];
+      RealType result( zero );
+      while( chunkIndex < lastChunk )
+         reduction( result,  chunksResults[ chunkIndex++ ] );
+      if( row >= first && row < last )
+         keeper( row, result );
+   }
 }
 #endif
 
diff --git a/src/TNL/Containers/Segments/details/CheckLambdas.h b/src/TNL/Containers/Segments/details/CheckLambdas.h
index 7e3955997..498e85a7a 100644
--- a/src/TNL/Containers/Segments/details/CheckLambdas.h
+++ b/src/TNL/Containers/Segments/details/CheckLambdas.h
@@ -18,61 +18,20 @@ namespace TNL {
 
 template< typename Index,
           typename Lambda >
-class CheckFetchLambdaAcceptsSegmentIdxAndCompute
-{
-   private:
-       typedef char YesType[1];
-       typedef char NoType[2];
-
-       template< typename C > static YesType& test( decltype(std::declval< C >()( Index(), Index(), Index(), std::declval< bool& >() ) ) );
-       template< typename C > static NoType& test(...);
-
-   public:
-       static constexpr bool value = ( sizeof( test< Lambda >(0) ) == sizeof( YesType ) );
-};
-
-template< typename Index,
-          typename Lambda >
-class CheckFetchLambdaAcceptsSegmentIdx
+class CheckFetchLambda
 {
    private:
-       typedef char YesType[1];
-       typedef char NoType[2];
-
-       template< typename C > static YesType& test( decltype(std::declval< C >()( Index(), Index(), Index() ) ) );
-       template< typename C > static NoType& test(...);
-
-   public:
-       static constexpr bool value = ( sizeof( test< Lambda >(0) ) == sizeof( YesType ) );
-};
+      typedef char YesType[1];
+      typedef char NoType[2];
 
-template< typename Index,
-          typename Lambda >
-class CheckFetchLambdaAcceptsCompute
-{
-   private:
-       typedef char YesType[1];
-       typedef char NoType[2];
+      template< typename C > static YesType& test( decltype(std::declval< C >()( Index(), Index(), Index(), std::declval< bool& >() ) ) );
+      template< typename C > static NoType& test(...);
 
-       template< typename C > static YesType& test( decltype(std::declval< C >()( Index(), Index(), std::declval< bool& >() ) ) );
-       template< typename C > static NoType& test(...);
+      static constexpr bool value = ( sizeof( test< Lambda >(0) ) == sizeof( YesType ) );
 
    public:
-       static constexpr bool value = ( sizeof( test< Lambda >(0) ) == sizeof( YesType ) );
-};
-
 
-template< typename Index,
-          typename Lambda >
-class CheckFetchLambda
-{
-   static constexpr bool AcceptsSegmentIdxAndCompute = CheckFetchLambdaAcceptsSegmentIdxAndCompute< Index, Lambda >::value;
-   static constexpr bool AcceptsSegmentIdx = CheckFetchLambdaAcceptsSegmentIdx< Index, Lambda >::value;
-   static constexpr bool AcceptsCompute = CheckFetchLambdaAcceptsCompute< Index, Lambda >::value;
-
-   public:
-      static constexpr bool acceptsSegmentIdx() { return AcceptsSegmentIdxAndCompute || AcceptsSegmentIdx; };
-      static constexpr bool acceptsCompute() { return AcceptsSegmentIdxAndCompute || AcceptsCompute; };
+      static constexpr bool hasAllParameters() { return value; };
 };
 
          } // namespace details
diff --git a/src/TNL/Containers/Segments/details/ChunkedEllpack.h b/src/TNL/Containers/Segments/details/ChunkedEllpack.h
index 95ae00c88..14e181c7e 100644
--- a/src/TNL/Containers/Segments/details/ChunkedEllpack.h
+++ b/src/TNL/Containers/Segments/details/ChunkedEllpack.h
@@ -13,6 +13,7 @@
 #include <type_traits>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Containers/Segments/ChunkedEllpackSegmentView.h>
+#include <TNL/Containers/Segments/details/CheckLambdas.h>
 
 namespace TNL {
    namespace Containers {
@@ -223,6 +224,81 @@ class ChunkedEllpack
                                     chunksInSlice );
       }
 };
+
+#ifdef HAVE_CUDA
+template< typename Index,
+          typename Fetch,
+          bool HasAllParameters = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
+struct ChunkedEllpackSegmentsReductionDispatcher{};
+
+template< typename Index, typename Fetch >
+struct ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch, true >
+{
+   template< typename View,
+             typename Reduction,
+             typename ResultKeeper,
+             typename Real,
+             typename... Args >
+   __device__
+   static void exec( View chunkedEllpack,
+                     Index gridIdx,
+                     Index first,
+                     Index last,
+                     Fetch fetch,
+                     Reduction reduction,
+                     ResultKeeper keeper,
+                     Real zero,
+                     Args... args )
+   {
+      chunkedEllpack.segmentsReductionKernelWithAllParameters( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+   }
+};
+
+template< typename Index, typename Fetch >
+struct ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch, false >
+{
+   template< typename View,
+             typename Reduction,
+             typename ResultKeeper,
+             typename Real,
+             typename... Args >
+   __device__
+   static void exec( View chunkedEllpack,
+                     Index gridIdx,
+                     Index first,
+                     Index last,
+                     Fetch fetch,
+                     Reduction reduction,
+                     ResultKeeper keeper,
+                     Real zero,
+                     Args... args )
+   {
+      chunkedEllpack.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+   }
+};
+
+template< typename View,
+          typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real,
+          typename... Args >
+__global__
+void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack,
+                                            Index gridIdx,
+                                            Index first,
+                                            Index last,
+                                            Fetch fetch,
+                                            Reduction reduction,
+                                            ResultKeeper keeper,
+                                            Real zero,
+                                            Args... args )
+{
+   ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch >::exec( chunkedEllpack, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+}
+#endif
+
          } //namespace details
       } //namespace Segments
    } //namespace Containers
diff --git a/src/TNL/Containers/Segments/details/LambdaAdapter.h b/src/TNL/Containers/Segments/details/LambdaAdapter.h
index affd9ac96..a2d118fdd 100644
--- a/src/TNL/Containers/Segments/details/LambdaAdapter.h
+++ b/src/TNL/Containers/Segments/details/LambdaAdapter.h
@@ -21,7 +21,7 @@ namespace TNL {
 
 template< typename Index,
           typename Lambda,
-          bool AcceptsSegmentIdx = CheckFetchLambda< Index, Lambda >::acceptsSegmentIdx() >
+          bool AllParameters = CheckFetchLambda< Index, Lambda >::hasAllParameters() >
 struct FetchLambdaAdapter
 {
 };
@@ -42,10 +42,10 @@ template< typename Index,
           typename Lambda >
 struct FetchLambdaAdapter< Index, Lambda, false >
 {
-   using ReturnType = decltype( std::declval< Lambda >()( Index(), Index(), std::declval< bool& >() ) );
+   using ReturnType = decltype( std::declval< Lambda >()( Index(), std::declval< bool& >() ) );
    static ReturnType call( Lambda& f, Index segmentIdx, Index localIdx, Index globalIdx, bool& compute )
    {
-      return f( localIdx, globalIdx, compute );
+      return f( globalIdx, compute );
    }
 };
 
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index e6f71c305..9f6580a59 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -397,7 +397,7 @@ vectorProduct( const InVector& inVector,
          return inVectorView[ column ];
       return valuesView[ globalIdx ] * inVectorView[ column ];
    };
-   auto fetch = [=] __cuda_callable__ ( IndexType localIdx, IndexType globalIdx, bool& compute ) mutable -> RealType {
+   auto fetch = [=] __cuda_callable__ ( IndexType globalIdx, bool& compute ) mutable -> RealType {
       const IndexType column = columnIndexesView[ globalIdx ];
       compute = ( column != paddingIndex );
       if( ! compute )
-- 
GitLab


From ba6492b2a11e1d97266a322ebfd3fed3d0caf3a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 5 Apr 2020 16:57:22 +0200
Subject: [PATCH 08/68] Fixed ConstView for segments and deleted duplicit code
 in ChunkedEllpack.

---
 src/TNL/Containers/Segments/CSR.h             |   8 +-
 src/TNL/Containers/Segments/CSR.hpp           |  61 +++-------
 src/TNL/Containers/Segments/CSRView.h         |   8 +-
 src/TNL/Containers/Segments/CSRView.hpp       |  34 ++----
 src/TNL/Containers/Segments/ChunkedEllpack.h  |   8 +-
 .../Containers/Segments/ChunkedEllpack.hpp    |  38 ++----
 .../Containers/Segments/ChunkedEllpackView.h  |   8 +-
 .../Segments/ChunkedEllpackView.hpp           |  36 ++----
 src/TNL/Containers/Segments/Ellpack.h         |   6 +-
 src/TNL/Containers/Segments/Ellpack.hpp       | 111 ++++--------------
 src/TNL/Containers/Segments/EllpackView.h     |   6 +-
 src/TNL/Containers/Segments/EllpackView.hpp   |  62 ++++------
 src/TNL/Containers/Segments/SlicedEllpack.h   |   6 +-
 src/TNL/Containers/Segments/SlicedEllpack.hpp | 105 +++--------------
 .../Containers/Segments/SlicedEllpackView.h   |   8 +-
 .../Containers/Segments/SlicedEllpackView.hpp |  34 ++----
 .../Containers/Segments/SegmentsTest.hpp      |   2 +-
 17 files changed, 151 insertions(+), 390 deletions(-)

diff --git a/src/TNL/Containers/Segments/CSR.h b/src/TNL/Containers/Segments/CSR.h
index 89cad0c6a..ac9a7063e 100644
--- a/src/TNL/Containers/Segments/CSR.h
+++ b/src/TNL/Containers/Segments/CSR.h
@@ -28,13 +28,13 @@ class CSR
    public:
 
       using DeviceType = Device;
-      using IndexType = Index;
-      using OffsetsHolder = Containers::Vector< IndexType, DeviceType, typename std::remove_const< IndexType >::type, IndexAllocator >;
+      using IndexType = std::remove_const_t< Index >;
+      using OffsetsHolder = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
       using SegmentsSizes = OffsetsHolder;
       template< typename Device_, typename Index_ >
       using ViewTemplate = CSRView< Device_, Index_ >;
       using ViewType = CSRView< Device, Index >;
-      using ConstViewType = CSRView< Device, std::add_const_t< Index > >;
+      using ConstViewType = CSRView< Device, std::add_const_t< IndexType > >;
       using SegmentViewType = SegmentView< IndexType, true >;
 
       CSR();
@@ -57,7 +57,7 @@ class CSR
 
       ViewType getView();
 
-      ConstViewType getConstView() const;
+      const ConstViewType getConstView() const;
 
       /**
        * \brief Number segments.
diff --git a/src/TNL/Containers/Segments/CSR.hpp b/src/TNL/Containers/Segments/CSR.hpp
index 9a948b04e..d8ba81461 100644
--- a/src/TNL/Containers/Segments/CSR.hpp
+++ b/src/TNL/Containers/Segments/CSR.hpp
@@ -98,9 +98,9 @@ getView()
 template< typename Device,
           typename Index,
           typename IndexAllocator >
-typename CSR< Device, Index, IndexAllocator >::ConstViewType
+auto
 CSR< Device, Index, IndexAllocator >::
-getConstView() const
+getConstView() const -> const ConstViewType
 {
    return ConstViewType( this->offsets.getConstView() );
 }
@@ -108,10 +108,8 @@ getConstView() const
 template< typename Device,
           typename Index,
           typename IndexAllocator >
-__cuda_callable__
-Index
-CSR< Device, Index, IndexAllocator >::
-getSegmentsCount() const
+__cuda_callable__ auto CSR< Device, Index, IndexAllocator >::
+getSegmentsCount() const -> IndexType
 {
    return this->offsets.getSize() - 1;
 }
@@ -119,10 +117,8 @@ getSegmentsCount() const
 template< typename Device,
           typename Index,
           typename IndexAllocator >
-__cuda_callable__
-Index
-CSR< Device, Index, IndexAllocator >::
-getSegmentSize( const IndexType segmentIdx ) const
+__cuda_callable__ auto CSR< Device, Index, IndexAllocator >::
+getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    return details::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx );
 }
@@ -130,10 +126,8 @@ getSegmentSize( const IndexType segmentIdx ) const
 template< typename Device,
           typename Index,
           typename IndexAllocator >
-__cuda_callable__
-Index
-CSR< Device, Index, IndexAllocator >::
-getSize() const
+__cuda_callable__ auto CSR< Device, Index, IndexAllocator >::
+getSize() const -> IndexType
 {
    return this->getStorageSize();
 }
@@ -141,10 +135,8 @@ getSize() const
 template< typename Device,
           typename Index,
           typename IndexAllocator >
-__cuda_callable__
-Index
-CSR< Device, Index, IndexAllocator >::
-getStorageSize() const
+__cuda_callable__ auto CSR< Device, Index, IndexAllocator >::
+getStorageSize() const -> IndexType
 {
    return details::CSR< Device, Index >::getStorageSize( this->offsets );
 }
@@ -152,10 +144,8 @@ getStorageSize() const
 template< typename Device,
           typename Index,
           typename IndexAllocator >
-__cuda_callable__
-Index
-CSR< Device, Index, IndexAllocator >::
-getGlobalIndex( const Index segmentIdx, const Index localIdx ) const
+__cuda_callable__ auto CSR< Device, Index, IndexAllocator >::
+getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
    if( ! std::is_same< DeviceType, Devices::Host >::value )
    {
@@ -197,16 +187,7 @@ void
 CSR< Device, Index, IndexAllocator >::
 forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
 {
-   const auto offsetsView = this->offsets.getConstView();
-   auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
-      const IndexType begin = offsetsView[ segmentIdx ];
-      const IndexType end = offsetsView[ segmentIdx + 1 ];
-      IndexType localIdx( 0 );
-      for( IndexType globalIdx = begin; globalIdx < end; globalIdx++  )
-         if( ! f( segmentIdx, localIdx++, globalIdx, args... ) )
-            break;
-   };
-   Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+   this->getConstView().forSegments( first, last, f, args... );
 }
 
 template< typename Device,
@@ -228,19 +209,7 @@ void
 CSR< Device, Index, IndexAllocator >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
-   const auto offsetsView = this->offsets.getConstView();
-   auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable {
-      const IndexType begin = offsetsView[ i ];
-      const IndexType end = offsetsView[ i + 1 ];
-      RealType aux( zero );
-      bool compute( true );
-      IndexType localIdx( 0 );
-      for( IndexType j = begin; j < end && compute; j++  )
-         reduction( aux, fetch( i, localIdx++, j, compute, args... ) );
-      keeper( i, aux );
-   };
-   Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+   this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
@@ -279,7 +248,7 @@ save( File& file ) const
 template< typename Device,
           typename Index,
           typename IndexAllocator >
-void
+void 
 CSR< Device, Index, IndexAllocator >::
 load( File& file )
 {
diff --git a/src/TNL/Containers/Segments/CSRView.h b/src/TNL/Containers/Segments/CSRView.h
index 4e53bd204..fa091583d 100644
--- a/src/TNL/Containers/Segments/CSRView.h
+++ b/src/TNL/Containers/Segments/CSRView.h
@@ -26,9 +26,9 @@ class CSRView
    public:
 
       using DeviceType = Device;
-      using IndexType = Index;
-      using OffsetsView = typename Containers::VectorView< IndexType, DeviceType, typename std::remove_const< IndexType >::type >;
-      using ConstOffsetsView = typename Containers::Vector< IndexType, DeviceType, typename std::remove_const< IndexType >::type >::ConstViewType;
+      using IndexType = std::remove_const_t< Index >;
+      using OffsetsView = typename Containers::VectorView< Index, DeviceType, IndexType >;
+      using ConstOffsetsView = typename Containers::Vector< Index, DeviceType,IndexType >::ConstViewType;
       using ViewType = CSRView;
       template< typename Device_, typename Index_ >
       using ViewTemplate = CSRView< Device_, Index_ >;
@@ -58,7 +58,7 @@ class CSRView
       ViewType getView();
 
       __cuda_callable__
-      ConstViewType getConstView() const;
+      const ConstViewType getConstView() const;
 
       /**
        * \brief Number segments.
diff --git a/src/TNL/Containers/Segments/CSRView.hpp b/src/TNL/Containers/Segments/CSRView.hpp
index 54bac01a2..4dde78a24 100644
--- a/src/TNL/Containers/Segments/CSRView.hpp
+++ b/src/TNL/Containers/Segments/CSRView.hpp
@@ -96,59 +96,49 @@ getView()
 template< typename Device,
           typename Index >
 __cuda_callable__
-typename CSRView< Device, Index >::ConstViewType
+auto
 CSRView< Device, Index >::
-getConstView() const
+getConstView() const -> const ConstViewType
 {
    return ConstViewType( this->offsets.getConstView() );
 }
 
 template< typename Device,
           typename Index >
-__cuda_callable__
-Index
-CSRView< Device, Index >::
-getSegmentsCount() const
+__cuda_callable__ auto CSRView< Device, Index >::
+getSegmentsCount() const -> IndexType
 {
    return this->offsets.getSize() - 1;
 }
 
 template< typename Device,
           typename Index >
-__cuda_callable__
-Index
-CSRView< Device, Index >::
-getSegmentSize( const IndexType segmentIdx ) const
+__cuda_callable__ auto CSRView< Device, Index >::
+getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    return details::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx );
 }
 
 template< typename Device,
           typename Index >
-__cuda_callable__
-Index
-CSRView< Device, Index >::
-getSize() const
+__cuda_callable__ auto CSRView< Device, Index >::
+getSize() const -> IndexType
 {
    return this->getStorageSize();
 }
 
 template< typename Device,
           typename Index >
-__cuda_callable__
-Index
-CSRView< Device, Index >::
-getStorageSize() const
+__cuda_callable__ auto CSRView< Device, Index >::
+getStorageSize() const -> IndexType
 {
    return details::CSR< Device, Index >::getStorageSize( this->offsets );
 }
 
 template< typename Device,
           typename Index >
-__cuda_callable__
-Index
-CSRView< Device, Index >::
-getGlobalIndex( const Index segmentIdx, const Index localIdx ) const
+__cuda_callable__ auto CSRView< Device, Index >::
+getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
    if( ! std::is_same< DeviceType, Devices::Host >::value )
    {
diff --git a/src/TNL/Containers/Segments/ChunkedEllpack.h b/src/TNL/Containers/Segments/ChunkedEllpack.h
index c6c7812db..8c2e94264 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpack.h
+++ b/src/TNL/Containers/Segments/ChunkedEllpack.h
@@ -28,13 +28,13 @@ class ChunkedEllpack
    public:
 
       using DeviceType = Device;
-      using IndexType = Index;
-      using OffsetsHolder = Containers::Vector< IndexType, DeviceType, typename std::remove_const< IndexType >::type, IndexAllocator >;
+      using IndexType = std::remove_const_t< Index >;
+      using OffsetsHolder = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
       static constexpr bool getRowMajorOrder() { return RowMajorOrder; }
       using ViewType = ChunkedEllpackView< Device, Index, RowMajorOrder >;
       template< typename Device_, typename Index_ >
       using ViewTemplate = ChunkedEllpackView< Device_, Index_, RowMajorOrder >;
-      using ConstViewType = ChunkedEllpackView< Device, std::add_const_t< Index >, RowMajorOrder >;
+      using ConstViewType = ChunkedEllpackView< Device, std::add_const_t< IndexType >, RowMajorOrder >;
       using SegmentViewType = ChunkedEllpackSegmentView< IndexType, RowMajorOrder >;
       using ChunkedEllpackSliceInfoType = details::ChunkedEllpackSliceInfo< IndexType >;
       //TODO: using ChunkedEllpackSliceInfoAllocator = typename IndexAllocatorType::retype< ChunkedEllpackSliceInfoType >;
@@ -55,7 +55,7 @@ class ChunkedEllpack
 
       ViewType getView();
 
-      ConstViewType getConstView() const;
+      const ConstViewType getConstView() const;
 
       /**
        * \brief Set sizes of particular segments.
diff --git a/src/TNL/Containers/Segments/ChunkedEllpack.hpp b/src/TNL/Containers/Segments/ChunkedEllpack.hpp
index 6d0cf6fe7..9eea0bbab 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpack.hpp
+++ b/src/TNL/Containers/Segments/ChunkedEllpack.hpp
@@ -111,9 +111,8 @@ template< typename Device,
           typename Index,
           typename IndexAllocator,
           bool RowMajorOrder >
-typename ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::ConstViewType
-ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
-getConstView() const
+auto ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
+getConstView() const -> const ConstViewType
 {
    return ConstViewType( size, storageSize, chunksInSlice, desiredChunkSize,
                          rowToChunkMapping.getConstView(),
@@ -306,10 +305,8 @@ template< typename Device,
           typename Index,
           typename IndexAllocator,
           bool RowMajorOrder >
-__cuda_callable__
-Index
-ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
-getSegmentsCount() const
+__cuda_callable__ auto ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
+getSegmentsCount() const -> IndexType
 {
    return this->segmentsCount;
 }
@@ -318,9 +315,8 @@ template< typename Device,
           typename Index,
           typename IndexAllocator,
           bool RowMajorOrder >
-Index
-ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
-getSegmentSize( const IndexType segmentIdx ) const
+auto ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
+getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentSize(
       rowToSliceMapping.getView(),
@@ -333,10 +329,8 @@ template< typename Device,
           typename Index,
           typename IndexAllocator,
           bool RowMajorOrder >
-__cuda_callable__
-Index
-ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
-getSize() const
+__cuda_callable__ auto ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
+getSize() const -> IndexType
 {
    return this->size;
 }
@@ -345,10 +339,8 @@ template< typename Device,
           typename Index,
           typename IndexAllocator,
           bool RowMajorOrder >
-__cuda_callable__
-Index
-ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
-getStorageSize() const
+__cuda_callable__ auto ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
+getStorageSize() const -> IndexType
 {
    return this->storageSize;
 }
@@ -357,10 +349,8 @@ template< typename Device,
           typename Index,
           typename IndexAllocator,
           bool RowMajorOrder >
-__cuda_callable__
-Index
-ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
-getGlobalIndex( const Index segmentIdx, const Index localIdx ) const
+__cuda_callable__ auto ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
+getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
       return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndex(
          rowToSliceMapping,
@@ -375,9 +365,7 @@ template< typename Device,
           typename Index,
           typename IndexAllocator,
           bool RowMajorOrder >
-__cuda_callable__
-auto
-ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
+__cuda_callable__ auto ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
 getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
 {
 }
diff --git a/src/TNL/Containers/Segments/ChunkedEllpackView.h b/src/TNL/Containers/Segments/ChunkedEllpackView.h
index 5b89b373f..2735c9914 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpackView.h
+++ b/src/TNL/Containers/Segments/ChunkedEllpackView.h
@@ -29,9 +29,9 @@ class ChunkedEllpackView
    public:
 
       using DeviceType = Device;
-      using IndexType = Index;
-      using OffsetsView = typename Containers::VectorView< IndexType, DeviceType, typename std::remove_const< IndexType >::type >;
-      using ConstOffsetsView = typename Containers::Vector< IndexType, DeviceType, typename std::remove_const< IndexType >::type >::ConstViewType;
+      using IndexType = std::remove_const_t< Index >;
+      using OffsetsView = typename Containers::VectorView< Index, DeviceType, IndexType >;
+      using ConstOffsetsView = typename OffsetsView::ConstViewType;
       using ViewType = ChunkedEllpackView;
       template< typename Device_, typename Index_ >
       using ViewTemplate = ChunkedEllpackView< Device_, Index_ >;
@@ -83,7 +83,7 @@ class ChunkedEllpackView
       ViewType getView();
 
       __cuda_callable__
-      ConstViewType getConstView() const;
+      const ConstViewType getConstView() const;
 
       /**
        * \brief Number segments.
diff --git a/src/TNL/Containers/Segments/ChunkedEllpackView.hpp b/src/TNL/Containers/Segments/ChunkedEllpackView.hpp
index 9d572f9f7..d0a9372d1 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpackView.hpp
+++ b/src/TNL/Containers/Segments/ChunkedEllpackView.hpp
@@ -154,10 +154,8 @@ getView()
 template< typename Device,
           typename Index,
           bool RowMajorOrder >
-__cuda_callable__
-typename ChunkedEllpackView< Device, Index, RowMajorOrder >::ConstViewType
-ChunkedEllpackView< Device, Index, RowMajorOrder >::
-getConstView() const
+__cuda_callable__ auto ChunkedEllpackView< Device, Index, RowMajorOrder >::
+getConstView() const -> const ConstViewType
 {
    return ConstViewType( size, chunksInSlice, desiredChunkSize,
                          rowToChunkMapping.getConstView(),
@@ -171,10 +169,8 @@ getConstView() const
 template< typename Device,
           typename Index,
           bool RowMajorOrder >
-__cuda_callable__
-Index
-ChunkedEllpackView< Device, Index, RowMajorOrder >::
-getSegmentsCount() const
+__cuda_callable__ auto ChunkedEllpackView< Device, Index, RowMajorOrder >::
+getSegmentsCount() const -> IndexType
 {
    return this->size;
 }
@@ -182,10 +178,8 @@ getSegmentsCount() const
 template< typename Device,
           typename Index,
           bool RowMajorOrder >
-__cuda_callable__
-Index
-ChunkedEllpackView< Device, Index, RowMajorOrder >::
-getSegmentSize( const IndexType segmentIdx ) const
+__cuda_callable__ auto ChunkedEllpackView< Device, Index, RowMajorOrder >::
+getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    if( std::is_same< DeviceType, Devices::Host >::value )
       return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentSizeDirect(
@@ -214,10 +208,8 @@ getSegmentSize( const IndexType segmentIdx ) const
 template< typename Device,
           typename Index,
           bool RowMajorOrder >
-__cuda_callable__
-Index
-ChunkedEllpackView< Device, Index, RowMajorOrder >::
-getSize() const
+__cuda_callable__ auto ChunkedEllpackView< Device, Index, RowMajorOrder >::
+getSize() const -> IndexType
 {
    return this->size;
 }
@@ -225,10 +217,8 @@ getSize() const
 template< typename Device,
           typename Index,
           bool RowMajorOrder >
-__cuda_callable__
-Index
-ChunkedEllpackView< Device, Index, RowMajorOrder >::
-getStorageSize() const
+__cuda_callable__ auto ChunkedEllpackView< Device, Index, RowMajorOrder >::
+getStorageSize() const -> IndexType
 {
    return this->storageSize;
 }
@@ -236,10 +226,8 @@ getStorageSize() const
 template< typename Device,
           typename Index,
           bool RowMajorOrder >
-__cuda_callable__
-Index
-ChunkedEllpackView< Device, Index, RowMajorOrder >::
-getGlobalIndex( const Index segmentIdx, const Index localIdx ) const
+__cuda_callable__ auto ChunkedEllpackView< Device, Index, RowMajorOrder >::
+getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
    if( std::is_same< DeviceType, Devices::Host >::value )
       return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndexDirect(
diff --git a/src/TNL/Containers/Segments/Ellpack.h b/src/TNL/Containers/Segments/Ellpack.h
index a1188a854..f02ef1523 100644
--- a/src/TNL/Containers/Segments/Ellpack.h
+++ b/src/TNL/Containers/Segments/Ellpack.h
@@ -28,7 +28,7 @@ class Ellpack
    public:
 
       using DeviceType = Device;
-      using IndexType = Index;
+      using IndexType = std::remove_const_t< Index >;
       static constexpr int getAlignment() { return Alignment; }
       static constexpr bool getRowMajorOrder() { return RowMajorOrder; }
       using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType >;
@@ -36,7 +36,7 @@ class Ellpack
       template< typename Device_, typename Index_ >
       using ViewTemplate = EllpackView< Device_, Index_, RowMajorOrder, Alignment >;
       using ViewType = EllpackView< Device, Index, RowMajorOrder, Alignment >;
-      //using ConstViewType = EllpackView< Device, std::add_const_t< Index >, RowMajorOrder, Alignment >;
+      using ConstViewType = typename ViewType::ConstViewType;
       using SegmentViewType = SegmentView< IndexType, RowMajorOrder >;
 
       Ellpack();
@@ -55,7 +55,7 @@ class Ellpack
 
       ViewType getView();
 
-      //ConstViewType getConstView() const;
+      const ConstViewType getConstView() const;
 
       /**
        * \brief Set sizes of particular segments.
diff --git a/src/TNL/Containers/Segments/Ellpack.hpp b/src/TNL/Containers/Segments/Ellpack.hpp
index 9c59c5529..dedc41a41 100644
--- a/src/TNL/Containers/Segments/Ellpack.hpp
+++ b/src/TNL/Containers/Segments/Ellpack.hpp
@@ -105,23 +105,24 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int Alignment >
-typename Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::ViewType
+auto
 Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
-getView()
+getView() -> ViewType
 {
    return ViewType( segmentSize, size, alignedSize );
 }
 
-/*template< typename Device,
+template< typename Device,
           typename Index,
+          typename IndexAllocator,
           bool RowMajorOrder,
           int Alignment >
-typename Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::ConstViewType
+auto
 Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
-getConstView() const
+getConstView() const -> const ConstViewType
 {
    return ConstViewType( segmentSize, size, alignedSize );
-}*/
+}
 
 template< typename Device,
           typename Index,
@@ -164,10 +165,8 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-Index
-Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
-getSegmentsCount() const
+__cuda_callable__ auto Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
+getSegmentsCount() const -> IndexType
 {
    return this->size;
 }
@@ -177,10 +176,8 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-Index
-Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
-getSegmentSize( const IndexType segmentIdx ) const
+__cuda_callable__ auto Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
+getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    return this->segmentSize;
 }
@@ -190,10 +187,8 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-Index
-Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
-getSize() const
+__cuda_callable__ auto Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
+getSize() const  -> IndexType
 {
    return this->size * this->segmentSize;
 }
@@ -204,10 +199,8 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-Index
-Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
-getStorageSize() const
+__cuda_callable__ auto Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
+getStorageSize() const -> IndexType
 {
    return this->alignedSize * this->segmentSize;
 }
@@ -217,10 +210,8 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-Index
-Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
-getGlobalIndex( const Index segmentIdx, const Index localIdx ) const
+__cuda_callable__ auto Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
+getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
    if( RowMajorOrder )
       return segmentIdx * this->segmentSize + localIdx;
@@ -233,9 +224,8 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-void
-Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
+__cuda_callable__ 
+void Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
 getSegmentAndLocalIndex( const Index globalIdx, Index& segmentIdx, Index& localIdx ) const
 {
 }
@@ -245,9 +235,7 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-auto
-Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
+__cuda_callable__ auto Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
 getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
 {
    if( RowMajorOrder )
@@ -266,33 +254,7 @@ void
 Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
 forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
 {
-   if( RowMajorOrder )
-   {
-      const IndexType segmentSize = this->segmentSize;
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
-         const IndexType begin = segmentIdx * segmentSize;
-         const IndexType end = begin + segmentSize;
-         IndexType localIdx( 0 );
-         for( IndexType globalIdx = begin; globalIdx < end; globalIdx++  )
-            if( ! f( segmentIdx, localIdx++, globalIdx,  args... ) )
-               break;
-      };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
-   }
-   else
-   {
-      const IndexType storageSize = this->getStorageSize();
-      const IndexType alignedSize = this->alignedSize;
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
-         const IndexType begin = segmentIdx;
-         const IndexType end = storageSize;
-         IndexType localIdx( 0 );
-         for( IndexType globalIdx = begin; globalIdx < end; globalIdx += alignedSize )
-            if( ! f( segmentIdx, localIdx++, globalIdx, args... ) )
-               break;
-      };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
-   }
+   this->getConstView().forSegments( first, last, f, args... );
 }
 
 template< typename Device,
@@ -318,36 +280,7 @@ void
 Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
-   if( RowMajorOrder )
-   {
-      const IndexType segmentSize = this->segmentSize;
-      auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable {
-         const IndexType begin = i * segmentSize;
-         const IndexType end = begin + segmentSize;
-         RealType aux( zero );
-         bool compute( true );
-         for( IndexType j = begin, localIdx = 0; j < end && compute; j++, localIdx++  )
-            reduction( aux, fetch( i, localIdx, j, compute, args... ) );
-         keeper( i, aux );
-      };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
-   }
-   else
-   {
-      const IndexType storageSize = this->getStorageSize();
-      const IndexType alignedSize = this->alignedSize;
-      auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable {
-         const IndexType begin = i;
-         const IndexType end = storageSize;
-         RealType aux( zero );
-         bool compute( true );
-         for( IndexType j = begin, localIdx = 0; j < end && compute; j += alignedSize, localIdx++  )
-            reduction( aux, fetch( i, localIdx, j, compute, args... ) );
-         keeper( i, aux );
-      };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
-   }
+   this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
diff --git a/src/TNL/Containers/Segments/EllpackView.h b/src/TNL/Containers/Segments/EllpackView.h
index 10a89bd7b..c9c52dd5d 100644
--- a/src/TNL/Containers/Segments/EllpackView.h
+++ b/src/TNL/Containers/Segments/EllpackView.h
@@ -29,7 +29,7 @@ class EllpackView
    public:
 
       using DeviceType = Device;
-      using IndexType = Index;
+      using IndexType = std::remove_const_t< Index >;
       static constexpr int getAlignment() { return Alignment; }
       static constexpr bool getRowMajorOrder() { return RowMajorOrder; }
       using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType >;
@@ -37,7 +37,7 @@ class EllpackView
       template< typename Device_, typename Index_ >
       using ViewTemplate = EllpackView< Device_, Index_, RowMajorOrder, Alignment >;
       using ViewType = EllpackView;
-      using ConstViewType = EllpackView< Device, std::add_const_t< Index > >;
+      using ConstViewType = ViewType;
       using SegmentViewType = SegmentView< IndexType, RowMajorOrder >;
 
       __cuda_callable__
@@ -60,7 +60,7 @@ class EllpackView
       ViewType getView();
 
       __cuda_callable__
-      ConstViewType getConstView() const;
+      const ConstViewType getConstView() const;
 
       /**
        * \brief Number segments.
diff --git a/src/TNL/Containers/Segments/EllpackView.hpp b/src/TNL/Containers/Segments/EllpackView.hpp
index f2cfaf590..d7654402f 100644
--- a/src/TNL/Containers/Segments/EllpackView.hpp
+++ b/src/TNL/Containers/Segments/EllpackView.hpp
@@ -103,9 +103,9 @@ template< typename Device,
           bool RowMajorOrder,
           int Alignment >
 __cuda_callable__
-typename EllpackView< Device, Index, RowMajorOrder, Alignment >::ConstViewType
+auto
 EllpackView< Device, Index, RowMajorOrder, Alignment >::
-getConstView() const
+getConstView() const -> const ConstViewType
 {
    return ConstViewType( segmentSize, size, alignedSize );
 }
@@ -114,10 +114,8 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-Index
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
-getSegmentsCount() const
+__cuda_callable__ auto EllpackView< Device, Index, RowMajorOrder, Alignment >::
+getSegmentsCount() const -> IndexType
 {
    return this->size;
 }
@@ -126,10 +124,8 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-Index
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
-getSegmentSize( const IndexType segmentIdx ) const
+__cuda_callable__ auto EllpackView< Device, Index, RowMajorOrder, Alignment >::
+getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    return this->segmentSize;
 }
@@ -138,10 +134,8 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-Index
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
-getSize() const
+__cuda_callable__ auto EllpackView< Device, Index, RowMajorOrder, Alignment >::
+getSize() const -> IndexType
 {
    return this->size * this->segmentSize;
 }
@@ -151,10 +145,8 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-Index
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
-getStorageSize() const
+__cuda_callable__ auto EllpackView< Device, Index, RowMajorOrder, Alignment >::
+getStorageSize() const -> IndexType
 {
    return this->alignedSize * this->segmentSize;
 }
@@ -163,10 +155,8 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-Index
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
-getGlobalIndex( const Index segmentIdx, const Index localIdx ) const
+__cuda_callable__ auto EllpackView< Device, Index, RowMajorOrder, Alignment >::
+getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
    if( RowMajorOrder )
       return segmentIdx * this->segmentSize + localIdx;
@@ -178,9 +168,7 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-void
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
+__cuda_callable__ void EllpackView< Device, Index, RowMajorOrder, Alignment >::
 getSegmentAndLocalIndex( const Index globalIdx, Index& segmentIdx, Index& localIdx ) const
 {
 }
@@ -189,9 +177,7 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int Alignment >
-__cuda_callable__
-auto
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
+__cuda_callable__ auto EllpackView< Device, Index, RowMajorOrder, Alignment >::
 getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
 {
    if( RowMajorOrder )
@@ -205,8 +191,7 @@ template< typename Device,
           bool RowMajorOrder,
           int Alignment >
    template< typename Function, typename... Args >
-void
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
+void EllpackView< Device, Index, RowMajorOrder, Alignment >::
 forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
 {
    if( RowMajorOrder )
@@ -218,7 +203,7 @@ forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
-            f( segmentIdx, localIdx++, globalIdx, compute, args... );
+            f( segmentIdx, localIdx++, globalIdx, compute );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
    }
@@ -243,8 +228,7 @@ template< typename Device,
           bool RowMajorOrder,
           int Alignment >
    template< typename Function, typename... Args >
-void
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
+void EllpackView< Device, Index, RowMajorOrder, Alignment >::
 forAll( Function& f, Args... args ) const
 {
    this->forSegments( 0, this->getSegmentsCount(), f, args... );
@@ -255,8 +239,7 @@ template< typename Device,
           bool RowMajorOrder,
           int Alignment >
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-void
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
+void EllpackView< Device, Index, RowMajorOrder, Alignment >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
@@ -299,8 +282,7 @@ template< typename Device,
           bool RowMajorOrder,
           int Alignment >
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-void
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
+void EllpackView< Device, Index, RowMajorOrder, Alignment >::
 allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
@@ -324,8 +306,7 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int Alignment >
-void
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
+void EllpackView< Device, Index, RowMajorOrder, Alignment >::
 save( File& file ) const
 {
    file.save( &segmentSize );
@@ -337,8 +318,7 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int Alignment >
-void
-EllpackView< Device, Index, RowMajorOrder, Alignment >::
+void EllpackView< Device, Index, RowMajorOrder, Alignment >::
 load( File& file )
 {
    file.load( &segmentSize );
diff --git a/src/TNL/Containers/Segments/SlicedEllpack.h b/src/TNL/Containers/Segments/SlicedEllpack.h
index 2027f1d78..c3967bc6b 100644
--- a/src/TNL/Containers/Segments/SlicedEllpack.h
+++ b/src/TNL/Containers/Segments/SlicedEllpack.h
@@ -29,8 +29,8 @@ class SlicedEllpack
    public:
 
       using DeviceType = Device;
-      using IndexType = Index;
-      using OffsetsHolder = Containers::Vector< IndexType, DeviceType, typename std::remove_const< IndexType >::type, IndexAllocator >;
+      using IndexType = std::remove_const_t< Index >;
+      using OffsetsHolder = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
       static constexpr int getSliceSize() { return SliceSize; }
       static constexpr bool getRowMajorOrder() { return RowMajorOrder; }
       using ViewType = SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >;
@@ -53,7 +53,7 @@ class SlicedEllpack
 
       ViewType getView();
 
-      ConstViewType getConstView() const;
+      const ConstViewType getConstView() const;
 
       /**
        * \brief Set sizes of particular segments.
diff --git a/src/TNL/Containers/Segments/SlicedEllpack.hpp b/src/TNL/Containers/Segments/SlicedEllpack.hpp
index 9ba1276e3..b01543e61 100644
--- a/src/TNL/Containers/Segments/SlicedEllpack.hpp
+++ b/src/TNL/Containers/Segments/SlicedEllpack.hpp
@@ -110,9 +110,9 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int SliceSize >
-typename SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::ConstViewType
+auto
 SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
-getConstView() const
+getConstView() const -> const ConstViewType
 {
    return ConstViewType( size, alignedSize, segmentsCount, sliceOffsets.getConstView(), sliceSegmentSizes.getConstView() );
 }
@@ -162,10 +162,8 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int SliceSize >
-__cuda_callable__
-Index
-SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
-getSegmentsCount() const
+__cuda_callable__ auto SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
+getSegmentsCount() const -> IndexType
 {
    return this->segmentsCount;
 }
@@ -175,10 +173,8 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int SliceSize >
-__cuda_callable__
-Index
-SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
-getSegmentSize( const IndexType segmentIdx ) const
+__cuda_callable__ auto SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
+getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    const Index sliceIdx = segmentIdx / SliceSize;
    if( std::is_same< DeviceType, Devices::Host >::value )
@@ -198,10 +194,8 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int SliceSize >
-__cuda_callable__
-Index
-SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
-getSize() const
+__cuda_callable__ auto SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
+getSize() const -> IndexType
 {
    return this->size;
 }
@@ -211,10 +205,8 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int SliceSize >
-__cuda_callable__
-Index
-SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
-getStorageSize() const
+__cuda_callable__ auto SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
+getStorageSize() const -> IndexType
 {
    return this->alignedSize;
 }
@@ -224,10 +216,8 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int SliceSize >
-__cuda_callable__
-Index
-SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
-getGlobalIndex( const Index segmentIdx, const Index localIdx ) const
+__cuda_callable__ auto SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
+getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
    const IndexType sliceIdx = segmentIdx / SliceSize;
    const IndexType segmentInSliceIdx = segmentIdx % SliceSize;
@@ -296,38 +286,7 @@ void
 SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
 forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
 {
-   const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView();
-   const auto sliceOffsets_view = this->sliceOffsets.getConstView();
-   if( RowMajorOrder )
-   {
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
-         const IndexType sliceIdx = segmentIdx / SliceSize;
-         const IndexType segmentInSliceIdx = segmentIdx % SliceSize;
-         const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ];
-         const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx * segmentSize;
-         const IndexType end = begin + segmentSize;
-         IndexType localIdx( 0 );
-         for( IndexType globalIdx = begin; globalIdx < end; globalIdx++  )
-            if( ! f( segmentIdx, localIdx++, globalIdx, args... ) )
-               break;
-      };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
-   }
-   else
-   {
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
-         const IndexType sliceIdx = segmentIdx / SliceSize;
-         const IndexType segmentInSliceIdx = segmentIdx % SliceSize;
-         const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ];
-         const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx;
-         const IndexType end = sliceOffsets_view[ sliceIdx + 1 ];
-         IndexType localIdx( 0 );
-         for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize )
-            if( ! f( segmentIdx, localIdx++, globalIdx, args... ) )
-               break;
-      };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
-   }
+   this->getConstView().forSegments( first, last, f, args... );
 }
 
 template< typename Device,
@@ -353,43 +312,7 @@ void
 SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
-   const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView();
-   const auto sliceOffsets_view = this->sliceOffsets.getConstView();
-   if( RowMajorOrder )
-   {
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
-         const IndexType sliceIdx = segmentIdx / SliceSize;
-         const IndexType segmentInSliceIdx = segmentIdx % SliceSize;
-         const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ];
-         const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx * segmentSize;
-         const IndexType end = begin + segmentSize;
-         RealType aux( zero );
-         bool compute( true );
-         IndexType localIdx( 0 );
-         for( IndexType globalIdx = begin; globalIdx< end; globalIdx++  )
-            reduction( aux, fetch( segmentIdx, localIdx++, globalIdx, compute, args... ) );
-         keeper( segmentIdx, aux );
-      };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
-   }
-   else
-   {
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
-         const IndexType sliceIdx = segmentIdx / SliceSize;
-         const IndexType segmentInSliceIdx = segmentIdx % SliceSize;
-         const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ];
-         const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx;
-         const IndexType end = sliceOffsets_view[ sliceIdx + 1 ];
-         RealType aux( zero );
-         bool compute( true );
-         IndexType localIdx( 0 );
-         for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize  )
-            reduction( aux, fetch( segmentIdx, localIdx++, globalIdx, compute, args... ) );
-         keeper( segmentIdx, aux );
-      };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
-   }
+   this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
diff --git a/src/TNL/Containers/Segments/SlicedEllpackView.h b/src/TNL/Containers/Segments/SlicedEllpackView.h
index 6e2e55bbc..fe1f035d0 100644
--- a/src/TNL/Containers/Segments/SlicedEllpackView.h
+++ b/src/TNL/Containers/Segments/SlicedEllpackView.h
@@ -28,14 +28,14 @@ class SlicedEllpackView
    public:
 
       using DeviceType = Device;
-      using IndexType = Index;
-      using OffsetsView = typename Containers::VectorView< IndexType, DeviceType, typename std::remove_const < IndexType >::type >;
+      using IndexType = std::remove_const_t< Index >;
+      using OffsetsView = typename Containers::VectorView< Index, DeviceType, IndexType >;
       static constexpr int getSliceSize() { return SliceSize; }
       static constexpr bool getRowMajorOrder() { return RowMajorOrder; }
       template< typename Device_, typename Index_ >
       using ViewTemplate = SlicedEllpackView< Device_, Index_, RowMajorOrder, SliceSize >;
       using ViewType = SlicedEllpackView;
-      using ConstViewType = SlicedEllpackView< Device, std::add_const_t< Index > >;
+      using ConstViewType = ViewType;
       using SegmentViewType = SegmentView< IndexType, RowMajorOrder >;
 
       __cuda_callable__
@@ -62,7 +62,7 @@ class SlicedEllpackView
       ViewType getView();
 
       __cuda_callable__
-      ConstViewType getConstView() const;
+      const ConstViewType getConstView() const;
 
       __cuda_callable__
       IndexType getSegmentsCount() const;
diff --git a/src/TNL/Containers/Segments/SlicedEllpackView.hpp b/src/TNL/Containers/Segments/SlicedEllpackView.hpp
index 50f1c65ee..dc755bb59 100644
--- a/src/TNL/Containers/Segments/SlicedEllpackView.hpp
+++ b/src/TNL/Containers/Segments/SlicedEllpackView.hpp
@@ -114,9 +114,9 @@ template< typename Device,
           bool RowMajorOrder,
           int SliceSize >
 __cuda_callable__
-typename SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::ConstViewType
+auto
 SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
-getConstView() const
+getConstView() const -> const ConstViewType
 {
    return ConstViewType( size, alignedSize, segmentsCount, sliceOffsets.getConstView(), sliceSegmentSizes.getConstView() );
 }
@@ -125,10 +125,8 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int SliceSize >
-__cuda_callable__
-Index
-SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
-getSegmentsCount() const
+__cuda_callable__ auto SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
+getSegmentsCount() const -> IndexType
 {
    return this->segmentsCount;
 }
@@ -137,10 +135,8 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int SliceSize >
-__cuda_callable__
-Index
-SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
-getSegmentSize( const IndexType segmentIdx ) const
+__cuda_callable__ auto SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
+getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    const Index sliceIdx = segmentIdx / SliceSize;
    if( std::is_same< DeviceType, Devices::Host >::value )
@@ -159,10 +155,8 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int SliceSize >
-__cuda_callable__
-Index
-SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
-getSize() const
+__cuda_callable__ auto SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
+getSize() const -> IndexType
 {
    return this->size;
 }
@@ -171,10 +165,8 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int SliceSize >
-__cuda_callable__
-Index
-SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
-getStorageSize() const
+__cuda_callable__ auto SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
+getStorageSize() const -> IndexType
 {
    return this->alignedSize;
 }
@@ -183,10 +175,8 @@ template< typename Device,
           typename Index,
           bool RowMajorOrder,
           int SliceSize >
-__cuda_callable__
-Index
-SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
-getGlobalIndex( const Index segmentIdx, const Index localIdx ) const
+__cuda_callable__ auto SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
+getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
    const IndexType sliceIdx = segmentIdx / SliceSize;
    const IndexType segmentInSliceIdx = segmentIdx % SliceSize;
diff --git a/src/UnitTests/Containers/Segments/SegmentsTest.hpp b/src/UnitTests/Containers/Segments/SegmentsTest.hpp
index 6d4692dbe..59ef44c2c 100644
--- a/src/UnitTests/Containers/Segments/SegmentsTest.hpp
+++ b/src/UnitTests/Containers/Segments/SegmentsTest.hpp
@@ -128,7 +128,7 @@ void test_AllReduction_MaximumInSegments()
    TNL::Containers::Vector< IndexType, DeviceType, IndexType > v( segments.getStorageSize() );
 
    auto view = v.getView();
-   auto init = [=] __cuda_callable__ ( const IndexType segmentIdx, const IndexType localIdx, const IndexType globalIdx ) mutable -> bool {
+   auto init = [=] __cuda_callable__ ( const IndexType segmentIdx, const IndexType localIdx, const IndexType globalIdx, bool& compute ) mutable -> bool {
       view[ globalIdx ] =  segmentIdx * 5 + localIdx + 1;
       return true;
    };
-- 
GitLab


From 429d197a9bc6d63ee50b39bfa9d0349ad5ef079f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 5 Apr 2020 20:35:28 +0200
Subject: [PATCH 09/68] Added ChunkedEllpack to SpMV benchmark.

---
 src/Benchmarks/SpMV/spmv-legacy.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h
index 17b0b8f0d..31d14c6a3 100644
--- a/src/Benchmarks/SpMV/spmv-legacy.h
+++ b/src/Benchmarks/SpMV/spmv-legacy.h
@@ -33,6 +33,7 @@
 #include <TNL/Containers/Segments/CSR.h>
 #include <TNL/Containers/Segments/Ellpack.h>
 #include <TNL/Containers/Segments/SlicedEllpack.h>
+#include <TNL/Containers/Segments/ChunkedEllpack.h>
 using namespace TNL::Matrices;
 
 #include "cusparseCSRMatrix.h"
@@ -61,6 +62,12 @@ using SlicedEllpackSegments = Containers::Segments::SlicedEllpack< Device, Index
 template< typename Real, typename Device, typename Index >
 using SparseMatrix_SlicedEllpack = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, SlicedEllpackSegments >;
 
+template< typename Device, typename Index, typename IndexAllocator >
+using ChunkedEllpackSegments = Containers::Segments::ChunkedEllpack< Device, Index, IndexAllocator >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrix_ChunkedEllpack = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, ChunkedEllpackSegments >;
+
 // Legacy formats
 template< typename Real, typename Device, typename Index >
 using SparseMatrixLegacy_CSR_Scalar = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRScalar >;
@@ -286,6 +293,7 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    benchmarkSpMV< Real, SlicedEllpackAlias               >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, SparseMatrix_SlicedEllpack       >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrix_ChunkedEllpack      >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, Matrices::Legacy::BiEllpack      >( benchmark, hostOutVector, inputFileName, verboseMR );
    /* AdEllpack is broken
    benchmarkSpMV< Real, Matrices::AdEllpack              >( benchmark, hostOutVector, inputFileName, verboseMR );
-- 
GitLab


From 36d81639eb8a98415e91f7012b78c400e3c7ff14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 7 Apr 2020 12:10:28 +0200
Subject: [PATCH 10/68] Added BiEllpack segments.

---
 src/TNL/Containers/Segments/BiEllpack.h       | 138 +++++
 src/TNL/Containers/Segments/BiEllpack.hpp     | 320 ++++++++++++
 .../Segments/BiEllpackSegmentView.h           |  94 ++++
 src/TNL/Containers/Segments/BiEllpackView.h   | 204 ++++++++
 src/TNL/Containers/Segments/BiEllpackView.hpp | 490 ++++++++++++++++++
 src/TNL/Containers/Segments/ChunkedEllpack.h  |   2 +-
 .../Containers/Segments/ChunkedEllpack.hpp    |   3 +-
 .../Segments/ChunkedEllpackSegmentView.h      |   2 +-
 .../Containers/Segments/ChunkedEllpackView.h  |   2 -
 .../Segments/ChunkedEllpackView.hpp           |  19 -
 .../Containers/Segments/details/BiEllpack.h   | 162 ++++++
 src/UnitTests/Matrices/CMakeLists.txt         |   8 +
 .../Matrices/SparseMatrixTest_BiEllpack.cpp   |  11 +
 .../Matrices/SparseMatrixTest_BiEllpack.cu    |  11 +
 .../Matrices/SparseMatrixTest_BiEllpack.h     |  57 ++
 15 files changed, 1498 insertions(+), 25 deletions(-)
 create mode 100644 src/TNL/Containers/Segments/BiEllpack.h
 create mode 100644 src/TNL/Containers/Segments/BiEllpack.hpp
 create mode 100644 src/TNL/Containers/Segments/BiEllpackSegmentView.h
 create mode 100644 src/TNL/Containers/Segments/BiEllpackView.h
 create mode 100644 src/TNL/Containers/Segments/BiEllpackView.hpp
 create mode 100644 src/TNL/Containers/Segments/details/BiEllpack.h
 create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.cpp
 create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.cu
 create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h

diff --git a/src/TNL/Containers/Segments/BiEllpack.h b/src/TNL/Containers/Segments/BiEllpack.h
new file mode 100644
index 000000000..30827b719
--- /dev/null
+++ b/src/TNL/Containers/Segments/BiEllpack.h
@@ -0,0 +1,138 @@
+/***************************************************************************
+                          BiEllpack.h -  description
+                             -------------------
+    begin                : Apr 5, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Allocators/Default.h>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Containers/Segments/BiEllpackView.h>
+#include <TNL/Containers/Segments/SegmentView.h>
+
+namespace TNL {
+   namespace Containers {
+      namespace Segments {
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index >,
+          bool RowMajorOrder = std::is_same< Device, Devices::Host >::value,
+          int WarpSize = 32 >
+class BiEllpack
+{
+   public:
+
+      using DeviceType = Device;
+      using IndexType = std::remove_const_t< Index >;
+      using OffsetsHolder = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
+      static constexpr bool getRowMajorOrder() { return RowMajorOrder; }
+      using ViewType = BiEllpackView< Device, Index, RowMajorOrder >;
+      template< typename Device_, typename Index_ >
+      using ViewTemplate = BiEllpackView< Device_, Index_, RowMajorOrder >;
+      using ConstViewType = BiEllpackView< Device, std::add_const_t< IndexType >, RowMajorOrder >;
+      using SegmentViewType = BiEllpackSegmentView< IndexType, RowMajorOrder >;
+
+      BiEllpack() = default;
+
+      BiEllpack( const Vector< IndexType, DeviceType, IndexType >& sizes );
+
+      BiEllpack( const BiEllpack& segments );
+
+      BiEllpack( const BiEllpack&& segments );
+
+      static String getSerializationType();
+
+      static String getSegmentsType();
+
+      ViewType getView();
+
+      const ConstViewType getConstView() const;
+
+      /**
+       * \brief Set sizes of particular segments.
+       */
+      template< typename SizesHolder = OffsetsHolder >
+      void setSegmentsSizes( const SizesHolder& sizes );
+
+      __cuda_callable__
+      IndexType getSegmentsCount() const;
+
+      IndexType getSegmentSize( const IndexType segmentIdx ) const;
+
+      /**
+       * \brief Number segments.
+       */
+      __cuda_callable__
+      IndexType getSize() const;
+
+      __cuda_callable__
+      IndexType getStorageSize() const;
+
+      __cuda_callable__
+      IndexType getGlobalIndex( const Index segmentIdx, const Index localIdx ) const;
+
+      __cuda_callable__
+      SegmentViewType getSegmentView( const IndexType segmentIdx ) const;
+
+      /***
+       * \brief Go over all segments and for each segment element call
+       * function 'f' with arguments 'args'. The return type of 'f' is bool.
+       * When its true, the for-loop continues. Once 'f' returns false, the for-loop
+       * is terminated.
+       */
+      template< typename Function, typename... Args >
+      void forSegments( IndexType first, IndexType last, Function& f, Args... args ) const;
+
+      template< typename Function, typename... Args >
+      void forAll( Function& f, Args... args ) const;
+
+
+      /***
+       * \brief Go over all segments and perform a reduction in each of them.
+       */
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+
+      BiEllpack& operator=( const BiEllpack& source ) = default;
+
+      template< typename Device_, typename Index_, typename IndexAllocator_, bool RowMajorOrder_ >
+      BiEllpack& operator=( const BiEllpack< Device_, Index_, IndexAllocator_, RowMajorOrder_, WarpSize >& source );
+
+      void save( File& file ) const;
+
+      void load( File& file );
+
+      void printStructure( std::ostream& str ); // TODO const;
+
+   protected:
+
+      static constexpr int getWarpSize() { return WarpSize; };
+
+      static constexpr int getLogWarpSize() { return std::log( WarpSize ); };
+
+      IndexType size = 0, storageSize = 0;
+
+      IndexType virtualRows = 0;
+
+      OffsetsHolder rowPermArray;
+
+      OffsetsHolder groupPointers;
+
+      template< typename Device_, typename Index_, typename IndexAllocator_, bool RowMajorOrder_, int WarpSize_ >
+      friend class BiEllpack;
+};
+
+      } // namespace Segements
+   }  // namespace Conatiners
+} // namespace TNL
+
+#include <TNL/Containers/Segments/BiEllpack.hpp>
diff --git a/src/TNL/Containers/Segments/BiEllpack.hpp b/src/TNL/Containers/Segments/BiEllpack.hpp
new file mode 100644
index 000000000..d99d883e0
--- /dev/null
+++ b/src/TNL/Containers/Segments/BiEllpack.hpp
@@ -0,0 +1,320 @@
+/***************************************************************************
+                          BiEllpack.hpp -  description
+                             -------------------
+    begin                : Apr 5, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Containers/Segments/BiEllpack.h>
+#include <TNL/Containers/Segments/Ellpack.h>
+
+namespace TNL {
+   namespace Containers {
+      namespace Segments {
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+BiEllpack( const Vector< IndexType, DeviceType, IndexType >& sizes )
+{
+   this->setSegmentsSizes( sizes );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+BiEllpack( const BiEllpack& biEllpack )
+   : size( biEllpack.size ),
+     storageSize( biEllpack.storageSize ),
+     virtualRows( biEllpack.virtualRows ),
+     rowPermArray( biEllpack.rowPermArray ),
+     groupPointers( biEllpack.groupPointers )
+{
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+BiEllpack( const BiEllpack&& biEllpack )
+   : size( biEllpack.size ),
+     storageSize( biEllpack.storageSize ),
+     virtualRows( biEllpack.virtualRows ),
+     rowPermArray( std::move( biEllpack.rowPermArray ) ),
+     groupPointers( std::move( biEllpack.groupPointers ) )
+{
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+String
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getSerializationType()
+{
+   return "BiEllpack< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+String
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getSegmentsType()
+{
+   return ViewType::getSegmentsType();
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+typename BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::ViewType
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getView()
+{
+   return ViewType( size, storageSize, virtualRows, rowPermArray.getView(), groupPointers.getView() );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getConstView() const -> const ConstViewType
+{
+   return ConstViewType( size, storageSize, virtualRows, rowPermArray.getConstView(), groupPointers.getConstView() );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename SizesHolder >
+void
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+setSegmentsSizes( const SizesHolder& segmentsSizes )
+{
+   if( std::is_same< DeviceType, Devices::Host >::value )
+   {
+   }
+   else
+   {
+      BiEllpack< Devices::Host, Index, typename Allocators::Default< Devices::Host >::template Allocator< Index >, RowMajorOrder > hostSegments;
+      Containers::Vector< IndexType, Devices::Host, IndexType > hostSegmentsSizes( segmentsSizes );
+      hostSegments.setSegmentsSizes( hostSegmentsSizes );
+      *this = hostSegments;
+   }
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__ auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getSegmentsCount() const -> IndexType
+{
+   return this->segmentsCount;
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getSegmentSize( const IndexType segmentIdx ) const -> IndexType
+{
+   return details::BiEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentSize(
+      rowPermArray.getConstView(),
+      groupPointers.getConstView(),
+      segmentIdx );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__ auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getSize() const -> IndexType
+{
+   return this->size;
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__ auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getStorageSize() const -> IndexType
+{
+   return this->storageSize;
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__ auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
+{
+      return details::BiEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndex(
+         rowPermArray.getConstView(),
+         groupPointers.getConstView(),
+         segmentIdx,
+         localIdx );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__ auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
+{
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Function, typename... Args >
+void
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
+{
+   this->getConstView().forSegments( first, last, f, args... );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Function, typename... Args >
+void
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+forAll( Function& f, Args... args ) const
+{
+   this->forSegments( 0, this->getSegmentsCount(), f, args... );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+void
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+{
+   this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+void
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+{
+   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Device_, typename Index_, typename IndexAllocator_, bool RowMajorOrder_ >
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >&
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+operator=( const BiEllpack< Device_, Index_, IndexAllocator_, RowMajorOrder_, WarpSize >& source )
+{
+   this->size = source.size;
+   this->storageSize = source.storageSize;
+   this->virtualRows = source.virtualRows;
+   this->rowPermArray = source.rowPermArray;
+   this->groupPointers = source.groupPointers;
+   return *this;
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+void
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+save( File& file ) const
+{
+   file.save( &this->size );
+   file.save( &this->storageSize );
+   file.save( &this->virtualRows );
+   file << this->rowPermArray
+        << this->groupPointers;
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+void
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+load( File& file )
+{
+   file.load( &this->size );
+   file.load( &this->storageSize );
+   file.load( &this->virtualRows );
+   file >> this->rowPermArray
+        >> this->groupPointers;
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+void
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+printStructure( std::ostream& str )
+{
+   this->getView().printStructure( str );
+}
+
+      } // namespace Segments
+   }  // namespace Conatiners
+} // namespace TNL
diff --git a/src/TNL/Containers/Segments/BiEllpackSegmentView.h b/src/TNL/Containers/Segments/BiEllpackSegmentView.h
new file mode 100644
index 000000000..6c049e6f8
--- /dev/null
+++ b/src/TNL/Containers/Segments/BiEllpackSegmentView.h
@@ -0,0 +1,94 @@
+/***************************************************************************
+                          BiEllpackSegmentView.h -  description
+                             -------------------
+    begin                : Apr 7, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+namespace TNL {
+   namespace Containers {
+      namespace Segments {
+
+template< typename Index,
+          bool RowMajorOrder = false >
+class BiEllpackSegmentView;
+
+template< typename Index >
+class BiEllpackSegmentView< Index, false >
+{
+   public:
+
+      using IndexType = Index;
+
+      __cuda_callable__
+      BiEllpackSegmentView( const IndexType offset,
+                                 const IndexType size,
+                                 const IndexType chunkSize,      // this is only for compatibility with the following specialization
+                                 const IndexType chunksInSlice ) // this one as well - both can be replaced when we could use constexprif in C++17
+      : segmentOffset( offset ), segmentSize( size ){};
+
+      __cuda_callable__
+      BiEllpackSegmentView( const BiEllpackSegmentView& view )
+      : segmentOffset( view.segmentOffset ), segmentSize( view.segmentSize ){};
+
+      __cuda_callable__
+      IndexType getSize() const
+      {
+         return this->segmentSize;
+      };
+
+      __cuda_callable__
+      IndexType getGlobalIndex( const IndexType localIndex ) const
+      {
+         TNL_ASSERT_LT( localIndex, segmentSize, "Local index exceeds segment bounds." );
+         return segmentOffset + localIndex;
+      };
+
+      protected:
+         
+         IndexType segmentOffset, segmentSize;
+};
+
+template< typename Index >
+class BiEllpackSegmentView< Index, true >
+{
+   public:
+
+      using IndexType = Index;
+
+      __cuda_callable__
+      BiEllpackSegmentView( const IndexType offset,
+                                 const IndexType size,
+                                 const IndexType chunkSize,
+                                 const IndexType chunksInSlice )
+      : segmentOffset( offset ), segmentSize( size ),
+        chunkSize( chunkSize ), chunksInSlice( chunksInSlice ){};
+
+      __cuda_callable__
+      IndexType getSize() const
+      {
+         return this->segmentSize;
+      };
+
+      __cuda_callable__
+      IndexType getGlobalIndex( const IndexType localIdx ) const
+      {
+         TNL_ASSERT_LT( localIdx, segmentSize, "Local index exceeds segment bounds." );
+         const IndexType chunkIdx = localIdx / chunkSize;
+         const IndexType inChunkOffset = localIdx % chunkSize;
+         return segmentOffset + inChunkOffset * chunksInSlice + chunkIdx;
+      };
+
+      protected:
+         
+         IndexType segmentOffset, segmentSize, chunkSize, chunksInSlice;
+};
+
+      } //namespace Segments
+   } //namespace Containers
+} //namespace TNL
diff --git a/src/TNL/Containers/Segments/BiEllpackView.h b/src/TNL/Containers/Segments/BiEllpackView.h
new file mode 100644
index 000000000..54bda498c
--- /dev/null
+++ b/src/TNL/Containers/Segments/BiEllpackView.h
@@ -0,0 +1,204 @@
+/***************************************************************************
+                          BiEllpackView.h -  description
+                             -------------------
+    begin                : Apr 5, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <type_traits>
+
+#include <TNL/Containers/Vector.h>
+#include <TNL/Containers/Segments/BiEllpackSegmentView.h>
+#include <TNL/Containers/Segments/details/BiEllpack.h>
+
+namespace TNL {
+   namespace Containers {
+      namespace Segments {
+
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder = std::is_same< Device, Devices::Host >::value,
+          int WarpSize = 32 >
+class BiEllpackView
+{
+   public:
+
+      using DeviceType = Device;
+      using IndexType = std::remove_const_t< Index >;
+      using OffsetsView = typename Containers::VectorView< Index, DeviceType, IndexType >;
+      using ConstOffsetsView = typename OffsetsView::ConstViewType;
+      using ViewType = BiEllpackView;
+      template< typename Device_, typename Index_ >
+      using ViewTemplate = BiEllpackView< Device_, Index_ >;
+      using ConstViewType = BiEllpackView< Device, std::add_const_t< Index > >;
+      using SegmentViewType = BiEllpackSegmentView< IndexType, RowMajorOrder >;
+
+      __cuda_callable__
+      BiEllpackView() = default;
+
+      __cuda_callable__
+      BiEllpackView( const IndexType size,
+                     const IndexType storageSize,
+                     const IndexType virtualRows,
+                     const OffsetsView& rowPermArray,
+                     const OffsetsView& groupPointers );
+
+      __cuda_callable__
+      BiEllpackView( const IndexType size,
+                     const IndexType storageSize,
+                     const IndexType virtualRows,
+                     const OffsetsView&& rowPermArray,
+                     const OffsetsView&& groupPointers );
+
+      __cuda_callable__
+      BiEllpackView( const BiEllpackView& chunked_ellpack_view );
+
+      __cuda_callable__
+      BiEllpackView( const BiEllpackView&& chunked_ellpack_view );
+
+      static String getSerializationType();
+
+      static String getSegmentsType();
+
+      __cuda_callable__
+      ViewType getView();
+
+      __cuda_callable__
+      const ConstViewType getConstView() const;
+
+      /**
+       * \brief Number segments.
+       */
+      __cuda_callable__
+      IndexType getSegmentsCount() const;
+
+      /***
+       * \brief Returns size of the segment number \r segmentIdx
+       */
+      __cuda_callable__
+      IndexType getSegmentSize( const IndexType segmentIdx ) const;
+
+      /***
+       * \brief Returns number of elements managed by all segments.
+       */
+      __cuda_callable__
+      IndexType getSize() const;
+
+      /***
+       * \brief Returns number of elements that needs to be allocated.
+       */
+      __cuda_callable__
+      IndexType getStorageSize() const;
+
+      __cuda_callable__
+      IndexType getGlobalIndex( const Index segmentIdx, const Index localIdx ) const;
+
+      __cuda_callable__
+      SegmentViewType getSegmentView( const IndexType segmentIdx ) const;
+
+      /***
+       * \brief Go over all segments and for each segment element call
+       * function 'f' with arguments 'args'. The return type of 'f' is bool.
+       * When its true, the for-loop continues. Once 'f' returns false, the for-loop
+       * is terminated.
+       */
+      template< typename Function, typename... Args >
+      void forSegments( IndexType first, IndexType last, Function& f, Args... args ) const;
+
+      template< typename Function, typename... Args >
+      void forAll( Function& f, Args... args ) const;
+
+
+      /***
+       * \brief Go over all segments and perform a reduction in each of them.
+       */
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+
+      BiEllpackView& operator=( const BiEllpackView& view );
+
+      void save( File& file ) const;
+
+      void load( File& file );
+
+   protected:
+
+      static constexpr int getWarpSize() { return WarpSize; };
+
+      static constexpr int getLogWarpSize() { return std::log( WarpSize ); };
+
+      IndexType size = 0, storageSize = 0;
+
+      IndexType virtualRows = 0;
+
+      OffsetsView rowPermArray;
+
+      OffsetsView groupPointers;
+
+#ifdef HAVE_CUDA
+      template< typename Fetch,
+                typename Reduction,
+                typename ResultKeeper,
+                typename Real,
+                typename... Args >
+      __device__
+      void segmentsReductionKernelWithAllParameters( IndexType gridIdx,
+                                                     IndexType first,
+                                                     IndexType last,
+                                                     Fetch fetch,
+                                                     Reduction reduction,
+                                                     ResultKeeper keeper,
+                                                     Real zero,
+                                                     Args... args ) const;
+
+      template< typename Fetch,
+                typename Reduction,
+                typename ResultKeeper,
+                typename Real,
+                typename... Args >
+      __device__
+      void segmentsReductionKernel( IndexType gridIdx,
+                                    IndexType first,
+                                    IndexType last,
+                                    Fetch fetch,
+                                    Reduction reduction,
+                                    ResultKeeper keeper,
+                                    Real zero,
+                                    Args... args ) const;
+
+      template< typename View_,
+                typename Index_,
+                typename Fetch_,
+                typename Reduction_,
+                typename ResultKeeper_,
+                typename Real_,
+                typename... Args_ >
+      friend __global__
+      void BiEllpackSegmentsReductionKernel( View_ chunkedEllpack,
+                                                  Index_ gridIdx,
+                                                  Index_ first,
+                                                  Index_ last,
+                                                  Fetch_ fetch,
+                                                  Reduction_ reduction,
+                                                  ResultKeeper_ keeper,
+                                                  Real_ zero,
+                                                  Args_... args );
+
+      template< typename Index_, typename Fetch_, bool B_ >
+      friend struct details::BiEllpackSegmentsReductionDispatcher;
+#endif
+};
+      } // namespace Segements
+   }  // namespace Conatiners
+} // namespace TNL
+
+#include <TNL/Containers/Segments/BiEllpackView.hpp>
diff --git a/src/TNL/Containers/Segments/BiEllpackView.hpp b/src/TNL/Containers/Segments/BiEllpackView.hpp
new file mode 100644
index 000000000..9a939b3fc
--- /dev/null
+++ b/src/TNL/Containers/Segments/BiEllpackView.hpp
@@ -0,0 +1,490 @@
+/***************************************************************************
+                          BiEllpackView.hpp -  description
+                             -------------------
+    begin                : Apr 5, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Containers/Segments/BiEllpackView.h>
+#include <TNL/Containers/Segments/details/LambdaAdapter.h>
+//#include <TNL/Containers/Segments/details/BiEllpack.h>
+
+namespace TNL {
+   namespace Containers {
+      namespace Segments {
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+BiEllpackView( const IndexType size,
+               const IndexType storageSize,
+               const IndexType virtualRows,
+               const OffsetsView& rowPermArray,
+               const OffsetsView& groupPointers )
+: size( size ),
+  storageSize( storageSize ),
+  virtualRows( virtualRows ),
+  rowPermArray( rowPermArray ),
+  groupPointers( groupPointers )
+{
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+BiEllpackView( const IndexType size,
+               const IndexType storageSize,
+               const IndexType virtualRows,
+               const OffsetsView&& rowPermArray,
+               const OffsetsView&& groupPointers )
+: size( size ),
+  storageSize( storageSize ),
+  virtualRows( virtualRows ),
+  rowPermArray( std::move( rowPermArray ) ),
+  groupPointers( std::move( groupPointers ) )
+{
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+BiEllpackView( const BiEllpackView& bi_ellpack_view )
+: size( bi_ellpack_view.size ),
+  storageSize( bi_ellpack_view.storageSize ),
+  virtualRows( bi_ellpack_view.virtualRows ),
+  rowPermArray( bi_ellpack_view.rowPermArray ),
+  groupPointers( bi_ellpack_view.groupPointers )
+{
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+BiEllpackView( const BiEllpackView&& bi_ellpack_view )
+: size( bi_ellpack_view.size ),
+  storageSize( bi_ellpack_view.storageSize ),
+  virtualRows( bi_ellpack_view.virtualRows ),
+  rowPermArray( std::move( bi_ellpack_view.rowPermArray ) ),
+  groupPointers( std::move( bi_ellpack_view.groupPointers ) )
+{
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+String
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+getSerializationType()
+{
+   return "BiEllpack< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+String
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+getSegmentsType()
+{
+   return "BiEllpack";
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__
+typename BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::ViewType
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+getView()
+{
+   return ViewType( size, storageSize, virtualRows, rowPermArray.getView(), groupPointers.getView() );
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__ auto BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+getConstView() const -> const ConstViewType
+{
+   return ConstViewType( size, storageSize, virtualRows, rowPermArray.getConstView(), groupPointers.getConstView() );
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__ auto BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+getSegmentsCount() const -> IndexType
+{
+   return this->size;
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__ auto BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+getSegmentSize( const IndexType segmentIdx ) const -> IndexType
+{
+   if( std::is_same< DeviceType, Devices::Host >::value )
+      return details::BiEllpack< IndexType, DeviceType, RowMajorOrder, WarpSize >::getSegmentSizeDirect(
+         rowPermArray,
+         groupPointers,
+         segmentIdx );
+   if( std::is_same< DeviceType, Devices::Cuda >::value )
+   {
+#ifdef __CUDA_ARCH__
+      return details::BiEllpack< IndexType, DeviceType, RowMajorOrder, WarpSize >::getSegmentSizeDirect(
+         rowPermArray,
+         groupPointers,
+         segmentIdx );
+#else
+      return details::BiEllpack< IndexType, DeviceType, RowMajorOrder, WarpSize >::getSegmentSize(
+         rowPermArray,
+         groupPointers,
+         segmentIdx );
+#endif
+   }
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__ auto BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+getSize() const -> IndexType
+{
+   return this->size;
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__ auto BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+getStorageSize() const -> IndexType
+{
+   return this->storageSize;
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__ auto BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
+{
+   if( std::is_same< DeviceType, Devices::Host >::value )
+      return details::BiEllpack< IndexType, DeviceType, RowMajorOrder, WarpSize >::getGlobalIndexDirect(
+         rowPermArray,
+         groupPointers,
+         segmentIdx,
+         localIdx );
+   if( std::is_same< DeviceType, Devices::Cuda >::value )
+   {
+#ifdef __CUDA_ARCH__
+      return details::BiEllpack< IndexType, DeviceType, RowMajorOrder, WarpSize >::getGlobalIndexDirect(
+         rowPermArray,
+         groupPointers,
+         segmentIdx,
+         localIdx );
+#else
+      return details::BiEllpack< IndexType, DeviceType, RowMajorOrder, WarpSize >::getGlobalIndex(
+         rowPermArray,
+         groupPointers,
+         segmentIdx,
+         localIdx );
+#endif
+   }
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+__cuda_callable__
+auto
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
+{
+   if( std::is_same< DeviceType, Devices::Host >::value )
+      return details::BiEllpack< IndexType, DeviceType, RowMajorOrder, WarpSize >::getSegmentViewDirect(
+         rowPermArray,
+         groupPointers,
+         segmentIdx );
+   if( std::is_same< DeviceType, Devices::Cuda >::value )
+   {
+#ifdef __CUDA_ARCH__
+      return details::BiEllpack< IndexType, DeviceType, RowMajorOrder, WarpSize >::getSegmentViewDirect(
+         rowPermArray,
+         groupPointers,
+         segmentIdx );
+#else
+      return details::BiEllpack< IndexType, DeviceType, RowMajorOrder, WarpSize >::getSegmentView(
+         rowPermArray,
+         groupPointers,
+         segmentIdx );
+#endif
+   }
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Function, typename... Args >
+void
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
+{
+   //Algorithms::ParallelFor< DeviceType >::exec( first, last , work, args... );
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Function, typename... Args >
+void
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+forAll( Function& f, Args... args ) const
+{
+   this->forSegments( 0, this->getSegmentsCount(), f, args... );
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+void
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+{
+   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+void
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+{
+   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >&
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+operator=( const BiEllpackView& source )
+{
+   this->size = source.size;
+   this->storageSize = source.storageSize;
+   this->virtualRows = source.virtualRows;
+   this->rowPermArray = source.rowPermArray;
+   this->groupPointers = source.groupPointers;
+   return *this;
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+void
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+save( File& file ) const
+{
+   file.save( &this->size );
+   file.save( &this->storageSize );
+   file.save( &this->virtualRows );
+   file << this->rowPermArray
+        << this->groupPointers;
+}
+
+#ifdef HAVE_CUDA
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Fetch,
+             typename Reduction,
+             typename ResultKeeper,
+             typename Real,
+             typename... Args >
+__device__
+void
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+segmentsReductionKernelWithAllParameters( IndexType gridIdx,
+                                          IndexType first,
+                                          IndexType last,
+                                          Fetch fetch,
+                                          Reduction reduction,
+                                          ResultKeeper keeper,
+                                          Real zero,
+                                          Args... args ) const
+{
+   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+
+   const IndexType firstSlice = rowToSliceMapping[ first ];
+   const IndexType lastSlice = rowToSliceMapping[ last - 1 ];
+
+   const IndexType sliceIdx = firstSlice + gridIdx * Cuda::getMaxGridSize() + blockIdx.x;
+   if( sliceIdx > lastSlice )
+      return;
+
+   RealType* chunksResults = Cuda::getSharedMemory< RealType >();
+   __shared__ details::BiEllpackSliceInfo< IndexType > sliceInfo;
+   if( threadIdx.x == 0 )
+      sliceInfo = this->slices[ sliceIdx ];
+   chunksResults[ threadIdx.x ] = zero;
+   __syncthreads();
+
+
+
+   const IndexType sliceOffset = sliceInfo.pointer;
+   const IndexType chunkSize = sliceInfo.chunkSize;
+   const IndexType chunkIdx = sliceIdx * chunksInSlice + threadIdx.x;
+   const IndexType segmentIdx = this->chunksToSegmentsMapping[ chunkIdx ];
+   IndexType firstChunkOfSegment( 0 );
+   if( segmentIdx != sliceInfo.firstSegment )
+      firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ];
+   IndexType localIdx = ( threadIdx.x - firstChunkOfSegment ) * chunkSize;
+   bool compute( true );
+
+   if( RowMajorOrder )
+   {
+      IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice
+      IndexType end = begin + chunkSize;
+      for( IndexType j = begin; j < end && compute; j++ )
+         reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute ) );
+   }
+   else
+   {
+      const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice
+      const IndexType end = begin + chunksInSlice * chunkSize;
+         for( IndexType j = begin; j < end && compute; j += chunksInSlice )
+            reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute ) );
+   }
+   __syncthreads();
+   if( threadIdx.x < sliceInfo.size )
+   {
+      const IndexType row = sliceInfo.firstSegment + threadIdx.x;
+      IndexType chunkIndex( 0 );
+      if( threadIdx.x != 0 )
+         chunkIndex = this->rowToChunkMapping[ row - 1 ];
+      const IndexType lastChunk = this->rowToChunkMapping[ row ];
+      RealType result( zero );
+      while( chunkIndex < lastChunk )
+         reduction( result,  chunksResults[ chunkIndex++ ] );
+      if( row >= first && row < last )
+         keeper( row, result );
+   }
+}
+
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename Fetch,
+             typename Reduction,
+             typename ResultKeeper,
+             typename Real,
+             typename... Args >
+__device__
+void
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+segmentsReductionKernel( IndexType gridIdx,
+                         IndexType first,
+                         IndexType last,
+                         Fetch fetch,
+                         Reduction reduction,
+                         ResultKeeper keeper,
+                         Real zero,
+                         Args... args ) const
+{
+   using RealType = decltype( fetch( IndexType(), std::declval< bool& >(), args... ) );
+
+   const IndexType firstSlice = rowToSliceMapping[ first ];
+   const IndexType lastSlice = rowToSliceMapping[ last - 1 ];
+
+   const IndexType sliceIdx = firstSlice + gridIdx * Cuda::getMaxGridSize() + blockIdx.x;
+   if( sliceIdx > lastSlice )
+      return;
+
+   RealType* chunksResults = Cuda::getSharedMemory< RealType >();
+   __shared__ details::BiEllpackSliceInfo< IndexType > sliceInfo;
+
+   if( threadIdx.x == 0 )
+      sliceInfo = this->slices[ sliceIdx ];
+   chunksResults[ threadIdx.x ] = zero;
+   __syncthreads();
+
+   const IndexType sliceOffset = sliceInfo.pointer;
+   const IndexType chunkSize = sliceInfo.chunkSize;
+   const IndexType chunkIdx = sliceIdx * chunksInSlice + threadIdx.x;
+   bool compute( true );
+
+   if( RowMajorOrder )
+   {
+      IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice
+      IndexType end = begin + chunkSize;
+      for( IndexType j = begin; j < end && compute; j++ )
+         reduction( chunksResults[ threadIdx.x ], fetch( j, compute ) );
+   }
+   else
+   {
+      const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice
+      const IndexType end = begin + chunksInSlice * chunkSize;
+         for( IndexType j = begin; j < end && compute; j += chunksInSlice )
+            reduction( chunksResults[ threadIdx.x ], fetch( j, compute ) );
+   }
+   __syncthreads();
+
+   if( threadIdx.x < sliceInfo.size )
+   {
+      const IndexType row = sliceInfo.firstSegment + threadIdx.x;
+      IndexType chunkIndex( 0 );
+      if( threadIdx.x != 0 )
+         chunkIndex = this->rowToChunkMapping[ row - 1 ];
+      const IndexType lastChunk = this->rowToChunkMapping[ row ];
+      RealType result( zero );
+      while( chunkIndex < lastChunk )
+         reduction( result,  chunksResults[ chunkIndex++ ] );
+      if( row >= first && row < last )
+         keeper( row, result );
+   }
+}
+#endif
+
+      } // namespace Segments
+   }  // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/Segments/ChunkedEllpack.h b/src/TNL/Containers/Segments/ChunkedEllpack.h
index 8c2e94264..624caae68 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpack.h
+++ b/src/TNL/Containers/Segments/ChunkedEllpack.h
@@ -150,7 +150,7 @@ class ChunkedEllpack
 
       ChunkedEllpackSliceInfoContainer slices;
 
-      IndexType numberOfSlices;
+      IndexType numberOfSlices = 0;
 
       template< typename Device_, typename Index_, typename IndexAllocator_, bool RowMajorOrder_ >
       friend class ChunkedEllpack;
diff --git a/src/TNL/Containers/Segments/ChunkedEllpack.hpp b/src/TNL/Containers/Segments/ChunkedEllpack.hpp
index 9eea0bbab..444360d66 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpack.hpp
+++ b/src/TNL/Containers/Segments/ChunkedEllpack.hpp
@@ -25,7 +25,6 @@ template< typename Device,
           bool RowMajorOrder >
 ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
 ChunkedEllpack( const Vector< IndexType, DeviceType, IndexType >& sizes )
-   : size( 0 ), storageSize( 0 ), chunksInSlice( 0 ), desiredChunkSize( 0 )
 {
    this->setSegmentsSizes( sizes );
 }
@@ -41,7 +40,7 @@ ChunkedEllpack( const ChunkedEllpack& chunkedEllpack )
      chunksInSlice( chunkedEllpack.chunksInSlice ), 
      desiredChunkSize( chunkedEllpack.desiredChunkSize ),
      rowToChunkMapping( chunkedEllpack.rowToChunkMapping ),
-     rowToSliceMapping( chunkedEllpack.rowTopSliceMapping ),
+     rowToSliceMapping( chunkedEllpack.rowToSliceMapping ),
      chunksToSegmentsMapping( chunkedEllpack. chunksToSegmentsMapping ),
      rowPointers( chunkedEllpack.rowPointers ),
      slices( chunkedEllpack.slices ),
diff --git a/src/TNL/Containers/Segments/ChunkedEllpackSegmentView.h b/src/TNL/Containers/Segments/ChunkedEllpackSegmentView.h
index 9eba9dd68..93da55927 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpackSegmentView.h
+++ b/src/TNL/Containers/Segments/ChunkedEllpackSegmentView.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          ChunkedEllpackChunkedEllpackSegmentView.h -  description
+                          ChunkedEllpackSegmentView.h -  description
                              -------------------
     begin                : Mar 24, 2020
     copyright            : (C) 2020 by Tomas Oberhuber
diff --git a/src/TNL/Containers/Segments/ChunkedEllpackView.h b/src/TNL/Containers/Segments/ChunkedEllpackView.h
index 2735c9914..3661e209e 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpackView.h
+++ b/src/TNL/Containers/Segments/ChunkedEllpackView.h
@@ -141,8 +141,6 @@ class ChunkedEllpackView
 
       void save( File& file ) const;
 
-      void load( File& file );
-
       void printStructure( std::ostream& str ) const;
 
    protected:
diff --git a/src/TNL/Containers/Segments/ChunkedEllpackView.hpp b/src/TNL/Containers/Segments/ChunkedEllpackView.hpp
index d0a9372d1..ce77eccd9 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpackView.hpp
+++ b/src/TNL/Containers/Segments/ChunkedEllpackView.hpp
@@ -482,25 +482,6 @@ save( File& file ) const
    file.save( &this->numberOfSlices );
 }
 
-template< typename Device,
-          typename Index,
-          bool RowMajorOrder >
-void
-ChunkedEllpackView< Device, Index, RowMajorOrder >::
-load( File& file )
-{
-   file.load( &this->size );
-   file.load( &this->storageSize );
-   file.load( &this->chunksInSlice );
-   file.load( &this->desiredChunkSize );
-   file >> this->rowToChunkMapping
-        >> this->chunksToSegmentsMapping
-        >> this->rowToSliceMapping
-        >> this->rowPointers
-        >> this->slices;
-   file.load( &this->numberOfSlices );
-}
-
 template< typename Device,
           typename Index,
           bool RowMajorOrder >
diff --git a/src/TNL/Containers/Segments/details/BiEllpack.h b/src/TNL/Containers/Segments/details/BiEllpack.h
new file mode 100644
index 000000000..ef95d50e3
--- /dev/null
+++ b/src/TNL/Containers/Segments/details/BiEllpack.h
@@ -0,0 +1,162 @@
+/***************************************************************************
+                          BiEllpack.h -  description
+                             -------------------
+    begin                : Apr 7, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <type_traits>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Containers/Segments/BiEllpackSegmentView.h>
+#include <TNL/Containers/Segments/details/CheckLambdas.h>
+
+namespace TNL {
+   namespace Containers {
+      namespace Segments {
+         namespace details {
+
+template< typename Index,
+          typename Device,
+          bool RowMajorOrder = std::is_same< Device, Devices::Host >::value,
+          int WarpSize = 32 >
+class BiEllpack
+{
+   public:
+
+      using DeviceType = Device;
+      using IndexType = Index;
+      static constexpr bool getRowMajorOrder() { return RowMajorOrder; }
+      using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType >;
+      using OffsetsHolderView = typename OffsetsHolder::ViewType;
+      using SegmentsSizes = OffsetsHolder;
+      using SegmentViewType = BiEllpackSegmentView< IndexType, RowMajorOrder >;
+
+      __cuda_callable__ static
+      IndexType getSegmentSizeDirect( const OffsetsHolderView& rowPermArray,
+                                      const OffsetsHolderView& groupPointers,
+                                      const IndexType segmentIdx )
+      {
+      }
+
+      static
+      IndexType getSegmentSize( const OffsetsHolderView& rowPermArray,
+                                const OffsetsHolderView& groupPointers,
+                                const IndexType segmentIdx )
+      {
+      }
+
+      __cuda_callable__ static
+      IndexType getGlobalIndexDirect( const OffsetsHolderView& rowPermArray,
+                                      const OffsetsHolderView& groupPointers,
+                                      const IndexType segmentIdx,
+                                      const IndexType localIdx )
+      {
+      }
+
+      static
+      IndexType getGlobalIndex( const OffsetsHolderView& rowPermArray,
+                                const OffsetsHolderView& groupPointers,
+                                const IndexType segmentIdx,
+                                const IndexType localIdx )
+      {
+      }
+
+      static __cuda_callable__
+      SegmentViewType getSegmentViewDirect( const OffsetsHolderView& rowPermArray,
+                                            const OffsetsHolderView& groupPointers,
+                                            const IndexType segmentIdx )
+      {
+      }
+
+      static __cuda_callable__
+      SegmentViewType getSegmentView( const OffsetsHolderView& rowPermArray,
+                                      const OffsetsHolderView& groupPointers,
+                                      const IndexType segmentIdx )
+      {
+      }
+};
+
+#ifdef HAVE_CUDA
+template< typename Index,
+          typename Fetch,
+          bool HasAllParameters = details::CheckFetchLambda< Index, Fetch >::hasAllParameters(),
+          int WarpSize = 32 >
+struct BiEllpackSegmentsReductionDispatcher{};
+
+template< typename Index, typename Fetch >
+struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, true >
+{
+   template< typename View,
+             typename Reduction,
+             typename ResultKeeper,
+             typename Real,
+             typename... Args >
+   __device__
+   static void exec( View chunkedEllpack,
+                     Index gridIdx,
+                     Index first,
+                     Index last,
+                     Fetch fetch,
+                     Reduction reduction,
+                     ResultKeeper keeper,
+                     Real zero,
+                     Args... args )
+   {
+      chunkedEllpack.segmentsReductionKernelWithAllParameters( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+   }
+};
+
+template< typename Index, typename Fetch >
+struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, false >
+{
+   template< typename View,
+             typename Reduction,
+             typename ResultKeeper,
+             typename Real,
+             typename... Args >
+   __device__
+   static void exec( View chunkedEllpack,
+                     Index gridIdx,
+                     Index first,
+                     Index last,
+                     Fetch fetch,
+                     Reduction reduction,
+                     ResultKeeper keeper,
+                     Real zero,
+                     Args... args )
+   {
+      chunkedEllpack.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+   }
+};
+
+template< typename View,
+          typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real,
+          typename... Args >
+__global__
+void BiEllpackSegmentsReductionKernel( View chunkedEllpack,
+                                            Index gridIdx,
+                                            Index first,
+                                            Index last,
+                                            Fetch fetch,
+                                            Reduction reduction,
+                                            ResultKeeper keeper,
+                                            Real zero,
+                                            Args... args )
+{
+   BiEllpackSegmentsReductionDispatcher< Index, Fetch >::exec( chunkedEllpack, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+}
+#endif
+
+         } //namespace details
+      } //namespace Segments
+   } //namespace Containers
+} //namepsace TNL
diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt
index f2ffd0c4b..eb8e2e1d5 100644
--- a/src/UnitTests/Matrices/CMakeLists.txt
+++ b/src/UnitTests/Matrices/CMakeLists.txt
@@ -25,6 +25,9 @@ IF( BUILD_CUDA )
    CUDA_ADD_EXECUTABLE( SparseMatrixTest_ChunkedEllpack SparseMatrixTest_ChunkedEllpack.cu OPTIONS ${CXX_TESTS_FLAGS} )
    TARGET_LINK_LIBRARIES( SparseMatrixTest_ChunkedEllpack ${GTEST_BOTH_LIBRARIES} )
 
+   CUDA_ADD_EXECUTABLE( SparseMatrixTest_BiEllpack SparseMatrixTest_BiEllpack.cu OPTIONS ${CXX_TESTS_FLAGS} )
+   TARGET_LINK_LIBRARIES( SparseMatrixTest_BiEllpack ${GTEST_BOTH_LIBRARIES} )
+
    CUDA_ADD_EXECUTABLE( SparseMatrixCopyTest SparseMatrixCopyTest.cu OPTIONS ${CXX_TESTS_FLAGS} )
    TARGET_LINK_LIBRARIES( SparseMatrixCopyTest ${GTEST_BOTH_LIBRARIES} )
 
@@ -79,6 +82,10 @@ ELSE(  BUILD_CUDA )
    TARGET_COMPILE_OPTIONS( SparseMatrixTest_ChunkedEllpack PRIVATE ${CXX_TESTS_FLAGS} )
    TARGET_LINK_LIBRARIES( SparseMatrixTest_ChunkedEllpack ${GTEST_BOTH_LIBRARIES} )
 
+   ADD_EXECUTABLE( SparseMatrixTest_BiEllpack SparseMatrixTest_BiEllpack.cpp )
+   TARGET_COMPILE_OPTIONS( SparseMatrixTest_BiEllpack PRIVATE ${CXX_TESTS_FLAGS} )
+   TARGET_LINK_LIBRARIES( SparseMatrixTest_BiEllpack ${GTEST_BOTH_LIBRARIES} )
+
    ADD_EXECUTABLE( SparseMatrixCopyTest SparseMatrixCopyTest.cpp )
    TARGET_COMPILE_OPTIONS( SparseMatrixCopyTest PRIVATE ${CXX_TESTS_FLAGS} )
    TARGET_LINK_LIBRARIES( SparseMatrixCopyTest ${GTEST_BOTH_LIBRARIES} )
@@ -117,6 +124,7 @@ ADD_TEST( SparseMatrixTest_CSR ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_CSR${C
 ADD_TEST( SparseMatrixTest_Ellpack ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_Ellpack${CMAKE_EXECUTABLE_SUFFIX} )
 ADD_TEST( SparseMatrixTest_SlicedEllpack ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_SlicedEllpack${CMAKE_EXECUTABLE_SUFFIX} )
 ADD_TEST( SparseMatrixTest_ChunkedEllpack ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_ChunkedEllpack${CMAKE_EXECUTABLE_SUFFIX} )
+ADD_TEST( SparseMatrixTest_BiEllpack ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_BiEllpack${CMAKE_EXECUTABLE_SUFFIX} )
 ADD_TEST( SparseMatrixCopyTest ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixCopyTest${CMAKE_EXECUTABLE_SUFFIX} )
 ADD_TEST( BinarySparseMatrixTest_CSR ${EXECUTABLE_OUTPUT_PATH}/BinarySparseMatrixTest_CSR${CMAKE_EXECUTABLE_SUFFIX} )
 ADD_TEST( BinarySparseMatrixTest_Ellpack ${EXECUTABLE_OUTPUT_PATH}/BinarySparseMatrixTest_Ellpack${CMAKE_EXECUTABLE_SUFFIX} )
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.cpp b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.cpp
new file mode 100644
index 000000000..ba7f3cf8d
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixTest_BiEllpack.cpp -  description
+                             -------------------
+    begin                : Apr 7, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixTest_BiEllpack.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.cu b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.cu
new file mode 100644
index 000000000..1121477b5
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.cu
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixTest_BiEllpack.cu -  description
+                             -------------------
+    begin                : Apr 7, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixTest_BiEllpack.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
new file mode 100644
index 000000000..d75b1c3cc
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
@@ -0,0 +1,57 @@
+/***************************************************************************
+                          SparseMatrixTest_BiEllpack.h -  description
+                             -------------------
+    begin                : Apr 7, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Containers/Segments/BiEllpack.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_BiEllpack_segments";
+
+////
+// Row-major format is used for the host system
+template< typename Device, typename Index, typename IndexAllocator >
+using RowMajorBiEllpack = TNL::Containers::Segments::BiEllpack< Device, Index, IndexAllocator, true >;
+
+////
+// Column-major format is used for GPUs
+template< typename Device, typename Index, typename IndexAllocator >
+using ColumnMajorBiEllpack = TNL::Containers::Segments::BiEllpack< Device, Index, IndexAllocator, false >;
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixTest.h"
+#include "../main.h"
-- 
GitLab


From c3bf6918d97ee50f096bd43960bd22f44bee9e6d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 9 Apr 2020 18:31:46 +0200
Subject: [PATCH 11/68] Implementing BiEllpack segments.

---
 src/TNL/Containers/Segments/BiEllpack.h       |  35 ++-
 src/TNL/Containers/Segments/BiEllpack.hpp     | 252 +++++++++++++++++-
 .../Segments/BiEllpackSegmentView.h           |  90 +++----
 src/TNL/Containers/Segments/BiEllpackView.h   |   2 +-
 src/TNL/Containers/Segments/BiEllpackView.hpp |   5 +-
 .../Containers/Segments/details/BiEllpack.h   | 182 ++++++++++++-
 src/TNL/Matrices/Legacy/BiEllpack_impl.h      |   1 +
 7 files changed, 491 insertions(+), 76 deletions(-)

diff --git a/src/TNL/Containers/Segments/BiEllpack.h b/src/TNL/Containers/Segments/BiEllpack.h
index 30827b719..3a52fe8ee 100644
--- a/src/TNL/Containers/Segments/BiEllpack.h
+++ b/src/TNL/Containers/Segments/BiEllpack.h
@@ -75,7 +75,7 @@ class BiEllpack
       IndexType getStorageSize() const;
 
       __cuda_callable__
-      IndexType getGlobalIndex( const Index segmentIdx, const Index localIdx ) const;
+      IndexType getGlobalIndex( const IndexType segmentIdx, const IndexType localIdx ) const;
 
       __cuda_callable__
       SegmentViewType getSegmentView( const IndexType segmentIdx ) const;
@@ -117,7 +117,23 @@ class BiEllpack
 
       static constexpr int getWarpSize() { return WarpSize; };
 
-      static constexpr int getLogWarpSize() { return std::log( WarpSize ); };
+      static constexpr int getLogWarpSize() { return std::log2( WarpSize ); };
+
+      template< typename SizesHolder = OffsetsHolder >
+      void performRowBubbleSort( const SizesHolder& segmentsSize );
+
+      template< typename SizesHolder = OffsetsHolder >
+      void computeColumnSizes( const SizesHolder& segmentsSizes );
+
+      template< typename SizesHolder = OffsetsHolder >
+      void verifyRowPerm( const SizesHolder& segmentsSizes );
+
+      template< typename SizesHolder = OffsetsHolder >
+      void verifyRowLengths( const SizesHolder& segmentsSizes );
+
+      IndexType getStripLength( const IndexType stripIdx ) const;
+
+      IndexType getGroupLength( const IndexType strip, const IndexType group ) const;
 
       IndexType size = 0, storageSize = 0;
 
@@ -127,6 +143,21 @@ class BiEllpack
 
       OffsetsHolder groupPointers;
 
+
+
+      // TODO: Replace later
+      __cuda_callable__ Index power( const IndexType number, const IndexType exponent ) const
+      {
+          if( exponent >= 0 )
+          {
+              IndexType result = 1;
+              for( IndexType i = 0; i < exponent; i++ )
+                  result *= number;
+              return result;
+          }
+          return 0;
+      };
+
       template< typename Device_, typename Index_, typename IndexAllocator_, bool RowMajorOrder_, int WarpSize_ >
       friend class BiEllpack;
 };
diff --git a/src/TNL/Containers/Segments/BiEllpack.hpp b/src/TNL/Containers/Segments/BiEllpack.hpp
index d99d883e0..678f2b2a0 100644
--- a/src/TNL/Containers/Segments/BiEllpack.hpp
+++ b/src/TNL/Containers/Segments/BiEllpack.hpp
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include <math.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Containers/Segments/BiEllpack.h>
@@ -113,20 +114,240 @@ template< typename Device,
           bool RowMajorOrder,
           int WarpSize >
    template< typename SizesHolder >
-void
-BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
-setSegmentsSizes( const SizesHolder& segmentsSizes )
+void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+performRowBubbleSort( const SizesHolder& segmentsSizes )
 {
+   this->rowPermArray.evaluate( [] __cuda_callable__ ( const IndexType i ) -> IndexType { return i; } );
+
    if( std::is_same< DeviceType, Devices::Host >::value )
    {
+      IndexType strips = this->virtualRows / getWarpSize();
+      for( IndexType i = 0; i < strips; i++ )
+      {
+         IndexType begin = i * getWarpSize();
+         IndexType end = ( i + 1 ) * getWarpSize() - 1;
+         if(this->getSize() - 1 < end)
+            end = this->getSize() - 1;
+         bool sorted = false;
+         IndexType permIndex1, permIndex2, offset = 0;
+         while( !sorted )
+         {
+            sorted = true;
+            for( IndexType j = begin + offset; j < end - offset; j++ )
+            {
+               for( IndexType k = begin; k < end + 1; k++ )
+               {
+                  if( this->rowPermArray.getElement( k ) == j )
+                     permIndex1 = k;
+                  if( this->rowPermArray.getElement( k ) == j + 1 )
+                     permIndex2 = k;
+               }
+               if( segmentsSizes.getElement( permIndex1 ) < segmentsSizes.getElement( permIndex2 ) )
+               {
+                  IndexType temp = this->rowPermArray.getElement( permIndex1 );
+                  this->rowPermArray.setElement( permIndex1, this->rowPermArray.getElement( permIndex2 ) );
+                  this->rowPermArray.setElement( permIndex2, temp );
+                  sorted = false;
+               }
+            }
+            for( IndexType j = end - 1 - offset; j > begin + offset; j-- )
+            {
+               for( IndexType k = begin; k < end + 1; k++ )
+               {
+                  if( this->rowPermArray.getElement( k ) == j )
+                     permIndex1 = k;
+                  if( this->rowPermArray.getElement( k ) == j - 1 )
+                     permIndex2 = k;
+               }
+               if( segmentsSizes.getElement( permIndex2 ) < segmentsSizes.getElement( permIndex1 ) )
+               {
+                  IndexType temp = this->rowPermArray.getElement( permIndex1 );
+                  this->rowPermArray.setElement( permIndex1, this->rowPermArray.getElement( permIndex2 ) );
+                  this->rowPermArray.setElement( permIndex2, temp );
+                  sorted = false;
+               }
+            }
+            offset++;
+         }
+      }
+   }
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename SizesHolder >
+void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+computeColumnSizes( const SizesHolder& segmentsSizes )
+{
+   IndexType numberOfStrips = this->virtualRows / getWarpSize();
+   auto groupPointersView = this->groupPointers.getView();
+   auto segmentsPermutationView = this->rowPermArray.getView();
+   auto segmentsSizesView = segmentsSizes.getConstView();
+   const IndexType size = this->getSize();
+   Algorithms::ParallelFor< DeviceType >::exec(
+      ( IndexType ) 0,
+      this->virtualRows / getWarpSize(),
+      [=] __cuda_callable__ ( const IndexType strip ) mutable {
+
+         IndexType firstSegment = strip * getWarpSize();
+         IndexType groupBegin = strip * ( getLogWarpSize() + 1 );
+         IndexType emptyGroups = 0;
+
+         ////
+         // The last strip can be shorter
+         if( strip == numberOfStrips - 1 )
+         {
+            IndexType segmentsCount = size - firstSegment;
+            while( !( segmentsCount > TNL::pow( getLogWarpSize() - 1 - emptyGroups, 2 ) ) )
+               emptyGroups++;
+            for( IndexType group = groupBegin; group < groupBegin + emptyGroups; group++ )
+               groupPointersView[ group ] = 0;
+         }
+
+         IndexType allocatedColumns = 0;
+         for( IndexType groupIdx = emptyGroups; groupIdx < getLogWarpSize(); groupIdx++ )
+         {
+            IndexType segmentIdx = TNL::pow( getLogWarpSize() - 1 - groupIdx, 2 );
+            IndexType permSegm = 0;
+            while( segmentsPermutationView[ permSegm + firstSegment ] != segmentIdx + firstSegment )
+               permSegm++;
+            const IndexType groupWidth = segmentsSizesView[ permSegm + firstSegment ] - allocatedColumns;
+            const IndexType groupHeight = TNL::pow( getLogWarpSize() - groupIdx, 2 );
+            const IndexType groupSize = groupWidth * groupHeight;
+            allocatedColumns = segmentsSizes[ permSegm + firstSegment ];
+            groupPointersView[ groupIdx + groupBegin ] = groupSize;
+         }
+      } );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename SizesHolder >
+void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+verifyRowPerm( const SizesHolder& segmentsSizes )
+{
+   bool ok = true;
+   IndexType numberOfStrips = this->virtualRows / getWarpSize();
+   for( IndexType strip = 0; strip < numberOfStrips; strip++ )
+   {
+      IndexType begin = strip * getWarpSize();
+      IndexType end = ( strip + 1 ) * getWarpSize();
+      if( this->getSize() < end )
+         end = this->getSize();
+      for( IndexType i = begin; i < end - 1; i++ )
+      {
+         IndexType permIndex1, permIndex2;
+         bool first = false;
+         bool second = false;
+         for( IndexType j = begin; j < end; j++ )
+         {
+            if( this->rowPermArray.getElement( j ) == i )
+            {
+               permIndex1 = j;
+               first = true;
+            }
+            if( this->rowPermArray.getElement( j ) == i + 1 )
+            {
+               permIndex2 = j;
+               second = true;
+            }
+         }
+         if( !first || !second )
+            std::cout << "Wrong permutation!" << std::endl;
+         if( segmentsSizes.getElement( permIndex1 ) >= segmentsSizes.getElement( permIndex2 ) )
+            continue;
+         else
+            ok = false;
+      }
+   }
+   if( !ok )
+      throw( std::logic_error( "Segments permutaion verification failed." ) );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename SizesHolder >
+void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+verifyRowLengths( const SizesHolder& segmentsSizes )
+{
+   bool ok = true;
+   for( IndexType segmentIdx = 0; segmentIdx < this->getSize(); segmentIdx++ )
+   {
+      const IndexType strip = segmentIdx / getWarpSize();
+      const IndexType stripLength = this->getStripLength( strip );
+      const IndexType groupBegin = ( getLogWarpSize() + 1 ) * strip;
+      const IndexType rowStripPerm = this->rowPermArray.getElement( segmentIdx ) - strip * getWarpSize();
+      const IndexType begin = this->groupPointers.getElement( groupBegin ) * getWarpSize() + rowStripPerm * stripLength;
+      IndexType elementPtr = begin;
+      IndexType rowLength = 0;
+      const IndexType groupsCount = details::BiEllpack< Index, Device, RowMajorOrder, WarpSize >::getActiveGroupsCount( this->rowPermArray.getConstView(), segmentIdx );
+      for( IndexType group = 0; group < groupsCount; group++ )
+      {
+         for( IndexType i = 0; i < this->getGroupLength( strip, group ); i++ )
+         {
+            IndexType biElementPtr = elementPtr;
+            for( IndexType j = 0; j < this->power( 2, group ); j++ )
+            {
+               rowLength++;
+               biElementPtr += this->power( 2, getLogWarpSize() - group ) * stripLength;
+            }
+            elementPtr++;
+         }
+      }
+      if( segmentsSizes.getElement( segmentIdx ) > rowLength )
+         ok = false;
    }
+   if( ! ok )
+      throw( std::logic_error( "Segments capacities verification failed." ) );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+   template< typename SizesHolder >
+void
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+setSegmentsSizes( const SizesHolder& segmentsSizes )
+{
+   //if( std::is_same< DeviceType, Devices::Host >::value )
+   // {
+      this->size = segmentsSizes.getSize();
+      if( this->size % WarpSize != 0 )
+         this->virtualRows = this->size + getWarpSize() - ( this->size % getWarpSize() );
+      else
+         this->virtualRows = this->size;
+      IndexType strips = this->virtualRows / getWarpSize();
+      this->rowPermArray.setSize( this->size );
+      this->groupPointers.setSize( strips * ( getLogWarpSize() + 1 ) + 1 );
+      this->groupPointers = 0;
+
+      this->performRowBubbleSort( segmentsSizes );
+      this->computeColumnSizes( segmentsSizes );
+
+      this->groupPointers.template scan< Algorithms::ScanType::Exclusive >();
+
+      this->verifyRowPerm( segmentsSizes );
+      this->verifyRowLengths( segmentsSizes );
+      this->storageSize =  getWarpSize() * this->groupPointers.getElement( strips * ( getLogWarpSize() + 1 ) );
+   /*}
    else
    {
-      BiEllpack< Devices::Host, Index, typename Allocators::Default< Devices::Host >::template Allocator< Index >, RowMajorOrder > hostSegments;
+      BiEllpack< Devices::Host, Index, typename Allocators::Default< Devices::Host >::template Allocator< IndexType >, RowMajorOrder > hostSegments;
       Containers::Vector< IndexType, Devices::Host, IndexType > hostSegmentsSizes( segmentsSizes );
       hostSegments.setSegmentsSizes( hostSegmentsSizes );
       *this = hostSegments;
-   }
+   }*/
 }
 
 template< typename Device,
@@ -182,7 +403,7 @@ template< typename Device,
           bool RowMajorOrder,
           int WarpSize >
 __cuda_callable__ auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
-getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
+getGlobalIndex( const IndexType segmentIdx, const IndexType localIdx ) const -> IndexType
 {
       return details::BiEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndex(
          rowPermArray.getConstView(),
@@ -308,11 +529,22 @@ template< typename Device,
           typename IndexAllocator,
           bool RowMajorOrder,
           int WarpSize >
-void
-BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
-printStructure( std::ostream& str )
+auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getStripLength( const IndexType stripIdx ) const -> IndexType
+{
+   return details::BiEllpack< Index, Device, RowMajorOrder, WarpSize >::getStripLength( this->groupPointers.getConstView(), stripIdx );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getGroupLength( const IndexType strip, const IndexType group ) const -> IndexType
 {
-   this->getView().printStructure( str );
+   return this->groupPointers.getElement( strip * ( getLogWarpSize() + 1 ) + group + 1 )
+           - this->groupPointers.getElement( strip * ( getLogWarpSize() + 1 ) + group );
 }
 
       } // namespace Segments
diff --git a/src/TNL/Containers/Segments/BiEllpackSegmentView.h b/src/TNL/Containers/Segments/BiEllpackSegmentView.h
index 6c049e6f8..427c06583 100644
--- a/src/TNL/Containers/Segments/BiEllpackSegmentView.h
+++ b/src/TNL/Containers/Segments/BiEllpackSegmentView.h
@@ -10,64 +10,42 @@
 
 #pragma once
 
+#include <TNL/Containers/StaticVector.h>
+
 namespace TNL {
    namespace Containers {
       namespace Segments {
 
 template< typename Index,
-          bool RowMajorOrder = false >
-class BiEllpackSegmentView;
-
-template< typename Index >
-class BiEllpackSegmentView< Index, false >
+          bool RowMajorOrder = false,
+          int WarpSize = 32 >
+class BiEllpackSegmentView
 {
    public:
+      
+      static constexpr int getWarpSize() { return WarpSize; };
 
-      using IndexType = Index;
-
-      __cuda_callable__
-      BiEllpackSegmentView( const IndexType offset,
-                                 const IndexType size,
-                                 const IndexType chunkSize,      // this is only for compatibility with the following specialization
-                                 const IndexType chunksInSlice ) // this one as well - both can be replaced when we could use constexprif in C++17
-      : segmentOffset( offset ), segmentSize( size ){};
-
-      __cuda_callable__
-      BiEllpackSegmentView( const BiEllpackSegmentView& view )
-      : segmentOffset( view.segmentOffset ), segmentSize( view.segmentSize ){};
-
-      __cuda_callable__
-      IndexType getSize() const
-      {
-         return this->segmentSize;
-      };
-
-      __cuda_callable__
-      IndexType getGlobalIndex( const IndexType localIndex ) const
-      {
-         TNL_ASSERT_LT( localIndex, segmentSize, "Local index exceeds segment bounds." );
-         return segmentOffset + localIndex;
-      };
+      static constexpr int getLogWarpSize() { return std::log2( WarpSize ); };
 
-      protected:
-         
-         IndexType segmentOffset, segmentSize;
-};
-
-template< typename Index >
-class BiEllpackSegmentView< Index, true >
-{
-   public:
+      static constexpr int getGroupsCount() { return getLogWarpSize() + 1; };
 
       using IndexType = Index;
+      using GroupsWidthType = Containers::StaticVector< getGroupsCount(), IndexType >;
 
+
+      /**
+       * \brief Constructor.
+       * 
+       * \param offset is offset of the first group of the strip the segment belongs to.
+       * \param size is the segment size
+       * \param inStripIdx is index of the segment within its strip.
+       * \param groupsWidth is a static vector containing widths of the strip groups
+       */
       __cuda_callable__
       BiEllpackSegmentView( const IndexType offset,
-                                 const IndexType size,
-                                 const IndexType chunkSize,
-                                 const IndexType chunksInSlice )
-      : segmentOffset( offset ), segmentSize( size ),
-        chunkSize( chunkSize ), chunksInSlice( chunksInSlice ){};
+                            const IndexType inStripIdx,
+                            const GroupsWidthType& groupsWidth )
+      : groupOffset( offset ), segmentSize( TNL::sum( groupsWidth ) ), inStripIdx( inStripIdx ), groupsWidth( groupsWidth ){};
 
       __cuda_callable__
       IndexType getSize() const
@@ -76,17 +54,27 @@ class BiEllpackSegmentView< Index, true >
       };
 
       __cuda_callable__
-      IndexType getGlobalIndex( const IndexType localIdx ) const
+      IndexType getGlobalIndex( IndexType localIdx ) const
       {
-         TNL_ASSERT_LT( localIdx, segmentSize, "Local index exceeds segment bounds." );
-         const IndexType chunkIdx = localIdx / chunkSize;
-         const IndexType inChunkOffset = localIdx % chunkSize;
-         return segmentOffset + inChunkOffset * chunksInSlice + chunkIdx;
+         IndexType i( 0 ), offset( groupOffset ), groupHeight( getWarpSize() );
+         while( localIdx > groupsWidth[ i ] )
+         {
+            localIdx -= groupsWidth[ i ];
+            offset += groupsWidth[ i++ ] * groupHeight;
+            groupHeight /= 2;
+         }
+         TNL_ASSERT_LE( i, TNL::log2( getWarpSize() - inStripIdx + 1 ), "Local index exceeds segment bounds." );
+         if( RowMajorOrder )
+            return offset + inStripIdx * groupsWidth[ i ] + localIdx;
+         else
+            return offset + inStripIdx + localIdx * groupHeight;
       };
 
       protected:
-         
-         IndexType segmentOffset, segmentSize, chunkSize, chunksInSlice;
+
+         IndexType groupOffset, inStripIdx, segmentSize;
+
+         GroupsWidthType groupsWidth;
 };
 
       } //namespace Segments
diff --git a/src/TNL/Containers/Segments/BiEllpackView.h b/src/TNL/Containers/Segments/BiEllpackView.h
index 54bda498c..a539cc92f 100644
--- a/src/TNL/Containers/Segments/BiEllpackView.h
+++ b/src/TNL/Containers/Segments/BiEllpackView.h
@@ -134,7 +134,7 @@ class BiEllpackView
 
       static constexpr int getWarpSize() { return WarpSize; };
 
-      static constexpr int getLogWarpSize() { return std::log( WarpSize ); };
+      static constexpr int getLogWarpSize() { return std::log2( WarpSize ); };
 
       IndexType size = 0, storageSize = 0;
 
diff --git a/src/TNL/Containers/Segments/BiEllpackView.hpp b/src/TNL/Containers/Segments/BiEllpackView.hpp
index 9a939b3fc..d05525ab8 100644
--- a/src/TNL/Containers/Segments/BiEllpackView.hpp
+++ b/src/TNL/Containers/Segments/BiEllpackView.hpp
@@ -285,6 +285,7 @@ BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
+   
 }
 
 template< typename Device,
@@ -310,8 +311,8 @@ operator=( const BiEllpackView& source )
    this->size = source.size;
    this->storageSize = source.storageSize;
    this->virtualRows = source.virtualRows;
-   this->rowPermArray = source.rowPermArray;
-   this->groupPointers = source.groupPointers;
+   this->rowPermArray.bind( source.rowPermArray );
+   this->groupPointers.bind( source.groupPointers );
    return *this;
 }
 
diff --git a/src/TNL/Containers/Segments/details/BiEllpack.h b/src/TNL/Containers/Segments/details/BiEllpack.h
index ef95d50e3..b8735af54 100644
--- a/src/TNL/Containers/Segments/details/BiEllpack.h
+++ b/src/TNL/Containers/Segments/details/BiEllpack.h
@@ -33,14 +33,60 @@ class BiEllpack
       static constexpr bool getRowMajorOrder() { return RowMajorOrder; }
       using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType >;
       using OffsetsHolderView = typename OffsetsHolder::ViewType;
+      using ConstOffsetsHolderView = typename OffsetsHolderView::ConstViewType;
       using SegmentsSizes = OffsetsHolder;
       using SegmentViewType = BiEllpackSegmentView< IndexType, RowMajorOrder >;
+      
+      static constexpr int getWarpSize() { return WarpSize; };
 
+      static constexpr int getLogWarpSize() { return std::log2( WarpSize ); };
+
+      static constexpr int getGroupsCount() { return getLogWarpSize() + 1; };
+
+      static IndexType getActiveGroupsCount( const ConstOffsetsHolderView& rowPermArray, const IndexType segmentIdx )
+      {
+         TNL_ASSERT_GE( segmentIdx, 0, "" );
+         //TNL_ASSERT_LT( segmentIdx, this->getSize(), "" );
+
+         IndexType strip = segmentIdx / getWarpSize();
+         IndexType rowStripPermutation = rowPermArray.getElement( segmentIdx ) - getWarpSize() * strip;
+         IndexType numberOfGroups = getLogWarpSize() + 1;
+         IndexType bisection = 1;
+         for( IndexType i = 0; i < getLogWarpSize() + 1; i++ )
+         {
+            if( rowStripPermutation < bisection )
+               return ( numberOfGroups - i );
+            bisection *= 2;
+         }
+         throw std::logic_error( "segmentIdx was not found" );
+      }
+
+      static IndexType getGroupLength( const ConstOffsetsHolderView& groupPointers,
+                                       const IndexType strip,
+                                       const IndexType group )
+      {
+         const IndexType groupOffset = strip * ( getLogWarpSize() + 1 ) + group;
+         return groupPointers.getElement( groupOffset + 1 ) - groupPointers.getElement( groupOffset );
+      }
       __cuda_callable__ static
       IndexType getSegmentSizeDirect( const OffsetsHolderView& rowPermArray,
                                       const OffsetsHolderView& groupPointers,
                                       const IndexType segmentIdx )
       {
+         const IndexType strip = segmentIdx / getWarpSize();
+         const IndexType groupIdx = strip * ( getLogWarpSize() + 1 );
+         const IndexType rowStripPerm = rowPermArray[ segmentIdx ] - strip * getWarpSize();
+         const IndexType groupsCount = getActiveGroupsCount( rowPermArray, segmentIdx );
+         IndexType groupHeight = getWarpSize();
+         IndexType segmentSize = 0;
+         for( IndexType group = 0; group < groupsCount; group++ )
+         {
+            const IndexType groupSize = getGroupLength( groupPointers, strip, group );
+            IndexType groupWidth =  groupSize / groupHeight;
+            segmentSize += groupWidth;
+            groupHeight /= 2;
+         }
+         return segmentSize;
       }
 
       static
@@ -48,22 +94,84 @@ class BiEllpack
                                 const OffsetsHolderView& groupPointers,
                                 const IndexType segmentIdx )
       {
+         const IndexType strip = segmentIdx / getWarpSize();
+         const IndexType groupIdx = strip * ( getLogWarpSize() + 1 );
+         const IndexType rowStripPerm = rowPermArray.getElement( segmentIdx ) - strip * getWarpSize();
+         const IndexType groupsCount = getActiveGroupsCount( rowPermArray, segmentIdx );
+         IndexType groupHeight = getWarpSize();
+         IndexType segmentSize = 0;
+         for( IndexType group = 0; group < groupsCount; group++ )
+         {
+            const IndexType groupSize = getGroupLength( groupPointers, strip, group );
+            IndexType groupWidth =  groupSize / groupHeight;
+            segmentSize += groupWidth;
+            groupHeight /= 2;
+         }
+         return segmentSize;
       }
 
       __cuda_callable__ static
       IndexType getGlobalIndexDirect( const OffsetsHolderView& rowPermArray,
                                       const OffsetsHolderView& groupPointers,
                                       const IndexType segmentIdx,
-                                      const IndexType localIdx )
+                                      IndexType localIdx )
       {
+         const IndexType strip = segmentIdx / getWarpSize();
+         const IndexType groupIdx = strip * ( getLogWarpSize() + 1 );
+         const IndexType rowStripPerm = rowPermArray[ segmentIdx ] - strip * getWarpSize();
+         const IndexType groupsCount = getActiveGroupsCount( rowPermArray, segmentIdx );
+         IndexType globalIdx = groupPointers[ groupIdx ] * getWarpSize();
+         IndexType groupHeight = getWarpSize();
+         for( IndexType group = 0; group < groupsCount; group++ )
+         {
+            const IndexType groupSize = getGroupLength( groupPointers, strip, group );
+            IndexType groupWidth =  groupSize / groupHeight;
+            if( localIdx > groupWidth )
+            {
+               localIdx -= groupWidth;
+               globalIdx += groupSize;
+            }
+            else
+            {
+               if( RowMajorOrder )
+                  return globalIdx + rowStripPerm * groupWidth + localIdx;
+               else
+                  return globalIdx + rowStripPerm + localIdx * groupHeight;
+            }
+            groupHeight /= 2;
+         }
       }
 
       static
-      IndexType getGlobalIndex( const OffsetsHolderView& rowPermArray,
-                                const OffsetsHolderView& groupPointers,
+      IndexType getGlobalIndex( const ConstOffsetsHolderView& rowPermArray,
+                                const ConstOffsetsHolderView& groupPointers,
                                 const IndexType segmentIdx,
-                                const IndexType localIdx )
+                                IndexType localIdx )
       {
+         const IndexType strip = segmentIdx / getWarpSize();
+         const IndexType groupIdx = strip * ( getLogWarpSize() + 1 );
+         const IndexType rowStripPerm = rowPermArray.getElement( segmentIdx ) - strip * getWarpSize();
+         const IndexType groupsCount = getActiveGroupsCount( rowPermArray, segmentIdx );
+         IndexType globalIdx = groupPointers.getElement( groupIdx ); // * getWarpSize();
+         IndexType groupHeight = getWarpSize();
+         for( IndexType group = 0; group < groupsCount; group++ )
+         {
+            const IndexType groupSize = getGroupLength( groupPointers, strip, group );
+            IndexType groupWidth =  groupSize / groupHeight;
+            if( localIdx > groupWidth )
+            {
+               localIdx -= groupWidth;
+               globalIdx += groupSize;
+            }
+            else
+            {
+               if( RowMajorOrder )
+                  return globalIdx + rowStripPerm * groupWidth + localIdx;
+               else
+                  return globalIdx + rowStripPerm + localIdx * groupHeight;
+            }
+            groupHeight /= 2;
+         }
       }
 
       static __cuda_callable__
@@ -71,6 +179,24 @@ class BiEllpack
                                             const OffsetsHolderView& groupPointers,
                                             const IndexType segmentIdx )
       {
+         using GroupsWidthType = typename SegmentViewType::GroupsWidthType;
+
+         const IndexType strip = segmentIdx / getWarpSize();
+         const IndexType groupIdx = strip * ( getLogWarpSize() + 1 );
+         const IndexType inStripIdx = rowPermArray[ segmentIdx ] - strip * getWarpSize();
+         const IndexType groupsCount = getActiveGroupsCount( rowPermArray, segmentIdx );
+         IndexType groupHeight = getWarpSize();
+         GroupsWidthType groupsWidth( 0 );
+         TNL_ASSERT_LE( groupsCount, getGroupsCount(), "" );
+         for( IndexType i = 0; i < groupsCount; i++ )
+         {
+            const IndexType groupSize = groupPointers[ groupIdx + i + 1 ] - groupPointers[ groupIdx + i ];
+            groupsWidth[ i ] = groupSize / groupHeight;
+            groupHeight /= 2;
+         }
+         return SegmentViewType( groupPointers[ groupIdx ],
+                                 inStripIdx,
+                                 groupsWidth );
       }
 
       static __cuda_callable__
@@ -78,7 +204,43 @@ class BiEllpack
                                       const OffsetsHolderView& groupPointers,
                                       const IndexType segmentIdx )
       {
+         using GroupsWidthType = typename SegmentViewType::GroupsWidthType;
+
+         const IndexType strip = segmentIdx / getWarpSize();
+         const IndexType groupIdx = strip * ( getLogWarpSize() + 1 );
+         const IndexType inStripIdx = rowPermArray.getElement( segmentIdx ) - strip * getWarpSize();
+         const IndexType groupsCount = getActiveGroupsCount( rowPermArray, segmentIdx );
+         IndexType groupHeight = getWarpSize();
+         GroupsWidthType groupsWidth( 0 );
+         for( IndexType i = 0; i < groupsCount; i++ )
+         {
+            const IndexType groupSize = groupPointers.getElement( groupIdx + i + 1 ) - groupPointers.getElement( groupIdx + i );
+            groupsWidth[ i ] = groupSize / groupHeight;
+            groupHeight /= 2;
+         }
+         return SegmentViewType( groupPointers[ groupIdx ],
+                                 inStripIdx,
+                                 groupsWidth );
+      }
+
+      static
+      Index getStripLength( const ConstOffsetsHolderView& groupPointers, const IndexType strip )
+      {
+         TNL_ASSERT( strip >= 0, std::cerr << "strip = " << strip );
+
+          return groupPointers.getElement( ( strip + 1 ) * ( getLogWarpSize() + 1 ) )
+                 - groupPointers.getElement( strip * ( getLogWarpSize() + 1 ) );
       }
+
+      static __cuda_callable__
+      Index getStripLengthDirect( const ConstOffsetsHolderView& groupPointers, const IndexType strip )
+      {
+         TNL_ASSERT( strip >= 0, std::cerr << "strip = " << strip );
+
+          return groupPointers[ ( strip + 1 ) * ( getLogWarpSize() + 1 ) ]
+                 - groupPointers[ strip * ( getLogWarpSize() + 1 ) ];
+      }
+
 };
 
 #ifdef HAVE_CUDA
@@ -97,7 +259,7 @@ struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, true >
              typename Real,
              typename... Args >
    __device__
-   static void exec( View chunkedEllpack,
+   static void exec( View biEllpack,
                      Index gridIdx,
                      Index first,
                      Index last,
@@ -107,7 +269,7 @@ struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, true >
                      Real zero,
                      Args... args )
    {
-      chunkedEllpack.segmentsReductionKernelWithAllParameters( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+      biEllpack.segmentsReductionKernelWithAllParameters( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
    }
 };
 
@@ -120,7 +282,7 @@ struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, false >
              typename Real,
              typename... Args >
    __device__
-   static void exec( View chunkedEllpack,
+   static void exec( View biEllpack,
                      Index gridIdx,
                      Index first,
                      Index last,
@@ -130,7 +292,7 @@ struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, false >
                      Real zero,
                      Args... args )
    {
-      chunkedEllpack.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+      biEllpack.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
    }
 };
 
@@ -142,7 +304,7 @@ template< typename View,
           typename Real,
           typename... Args >
 __global__
-void BiEllpackSegmentsReductionKernel( View chunkedEllpack,
+void BiEllpackSegmentsReductionKernel( View biEllpack,
                                             Index gridIdx,
                                             Index first,
                                             Index last,
@@ -152,7 +314,7 @@ void BiEllpackSegmentsReductionKernel( View chunkedEllpack,
                                             Real zero,
                                             Args... args )
 {
-   BiEllpackSegmentsReductionDispatcher< Index, Fetch >::exec( chunkedEllpack, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+   BiEllpackSegmentsReductionDispatcher< Index, Fetch >::exec( biEllpack, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
 }
 #endif
 
diff --git a/src/TNL/Matrices/Legacy/BiEllpack_impl.h b/src/TNL/Matrices/Legacy/BiEllpack_impl.h
index 1bb393bb9..c83c9e0fb 100644
--- a/src/TNL/Matrices/Legacy/BiEllpack_impl.h
+++ b/src/TNL/Matrices/Legacy/BiEllpack_impl.h
@@ -1070,6 +1070,7 @@ public:
 						if( matrix.rowPermArray.getElement( k ) == j + 1 )
 							permIndex2 = k;
 					}
+               std::cerr << "permIndex2 = " << permIndex2 << std::endl;
 					if( rowLengths.getElement( permIndex1 ) < rowLengths.getElement( permIndex2 ) )
 					{
 						Index temp = matrix.rowPermArray.getElement( permIndex1 );
-- 
GitLab


From a6d4e7d385b38ada2716e8999b499d080e662030 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 11 Apr 2020 13:36:40 +0200
Subject: [PATCH 12/68] BiEllpack works on CPU.

---
 src/TNL/Containers/Segments/BiEllpack.hpp     |  6 +-
 .../Segments/BiEllpackSegmentView.h           | 19 ++++--
 src/TNL/Containers/Segments/BiEllpackView.hpp | 39 +++++++++++-
 .../Containers/Segments/details/BiEllpack.h   | 61 +++++++++++++------
 src/UnitTests/Matrices/SparseMatrixTest.h     |  1 +
 .../Matrices/SparseMatrixTest_BiEllpack.h     |  2 +-
 6 files changed, 98 insertions(+), 30 deletions(-)

diff --git a/src/TNL/Containers/Segments/BiEllpack.hpp b/src/TNL/Containers/Segments/BiEllpack.hpp
index 678f2b2a0..b186f33a1 100644
--- a/src/TNL/Containers/Segments/BiEllpack.hpp
+++ b/src/TNL/Containers/Segments/BiEllpack.hpp
@@ -201,7 +201,7 @@ computeColumnSizes( const SizesHolder& segmentsSizes )
          if( strip == numberOfStrips - 1 )
          {
             IndexType segmentsCount = size - firstSegment;
-            while( !( segmentsCount > TNL::pow( getLogWarpSize() - 1 - emptyGroups, 2 ) ) )
+            while( !( segmentsCount > TNL::pow( 2, getLogWarpSize() - 1 - emptyGroups ) ) )
                emptyGroups++;
             for( IndexType group = groupBegin; group < groupBegin + emptyGroups; group++ )
                groupPointersView[ group ] = 0;
@@ -210,12 +210,12 @@ computeColumnSizes( const SizesHolder& segmentsSizes )
          IndexType allocatedColumns = 0;
          for( IndexType groupIdx = emptyGroups; groupIdx < getLogWarpSize(); groupIdx++ )
          {
-            IndexType segmentIdx = TNL::pow( getLogWarpSize() - 1 - groupIdx, 2 );
+            IndexType segmentIdx = TNL::pow( 2, getLogWarpSize() - 1 - groupIdx ) - 1;
             IndexType permSegm = 0;
             while( segmentsPermutationView[ permSegm + firstSegment ] != segmentIdx + firstSegment )
                permSegm++;
             const IndexType groupWidth = segmentsSizesView[ permSegm + firstSegment ] - allocatedColumns;
-            const IndexType groupHeight = TNL::pow( getLogWarpSize() - groupIdx, 2 );
+            const IndexType groupHeight = TNL::pow( 2, getLogWarpSize() - groupIdx );
             const IndexType groupSize = groupWidth * groupHeight;
             allocatedColumns = segmentsSizes[ permSegm + firstSegment ];
             groupPointersView[ groupIdx + groupBegin ] = groupSize;
diff --git a/src/TNL/Containers/Segments/BiEllpackSegmentView.h b/src/TNL/Containers/Segments/BiEllpackSegmentView.h
index 427c06583..15352fc56 100644
--- a/src/TNL/Containers/Segments/BiEllpackSegmentView.h
+++ b/src/TNL/Containers/Segments/BiEllpackSegmentView.h
@@ -56,16 +56,23 @@ class BiEllpackSegmentView
       __cuda_callable__
       IndexType getGlobalIndex( IndexType localIdx ) const
       {
-         IndexType i( 0 ), offset( groupOffset ), groupHeight( getWarpSize() );
-         while( localIdx > groupsWidth[ i ] )
+         //std::cerr << "SegmentView: localIdx = " << localIdx << " groupWidth = " << groupsWidth << std::endl;
+         IndexType groupIdx( 0 ), offset( groupOffset ), groupHeight( getWarpSize() );
+         while( localIdx >= groupsWidth[ groupIdx ] )
          {
-            localIdx -= groupsWidth[ i ];
-            offset += groupsWidth[ i++ ] * groupHeight;
+            //std::cerr << "ROW: groupIdx = " << groupIdx << " groupWidth = " << groupsWidth[ groupIdx ]
+            //          << " groupSize = " << groupsWidth[ groupIdx ] * groupHeight << std::endl;
+            localIdx -= groupsWidth[ groupIdx ];
+            offset += groupsWidth[ groupIdx++ ] * groupHeight;
             groupHeight /= 2;
          }
-         TNL_ASSERT_LE( i, TNL::log2( getWarpSize() - inStripIdx + 1 ), "Local index exceeds segment bounds." );
+         TNL_ASSERT_LE( groupIdx, TNL::log2( getWarpSize() - inStripIdx + 1 ), "Local index exceeds segment bounds." );
          if( RowMajorOrder )
-            return offset + inStripIdx * groupsWidth[ i ] + localIdx;
+         {
+            //std::cerr << " offset = " << offset << " inStripIdx = " << inStripIdx << " localIdx = " << localIdx 
+            //          << " return = " << offset + inStripIdx * groupsWidth[ groupIdx ] + localIdx << std::endl;
+            return offset + inStripIdx * groupsWidth[ groupIdx ] + localIdx;
+         }
          else
             return offset + inStripIdx + localIdx * groupHeight;
       };
diff --git a/src/TNL/Containers/Segments/BiEllpackView.hpp b/src/TNL/Containers/Segments/BiEllpackView.hpp
index d05525ab8..edf23bb14 100644
--- a/src/TNL/Containers/Segments/BiEllpackView.hpp
+++ b/src/TNL/Containers/Segments/BiEllpackView.hpp
@@ -285,7 +285,44 @@ BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
-   
+   if( std::is_same< DeviceType, Devices::Host >::value )
+      for( IndexType segmentIdx = 0; segmentIdx < this->getSize(); segmentIdx++ )
+      {
+         const IndexType stripIdx = segmentIdx / getWarpSize();
+         const IndexType groupIdx = stripIdx * ( getLogWarpSize() + 1 );
+         const IndexType inStripIdx = rowPermArray[ segmentIdx ] - stripIdx * getWarpSize();
+         const IndexType groupsCount = details::BiEllpack< IndexType, DeviceType, RowMajorOrder, getWarpSize() >::getActiveGroupsCount( rowPermArray, segmentIdx );
+         IndexType globalIdx = groupPointers[ groupIdx ];
+         IndexType groupHeight = getWarpSize();
+         IndexType localIdx( 0 );
+         RealType aux( zero );
+         bool compute( true );
+         for( IndexType group = 0; group < groupsCount && compute; group++ )
+         {
+            const IndexType groupSize = details::BiEllpack< IndexType, DeviceType, RowMajorOrder, getWarpSize() >::getGroupSize( groupPointers, stripIdx, group );
+            IndexType groupWidth = groupSize / groupHeight;
+            const IndexType globalIdxBack = globalIdx;
+            if( RowMajorOrder )
+               globalIdx += inStripIdx * groupWidth;
+            else
+               globalIdx += inStripIdx;
+            for( IndexType j = 0; j < groupWidth && compute; j++ )
+            {
+               //std::cerr << "segmentIdx = " << segmentIdx << " groupIdx = " << groupIdx 
+               //         << " groupWidth = " << groupWidth << " groupHeight = " << groupHeight
+               //          << " localIdx = " << localIdx << " globalIdx = " << globalIdx 
+               //          << " fetch = " << details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) << std::endl;
+               reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+               if( RowMajorOrder )
+                  globalIdx ++;
+               else
+                  globalIdx += groupHeight;
+            }
+            globalIdx = globalIdxBack + groupSize;
+            groupHeight /= 2;
+         }
+         keeper( segmentIdx, aux );
+      }
 }
 
 template< typename Device,
diff --git a/src/TNL/Containers/Segments/details/BiEllpack.h b/src/TNL/Containers/Segments/details/BiEllpack.h
index b8735af54..4cae9a531 100644
--- a/src/TNL/Containers/Segments/details/BiEllpack.h
+++ b/src/TNL/Containers/Segments/details/BiEllpack.h
@@ -61,9 +61,18 @@ class BiEllpack
          throw std::logic_error( "segmentIdx was not found" );
       }
 
-      static IndexType getGroupLength( const ConstOffsetsHolderView& groupPointers,
-                                       const IndexType strip,
-                                       const IndexType group )
+      static IndexType getGroupSizeDirect( const ConstOffsetsHolderView& groupPointers,
+                                           const IndexType strip,
+                                           const IndexType group )
+      {
+         const IndexType groupOffset = strip * ( getLogWarpSize() + 1 ) + group;
+         return groupPointers[ groupOffset + 1 ] - groupPointers[ groupOffset ];
+      }
+
+      
+      static IndexType getGroupSize( const ConstOffsetsHolderView& groupPointers,
+                                     const IndexType strip,
+                                    const IndexType group )
       {
          const IndexType groupOffset = strip * ( getLogWarpSize() + 1 ) + group;
          return groupPointers.getElement( groupOffset + 1 ) - groupPointers.getElement( groupOffset );
@@ -79,13 +88,15 @@ class BiEllpack
          const IndexType groupsCount = getActiveGroupsCount( rowPermArray, segmentIdx );
          IndexType groupHeight = getWarpSize();
          IndexType segmentSize = 0;
-         for( IndexType group = 0; group < groupsCount; group++ )
+         for( IndexType groupIdx = 0; groupIdx < groupsCount; groupIdx++ )
          {
-            const IndexType groupSize = getGroupLength( groupPointers, strip, group );
+            const IndexType groupSize = getGroupSizeDirect( groupPointers, strip, groupIdx );
             IndexType groupWidth =  groupSize / groupHeight;
+            //std::cerr << " groupIdx = " << groupIdx << " groupWidth = " << groupWidth << std::endl;
             segmentSize += groupWidth;
             groupHeight /= 2;
          }
+         //std::cerr << "############### segmentIdx = " << segmentIdx << " segmentSize = " << segmentSize << std::endl;
          return segmentSize;
       }
 
@@ -102,7 +113,7 @@ class BiEllpack
          IndexType segmentSize = 0;
          for( IndexType group = 0; group < groupsCount; group++ )
          {
-            const IndexType groupSize = getGroupLength( groupPointers, strip, group );
+            const IndexType groupSize = getGroupSize( groupPointers, strip, group );
             IndexType groupWidth =  groupSize / groupHeight;
             segmentSize += groupWidth;
             groupHeight /= 2;
@@ -122,24 +133,35 @@ class BiEllpack
          const IndexType groupsCount = getActiveGroupsCount( rowPermArray, segmentIdx );
          IndexType globalIdx = groupPointers[ groupIdx ] * getWarpSize();
          IndexType groupHeight = getWarpSize();
+         //std::cerr << "segmentIdx = " << segmentIdx << " localIdx = " << localIdx << " rowstripPerm = " << rowStripPerm << std::endl;
          for( IndexType group = 0; group < groupsCount; group++ )
          {
-            const IndexType groupSize = getGroupLength( groupPointers, strip, group );
-            IndexType groupWidth =  groupSize / groupHeight;
-            if( localIdx > groupWidth )
+            const IndexType groupSize = getGroupSizeDirect( groupPointers, strip, group );
+            //std::cerr << "   groupIdx = " << groupIdx << " groupSize = " << groupSize << std::endl;
+            if(  groupSize )
             {
-               localIdx -= groupWidth;
-               globalIdx += groupSize;
-            }
-            else
-            {
-               if( RowMajorOrder )
-                  return globalIdx + rowStripPerm * groupWidth + localIdx;
+               IndexType groupWidth =  groupSize / groupHeight;
+               //std::cerr << "   groupWidth = " << groupWidth << std::endl;
+               if( localIdx >= groupWidth )
+               {
+                  localIdx -= groupWidth;
+                  globalIdx += groupSize;
+               }
                else
-                  return globalIdx + rowStripPerm + localIdx * groupHeight;
+               {
+                  if( RowMajorOrder )
+                  {
+                     // std::cerr << ">>>> globalIdx = " << globalIdx << " rowStriPerm = " <<  rowStripPerm << " localIdx = " <<  localIdx
+                     //          << " return = " << globalIdx + rowStripPerm * groupWidth + localIdx << std::endl;
+                     return globalIdx + rowStripPerm * groupWidth + localIdx;
+                  }
+                  else
+                     return globalIdx + rowStripPerm + localIdx * groupHeight;
+               }
             }
             groupHeight /= 2;
          }
+         TNL_ASSERT_TRUE( false, "Segment capacity exceeded, wrong localIdx." );
       }
 
       static
@@ -156,9 +178,9 @@ class BiEllpack
          IndexType groupHeight = getWarpSize();
          for( IndexType group = 0; group < groupsCount; group++ )
          {
-            const IndexType groupSize = getGroupLength( groupPointers, strip, group );
+            const IndexType groupSize = getGroupSize( groupPointers, strip, group );
             IndexType groupWidth =  groupSize / groupHeight;
-            if( localIdx > groupWidth )
+            if( localIdx >= groupWidth )
             {
                localIdx -= groupWidth;
                globalIdx += groupSize;
@@ -193,6 +215,7 @@ class BiEllpack
             const IndexType groupSize = groupPointers[ groupIdx + i + 1 ] - groupPointers[ groupIdx + i ];
             groupsWidth[ i ] = groupSize / groupHeight;
             groupHeight /= 2;
+            //std::cerr << " ROW INIT: groupIdx = " << i << " groupSize = " << groupSize << " groupWidth = " << groupsWidth[ i ] << std::endl;
          }
          return SegmentViewType( groupPointers[ groupIdx ],
                                  inStripIdx,
diff --git a/src/UnitTests/Matrices/SparseMatrixTest.h b/src/UnitTests/Matrices/SparseMatrixTest.h
index 12cdbeef3..b2c81652a 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest.h
@@ -39,6 +39,7 @@ TYPED_TEST( MatrixTest, Constructors )
     test_Constructors< MatrixType >();
 }
 
+
 TYPED_TEST( MatrixTest, setDimensionsTest )
 {
     using MatrixType = typename TestFixture::MatrixType;
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
index d75b1c3cc..f697ad424 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
@@ -30,7 +30,7 @@ using ColumnMajorBiEllpack = TNL::Containers::Segments::BiEllpack< Device, Index
 // types for which MatrixTest is instantiated
 using MatrixTypes = ::testing::Types
 <
-    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    //TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
     TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
     TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
     TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
-- 
GitLab


From a92f219de2e16af395abf230bc78e4ab93e8b33f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 18 Apr 2020 20:53:54 +0200
Subject: [PATCH 13/68] BiEllpack works on GPU.

---
 src/TNL/Containers/Segments/BiEllpack.h       |  16 +-
 src/TNL/Containers/Segments/BiEllpack.hpp     |  87 +++---
 .../Segments/BiEllpackSegmentView.h           |   3 +-
 src/TNL/Containers/Segments/BiEllpackView.h   |  29 +-
 src/TNL/Containers/Segments/BiEllpackView.hpp | 281 ++++++++++++------
 .../Containers/Segments/details/BiEllpack.h   | 105 ++++---
 src/UnitTests/Matrices/SparseMatrixTest.h     |   2 -
 .../Matrices/SparseMatrixTest_BiEllpack.h     |   5 +-
 8 files changed, 332 insertions(+), 196 deletions(-)

diff --git a/src/TNL/Containers/Segments/BiEllpack.h b/src/TNL/Containers/Segments/BiEllpack.h
index 3a52fe8ee..06578312e 100644
--- a/src/TNL/Containers/Segments/BiEllpack.h
+++ b/src/TNL/Containers/Segments/BiEllpack.h
@@ -111,20 +111,22 @@ class BiEllpack
 
       void load( File& file );
 
-      void printStructure( std::ostream& str ); // TODO const;
-
-   protected:
-
-      static constexpr int getWarpSize() { return WarpSize; };
-
-      static constexpr int getLogWarpSize() { return std::log2( WarpSize ); };
+      void printStructure( std::ostream& str ) const;
 
+      // TODO: nvcc needs this public because of lambda function used inside
       template< typename SizesHolder = OffsetsHolder >
       void performRowBubbleSort( const SizesHolder& segmentsSize );
 
+      // TODO: the same as  above
       template< typename SizesHolder = OffsetsHolder >
       void computeColumnSizes( const SizesHolder& segmentsSizes );
 
+   protected:
+
+      static constexpr int getWarpSize() { return WarpSize; };
+
+      static constexpr int getLogWarpSize() { return std::log2( WarpSize ); };
+
       template< typename SizesHolder = OffsetsHolder >
       void verifyRowPerm( const SizesHolder& segmentsSizes );
 
diff --git a/src/TNL/Containers/Segments/BiEllpack.hpp b/src/TNL/Containers/Segments/BiEllpack.hpp
index b186f33a1..91ebea207 100644
--- a/src/TNL/Containers/Segments/BiEllpack.hpp
+++ b/src/TNL/Containers/Segments/BiEllpack.hpp
@@ -119,7 +119,7 @@ performRowBubbleSort( const SizesHolder& segmentsSizes )
 {
    this->rowPermArray.evaluate( [] __cuda_callable__ ( const IndexType i ) -> IndexType { return i; } );
 
-   if( std::is_same< DeviceType, Devices::Host >::value )
+   //if( std::is_same< DeviceType, Devices::Host >::value )
    {
       IndexType strips = this->virtualRows / getWarpSize();
       for( IndexType i = 0; i < strips; i++ )
@@ -187,40 +187,37 @@ computeColumnSizes( const SizesHolder& segmentsSizes )
    auto segmentsPermutationView = this->rowPermArray.getView();
    auto segmentsSizesView = segmentsSizes.getConstView();
    const IndexType size = this->getSize();
-   Algorithms::ParallelFor< DeviceType >::exec(
-      ( IndexType ) 0,
-      this->virtualRows / getWarpSize(),
-      [=] __cuda_callable__ ( const IndexType strip ) mutable {
-
-         IndexType firstSegment = strip * getWarpSize();
-         IndexType groupBegin = strip * ( getLogWarpSize() + 1 );
-         IndexType emptyGroups = 0;
-
-         ////
-         // The last strip can be shorter
-         if( strip == numberOfStrips - 1 )
-         {
-            IndexType segmentsCount = size - firstSegment;
-            while( !( segmentsCount > TNL::pow( 2, getLogWarpSize() - 1 - emptyGroups ) ) )
-               emptyGroups++;
-            for( IndexType group = groupBegin; group < groupBegin + emptyGroups; group++ )
-               groupPointersView[ group ] = 0;
-         }
+   auto createGroups = [=] __cuda_callable__ ( const IndexType strip ) mutable {
+      IndexType firstSegment = strip * getWarpSize();
+      IndexType groupBegin = strip * ( getLogWarpSize() + 1 );
+      IndexType emptyGroups = 0;
+
+      ////
+      // The last strip can be shorter
+      if( strip == numberOfStrips - 1 )
+      {
+         IndexType segmentsCount = size - firstSegment;
+         while( !( segmentsCount > TNL::pow( 2, getLogWarpSize() - 1 - emptyGroups ) ) )
+            emptyGroups++;
+         for( IndexType group = groupBegin; group < groupBegin + emptyGroups; group++ )
+            groupPointersView[ group ] = 0;
+      }
 
-         IndexType allocatedColumns = 0;
-         for( IndexType groupIdx = emptyGroups; groupIdx < getLogWarpSize(); groupIdx++ )
-         {
-            IndexType segmentIdx = TNL::pow( 2, getLogWarpSize() - 1 - groupIdx ) - 1;
-            IndexType permSegm = 0;
-            while( segmentsPermutationView[ permSegm + firstSegment ] != segmentIdx + firstSegment )
-               permSegm++;
-            const IndexType groupWidth = segmentsSizesView[ permSegm + firstSegment ] - allocatedColumns;
-            const IndexType groupHeight = TNL::pow( 2, getLogWarpSize() - groupIdx );
-            const IndexType groupSize = groupWidth * groupHeight;
-            allocatedColumns = segmentsSizes[ permSegm + firstSegment ];
-            groupPointersView[ groupIdx + groupBegin ] = groupSize;
-         }
-      } );
+      IndexType allocatedColumns = 0;
+      for( IndexType groupIdx = emptyGroups; groupIdx < getLogWarpSize(); groupIdx++ )
+      {
+         IndexType segmentIdx = TNL::pow( 2, getLogWarpSize() - 1 - groupIdx ) - 1;
+         IndexType permSegm = 0;
+         while( segmentsPermutationView[ permSegm + firstSegment ] != segmentIdx + firstSegment )
+            permSegm++;
+         const IndexType groupWidth = segmentsSizesView[ permSegm + firstSegment ] - allocatedColumns;
+         const IndexType groupHeight = TNL::pow( 2, getLogWarpSize() - groupIdx );
+         const IndexType groupSize = groupWidth * groupHeight;
+         allocatedColumns = segmentsSizesView[ permSegm + firstSegment ];
+         groupPointersView[ groupIdx + groupBegin ] = groupSize;
+      }
+   };
+   Algorithms::ParallelFor< DeviceType >::exec( ( IndexType ) 0, this->virtualRows / getWarpSize(), createGroups );
 }
 
 template< typename Device,
@@ -267,7 +264,7 @@ verifyRowPerm( const SizesHolder& segmentsSizes )
       }
    }
    if( !ok )
-      throw( std::logic_error( "Segments permutaion verification failed." ) );
+      throw( std::logic_error( "Segments permutation verification failed." ) );
 }
 
 template< typename Device,
@@ -320,8 +317,8 @@ void
 BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
 setSegmentsSizes( const SizesHolder& segmentsSizes )
 {
-   //if( std::is_same< DeviceType, Devices::Host >::value )
-   // {
+   if( std::is_same< DeviceType, Devices::Host >::value )
+   {
       this->size = segmentsSizes.getSize();
       if( this->size % WarpSize != 0 )
          this->virtualRows = this->size + getWarpSize() - ( this->size % getWarpSize() );
@@ -340,14 +337,14 @@ setSegmentsSizes( const SizesHolder& segmentsSizes )
       this->verifyRowPerm( segmentsSizes );
       this->verifyRowLengths( segmentsSizes );
       this->storageSize =  getWarpSize() * this->groupPointers.getElement( strips * ( getLogWarpSize() + 1 ) );
-   /*}
+   }
    else
    {
       BiEllpack< Devices::Host, Index, typename Allocators::Default< Devices::Host >::template Allocator< IndexType >, RowMajorOrder > hostSegments;
       Containers::Vector< IndexType, Devices::Host, IndexType > hostSegmentsSizes( segmentsSizes );
       hostSegments.setSegmentsSizes( hostSegmentsSizes );
       *this = hostSegments;
-   }*/
+   }
 }
 
 template< typename Device,
@@ -524,6 +521,18 @@ load( File& file )
         >> this->groupPointers;
 }
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+void
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+printStructure( std::ostream& str ) const
+{
+   this->view.printStructure( str );
+}
+
 template< typename Device,
           typename Index,
           typename IndexAllocator,
diff --git a/src/TNL/Containers/Segments/BiEllpackSegmentView.h b/src/TNL/Containers/Segments/BiEllpackSegmentView.h
index 15352fc56..b716fe4c0 100644
--- a/src/TNL/Containers/Segments/BiEllpackSegmentView.h
+++ b/src/TNL/Containers/Segments/BiEllpackSegmentView.h
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include <math.h>
 #include <TNL/Containers/StaticVector.h>
 
 namespace TNL {
@@ -25,7 +26,7 @@ class BiEllpackSegmentView
       
       static constexpr int getWarpSize() { return WarpSize; };
 
-      static constexpr int getLogWarpSize() { return std::log2( WarpSize ); };
+      static constexpr int getLogWarpSize() { static_assert( WarpSize == 32, "nvcc does not allow constexpr log2" ); return 5; }// TODO: return std::log2( WarpSize ); };
 
       static constexpr int getGroupsCount() { return getLogWarpSize() + 1; };
 
diff --git a/src/TNL/Containers/Segments/BiEllpackView.h b/src/TNL/Containers/Segments/BiEllpackView.h
index a539cc92f..20726c621 100644
--- a/src/TNL/Containers/Segments/BiEllpackView.h
+++ b/src/TNL/Containers/Segments/BiEllpackView.h
@@ -130,6 +130,8 @@ class BiEllpackView
 
       void load( File& file );
 
+      void printStructure( std::ostream& str ) const;
+
    protected:
 
       static constexpr int getWarpSize() { return WarpSize; };
@@ -149,6 +151,7 @@ class BiEllpackView
                 typename Reduction,
                 typename ResultKeeper,
                 typename Real,
+                int BlockDim,
                 typename... Args >
       __device__
       void segmentsReductionKernelWithAllParameters( IndexType gridIdx,
@@ -163,7 +166,8 @@ class BiEllpackView
       template< typename Fetch,
                 typename Reduction,
                 typename ResultKeeper,
-                typename Real,
+                typename Real_,
+                int BlockDim,
                 typename... Args >
       __device__
       void segmentsReductionKernel( IndexType gridIdx,
@@ -172,7 +176,7 @@ class BiEllpackView
                                     Fetch fetch,
                                     Reduction reduction,
                                     ResultKeeper keeper,
-                                    Real zero,
+                                    Real_ zero,
                                     Args... args ) const;
 
       template< typename View_,
@@ -181,19 +185,20 @@ class BiEllpackView
                 typename Reduction_,
                 typename ResultKeeper_,
                 typename Real_,
+                int BlockDim,
                 typename... Args_ >
       friend __global__
       void BiEllpackSegmentsReductionKernel( View_ chunkedEllpack,
-                                                  Index_ gridIdx,
-                                                  Index_ first,
-                                                  Index_ last,
-                                                  Fetch_ fetch,
-                                                  Reduction_ reduction,
-                                                  ResultKeeper_ keeper,
-                                                  Real_ zero,
-                                                  Args_... args );
-
-      template< typename Index_, typename Fetch_, bool B_ >
+                                             Index_ gridIdx,
+                                             Index_ first,
+                                             Index_ last,
+                                             Fetch_ fetch,
+                                             Reduction_ reduction,
+                                             ResultKeeper_ keeper,
+                                             Real_ zero,
+                                             Args_... args );
+
+      template< typename Index_, typename Fetch_, int BlockDim_, int WarpSize_, bool B_ >
       friend struct details::BiEllpackSegmentsReductionDispatcher;
 #endif
 };
diff --git a/src/TNL/Containers/Segments/BiEllpackView.hpp b/src/TNL/Containers/Segments/BiEllpackView.hpp
index edf23bb14..84651c638 100644
--- a/src/TNL/Containers/Segments/BiEllpackView.hpp
+++ b/src/TNL/Containers/Segments/BiEllpackView.hpp
@@ -260,7 +260,45 @@ void
 BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
 forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
 {
-   //Algorithms::ParallelFor< DeviceType >::exec( first, last , work, args... );
+   const auto segmentsPermutationView = this->rowPermArray.getConstView();
+   const auto groupPointersView = this->groupPointers.getConstView();
+   auto work = [=] __cuda_callable__ ( IndexType segmentIdx, Args... args ) mutable {
+      const IndexType strip = segmentIdx / getWarpSize();
+      const IndexType firstGroupInStrip = strip * ( getLogWarpSize() + 1 );
+      const IndexType rowStripPerm = segmentsPermutationView[ segmentIdx ] - strip * getWarpSize();
+      const IndexType groupsCount = details::BiEllpack< IndexType, DeviceType, RowMajorOrder, getWarpSize() >::getActiveGroupsCountDirect( segmentsPermutationView, segmentIdx );
+      IndexType groupHeight = getWarpSize();
+      //printf( "segmentIdx = %d strip = %d firstGroupInStrip = %d rowStripPerm = %d groupsCount = %d \n", segmentIdx, strip, firstGroupInStrip, rowStripPerm, groupsCount );
+      bool compute( true );
+      IndexType localIdx( 0 );
+      for( IndexType groupIdx = firstGroupInStrip; groupIdx < firstGroupInStrip + groupsCount && compute; groupIdx++ )
+      {
+         IndexType groupOffset = groupPointersView[ groupIdx ];
+         const IndexType groupSize = groupPointersView[ groupIdx + 1 ] - groupOffset;
+         //printf( "groupSize = %d \n", groupSize );
+         if( groupSize )
+         {
+            const IndexType groupWidth = groupSize / groupHeight;
+            for( IndexType i = 0; i < groupWidth; i++ )
+            {
+               if( RowMajorOrder )
+               {
+                  f( segmentIdx, localIdx, groupOffset + rowStripPerm * groupWidth + i, compute );
+               }
+               else
+               {
+                  /*printf( "segmentIdx = %d localIdx = %d globalIdx = %d groupIdx = %d groupSize = %d groupWidth = %d\n",
+                     segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight,
+                     groupIdx, groupSize, groupWidth );*/
+                  f( segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight, compute );
+               }
+               localIdx++;
+            }
+         }
+         groupHeight /= 2;
+      }
+   };
+   Algorithms::ParallelFor< DeviceType >::exec( first, last , work, args... );
 }
 
 template< typename Device,
@@ -323,6 +361,35 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
          }
          keeper( segmentIdx, aux );
       }
+   if( std::is_same< DeviceType, Devices::Cuda >::value )
+   {
+#ifdef HAVE_CUDA
+      //printStructure( std::cerr );
+      //for( IndexType i = first; i < last; i += getWarpSize() )
+      {
+         //IndexType first = i;
+         //IndexType last = TNL::min( this->getSize(), i + getWarpSize() );
+         constexpr int BlockDim = getWarpSize();
+         dim3 cudaBlockSize = BlockDim;
+         const IndexType stripsCount = roundUpDivision( last - first, getWarpSize() );
+         const IndexType cudaBlocks = roundUpDivision( stripsCount * getWarpSize(), cudaBlockSize.x );
+         const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
+         const IndexType sharedMemory = cudaBlockSize.x * sizeof( RealType );
+
+         for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
+         {
+            dim3 cudaGridSize = Cuda::getMaxGridSize();
+            if( gridIdx == cudaGrids - 1 )
+               cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
+            details::BiEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim, Args...  >
+               <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
+               ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+            cudaThreadSynchronize();
+            TNL_CHECK_CUDA_DEVICE;
+         }
+      }
+#endif
+   }
 }
 
 template< typename Device,
@@ -368,6 +435,31 @@ save( File& file ) const
         << this->groupPointers;
 }
 
+template< typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          int WarpSize >
+void
+BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
+printStructure( std::ostream& str ) const
+{
+   const IndexType stripsCount = roundUpDivision( this->getSize(), getWarpSize() );
+   for( IndexType stripIdx = 0; stripIdx < stripsCount; stripIdx++ )
+   {
+      str << "Strip: " << stripIdx << std::endl;
+      const IndexType firstGroupIdx = stripIdx * ( getLogWarpSize() + 1 );
+      const IndexType lastGroupIdx = firstGroupIdx + getLogWarpSize() + 1;
+      IndexType groupHeight = getWarpSize();
+      for( IndexType groupIdx = firstGroupIdx; groupIdx < lastGroupIdx; groupIdx ++ )
+      {
+         const IndexType groupSize = groupPointers.getElement( groupIdx + 1 ) - groupPointers.getElement( groupIdx );
+         const IndexType groupWidth = groupSize / groupHeight;
+         str << "\tGroup: " << groupIdx << " size = " << groupSize << " width = " << groupWidth << " height = " << groupHeight << std::endl;
+         groupHeight /= 2;
+      }
+   }
+}
+
 #ifdef HAVE_CUDA
 template< typename Device,
           typename Index,
@@ -377,6 +469,7 @@ template< typename Device,
              typename Reduction,
              typename ResultKeeper,
              typename Real,
+             int BlockDim,
              typename... Args >
 __device__
 void
@@ -391,61 +484,46 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx,
                                           Args... args ) const
 {
    using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
-
-   const IndexType firstSlice = rowToSliceMapping[ first ];
-   const IndexType lastSlice = rowToSliceMapping[ last - 1 ];
-
-   const IndexType sliceIdx = firstSlice + gridIdx * Cuda::getMaxGridSize() + blockIdx.x;
-   if( sliceIdx > lastSlice )
+   const IndexType segmentIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x + first;
+   if( segmentIdx >= last )
       return;
 
-   RealType* chunksResults = Cuda::getSharedMemory< RealType >();
-   __shared__ details::BiEllpackSliceInfo< IndexType > sliceInfo;
-   if( threadIdx.x == 0 )
-      sliceInfo = this->slices[ sliceIdx ];
-   chunksResults[ threadIdx.x ] = zero;
-   __syncthreads();
-
-
-
-   const IndexType sliceOffset = sliceInfo.pointer;
-   const IndexType chunkSize = sliceInfo.chunkSize;
-   const IndexType chunkIdx = sliceIdx * chunksInSlice + threadIdx.x;
-   const IndexType segmentIdx = this->chunksToSegmentsMapping[ chunkIdx ];
-   IndexType firstChunkOfSegment( 0 );
-   if( segmentIdx != sliceInfo.firstSegment )
-      firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ];
-   IndexType localIdx = ( threadIdx.x - firstChunkOfSegment ) * chunkSize;
+   const IndexType strip = segmentIdx / getWarpSize();
+   const IndexType firstGroupInStrip = strip * ( getLogWarpSize() + 1 );
+   const IndexType rowStripPerm = rowPermArray[ segmentIdx ] - strip * getWarpSize();
+   const IndexType groupsCount = details::BiEllpack< IndexType, DeviceType, RowMajorOrder, getWarpSize() >::getActiveGroupsCountDirect( rowPermArray, segmentIdx );
+   IndexType groupHeight = getWarpSize();
+   //printf( "segmentIdx = %d strip = %d firstGroupInStrip = %d rowStripPerm = %d groupsCount = %d \n", segmentIdx, strip, firstGroupInStrip, rowStripPerm, groupsCount );
    bool compute( true );
-
-   if( RowMajorOrder )
-   {
-      IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice
-      IndexType end = begin + chunkSize;
-      for( IndexType j = begin; j < end && compute; j++ )
-         reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute ) );
-   }
-   else
+   IndexType localIdx( 0 );
+   RealType result( zero );
+   for( IndexType groupIdx = firstGroupInStrip; groupIdx < firstGroupInStrip + groupsCount && compute; groupIdx++ )
    {
-      const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice
-      const IndexType end = begin + chunksInSlice * chunkSize;
-         for( IndexType j = begin; j < end && compute; j += chunksInSlice )
-            reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute ) );
-   }
-   __syncthreads();
-   if( threadIdx.x < sliceInfo.size )
-   {
-      const IndexType row = sliceInfo.firstSegment + threadIdx.x;
-      IndexType chunkIndex( 0 );
-      if( threadIdx.x != 0 )
-         chunkIndex = this->rowToChunkMapping[ row - 1 ];
-      const IndexType lastChunk = this->rowToChunkMapping[ row ];
-      RealType result( zero );
-      while( chunkIndex < lastChunk )
-         reduction( result,  chunksResults[ chunkIndex++ ] );
-      if( row >= first && row < last )
-         keeper( row, result );
+      IndexType groupOffset = groupPointers[ groupIdx ];
+      const IndexType groupSize = groupPointers[ groupIdx + 1 ] - groupOffset;
+      //printf( "groupSize = %d \n", groupSize );
+      if( groupSize )
+      {
+         const IndexType groupWidth = groupSize / groupHeight;
+         for( IndexType i = 0; i < groupWidth; i++ )
+         {
+            if( RowMajorOrder )
+            {
+               reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm * groupWidth + i, compute ) );
+            }
+            else
+            {
+               /*printf( "segmentIdx = %d localIdx = %d globalIdx = %d groupIdx = %d groupSize = %d groupWidth = %d\n",
+                  segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight,
+                  groupIdx, groupSize, groupWidth );*/
+               reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight, compute ) );
+            }
+            localIdx++;
+         }
+      }
+      groupHeight /= 2;
    }
+   keeper( segmentIdx, result );
 }
 
 template< typename Device,
@@ -456,6 +534,7 @@ template< typename Device,
              typename Reduction,
              typename ResultKeeper,
              typename Real,
+             int BlockDim,
              typename... Args >
 __device__
 void
@@ -470,56 +549,78 @@ segmentsReductionKernel( IndexType gridIdx,
                          Args... args ) const
 {
    using RealType = decltype( fetch( IndexType(), std::declval< bool& >(), args... ) );
+   Index segmentIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x + first;
 
-   const IndexType firstSlice = rowToSliceMapping[ first ];
-   const IndexType lastSlice = rowToSliceMapping[ last - 1 ];
+   const IndexType strip = segmentIdx >> getLogWarpSize();
+   const IndexType warpStart = strip << getLogWarpSize();
+   const IndexType inWarpIdx = segmentIdx & ( getWarpSize() - 1 );
 
-   const IndexType sliceIdx = firstSlice + gridIdx * Cuda::getMaxGridSize() + blockIdx.x;
-   if( sliceIdx > lastSlice )
+   if( warpStart >= last )
       return;
 
-   RealType* chunksResults = Cuda::getSharedMemory< RealType >();
-   __shared__ details::BiEllpackSliceInfo< IndexType > sliceInfo;
+   IndexType groupHeight = getWarpSize();
+   IndexType firstGroupIdx = strip * ( getLogWarpSize() + 1 );
 
-   if( threadIdx.x == 0 )
-      sliceInfo = this->slices[ sliceIdx ];
-   chunksResults[ threadIdx.x ] = zero;
-   __syncthreads();
+   RealType* temp( nullptr );
+   if( ! RowMajorOrder )
+      temp = Cuda::getSharedMemory< RealType >();
+   __shared__ RealType results[ BlockDim ];
+   results[ threadIdx.x ] = zero;
+   __shared__ IndexType sharedGroupPointers[ 7 ]; // TODO: getLogWarpSize() + 1 ];
 
-   const IndexType sliceOffset = sliceInfo.pointer;
-   const IndexType chunkSize = sliceInfo.chunkSize;
-   const IndexType chunkIdx = sliceIdx * chunksInSlice + threadIdx.x;
+   if( threadIdx.x <= getLogWarpSize() + 1 )
+      sharedGroupPointers[ threadIdx.x ] = this->groupPointers[ firstGroupIdx + threadIdx.x ];
+   __syncthreads();
+         
    bool compute( true );
-
-   if( RowMajorOrder )
-   {
-      IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice
-      IndexType end = begin + chunkSize;
-      for( IndexType j = begin; j < end && compute; j++ )
-         reduction( chunksResults[ threadIdx.x ], fetch( j, compute ) );
-   }
-   else
+   for( IndexType group = 0; group < getLogWarpSize() + 1; group++ )
    {
-      const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice
-      const IndexType end = begin + chunksInSlice * chunkSize;
-         for( IndexType j = begin; j < end && compute; j += chunksInSlice )
-            reduction( chunksResults[ threadIdx.x ], fetch( j, compute ) );
+      IndexType groupBegin = sharedGroupPointers[ group ];
+      IndexType groupEnd = sharedGroupPointers[ group + 1 ];
+      if( groupEnd - groupBegin > 0 )
+      {
+         if( RowMajorOrder )
+         {
+            if( inWarpIdx < groupHeight )
+            {
+               const IndexType groupWidth = ( groupEnd - groupBegin ) / groupHeight;
+               IndexType globalIdx = groupBegin + inWarpIdx * groupWidth;
+               for( IndexType i = 0; i < groupWidth && compute; i++ )
+                  reduction( results[ threadIdx.x ], fetch( globalIdx++, compute ) );
+            }
+         }
+         else
+         {
+            temp[ threadIdx.x ] = zero;
+            IndexType globalIdx = groupBegin + inWarpIdx;
+            while( globalIdx < groupEnd )
+            {
+               reduction( temp[ threadIdx.x ], fetch( globalIdx, compute ) );
+               /*printf( "FETCH: globalIdx = %d fetch = %d result = %d groupEnd = %d \n", 
+                  globalIdx,
+                  ( int ) fetch( globalIdx, compute ),
+                  ( int ) temp[ threadIdx.x ], groupEnd );*/
+               globalIdx += getWarpSize();
+            }
+            // TODO: reduction via templates
+            IndexType bisection2 = getWarpSize();
+            for( IndexType i = 0; i < group; i++ )
+            {
+               bisection2 >>= 1;
+               if( inWarpIdx < bisection2 )
+                  reduction( temp[ threadIdx.x ], temp[ threadIdx.x + bisection2 ] );
+            }
+            if( inWarpIdx < groupHeight )
+               reduction( results[ threadIdx.x ], temp[ threadIdx.x ] );
+         }
+      }
+      groupHeight >>= 1;
    }
    __syncthreads();
+   if( warpStart + inWarpIdx >= last )
+      return;
 
-   if( threadIdx.x < sliceInfo.size )
-   {
-      const IndexType row = sliceInfo.firstSegment + threadIdx.x;
-      IndexType chunkIndex( 0 );
-      if( threadIdx.x != 0 )
-         chunkIndex = this->rowToChunkMapping[ row - 1 ];
-      const IndexType lastChunk = this->rowToChunkMapping[ row ];
-      RealType result( zero );
-      while( chunkIndex < lastChunk )
-         reduction( result,  chunksResults[ chunkIndex++ ] );
-      if( row >= first && row < last )
-         keeper( row, result );
-   }
+   keeper( warpStart + inWarpIdx, results[ this->rowPermArray[ warpStart + inWarpIdx ] & ( blockDim.x - 1 ) ] );
 }
 #endif
 
diff --git a/src/TNL/Containers/Segments/details/BiEllpack.h b/src/TNL/Containers/Segments/details/BiEllpack.h
index 4cae9a531..a9d4eb97a 100644
--- a/src/TNL/Containers/Segments/details/BiEllpack.h
+++ b/src/TNL/Containers/Segments/details/BiEllpack.h
@@ -43,6 +43,24 @@ class BiEllpack
 
       static constexpr int getGroupsCount() { return getLogWarpSize() + 1; };
 
+      __cuda_callable__
+      static IndexType getActiveGroupsCountDirect( const ConstOffsetsHolderView& rowPermArray, const IndexType segmentIdx )
+      {
+         TNL_ASSERT_GE( segmentIdx, 0, "" );
+         //TNL_ASSERT_LT( segmentIdx, this->getSize(), "" );
+
+         IndexType strip = segmentIdx / getWarpSize();
+         IndexType rowStripPermutation = rowPermArray[ segmentIdx ] - getWarpSize() * strip;
+         IndexType numberOfGroups = getLogWarpSize() + 1;
+         IndexType bisection = 1;
+         for( IndexType i = 0; i < getLogWarpSize() + 1; i++ )
+         {
+            if( rowStripPermutation < bisection )
+               return numberOfGroups - i;
+            bisection *= 2;
+         }
+      }
+
       static IndexType getActiveGroupsCount( const ConstOffsetsHolderView& rowPermArray, const IndexType segmentIdx )
       {
          TNL_ASSERT_GE( segmentIdx, 0, "" );
@@ -55,12 +73,13 @@ class BiEllpack
          for( IndexType i = 0; i < getLogWarpSize() + 1; i++ )
          {
             if( rowStripPermutation < bisection )
-               return ( numberOfGroups - i );
+               return numberOfGroups - i;
             bisection *= 2;
          }
          throw std::logic_error( "segmentIdx was not found" );
       }
 
+      __cuda_callable__
       static IndexType getGroupSizeDirect( const ConstOffsetsHolderView& groupPointers,
                                            const IndexType strip,
                                            const IndexType group )
@@ -72,7 +91,7 @@ class BiEllpack
       
       static IndexType getGroupSize( const ConstOffsetsHolderView& groupPointers,
                                      const IndexType strip,
-                                    const IndexType group )
+                                     const IndexType group )
       {
          const IndexType groupOffset = strip * ( getLogWarpSize() + 1 ) + group;
          return groupPointers.getElement( groupOffset + 1 ) - groupPointers.getElement( groupOffset );
@@ -85,18 +104,16 @@ class BiEllpack
          const IndexType strip = segmentIdx / getWarpSize();
          const IndexType groupIdx = strip * ( getLogWarpSize() + 1 );
          const IndexType rowStripPerm = rowPermArray[ segmentIdx ] - strip * getWarpSize();
-         const IndexType groupsCount = getActiveGroupsCount( rowPermArray, segmentIdx );
+         const IndexType groupsCount = getActiveGroupsCountDirect( rowPermArray, segmentIdx );
          IndexType groupHeight = getWarpSize();
          IndexType segmentSize = 0;
          for( IndexType groupIdx = 0; groupIdx < groupsCount; groupIdx++ )
          {
             const IndexType groupSize = getGroupSizeDirect( groupPointers, strip, groupIdx );
             IndexType groupWidth =  groupSize / groupHeight;
-            //std::cerr << " groupIdx = " << groupIdx << " groupWidth = " << groupWidth << std::endl;
             segmentSize += groupWidth;
             groupHeight /= 2;
          }
-         //std::cerr << "############### segmentIdx = " << segmentIdx << " segmentSize = " << segmentSize << std::endl;
          return segmentSize;
       }
 
@@ -130,18 +147,15 @@ class BiEllpack
          const IndexType strip = segmentIdx / getWarpSize();
          const IndexType groupIdx = strip * ( getLogWarpSize() + 1 );
          const IndexType rowStripPerm = rowPermArray[ segmentIdx ] - strip * getWarpSize();
-         const IndexType groupsCount = getActiveGroupsCount( rowPermArray, segmentIdx );
-         IndexType globalIdx = groupPointers[ groupIdx ] * getWarpSize();
+         const IndexType groupsCount = getActiveGroupsCountDirect( rowPermArray, segmentIdx );
+         IndexType globalIdx = groupPointers[ groupIdx ];
          IndexType groupHeight = getWarpSize();
-         //std::cerr << "segmentIdx = " << segmentIdx << " localIdx = " << localIdx << " rowstripPerm = " << rowStripPerm << std::endl;
          for( IndexType group = 0; group < groupsCount; group++ )
          {
             const IndexType groupSize = getGroupSizeDirect( groupPointers, strip, group );
-            //std::cerr << "   groupIdx = " << groupIdx << " groupSize = " << groupSize << std::endl;
             if(  groupSize )
             {
                IndexType groupWidth =  groupSize / groupHeight;
-               //std::cerr << "   groupWidth = " << groupWidth << std::endl;
                if( localIdx >= groupWidth )
                {
                   localIdx -= groupWidth;
@@ -150,11 +164,7 @@ class BiEllpack
                else
                {
                   if( RowMajorOrder )
-                  {
-                     // std::cerr << ">>>> globalIdx = " << globalIdx << " rowStriPerm = " <<  rowStripPerm << " localIdx = " <<  localIdx
-                     //          << " return = " << globalIdx + rowStripPerm * groupWidth + localIdx << std::endl;
                      return globalIdx + rowStripPerm * groupWidth + localIdx;
-                  }
                   else
                      return globalIdx + rowStripPerm + localIdx * groupHeight;
                }
@@ -162,6 +172,7 @@ class BiEllpack
             groupHeight /= 2;
          }
          TNL_ASSERT_TRUE( false, "Segment capacity exceeded, wrong localIdx." );
+         return -1; // to avoid compiler warning
       }
 
       static
@@ -174,26 +185,32 @@ class BiEllpack
          const IndexType groupIdx = strip * ( getLogWarpSize() + 1 );
          const IndexType rowStripPerm = rowPermArray.getElement( segmentIdx ) - strip * getWarpSize();
          const IndexType groupsCount = getActiveGroupsCount( rowPermArray, segmentIdx );
-         IndexType globalIdx = groupPointers.getElement( groupIdx ); // * getWarpSize();
+         IndexType globalIdx = groupPointers.getElement( groupIdx );
          IndexType groupHeight = getWarpSize();
          for( IndexType group = 0; group < groupsCount; group++ )
          {
             const IndexType groupSize = getGroupSize( groupPointers, strip, group );
-            IndexType groupWidth =  groupSize / groupHeight;
-            if( localIdx >= groupWidth )
-            {
-               localIdx -= groupWidth;
-               globalIdx += groupSize;
-            }
-            else
+            if(  groupSize )
             {
-               if( RowMajorOrder )
-                  return globalIdx + rowStripPerm * groupWidth + localIdx;
+               IndexType groupWidth =  groupSize / groupHeight;
+               if( localIdx >= groupWidth )
+               {
+                  localIdx -= groupWidth;
+                  globalIdx += groupSize;
+               }
                else
-                  return globalIdx + rowStripPerm + localIdx * groupHeight;
+               {
+                  if( RowMajorOrder )
+                  {
+                     return globalIdx + rowStripPerm * groupWidth + localIdx;
+                  }
+                  else
+                     return globalIdx + rowStripPerm + localIdx * groupHeight;
+               }
             }
             groupHeight /= 2;
          }
+         TNL_ASSERT_TRUE( false, "Segment capacity exceeded, wrong localIdx." );
       }
 
       static __cuda_callable__
@@ -206,7 +223,7 @@ class BiEllpack
          const IndexType strip = segmentIdx / getWarpSize();
          const IndexType groupIdx = strip * ( getLogWarpSize() + 1 );
          const IndexType inStripIdx = rowPermArray[ segmentIdx ] - strip * getWarpSize();
-         const IndexType groupsCount = getActiveGroupsCount( rowPermArray, segmentIdx );
+         const IndexType groupsCount = getActiveGroupsCountDirect( rowPermArray, segmentIdx );
          IndexType groupHeight = getWarpSize();
          GroupsWidthType groupsWidth( 0 );
          TNL_ASSERT_LE( groupsCount, getGroupsCount(), "" );
@@ -269,12 +286,13 @@ class BiEllpack
 #ifdef HAVE_CUDA
 template< typename Index,
           typename Fetch,
-          bool HasAllParameters = details::CheckFetchLambda< Index, Fetch >::hasAllParameters(),
-          int WarpSize = 32 >
+          int BlockDim = 256,
+          int WarpSize = 32,
+          bool HasAllParameters = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
 struct BiEllpackSegmentsReductionDispatcher{};
 
-template< typename Index, typename Fetch >
-struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, true >
+template< typename Index, typename Fetch, int BlockDim, int WarpSize >
+struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, BlockDim, WarpSize, true >
 {
    template< typename View,
              typename Reduction,
@@ -292,12 +310,12 @@ struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, true >
                      Real zero,
                      Args... args )
    {
-      biEllpack.segmentsReductionKernelWithAllParameters( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+      biEllpack.template segmentsReductionKernelWithAllParameters< Fetch, Reduction, ResultKeeper, Real, BlockDim, Args... >( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
    }
 };
 
-template< typename Index, typename Fetch >
-struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, false >
+template< typename Index, typename Fetch, int BlockDim, int WarpSize >
+struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, BlockDim, WarpSize, false >
 {
    template< typename View,
              typename Reduction,
@@ -315,7 +333,7 @@ struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, false >
                      Real zero,
                      Args... args )
    {
-      biEllpack.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+      biEllpack.template segmentsReductionKernel< Fetch, Reduction, ResultKeeper, Real, BlockDim, Args... >( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
    }
 };
 
@@ -325,19 +343,20 @@ template< typename View,
           typename Reduction,
           typename ResultKeeper,
           typename Real,
+          int BlockDim,
           typename... Args >
 __global__
 void BiEllpackSegmentsReductionKernel( View biEllpack,
-                                            Index gridIdx,
-                                            Index first,
-                                            Index last,
-                                            Fetch fetch,
-                                            Reduction reduction,
-                                            ResultKeeper keeper,
-                                            Real zero,
-                                            Args... args )
+                                       Index gridIdx,
+                                       Index first,
+                                       Index last,
+                                       Fetch fetch,
+                                       Reduction reduction,
+                                       ResultKeeper keeper,
+                                       Real zero,
+                                       Args... args )
 {
-   BiEllpackSegmentsReductionDispatcher< Index, Fetch >::exec( biEllpack, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+   BiEllpackSegmentsReductionDispatcher< Index, Fetch, BlockDim >::exec( biEllpack, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
 }
 #endif
 
diff --git a/src/UnitTests/Matrices/SparseMatrixTest.h b/src/UnitTests/Matrices/SparseMatrixTest.h
index b2c81652a..a00e69687 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest.h
@@ -39,7 +39,6 @@ TYPED_TEST( MatrixTest, Constructors )
     test_Constructors< MatrixType >();
 }
 
-
 TYPED_TEST( MatrixTest, setDimensionsTest )
 {
     using MatrixType = typename TestFixture::MatrixType;
@@ -116,5 +115,4 @@ TYPED_TEST( MatrixTest, printTest )
 
     test_Print< MatrixType >();
 }
-
 #endif
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
index f697ad424..03cc3646b 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h
@@ -30,7 +30,7 @@ using ColumnMajorBiEllpack = TNL::Containers::Segments::BiEllpack< Device, Index
 // types for which MatrixTest is instantiated
 using MatrixTypes = ::testing::Types
 <
-    //TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
     TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
     TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
     TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
@@ -47,7 +47,8 @@ using MatrixTypes = ::testing::Types
     TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
     TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
     TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
 #endif
 >;
 
-- 
GitLab


From e9f253366ba958074e6172f1d37a3c79b19c7582 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 19 Apr 2020 11:05:41 +0200
Subject: [PATCH 14/68] Refactoring BiEllapck SpMV CUDA kernel.

---
 src/TNL/Containers/Segments/BiEllpackView.hpp | 97 +++++++++----------
 1 file changed, 44 insertions(+), 53 deletions(-)

diff --git a/src/TNL/Containers/Segments/BiEllpackView.hpp b/src/TNL/Containers/Segments/BiEllpackView.hpp
index 84651c638..abf82313a 100644
--- a/src/TNL/Containers/Segments/BiEllpackView.hpp
+++ b/src/TNL/Containers/Segments/BiEllpackView.hpp
@@ -364,29 +364,25 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef HAVE_CUDA
-      //printStructure( std::cerr );
-      //for( IndexType i = first; i < last; i += getWarpSize() )
+      constexpr int BlockDim = 256;//getWarpSize();
+      dim3 cudaBlockSize = BlockDim;
+      const IndexType stripsCount = roundUpDivision( last - first, getWarpSize() );
+      const IndexType cudaBlocks = roundUpDivision( stripsCount * getWarpSize(), cudaBlockSize.x );
+      const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
+      IndexType sharedMemory = 0;
+      if( ! RowMajorOrder )
+         sharedMemory = cudaBlockSize.x * sizeof( RealType );
+
+      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
       {
-         //IndexType first = i;
-         //IndexType last = TNL::min( this->getSize(), i + getWarpSize() );
-         constexpr int BlockDim = getWarpSize();
-         dim3 cudaBlockSize = BlockDim;
-         const IndexType stripsCount = roundUpDivision( last - first, getWarpSize() );
-         const IndexType cudaBlocks = roundUpDivision( stripsCount * getWarpSize(), cudaBlockSize.x );
-         const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
-         const IndexType sharedMemory = cudaBlockSize.x * sizeof( RealType );
-
-         for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-         {
-            dim3 cudaGridSize = Cuda::getMaxGridSize();
-            if( gridIdx == cudaGrids - 1 )
-               cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-            details::BiEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim, Args...  >
-               <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
-               ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
-            cudaThreadSynchronize();
-            TNL_CHECK_CUDA_DEVICE;
-         }
+         dim3 cudaGridSize = Cuda::getMaxGridSize();
+         if( gridIdx == cudaGrids - 1 )
+            cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
+         details::BiEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim, Args...  >
+            <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
+            ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+         cudaThreadSynchronize();
+         TNL_CHECK_CUDA_DEVICE;
       }
 #endif
    }
@@ -493,7 +489,6 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx,
    const IndexType rowStripPerm = rowPermArray[ segmentIdx ] - strip * getWarpSize();
    const IndexType groupsCount = details::BiEllpack< IndexType, DeviceType, RowMajorOrder, getWarpSize() >::getActiveGroupsCountDirect( rowPermArray, segmentIdx );
    IndexType groupHeight = getWarpSize();
-   //printf( "segmentIdx = %d strip = %d firstGroupInStrip = %d rowStripPerm = %d groupsCount = %d \n", segmentIdx, strip, firstGroupInStrip, rowStripPerm, groupsCount );
    bool compute( true );
    IndexType localIdx( 0 );
    RealType result( zero );
@@ -501,23 +496,15 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx,
    {
       IndexType groupOffset = groupPointers[ groupIdx ];
       const IndexType groupSize = groupPointers[ groupIdx + 1 ] - groupOffset;
-      //printf( "groupSize = %d \n", groupSize );
       if( groupSize )
       {
          const IndexType groupWidth = groupSize / groupHeight;
          for( IndexType i = 0; i < groupWidth; i++ )
          {
             if( RowMajorOrder )
-            {
                reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm * groupWidth + i, compute ) );
-            }
             else
-            {
-               /*printf( "segmentIdx = %d localIdx = %d globalIdx = %d groupIdx = %d groupSize = %d groupWidth = %d\n",
-                  segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight,
-                  groupIdx, groupSize, groupWidth );*/
                reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight, compute ) );
-            }
             localIdx++;
          }
       }
@@ -561,9 +548,6 @@ segmentsReductionKernel( IndexType gridIdx,
    IndexType groupHeight = getWarpSize();
    IndexType firstGroupIdx = strip * ( getLogWarpSize() + 1 );
 
-   RealType* temp( nullptr );
-   if( ! RowMajorOrder )
-      temp = Cuda::getSharedMemory< RealType >();
    __shared__ RealType results[ BlockDim ];
    results[ threadIdx.x ] = zero;
    __shared__ IndexType sharedGroupPointers[ 7 ]; // TODO: getLogWarpSize() + 1 ];
@@ -571,35 +555,42 @@ segmentsReductionKernel( IndexType gridIdx,
    if( threadIdx.x <= getLogWarpSize() + 1 )
       sharedGroupPointers[ threadIdx.x ] = this->groupPointers[ firstGroupIdx + threadIdx.x ];
    __syncthreads();
-         
+
    bool compute( true );
-   for( IndexType group = 0; group < getLogWarpSize() + 1; group++ )
+   if( RowMajorOrder )
    {
-      IndexType groupBegin = sharedGroupPointers[ group ];
-      IndexType groupEnd = sharedGroupPointers[ group + 1 ];
-      if( groupEnd - groupBegin > 0 )
+      for( IndexType group = 0; group < getLogWarpSize() + 1; group++ )
       {
-         if( RowMajorOrder )
+         IndexType groupBegin = sharedGroupPointers[ group ];
+         IndexType groupEnd = sharedGroupPointers[ group + 1 ];
+         if( groupEnd - groupBegin > 0 )
          {
-            if( inWarpIdx < groupHeight )
-            {
-               const IndexType groupWidth = ( groupEnd - groupBegin ) / groupHeight;
-               IndexType globalIdx = groupBegin + inWarpIdx * groupWidth;
-               for( IndexType i = 0; i < groupWidth && compute; i++ )
-                  reduction( results[ threadIdx.x ], fetch( globalIdx++, compute ) );
+
+               if( inWarpIdx < groupHeight )
+               {
+                  const IndexType groupWidth = ( groupEnd - groupBegin ) / groupHeight;
+                  IndexType globalIdx = groupBegin + inWarpIdx * groupWidth;
+                  for( IndexType i = 0; i < groupWidth && compute; i++ )
+                     reduction( results[ threadIdx.x ], fetch( globalIdx++, compute ) );
+               }
             }
-         }
-         else
+         groupHeight >>= 1;
+      }
+   }
+   else
+   {
+      RealType* temp = Cuda::getSharedMemory< RealType >();
+      for( IndexType group = 0; group < getLogWarpSize() + 1; group++ )
+      {
+         IndexType groupBegin = sharedGroupPointers[ group ];
+         IndexType groupEnd = sharedGroupPointers[ group + 1 ];
+         if( groupEnd - groupBegin > 0 )
          {
             temp[ threadIdx.x ] = zero;
             IndexType globalIdx = groupBegin + inWarpIdx;
             while( globalIdx < groupEnd )
             {
                reduction( temp[ threadIdx.x ], fetch( globalIdx, compute ) );
-               /*printf( "FETCH: globalIdx = %d fetch = %d result = %d groupEnd = %d \n", 
-                  globalIdx,
-                  ( int ) fetch( globalIdx, compute ),
-                  ( int ) temp[ threadIdx.x ], groupEnd );*/
                globalIdx += getWarpSize();
             }
             // TODO: reduction via templates
@@ -613,8 +604,8 @@ segmentsReductionKernel( IndexType gridIdx,
             if( inWarpIdx < groupHeight )
                reduction( results[ threadIdx.x ], temp[ threadIdx.x ] );
          }
+         groupHeight >>= 1;
       }
-      groupHeight >>= 1;
    }
    __syncthreads();
    if( warpStart + inWarpIdx >= last )
-- 
GitLab


From e945cfb9318257d3bebc47fc0e5750898391518f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 20 Apr 2020 10:36:04 +0200
Subject: [PATCH 15/68] Added BiEllpack to spmv-legacy benchmark.

---
 src/Benchmarks/SpMV/spmv-legacy.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h
index 31d14c6a3..a066b461e 100644
--- a/src/Benchmarks/SpMV/spmv-legacy.h
+++ b/src/Benchmarks/SpMV/spmv-legacy.h
@@ -34,6 +34,7 @@
 #include <TNL/Containers/Segments/Ellpack.h>
 #include <TNL/Containers/Segments/SlicedEllpack.h>
 #include <TNL/Containers/Segments/ChunkedEllpack.h>
+#include <TNL/Containers/Segments/BiEllpack.h>
 using namespace TNL::Matrices;
 
 #include "cusparseCSRMatrix.h"
@@ -68,6 +69,12 @@ using ChunkedEllpackSegments = Containers::Segments::ChunkedEllpack< Device, Ind
 template< typename Real, typename Device, typename Index >
 using SparseMatrix_ChunkedEllpack = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, ChunkedEllpackSegments >;
 
+template< typename Device, typename Index, typename IndexAllocator >
+using BiEllpackSegments = Containers::Segments::BiEllpack< Device, Index, IndexAllocator >;
+
+template< typename Real, typename Device, typename Index >
+using SparseMatrix_BiEllpack = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, BiEllpackSegments >;
+
 // Legacy formats
 template< typename Real, typename Device, typename Index >
 using SparseMatrixLegacy_CSR_Scalar = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRScalar >;
@@ -295,6 +302,7 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, SparseMatrix_ChunkedEllpack      >( benchmark, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, Matrices::Legacy::BiEllpack      >( benchmark, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, SparseMatrix_BiEllpack           >( benchmark, hostOutVector, inputFileName, verboseMR );
    /* AdEllpack is broken
    benchmarkSpMV< Real, Matrices::AdEllpack              >( benchmark, hostOutVector, inputFileName, verboseMR );
     */
-- 
GitLab


From 5fede9ab05a9e3e6243b8338a8adb6fc9061137c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 20 Apr 2020 18:52:22 +0200
Subject: [PATCH 16/68] Renaming Matrices::Dense to Matrices::DenseMatrix.

---
 src/TNL/Matrices/{Dense.h => DenseMatrix.h}   |  24 ++--
 .../Matrices/{Dense.hpp => DenseMatrix.hpp}   | 130 +++++++++---------
 src/TNL/Matrices/DenseMatrixView.hpp          |   4 +-
 src/TNL/Matrices/DistributedSpMV.h            |   6 +-
 src/TNL/Matrices/MatrixInfo.h                 |   6 +-
 src/TNL/Matrices/SparseMatrix.h               |   4 +-
 src/TNL/Matrices/SparseMatrix.hpp             |   4 +-
 .../Matrices/BinarySparseMatrixCopyTest.h     |   6 +-
 src/UnitTests/Matrices/DenseMatrixCopyTest.h  |  14 +-
 src/UnitTests/Matrices/DenseMatrixTest.h      |  78 +++++------
 src/UnitTests/Matrices/SparseMatrixCopyTest.h |   6 +-
 11 files changed, 141 insertions(+), 141 deletions(-)
 rename src/TNL/Matrices/{Dense.h => DenseMatrix.h} (90%)
 rename src/TNL/Matrices/{Dense.hpp => DenseMatrix.hpp} (88%)

diff --git a/src/TNL/Matrices/Dense.h b/src/TNL/Matrices/DenseMatrix.h
similarity index 90%
rename from src/TNL/Matrices/Dense.h
rename to src/TNL/Matrices/DenseMatrix.h
index 6a4795a7e..4b09d14c3 100644
--- a/src/TNL/Matrices/Dense.h
+++ b/src/TNL/Matrices/DenseMatrix.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          Dense.h  -  description
+                          DenseMatrix.h  -  description
                              -------------------
     begin                : Nov 29, 2013
     copyright            : (C) 2013 by Tomas Oberhuber
@@ -28,7 +28,7 @@ template< typename Real = double,
           typename Index = int,
           bool RowMajorOrder = std::is_same< Device, Devices::Host >::value,
           typename RealAllocator = typename Allocators::Default< Device >::template Allocator< Real > >
-class Dense : public Matrix< Real, Device, Index >
+class DenseMatrix : public Matrix< Real, Device, Index >
 {
    public:
       using RealType = Real;
@@ -51,13 +51,13 @@ class Dense : public Matrix< Real, Device, Index >
       template< typename _Real = Real,
                 typename _Device = Device,
                 typename _Index = Index >
-      using Self = Dense< _Real, _Device, _Index >;
+      using Self = DenseMatrix< _Real, _Device, _Index >;
 
-      Dense();
+      DenseMatrix();
 
-      Dense( const IndexType rows, const IndexType columns );
+      DenseMatrix( const IndexType rows, const IndexType columns );
 
-      Dense( std::initializer_list< std::initializer_list< RealType > > data );
+      DenseMatrix( std::initializer_list< std::initializer_list< RealType > > data );
 
       ViewType getView();
 
@@ -184,7 +184,7 @@ class Dense : public Matrix< Real, Device, Index >
        * @param matrix
        * @return 
        */
-      Dense& operator=( const Dense& matrix );
+      DenseMatrix& operator=( const DenseMatrix& matrix );
 
       /**
        * \brief Assignment operator for other dense matrices.
@@ -194,7 +194,7 @@ class Dense : public Matrix< Real, Device, Index >
        */
       template< typename RHSReal, typename RHSDevice, typename RHSIndex,
                  bool RHSRowMajorOrder, typename RHSRealAllocator >
-      Dense& operator=( const Dense< RHSReal, RHSDevice, RHSIndex, RHSRowMajorOrder, RHSRealAllocator >& matrix );
+      DenseMatrix& operator=( const DenseMatrix< RHSReal, RHSDevice, RHSIndex, RHSRowMajorOrder, RHSRealAllocator >& matrix );
 
       /**
        * \brief Assignment operator for other (sparse) types of matrices.
@@ -202,13 +202,13 @@ class Dense : public Matrix< Real, Device, Index >
        * @return 
        */
       template< typename RHSMatrix >
-      Dense& operator=( const RHSMatrix& matrix );
+      DenseMatrix& operator=( const RHSMatrix& matrix );
 
       template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
-      bool operator==( const Dense< Real_, Device_, Index_, RowMajorOrder >& matrix ) const;
+      bool operator==( const DenseMatrix< Real_, Device_, Index_, RowMajorOrder >& matrix ) const;
 
       template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
-      bool operator!=( const Dense< Real_, Device_, Index_, RowMajorOrder >& matrix ) const;
+      bool operator!=( const DenseMatrix< Real_, Device_, Index_, RowMajorOrder >& matrix ) const;
 
       void save( const String& fileName ) const;
 
@@ -237,4 +237,4 @@ class Dense : public Matrix< Real, Device, Index >
 } // namespace Matrices
 } // namespace TNL
 
-#include <TNL/Matrices/Dense.hpp>
+#include <TNL/Matrices/DenseMatrix.hpp>
diff --git a/src/TNL/Matrices/Dense.hpp b/src/TNL/Matrices/DenseMatrix.hpp
similarity index 88%
rename from src/TNL/Matrices/Dense.hpp
rename to src/TNL/Matrices/DenseMatrix.hpp
index 28f152444..f0d1ecc04 100644
--- a/src/TNL/Matrices/Dense.hpp
+++ b/src/TNL/Matrices/DenseMatrix.hpp
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <TNL/Assert.h>
-#include <TNL/Matrices/Dense.h>
+#include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
 namespace TNL {
@@ -22,7 +22,7 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::Dense()
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::DenseMatrix()
 {
 }
 
@@ -31,8 +31,8 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
-Dense( const IndexType rows, const IndexType columns )
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix( const IndexType rows, const IndexType columns )
 {
    this->setDimensions( rows, columns );
 }
@@ -42,8 +42,8 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
-Dense( std::initializer_list< std::initializer_list< RealType > > data )
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix( std::initializer_list< std::initializer_list< RealType > > data )
 {
    this->setElements( data );
 }
@@ -54,7 +54,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 setElements( std::initializer_list< std::initializer_list< RealType > > data )
 {
    IndexType rows = data.size();
@@ -64,7 +64,7 @@ setElements( std::initializer_list< std::initializer_list< RealType > > data )
    this->setDimensions( rows, columns );
    if( ! std::is_same< DeviceType, Devices::Host >::value )
    {
-      Dense< RealType, Devices::Host, IndexType > hostDense( rows, columns );
+      DenseMatrix< RealType, Devices::Host, IndexType > hostDense( rows, columns );
       IndexType rowIdx( 0 );
       for( auto row : data )
       {
@@ -94,7 +94,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 auto
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 getView() -> ViewType
 {
    return ViewType( this->getRows(),
@@ -108,7 +108,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 auto
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 getConstView() const -> ConstViewType
 {
    return ConstViewType( this->getRows(),
@@ -122,7 +122,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 String
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 getSerializationType()
 {
    return ViewType::getSerializationType();
@@ -134,7 +134,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 String
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 getSerializationTypeVirtual() const
 {
    return this->getSerializationType();
@@ -146,7 +146,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 setDimensions( const IndexType rows,
                const IndexType columns )
 {
@@ -164,7 +164,7 @@ template< typename Real,
           typename RealAllocator >
    template< typename Matrix_ >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 setLike( const Matrix_& matrix )
 {
    this->setDimensions( matrix.getRows(), matrix.getColumns() );
@@ -176,7 +176,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
 {
    TNL_ASSERT_EQ( rowLengths.getSize(), this->getRows(), "" );
@@ -190,7 +190,7 @@ template< typename Real,
           typename RealAllocator >
    template< typename Vector >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 getCompressedRowLengths( Vector& rowLengths ) const
 {
    this->view.getCompressedRowLengths( rowLengths );
@@ -201,7 +201,7 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-Index Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::getRowLength( const IndexType row ) const
+Index DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::getRowLength( const IndexType row ) const
 {
    return this->getColumns();
 }
@@ -211,7 +211,7 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-Index Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::getMaxRowLength() const
+Index DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::getMaxRowLength() const
 {
    return this->getColumns();
 }
@@ -221,7 +221,7 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-Index Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::getNumberOfMatrixElements() const
+Index DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::getNumberOfMatrixElements() const
 {
    return this->getRows() * this->getColumns();
 }
@@ -231,7 +231,7 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-Index Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::getNumberOfNonzeroMatrixElements() const
+Index DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::getNumberOfNonzeroMatrixElements() const
 {
    return this->view.getNumberOfNonzeroMatrixElements();
 }
@@ -241,7 +241,7 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::reset()
+void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::reset()
 {
    Matrix< Real, Device, Index >::reset();
 }
@@ -251,7 +251,7 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::setValue( const Real& value )
+void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::setValue( const Real& value )
 {
    this->view.setValue( value );
 }
@@ -262,7 +262,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 __cuda_callable__ auto
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 getRow( const IndexType& rowIdx ) const -> const RowView
 {
    return this->view.getRow( rowIdx );
@@ -274,7 +274,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 __cuda_callable__ auto
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 getRow( const IndexType& rowIdx ) -> RowView
 {
    return this->view.getRow( rowIdx );
@@ -286,7 +286,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 __cuda_callable__
-Real& Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::operator()( const IndexType row,
+Real& DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::operator()( const IndexType row,
                                                 const IndexType column )
 {
    return this->view.operator()( row, column );
@@ -298,7 +298,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 __cuda_callable__
-const Real& Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::operator()( const IndexType row,
+const Real& DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::operator()( const IndexType row,
                                                       const IndexType column ) const
 {
    return this->view.operator()( row, column );
@@ -310,7 +310,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 setElement( const IndexType row,
             const IndexType column,
             const RealType& value )
@@ -324,7 +324,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 addElement( const IndexType row,
             const IndexType column,
             const RealType& value,
@@ -339,7 +339,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
 Real
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 getElement( const IndexType row,
             const IndexType column ) const
 {
@@ -353,7 +353,7 @@ template< typename Real,
           typename RealAllocator >
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchValue& zero ) const
 {
    this->view.rowsReduction( first, last, fetch, reduce, keep, zero );
@@ -366,7 +366,7 @@ template< typename Real,
           typename RealAllocator >
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
 {
    this->rowsReduction( 0, this->getRows(), fetch, reduce, keep, zero );
@@ -379,7 +379,7 @@ template< typename Real,
           typename RealAllocator >
    template< typename Function >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 forRows( IndexType first, IndexType last, Function& function ) const
 {
    this->view.forRows( first, last, function );
@@ -392,7 +392,7 @@ template< typename Real,
           typename RealAllocator >
    template< typename Function >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 forRows( IndexType first, IndexType last, Function& function )
 {
    this->view.forRows( first, last, function );
@@ -405,7 +405,7 @@ template< typename Real,
           typename RealAllocator >
    template< typename Function >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 forAllRows( Function& function ) const
 {
    this->forRows( 0, this->getRows(), function );
@@ -418,7 +418,7 @@ template< typename Real,
           typename RealAllocator >
    template< typename Function >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 forAllRows( Function& function )
 {
    this->forRows( 0, this->getRows(), function );
@@ -431,7 +431,7 @@ template< typename Real,
           typename RealAllocator >
    template< typename Vector >
 __cuda_callable__
-typename Vector::RealType Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::rowVectorProduct( const IndexType row,
+typename Vector::RealType DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::rowVectorProduct( const IndexType row,
                                                                                    const Vector& vector ) const
 {
    return this->view.rowVectorProduct( row, vector );
@@ -445,7 +445,7 @@ template< typename Real,
    template< typename InVector,
              typename OutVector >
 void
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 vectorProduct( const InVector& inVector, OutVector& outVector ) const
 {
    this->view.vectorProduct( inVector, outVector );
@@ -457,7 +457,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
    template< typename Matrix >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::addMatrix( const Matrix& matrix,
+void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::addMatrix( const Matrix& matrix,
                                               const RealType& matrixMultiplicator,
                                               const RealType& thisMatrixMultiplicator )
 {
@@ -483,7 +483,7 @@ template< typename Real,
           typename Matrix2,
           int tileDim,
           int tileRowBlockSize >
-__global__ void DenseMatrixProductKernel( Dense< Real, Devices::Cuda, Index >* resultMatrix,
+__global__ void DenseMatrixProductKernel( DenseMatrix< Real, Devices::Cuda, Index >* resultMatrix,
                                                    const Matrix1* matrixA,
                                                    const Matrix2* matrixB,
                                                    const Real matrixAMultiplicator,
@@ -581,7 +581,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
    template< typename Matrix1, typename Matrix2, int tileDim >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::getMatrixProduct( const Matrix1& matrix1,
+void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::getMatrixProduct( const Matrix1& matrix1,
                                                               const Matrix2& matrix2,
                                                               const RealType& matrix1Multiplicator,
                                                               const RealType& matrix2Multiplicator )
@@ -638,7 +638,7 @@ void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::getMatrixProduc
                cudaGridSize.x = columnTiles % Cuda::getMaxGridSize();
             if( gridIdx_y == rowGrids - 1 )
                cudaGridSize.y = rowTiles % Cuda::getMaxGridSize();
-            Dense* this_kernel = Cuda::passToDevice( *this );
+            DenseMatrix* this_kernel = Cuda::passToDevice( *this );
             Matrix1* matrix1_kernel = Cuda::passToDevice( matrix1 );
             Matrix2* matrix2_kernel = Cuda::passToDevice( matrix2 );
             DenseMatrixProductKernel< Real,
@@ -673,7 +673,7 @@ template< typename Real,
           typename RealAllocator,
           int tileDim,
           int tileRowBlockSize >
-__global__ void DenseTranspositionAlignedKernel( Dense< Real, Devices::Cuda, Index >* resultMatrix,
+__global__ void DenseTranspositionAlignedKernel( DenseMatrix< Real, Devices::Cuda, Index >* resultMatrix,
                                                           const Matrix* inputMatrix,
                                                           const Real matrixMultiplicator,
                                                           const Index gridIdx_x,
@@ -744,7 +744,7 @@ template< typename Real,
           typename Matrix,
           int tileDim,
           int tileRowBlockSize >
-__global__ void DenseTranspositionNonAlignedKernel( Dense< Real, Devices::Cuda, Index >* resultMatrix,
+__global__ void DenseTranspositionNonAlignedKernel( DenseMatrix< Real, Devices::Cuda, Index >* resultMatrix,
                                                              const Matrix* inputMatrix,
                                                              const Real matrixMultiplicator,
                                                              const Index gridIdx_x,
@@ -825,7 +825,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
    template< typename Matrix, int tileDim >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::getTransposition( const Matrix& matrix,
+void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::getTransposition( const Matrix& matrix,
                                                               const RealType& matrixMultiplicator )
 {
    TNL_ASSERT( this->getColumns() == matrix.getRows() &&
@@ -860,7 +860,7 @@ void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::getTranspositio
       const IndexType columnGrids = roundUpDivision( columnTiles, Cuda::getMaxGridSize() );
       const IndexType sharedMemorySize = tileDim*tileDim + tileDim*tileDim/Cuda::getNumberOfSharedMemoryBanks();
 
-      Dense* this_device = Cuda::passToDevice( *this );
+      DenseMatrix* this_device = Cuda::passToDevice( *this );
       Matrix* matrix_device = Cuda::passToDevice( matrix );
 
       for( IndexType gridIdx_x = 0; gridIdx_x < columnGrids; gridIdx_x++ )
@@ -918,7 +918,7 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
    template< typename Vector1, typename Vector2 >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::performSORIteration( const Vector1& b,
+void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::performSORIteration( const Vector1& b,
                                                         const IndexType row,
                                                         Vector2& x,
                                                         const RealType& omega ) const
@@ -939,9 +939,9 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >&
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
-operator=( const Dense< Real, Device, Index, RowMajorOrder, RealAllocator >& matrix )
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >&
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+operator=( const DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >& matrix )
 {
    setLike( matrix );
    this->values = matrix.values;
@@ -955,11 +955,11 @@ template< typename Real,
           typename RealAllocator >
    template< typename RHSReal, typename RHSDevice, typename RHSIndex,
              bool RHSRowMajorOrder, typename RHSRealAllocator >
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >&
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
-operator=( const Dense< RHSReal, RHSDevice, RHSIndex, RHSRowMajorOrder, RHSRealAllocator >& matrix )
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >&
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+operator=( const DenseMatrix< RHSReal, RHSDevice, RHSIndex, RHSRowMajorOrder, RHSRealAllocator >& matrix )
 {
-   using RHSMatrix = Dense< RHSReal, RHSDevice, RHSIndex, RHSRowMajorOrder, RHSRealAllocator >;
+   using RHSMatrix = DenseMatrix< RHSReal, RHSDevice, RHSIndex, RHSRowMajorOrder, RHSRealAllocator >;
    using RHSIndexType = typename RHSMatrix::IndexType;
    using RHSRealType = typename RHSMatrix::RealType;
    using RHSDeviceType = typename RHSMatrix::DeviceType;
@@ -1027,8 +1027,8 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
    template< typename RHSMatrix >
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >&
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >&
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 operator=( const RHSMatrix& matrix )
 {
    using RHSIndexType = typename RHSMatrix::IndexType;
@@ -1118,8 +1118,8 @@ template< typename Real,
           typename RealAllocator >
    template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
 bool
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
-operator==( const Dense< Real_, Device_, Index_, RowMajorOrder >& matrix ) const
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+operator==( const DenseMatrix< Real_, Device_, Index_, RowMajorOrder >& matrix ) const
 {
    return( this->getRows() == matrix.getRows() &&
            this->getColumns() == matrix.getColumns() &&
@@ -1133,8 +1133,8 @@ template< typename Real,
           typename RealAllocator >
    template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
 bool
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
-operator!=( const Dense< Real_, Device_, Index_, RowMajorOrder >& matrix ) const
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+operator!=( const DenseMatrix< Real_, Device_, Index_, RowMajorOrder >& matrix ) const
 {
    return ! ( *this == matrix );
 }
@@ -1144,7 +1144,7 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::save( const String& fileName ) const
+void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::save( const String& fileName ) const
 {
    this->view.save( fileName );
 }
@@ -1154,7 +1154,7 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::load( const String& fileName )
+void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::load( const String& fileName )
 {
    Object::load( fileName );
 }
@@ -1164,7 +1164,7 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::save( File& file ) const
+void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::save( File& file ) const
 {
    this->view.save( file );
 }
@@ -1174,7 +1174,7 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::load( File& file )
+void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::load( File& file )
 {
    Matrix< Real, Device, Index >::load( file );
    this->segments.load( file );
@@ -1186,7 +1186,7 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-void Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::print( std::ostream& str ) const
+void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::print( std::ostream& str ) const
 {
    this->view.print( str );
 }
@@ -1198,7 +1198,7 @@ template< typename Real,
           typename RealAllocator >
 __cuda_callable__
 Index
-Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
 getElementIndex( const IndexType row, const IndexType column ) const
 {
    return this->segments.getGlobalIndex( row, column );
diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index ddd9c9328..e4e3b448a 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <TNL/Assert.h>
-#include <TNL/Matrices/Dense.h>
+#include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
 namespace TNL {
@@ -80,7 +80,7 @@ String
 DenseMatrixView< Real, Device, Index, RowMajorOrder >::
 getSerializationType()
 {
-   return String( "Matrices::Dense< " ) +
+   return String( "Matrices::DenseMatrix< " ) +
           TNL::getSerializationType< RealType >() + ", [any_device], " +
           TNL::getSerializationType< IndexType >() + ", " +
           ( RowMajorOrder ? "true" : "false" ) + ", [any_allocator] >";
diff --git a/src/TNL/Matrices/DistributedSpMV.h b/src/TNL/Matrices/DistributedSpMV.h
index 01e9c286f..083d7a606 100644
--- a/src/TNL/Matrices/DistributedSpMV.h
+++ b/src/TNL/Matrices/DistributedSpMV.h
@@ -20,7 +20,7 @@
 #include <utility>  // std::pair
 #include <limits>   // std::numeric_limits
 #include <TNL/Allocators/Host.h>
-#include <TNL/Matrices/Dense.h>
+#include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Matrices/ThreePartVector.h>
@@ -120,7 +120,7 @@ public:
 
       // copy the buffer into all rows of the preCommPattern* matrices
       // (in-place copy does not work with some OpenMPI configurations)
-      Matrices::Dense< IndexType, Devices::Host, int > preCommPatternStarts, preCommPatternEnds;
+      Matrices::DenseMatrix< IndexType, Devices::Host, int > preCommPatternStarts, preCommPatternEnds;
       preCommPatternStarts.setLike( commPatternStarts );
       preCommPatternEnds.setLike( commPatternEnds );
       for( int j = 0; j < nproc; j++ )
@@ -237,7 +237,7 @@ public:
 
 protected:
    // communication pattern
-   Matrices::Dense< IndexType, Devices::Host, int > commPatternStarts, commPatternEnds;
+   Matrices::DenseMatrix< IndexType, Devices::Host, int, true, Allocators::Host< IndexType > > commPatternStarts, commPatternEnds;
 
    // span of rows with only block-diagonal entries
    std::pair< IndexType, IndexType > localOnlySpan;
diff --git a/src/TNL/Matrices/MatrixInfo.h b/src/TNL/Matrices/MatrixInfo.h
index ed999c9f2..e91e8a404 100644
--- a/src/TNL/Matrices/MatrixInfo.h
+++ b/src/TNL/Matrices/MatrixInfo.h
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <TNL/String.h>
-#include <TNL/Matrices/Dense.h>
+#include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Matrices/DenseMatrixView.h>
 #include <TNL/Matrices/SparseMatrix.h>
 #include <TNL/Matrices/SparseMatrixView.h>
@@ -48,8 +48,8 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-struct MatrixInfo< Dense< Real, Device, Index, RowMajorOrder, RealAllocator > >
-: public MatrixInfo< typename Dense< Real, Device, Index, RowMajorOrder, RealAllocator >::ViewType >
+struct MatrixInfo< DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator > >
+: public MatrixInfo< typename DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::ViewType >
 {
 };
 
diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index b3c90950a..12c9de4c9 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -17,7 +17,7 @@
 #include <TNL/Containers/Segments/CSR.h>
 #include <TNL/Matrices/SparseMatrixRowView.h>
 #include <TNL/Matrices/SparseMatrixView.h>
-#include <TNL/Matrices/Dense.h>
+#include <TNL/Matrices/DenseMatrix.h>
 
 namespace TNL {
 namespace Matrices {
@@ -215,7 +215,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Assignment of dense matrix
        */
       template< typename Real_, typename Device_, typename Index_, bool RowMajorOrder, typename RealAllocator_ >
-      SparseMatrix& operator=( const Dense< Real_, Device_, Index_, RowMajorOrder, RealAllocator_ >& matrix );
+      SparseMatrix& operator=( const DenseMatrix< Real_, Device_, Index_, RowMajorOrder, RealAllocator_ >& matrix );
 
 
       /**
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index 933177eae..156b370cb 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -684,9 +684,9 @@ template< typename Real,
    template< typename Real_, typename Device_, typename Index_, bool RowMajorOrder, typename RealAllocator_ >
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >&
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
-operator=( const Dense< Real_, Device_, Index_, RowMajorOrder, RealAllocator_ >& matrix )
+operator=( const DenseMatrix< Real_, Device_, Index_, RowMajorOrder, RealAllocator_ >& matrix )
 {
-   using RHSMatrix = Dense< Real_, Device_, Index_, RowMajorOrder, RealAllocator_ >;
+   using RHSMatrix = DenseMatrix< Real_, Device_, Index_, RowMajorOrder, RealAllocator_ >;
    using RHSIndexType = typename RHSMatrix::IndexType;
    using RHSRealType = typename RHSMatrix::RealType;
    using RHSDeviceType = typename RHSMatrix::DeviceType;
diff --git a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h
index b901acbbd..d7a3a429d 100644
--- a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h
@@ -14,7 +14,7 @@
 
 #include <TNL/Matrices/SparseMatrix.h>
 #include <TNL/Matrices/MatrixType.h>
-#include <TNL/Matrices/Dense.h>
+#include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Matrices/Tridiagonal.h>
 #include <TNL/Matrices/Multidiagonal.h>
 #include <TNL/Containers/Segments/CSR.h>
@@ -539,8 +539,8 @@ void denseMatrixAssignment()
    using DeviceType = typename Matrix::DeviceType;
    using IndexType = typename Matrix::IndexType;
 
-   using DenseHost = TNL::Matrices::Dense< RealType, TNL::Devices::Host, IndexType >;
-   using DenseCuda = TNL::Matrices::Dense< RealType, TNL::Devices::Cuda, IndexType >;
+   using DenseHost = TNL::Matrices::DenseMatrix< RealType, TNL::Devices::Host, IndexType >;
+   using DenseCuda = TNL::Matrices::DenseMatrix< RealType, TNL::Devices::Cuda, IndexType >;
 
    const IndexType rows( 10 ), columns( 10 );
    DenseHost hostMatrix( rows, columns );
diff --git a/src/UnitTests/Matrices/DenseMatrixCopyTest.h b/src/UnitTests/Matrices/DenseMatrixCopyTest.h
index 3ef31f107..5e4d42fe4 100644
--- a/src/UnitTests/Matrices/DenseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixCopyTest.h
@@ -14,7 +14,7 @@
 
 #include <TNL/Matrices/SparseMatrix.h>
 #include <TNL/Matrices/MatrixType.h>
-#include <TNL/Matrices/Dense.h>
+#include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Matrices/Tridiagonal.h>
 #include <TNL/Matrices/Multidiagonal.h>
 #include <TNL/Containers/Segments/CSR.h>
@@ -33,10 +33,10 @@ using E_host   = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL:
 using E_cuda   = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, EllpackSegments >;
 using SE_host  = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, SlicedEllpackSegments >;
 using SE_cuda  = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, SlicedEllpackSegments >;
-using Dense_host               = TNL::Matrices::Dense< int, TNL::Devices::Host, int, false >;
-using Dense_host_RowMajorOrder = TNL::Matrices::Dense< int, TNL::Devices::Host, int, true >;
-using Dense_cuda               = TNL::Matrices::Dense< int, TNL::Devices::Cuda, int, false >;
-using Dense_cuda_RowMajorOrder = TNL::Matrices::Dense< int, TNL::Devices::Cuda, int, true >;
+using Dense_host               = TNL::Matrices::DenseMatrix< int, TNL::Devices::Host, int, false >;
+using Dense_host_RowMajorOrder = TNL::Matrices::DenseMatrix< int, TNL::Devices::Host, int, true >;
+using Dense_cuda               = TNL::Matrices::DenseMatrix< int, TNL::Devices::Cuda, int, false >;
+using Dense_cuda_RowMajorOrder = TNL::Matrices::DenseMatrix< int, TNL::Devices::Cuda, int, true >;
 
 
 #ifdef HAVE_GTEST
@@ -501,8 +501,8 @@ void denseMatrixAssignment()
    using DeviceType = typename Matrix::DeviceType;
    using IndexType = typename Matrix::IndexType;
 
-   using DenseHost = TNL::Matrices::Dense< RealType, TNL::Devices::Host, IndexType >;
-   using DenseCuda = TNL::Matrices::Dense< RealType, TNL::Devices::Cuda, IndexType >;
+   using DenseHost = TNL::Matrices::DenseMatrix< RealType, TNL::Devices::Host, IndexType >;
+   using DenseCuda = TNL::Matrices::DenseMatrix< RealType, TNL::Devices::Cuda, IndexType >;
 
    const IndexType rows( 10 ), columns( 10 );
    DenseHost hostMatrix( rows, columns );
diff --git a/src/UnitTests/Matrices/DenseMatrixTest.h b/src/UnitTests/Matrices/DenseMatrixTest.h
index 8791b51fa..cb50738a9 100644
--- a/src/UnitTests/Matrices/DenseMatrixTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixTest.h
@@ -10,7 +10,7 @@
 
 #include <TNL/Devices/Host.h>
 #include <TNL/Matrices/Matrix.h>
-#include <TNL/Matrices/Dense.h>
+#include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Containers/Array.h>
 
 #include <TNL/Containers/Vector.h>
@@ -18,11 +18,11 @@
 #include <TNL/Math.h>
 #include <iostream>
 
-using Dense_host_float = TNL::Matrices::Dense< float, TNL::Devices::Host, int >;
-using Dense_host_int = TNL::Matrices::Dense< int, TNL::Devices::Host, int >;
+using Dense_host_float = TNL::Matrices::DenseMatrix< float, TNL::Devices::Host, int >;
+using Dense_host_int = TNL::Matrices::DenseMatrix< int, TNL::Devices::Host, int >;
 
-using Dense_cuda_float = TNL::Matrices::Dense< float, TNL::Devices::Cuda, int >;
-using Dense_cuda_int = TNL::Matrices::Dense< int, TNL::Devices::Cuda, int >;
+using Dense_cuda_float = TNL::Matrices::DenseMatrix< float, TNL::Devices::Cuda, int >;
+using Dense_cuda_int = TNL::Matrices::DenseMatrix< int, TNL::Devices::Cuda, int >;
 
 static const char* TEST_FILE_NAME = "test_DenseMatrixTest.tnl";
 
@@ -33,14 +33,14 @@ static const char* TEST_FILE_NAME = "test_DenseMatrixTest.tnl";
 
 void test_GetSerializationType()
 {
-   EXPECT_EQ( ( TNL::Matrices::Dense< float, TNL::Devices::Host, int, true >::getSerializationType() ), TNL::String( "Matrices::Dense< float, [any_device], int, true, [any_allocator] >" ) );
-   EXPECT_EQ( ( TNL::Matrices::Dense< int,   TNL::Devices::Host, int, true >::getSerializationType() ), TNL::String( "Matrices::Dense< int, [any_device], int, true, [any_allocator] >" ) );
-   EXPECT_EQ( ( TNL::Matrices::Dense< float, TNL::Devices::Cuda, int, true >::getSerializationType() ), TNL::String( "Matrices::Dense< float, [any_device], int, true, [any_allocator] >" ) );
-   EXPECT_EQ( ( TNL::Matrices::Dense< int,   TNL::Devices::Cuda, int, true >::getSerializationType() ), TNL::String( "Matrices::Dense< int, [any_device], int, true, [any_allocator] >" ) );
-   EXPECT_EQ( ( TNL::Matrices::Dense< float, TNL::Devices::Host, int, false >::getSerializationType() ), TNL::String( "Matrices::Dense< float, [any_device], int, false, [any_allocator] >" ) );
-   EXPECT_EQ( ( TNL::Matrices::Dense< int,   TNL::Devices::Host, int, false >::getSerializationType() ), TNL::String( "Matrices::Dense< int, [any_device], int, false, [any_allocator] >" ) );
-   EXPECT_EQ( ( TNL::Matrices::Dense< float, TNL::Devices::Cuda, int, false >::getSerializationType() ), TNL::String( "Matrices::Dense< float, [any_device], int, false, [any_allocator] >" ) );
-   EXPECT_EQ( ( TNL::Matrices::Dense< int,   TNL::Devices::Cuda, int, false >::getSerializationType() ), TNL::String( "Matrices::Dense< int, [any_device], int, false, [any_allocator] >" ) );
+   EXPECT_EQ( ( TNL::Matrices::DenseMatrix< float, TNL::Devices::Host, int, true >::getSerializationType() ), TNL::String( "Matrices::DenseMatrix< float, [any_device], int, true, [any_allocator] >" ) );
+   EXPECT_EQ( ( TNL::Matrices::DenseMatrix< int,   TNL::Devices::Host, int, true >::getSerializationType() ), TNL::String( "Matrices::DenseMatrix< int, [any_device], int, true, [any_allocator] >" ) );
+   EXPECT_EQ( ( TNL::Matrices::DenseMatrix< float, TNL::Devices::Cuda, int, true >::getSerializationType() ), TNL::String( "Matrices::DenseMatrix< float, [any_device], int, true, [any_allocator] >" ) );
+   EXPECT_EQ( ( TNL::Matrices::DenseMatrix< int,   TNL::Devices::Cuda, int, true >::getSerializationType() ), TNL::String( "Matrices::DenseMatrix< int, [any_device], int, true, [any_allocator] >" ) );
+   EXPECT_EQ( ( TNL::Matrices::DenseMatrix< float, TNL::Devices::Host, int, false >::getSerializationType() ), TNL::String( "Matrices::DenseMatrix< float, [any_device], int, false, [any_allocator] >" ) );
+   EXPECT_EQ( ( TNL::Matrices::DenseMatrix< int,   TNL::Devices::Host, int, false >::getSerializationType() ), TNL::String( "Matrices::DenseMatrix< int, [any_device], int, false, [any_allocator] >" ) );
+   EXPECT_EQ( ( TNL::Matrices::DenseMatrix< float, TNL::Devices::Cuda, int, false >::getSerializationType() ), TNL::String( "Matrices::DenseMatrix< float, [any_device], int, false, [any_allocator] >" ) );
+   EXPECT_EQ( ( TNL::Matrices::DenseMatrix< int,   TNL::Devices::Cuda, int, false >::getSerializationType() ), TNL::String( "Matrices::DenseMatrix< int, [any_device], int, false, [any_allocator] >" ) );
 }
 
 template< typename Matrix >
@@ -1191,8 +1191,8 @@ void test_AssignmentOperator()
    using DeviceType = typename Matrix::DeviceType;
    using IndexType = typename Matrix::IndexType;
 
-   using DenseHost = TNL::Matrices::Dense< RealType, TNL::Devices::Host, IndexType >;
-   using DenseCuda = TNL::Matrices::Dense< RealType, TNL::Devices::Cuda, IndexType >;
+   using DenseHost = TNL::Matrices::DenseMatrix< RealType, TNL::Devices::Host, IndexType >;
+   using DenseCuda = TNL::Matrices::DenseMatrix< RealType, TNL::Devices::Cuda, IndexType >;
 
    const IndexType rows( 10 ), columns( 10 );
    DenseHost hostMatrix( rows, columns );
@@ -1363,31 +1363,31 @@ protected:
 // types for which MatrixTest is instantiated
 using MatrixTypes = ::testing::Types
 <
-    TNL::Matrices::Dense< int,    TNL::Devices::Host, short >,
-    TNL::Matrices::Dense< long,   TNL::Devices::Host, short >,
-    TNL::Matrices::Dense< float,  TNL::Devices::Host, short >,
-    TNL::Matrices::Dense< double, TNL::Devices::Host, short >,
-    TNL::Matrices::Dense< int,    TNL::Devices::Host, int >,
-    TNL::Matrices::Dense< long,   TNL::Devices::Host, int >,
-    TNL::Matrices::Dense< float,  TNL::Devices::Host, int >,
-    TNL::Matrices::Dense< double, TNL::Devices::Host, int >,
-    TNL::Matrices::Dense< int,    TNL::Devices::Host, long >,
-    TNL::Matrices::Dense< long,   TNL::Devices::Host, long >,
-    TNL::Matrices::Dense< float,  TNL::Devices::Host, long >,
-    TNL::Matrices::Dense< double, TNL::Devices::Host, long >
+    TNL::Matrices::DenseMatrix< int,    TNL::Devices::Host, short >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Host, short >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Host, short >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Host, short >,
+    TNL::Matrices::DenseMatrix< int,    TNL::Devices::Host, int >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Host, int >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Host, int >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Host, int >,
+    TNL::Matrices::DenseMatrix< int,    TNL::Devices::Host, long >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Host, long >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Host, long >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Host, long >
 #ifdef HAVE_CUDA
-    ,TNL::Matrices::Dense< int,    TNL::Devices::Cuda, short >,
-    TNL::Matrices::Dense< long,   TNL::Devices::Cuda, short >,
-    TNL::Matrices::Dense< float,  TNL::Devices::Cuda, short >,
-    TNL::Matrices::Dense< double, TNL::Devices::Cuda, short >,
-    TNL::Matrices::Dense< int,    TNL::Devices::Cuda, int >,
-    TNL::Matrices::Dense< long,   TNL::Devices::Cuda, int >,
-    TNL::Matrices::Dense< float,  TNL::Devices::Cuda, int >,
-    TNL::Matrices::Dense< double, TNL::Devices::Cuda, int >,
-    TNL::Matrices::Dense< int,    TNL::Devices::Cuda, long >,
-    TNL::Matrices::Dense< long,   TNL::Devices::Cuda, long >,
-    TNL::Matrices::Dense< float,  TNL::Devices::Cuda, long >,
-    TNL::Matrices::Dense< double, TNL::Devices::Cuda, long >
+    ,TNL::Matrices::DenseMatrix< int,    TNL::Devices::Cuda, short >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Cuda, short >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Cuda, short >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Cuda, short >,
+    TNL::Matrices::DenseMatrix< int,    TNL::Devices::Cuda, int >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Cuda, int >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Cuda, int >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Cuda, int >,
+    TNL::Matrices::DenseMatrix< int,    TNL::Devices::Cuda, long >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Cuda, long >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Cuda, long >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Cuda, long >
 #endif
 >;
 
diff --git a/src/UnitTests/Matrices/SparseMatrixCopyTest.h b/src/UnitTests/Matrices/SparseMatrixCopyTest.h
index 829c30677..6f8a142a6 100644
--- a/src/UnitTests/Matrices/SparseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/SparseMatrixCopyTest.h
@@ -14,7 +14,7 @@
 
 #include <TNL/Matrices/SparseMatrix.h>
 #include <TNL/Matrices/MatrixType.h>
-#include <TNL/Matrices/Dense.h>
+#include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Matrices/Tridiagonal.h>
 #include <TNL/Matrices/Multidiagonal.h>
 #include <TNL/Containers/Segments/CSR.h>
@@ -542,8 +542,8 @@ void denseMatrixAssignment()
    using DeviceType = typename Matrix::DeviceType;
    using IndexType = typename Matrix::IndexType;
 
-   using DenseHost = TNL::Matrices::Dense< RealType, TNL::Devices::Host, IndexType >;
-   using DenseCuda = TNL::Matrices::Dense< RealType, TNL::Devices::Cuda, IndexType >;
+   using DenseHost = TNL::Matrices::DenseMatrix< RealType, TNL::Devices::Host, IndexType >;
+   using DenseCuda = TNL::Matrices::DenseMatrix< RealType, TNL::Devices::Cuda, IndexType >;
 
    const IndexType rows( 10 ), columns( 10 );
    DenseHost hostMatrix( rows, columns );
-- 
GitLab


From 6ca6f3d22fdd633d0efd36e0ea57b3519acc3477 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 21 Apr 2020 10:33:30 +0200
Subject: [PATCH 17/68] Writing documentation for dense matrix.

---
 src/TNL/Matrices/DenseMatrix.h | 97 ++++++++++++++++++++++++++++------
 1 file changed, 82 insertions(+), 15 deletions(-)

diff --git a/src/TNL/Matrices/DenseMatrix.h b/src/TNL/Matrices/DenseMatrix.h
index 4b09d14c3..37683d4bf 100644
--- a/src/TNL/Matrices/DenseMatrix.h
+++ b/src/TNL/Matrices/DenseMatrix.h
@@ -20,9 +20,19 @@
 namespace TNL {
 namespace Matrices {
 
-template< typename Device >
-class DenseDeviceDependentCode;
-
+//template< typename Device >
+//class DenseDeviceDependentCode;
+
+/**
+ * \brief Implementation of dense matrix, i.e. matrix storing explicitly all of its elements including zeros.
+ * 
+ * \tparam Real is a type of matrix elements.
+ * \tparam Device is a device where the matrix is allocated.
+ * \tparam Index is a type for indexing of the matrix elements.
+ * \tparam RowMajorOrder tells the ordering of matrix elements. If it is \e true the matrix elements
+ *         are stored in row major order. If it is \e false, the matrix elements are stored in column major order.
+ * \tparam RealAllocator is allocator for the matrix elements.
+ */
 template< typename Real = double,
           typename Device = Devices::Host,
           typename Index = int,
@@ -30,33 +40,90 @@ template< typename Real = double,
           typename RealAllocator = typename Allocators::Default< Device >::template Allocator< Real > >
 class DenseMatrix : public Matrix< Real, Device, Index >
 {
+   protected:
+      using BaseType = Matrix< Real, Device, Index, RealAllocator >;
+      using ValuesVectorType = typename BaseType::ValuesVectorType;
+      using ValuesViewType = typename ValuesVectorType::ViewType;
+      using SegmentsType = Containers::Segments::Ellpack< Device, Index, typename Allocators::Default< Device >::template Allocator< Index >, RowMajorOrder, 1 >;
+      using SegmentViewType = typename SegmentsType::SegmentViewType;
+
+
    public:
+
+      /**
+       * \brief The type of matrix elements.
+       */
       using RealType = Real;
+
+      /**
+       * \brief The device where the matrix is allocated.
+       */
       using DeviceType = Device;
+
+      /**
+       * \brief The type used for matrix elements indexing.
+       */
       using IndexType = Index;
+
+      /**
+       * \brief The allocator for matrix elements.
+       */
       using RealAllocatorType = RealAllocator;
-      using BaseType = Matrix< Real, Device, Index, RealAllocator >;
-      using ValuesVectorType = typename BaseType::ValuesVectorType;
-      using ValuesViewType = typename ValuesVectorType::ViewType;
-      using SegmentsType = Containers::Segments::Ellpack< DeviceType, IndexType, typename Allocators::Default< Device >::template Allocator< IndexType >, RowMajorOrder, 1 >;
-      using SegmentViewType = typename SegmentsType::SegmentViewType;
+
+      /**
+       * \brief Type of related matrix view. 
+       * 
+       * See \ref DenseMatrixView.
+       */
       using ViewType = DenseMatrixView< Real, Device, Index, RowMajorOrder >;
+
+      /**
+       * \brief Matrix view type for constant instances.
+       * 
+       * See \ref DenseMatrixView.
+       */
       using ConstViewType = DenseMatrixView< typename std::add_const< Real >::type, Device, Index, RowMajorOrder >;
+
+      /**
+       * \brief Type for accessing matrix row.
+       */
       using RowView = DenseMatrixRowView< SegmentViewType, ValuesViewType >;
 
+      /**
+       * \brief Helper type for getting self type or its variations.
+       */
+      template< typename _Real = Real,
+                typename _Device = Device,
+                typename _Index = Index,
+                bool RowMajorOrder_ = RowMajorOrder,
+                typename RealAllocator_ = RealAllocator >
+      using Self = DenseMatrix< _Real, _Device, _Index, RowMajorOrder_, RealAllocator_ >;
       // TODO: remove this
+
       using CompressedRowLengthsVector = typename Matrix< Real, Device, Index >::CompressedRowLengthsVector;
-      using ConstCompressedRowLengthsVectorView = typename Matrix< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView;
+      using ConstCompressedRowLengthsVectorView = typename Matrix< Real, Device, Index >::ConstCompressedRowLengthsVectorView;
 
-      template< typename _Real = Real,
-                typename _Device = Device,
-                typename _Index = Index >
-      using Self = DenseMatrix< _Real, _Device, _Index >;
 
+
+      /**
+       * \brief Constrictor without parameters.
+       */
       DenseMatrix();
 
+      /**
+       * \brief Constructor with matrix dimensions.
+       * 
+       * \param rows is number of matrix rows.
+       * \param columns is number of matrix columns.
+       */
       DenseMatrix( const IndexType rows, const IndexType columns );
 
+      /**
+       * \brief Constructor with initializer list.
+       * 
+       * \param data is a initializer list of initializer lists. The inner
+       * initializer list represents matrix rows.
+       */
       DenseMatrix( std::initializer_list< std::initializer_list< RealType > > data );
 
       ViewType getView();
@@ -226,8 +293,8 @@ class DenseMatrix : public Matrix< Real, Device, Index >
       IndexType getElementIndex( const IndexType row,
                                  const IndexType column ) const;
 
-      typedef DenseDeviceDependentCode< DeviceType > DeviceDependentCode;
-      friend class DenseDeviceDependentCode< DeviceType >;
+      //typedef DenseDeviceDependentCode< DeviceType > DeviceDependentCode;
+      //friend class DenseDeviceDependentCode< DeviceType >;
 
       SegmentsType segments;
 
-- 
GitLab


From 8e247844b226618d4795e87473b161318d14ba8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 21 Apr 2020 22:07:57 +0200
Subject: [PATCH 18/68] Writting documentation and refactoring DenseMatrix.

---
 Documentation/Examples/CMakeLists.txt         |   1 +
 .../Examples/Matrices/CMakeLists.txt          |  40 ++++++
 ...nseMatrixExample_Constructor_init_list.cpp |  24 ++++
 ...enseMatrixExample_Constructor_init_list.cu |   1 +
 ...eMatrixExample_getCompressedRowLengths.cpp |  21 ++++
 ...seMatrixExample_getCompressedRowLengths.cu |   1 +
 .../DenseMatrixExample_setElements.cpp        |  26 ++++
 .../DenseMatrixExample_setElements.cu         |   1 +
 src/TNL/Containers/Array.h                    |   2 +-
 src/TNL/Matrices/DenseMatrix.h                | 114 ++++++++++++++----
 src/TNL/Matrices/DenseMatrix.hpp              |  32 ++---
 src/TNL/Matrices/DenseMatrixView.hpp          |   7 +-
 src/TNL/Matrices/SparseMatrix.h               |   2 +-
 src/TNL/Matrices/SparseMatrix.hpp             |   2 +-
 src/UnitTests/Matrices/DenseMatrixCopyTest.h  |   6 +-
 src/UnitTests/Matrices/DenseMatrixTest.h      |  31 -----
 16 files changed, 237 insertions(+), 74 deletions(-)
 create mode 100644 Documentation/Examples/Matrices/CMakeLists.txt
 create mode 100644 Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cpp
 create mode 120000 Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cu
 create mode 100644 Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cpp
 create mode 120000 Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cu
 create mode 100644 Documentation/Examples/Matrices/DenseMatrixExample_setElements.cpp
 create mode 120000 Documentation/Examples/Matrices/DenseMatrixExample_setElements.cu

diff --git a/Documentation/Examples/CMakeLists.txt b/Documentation/Examples/CMakeLists.txt
index 45689f9e9..ca8662ad0 100644
--- a/Documentation/Examples/CMakeLists.txt
+++ b/Documentation/Examples/CMakeLists.txt
@@ -1,6 +1,7 @@
 ADD_SUBDIRECTORY( Algorithms )
 ADD_SUBDIRECTORY( Containers )
 ADD_SUBDIRECTORY( Pointers )
+ADD_SUBDIRECTORY( Matrices )
 
 ADD_EXECUTABLE( FileExample FileExample.cpp )
 ADD_CUSTOM_COMMAND( COMMAND FileExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/FileExample.out OUTPUT FileExample.out )
diff --git a/Documentation/Examples/Matrices/CMakeLists.txt b/Documentation/Examples/Matrices/CMakeLists.txt
new file mode 100644
index 000000000..8cb519f7a
--- /dev/null
+++ b/Documentation/Examples/Matrices/CMakeLists.txt
@@ -0,0 +1,40 @@
+IF( BUILD_CUDA )
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_Constructor_init_list_cuda DenseMatrixExample_Constructor_init_list.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_Constructor_init_list_cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_Constructor_init_list.out 
+                       OUTPUT DenseMatrixExample_Constructor_init_list.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_setElements_cuda DenseMatrixExample_setElements.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElements_cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElements.out 
+                       OUTPUT DenseMatrixExample_setElements.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_getCompressedRowLengths_cuda DenseMatrixExample_getCompressedRowLengths.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getCompressedRowLengths_cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getCompressedRowLengths.out 
+                       OUTPUT DenseMatrixExample_getCompressedRowLengths.out )
+
+ELSE()
+   ADD_EXECUTABLE( DenseMatrixExample_Constructor_init_list DenseMatrixExample_Constructor_init_list.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_Constructor_init_list > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_Constructor_init_list.out 
+                       OUTPUT DenseMatrixExample_Constructor_init_list.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_setElements DenseMatrixExample_setElements.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElements > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElements.out 
+                       OUTPUT DenseMatrixExample_setElements.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_getCompressedRowLengths DenseMatrixExample_getCompressedRowLengths.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getCompressedRowLengths > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getCompressedRowLengths.out 
+                       OUTPUT DenseMatrixExample_getCompressedRowLengths.out )
+ENDIF()
+
+IF( BUILD_CUDA )
+ADD_CUSTOM_TARGET( RunMatricesExamples-cuda ALL DEPENDS
+   DenseMatrixExample_Constructor_init_list.out
+   DenseMatrixExample_setElements.out
+   DenseMatrixExample_getCompressedRowLengths.out
+   )
+ELSE()
+ADD_CUSTOM_TARGET( RunMatricesExamples ALL DEPENDS
+   DenseMatrixExample_Constructor_init_list.out
+   DenseMatrixExample_setElements.out
+   DenseMatrixExample_getCompressedRowLengths.out
+   )
+ENDIF()
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cpp
new file mode 100644
index 000000000..c11178c46
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cpp
@@ -0,0 +1,24 @@
+#include <iostream>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+int main( int argc, char* argv[] )
+{
+   TNL::Matrices::DenseMatrix< double, TNL::Devices::Host > matrix {
+      {  1,  2,  3,  4,  5,  6 },
+      {  7,  8,  9, 10, 11, 12 },
+      { 13, 14, 15, 16, 17, 18 }
+   };
+
+   std::cout << matrix << std::endl;
+
+   TNL::Matrices::DenseMatrix< double, TNL::Devices::Host > triangularMatrix {
+      {  1 },
+      {  2,  3 },
+      {  4,  5,  6 },
+      {  7,  8,  9, 10 },
+      { 11, 12, 13, 14, 15 }
+   };
+
+   std::cout << triangularMatrix << std::endl;
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cu b/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cu
new file mode 120000
index 000000000..91fa4f073
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cu
@@ -0,0 +1 @@
+DenseMatrixExample_Constructor_init_list.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cpp
new file mode 100644
index 000000000..cb0abc6fd
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cpp
@@ -0,0 +1,21 @@
+#include <iostream>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+int main( int argc, char* argv[] )
+{
+   TNL::Matrices::DenseMatrix< double, TNL::Devices::Host > triangularMatrix {
+      {  1 },
+      {  2,  3 },
+      {  4,  5,  6 },
+      {  7,  8,  9, 10 },
+      { 11, 12, 13, 14, 15 }
+   };
+
+   std::cout << triangularMatrix << std::endl;
+
+   TNL::Containers::Vector< int, TNL::Devices::Host > rowLengths;
+   triangularMatrix.getCompressedRowLengths( rowLengths );
+
+   std::cout << "Compressed row lengths are: " << rowLengths << std::endl;
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cu b/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cu
new file mode 120000
index 000000000..2b3cd6c13
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cu
@@ -0,0 +1 @@
+DenseMatrixExample_getCompressedRowLengths.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cpp
new file mode 100644
index 000000000..bf96abf23
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cpp
@@ -0,0 +1,26 @@
+#include <iostream>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+int main( int argc, char* argv[] )
+{
+   TNL::Matrices::DenseMatrix< double, TNL::Devices::Host > matrix;
+   matrix.setElements( {
+      {  1,  2,  3,  4,  5,  6 },
+      {  7,  8,  9, 10, 11, 12 },
+      { 13, 14, 15, 16, 17, 18 }
+   } );
+
+   std::cout << matrix << std::endl;
+
+   TNL::Matrices::DenseMatrix< double, TNL::Devices::Host > triangularMatrix;
+   triangularMatrix.setElements( {
+      {  1 },
+      {  2,  3 },
+      {  4,  5,  6 },
+      {  7,  8,  9, 10 },
+      { 11, 12, 13, 14, 15 }
+   } );
+
+   std::cout << triangularMatrix << std::endl;
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cu b/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cu
new file mode 120000
index 000000000..fa2487e27
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cu
@@ -0,0 +1 @@
+DenseMatrixExample_setElements.cpp
\ No newline at end of file
diff --git a/src/TNL/Containers/Array.h b/src/TNL/Containers/Array.h
index a73385eb1..25f8048d8 100644
--- a/src/TNL/Containers/Array.h
+++ b/src/TNL/Containers/Array.h
@@ -396,7 +396,7 @@ class Array
        * \brief Resets the array to the empty state.
        *
        * The current data will be deallocated, thus all pointers and views to
-       * the array alements will become invalid.
+       * the array elements will become invalid.
        */
       void reset();
 
diff --git a/src/TNL/Matrices/DenseMatrix.h b/src/TNL/Matrices/DenseMatrix.h
index 37683d4bf..40fc1302b 100644
--- a/src/TNL/Matrices/DenseMatrix.h
+++ b/src/TNL/Matrices/DenseMatrix.h
@@ -20,9 +20,6 @@
 namespace TNL {
 namespace Matrices {
 
-//template< typename Device >
-//class DenseDeviceDependentCode;
-
 /**
  * \brief Implementation of dense matrix, i.e. matrix storing explicitly all of its elements including zeros.
  * 
@@ -98,8 +95,8 @@ class DenseMatrix : public Matrix< Real, Device, Index >
                 bool RowMajorOrder_ = RowMajorOrder,
                 typename RealAllocator_ = RealAllocator >
       using Self = DenseMatrix< _Real, _Device, _Index, RowMajorOrder_, RealAllocator_ >;
-      // TODO: remove this
 
+      // TODO: remove this
       using CompressedRowLengthsVector = typename Matrix< Real, Device, Index >::CompressedRowLengthsVector;
       using ConstCompressedRowLengthsVectorView = typename Matrix< Real, Device, Index >::ConstCompressedRowLengthsVectorView;
 
@@ -119,46 +116,117 @@ class DenseMatrix : public Matrix< Real, Device, Index >
       DenseMatrix( const IndexType rows, const IndexType columns );
 
       /**
-       * \brief Constructor with initializer list.
+       * \brief Constructor with 2D initializer list.
        * 
-       * \param data is a initializer list of initializer lists. The inner
-       * initializer list represents matrix rows.
+       * The number of matrix rows is set to the outer list size and the number
+       * of matrix columns is set to maximum size of inner lists. Missing elements
+       * are filled in with zeros.
+       * 
+       * \param data is a initializer list of initializer lists representing
+       * list of matrix rows.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_Constructor_init_list.cpp
+       * \par Output
+       * \include DenseMatrixExample_Constructor_init_list.out
        */
       DenseMatrix( std::initializer_list< std::initializer_list< RealType > > data );
 
+      /**
+       * \brief Returns a modifiable view of the dense matrix.
+       * 
+       * See \ref DenseMatrixView.
+       * 
+       * \return dense matrix view.
+       */
       ViewType getView();
 
+      /**
+       * \brief Returns a non-modifiable view of the dense matrix.
+       * 
+       * See \ref DenseMatrixView.
+       * 
+       * \return dense matrix view.
+       */
       ConstViewType getConstView() const;
 
+      /**
+       * \brief Returns string with serialization type.
+       * 
+       * The string has a form \e `Matrices::DenseMatrix< RealType,  [any_device], IndexType, [any_allocator], true/false >`.
+       * 
+       * \return \e String with the serialization type.
+       */
       static String getSerializationType();
 
+      /**
+       * \brief Returns string with serialization type.
+       * 
+       * See \ref DenseMatrix::getSerializationType.
+       * 
+       * \return \e String with the serialization type.
+       */
       virtual String getSerializationTypeVirtual() const;
 
+      /**
+       * \brief Set number of rows and columns of this matrix.
+       * 
+       * \param rows is the number of matrix rows.
+       * \param columns is the number of matrix columns.
+       */
       void setDimensions( const IndexType rows,
                           const IndexType columns );
 
+      /**
+       * \brief Set the number of matrix rows and columns by the given matrix.
+       * 
+       * \tparam Matrix is matrix type. This can be any matrix having methods 
+       *  \ref getRows and \ref getColumns.
+       * 
+       * \param matrix in the input matrix dimensions of which are to be adopted.
+       */
       template< typename Matrix >
       void setLike( const Matrix& matrix );
 
       /**
-       * \brief This method creates dense matrix from 2D initializer list.
+       * \brief This method recreates the dense matrix from 2D initializer list.
+       * 
+       * The number of matrix rows is set to the outer list size and the number
+       * of matrix columns is set to maximum size of inner lists. Missing elements
+       * are filled in with zeros.
        * 
-       * The matrix dimensions will be adjusted by the input data.
+       * \param data is a initializer list of initializer lists representing
+       * list of matrix rows.
        * 
-       * @param data
+       * \par Example
+       * \include Matrices/DenseMatrixExample_setElements.cpp
+       * \par Output
+       * \include DenseMatrixExample_setElements.out
        */
       void setElements( std::initializer_list< std::initializer_list< RealType > > data );
-      
+
       /**
-       * This method is only for the compatibility with the sparse matrices.
+       * \brief This method is only for the compatibility with the sparse matrices.
+       * 
+       * This method does nothing. In debug mode it contains assertions checking
+       * that given rowCapacities are compatible with the current matrix dimensions.
        */
-      void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths );
+      template< typename RowCapacitiesVector >
+      void setRowCapacities( const RowCapacitiesVector& rowCapacities );
 
-      template< typename Vector >
-      void getCompressedRowLengths( Vector& rowLengths ) const;
-
-      [[deprecated]]
-      IndexType getRowLength( const IndexType row ) const;
+      /**
+       * \brief Computes number of non-zeros in each row.
+       * 
+       * \param rowLengths is a vector into which the number of non-zeros in each row
+       * will be stored.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_getCompressedRowLengths.cpp
+       * \par Output
+       * \include DenseMatrixExample_getCompressedRowLengths.out
+       */
+      template< typename RowLengthsVector >
+      void getCompressedRowLengths( RowLengthsVector& rowLengths ) const;
 
       IndexType getMaxRowLength() const;
 
@@ -293,14 +361,18 @@ class DenseMatrix : public Matrix< Real, Device, Index >
       IndexType getElementIndex( const IndexType row,
                                  const IndexType column ) const;
 
-      //typedef DenseDeviceDependentCode< DeviceType > DeviceDependentCode;
-      //friend class DenseDeviceDependentCode< DeviceType >;
-
       SegmentsType segments;
 
       ViewType view;
 };
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+std::ostream& operator<< ( std::ostream& str, const DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >& matrix );
+
 } // namespace Matrices
 } // namespace TNL
 
diff --git a/src/TNL/Matrices/DenseMatrix.hpp b/src/TNL/Matrices/DenseMatrix.hpp
index f0d1ecc04..64a14afc8 100644
--- a/src/TNL/Matrices/DenseMatrix.hpp
+++ b/src/TNL/Matrices/DenseMatrix.hpp
@@ -175,12 +175,13 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
+   template< typename RowCapacitiesVector >
 void
 DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
-setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths )
+setRowCapacities( const RowCapacitiesVector& rowCapacities )
 {
-   TNL_ASSERT_EQ( rowLengths.getSize(), this->getRows(), "" );
-   TNL_ASSERT_LE( max( rowLengths ), this->getColumns(), "" );
+   TNL_ASSERT_EQ( rowCapacities.getSize(), this->getRows(), "" );
+   TNL_ASSERT_LE( max( rowCapacities ), this->getColumns(), "" );
 }
 
 template< typename Real,
@@ -188,24 +189,14 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-   template< typename Vector >
+   template< typename RowLengthsVector >
 void
 DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
-getCompressedRowLengths( Vector& rowLengths ) const
+getCompressedRowLengths( RowLengthsVector& rowLengths ) const
 {
    this->view.getCompressedRowLengths( rowLengths );
 }
 
-template< typename Real,
-          typename Device,
-          typename Index,
-          bool RowMajorOrder,
-          typename RealAllocator >
-Index DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::getRowLength( const IndexType row ) const
-{
-   return this->getColumns();
-}
-
 template< typename Real,
           typename Device,
           typename Index,
@@ -1204,5 +1195,16 @@ getElementIndex( const IndexType row, const IndexType column ) const
    return this->segments.getGlobalIndex( row, column );
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          bool RowMajorOrder,
+          typename RealAllocator >
+std::ostream& operator<< ( std::ostream& str, const DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >& matrix )
+{ 
+   matrix.print( str );
+   return str;
+}
+
 } // namespace Matrices
 } // namespace TNL
diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index e4e3b448a..ec2e3f892 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include <iomanip>
 #include <TNL/Assert.h>
 #include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Exceptions/NotImplementedError.h>
@@ -682,7 +683,11 @@ void DenseMatrixView< Real, Device, Index, RowMajorOrder >::print( std::ostream&
    {
       str <<"Row: " << row << " -> ";
       for( IndexType column = 0; column < this->getColumns(); column++ )
-         str << " Col:" << column << "->" << this->getElement( row, column ) << "\t";
+      {
+         std::stringstream str_;
+         str_ << column << ":" << this->getElement( row, column );
+         str << std::setw( 6 ) << str_.str();
+      }
       str << std::endl;
    }
 }
diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index 12c9de4c9..046b0a6ae 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -111,7 +111,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       virtual String getSerializationTypeVirtual() const;
 
       template< typename RowsCapacitiesVector >
-      void setCompressedRowLengths( const RowsCapacitiesVector& rowCapacities );
+      void setRowCapacities( const RowsCapacitiesVector& rowCapacities );
 
       // TODO: Remove this when possible
       void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) {
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index 156b370cb..ea74a9e9d 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -204,7 +204,7 @@ template< typename Real,
    template< typename RowsCapacitiesVector >
 void
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
-setCompressedRowLengths( const RowsCapacitiesVector& rowsCapacities )
+setRowCapacities( const RowsCapacitiesVector& rowsCapacities )
 {
    TNL_ASSERT_EQ( rowsCapacities.getSize(), this->getRows(), "Number of matrix rows does not fit with rowLengths vector size." );
    using RowsCapacitiesVectorDevice = typename RowsCapacitiesVector::DeviceType;
diff --git a/src/UnitTests/Matrices/DenseMatrixCopyTest.h b/src/UnitTests/Matrices/DenseMatrixCopyTest.h
index 5e4d42fe4..d041de1db 100644
--- a/src/UnitTests/Matrices/DenseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixCopyTest.h
@@ -72,7 +72,7 @@ void setupUnevenRowSizeMatrix( Matrix& m )
    rowLengths.setElement( 6,  1 );
    rowLengths.setElement( 7,  1 );
    rowLengths.setElement( 9,  1 );
-   m.setCompressedRowLengths( rowLengths );
+   m.setRowCapacities( rowLengths );
 
     int value = 1;
     for( int i = 0; i < cols - 4; i++ )  // 0th row
@@ -202,7 +202,7 @@ void setupAntiTriDiagMatrix( Matrix& m )
    rowLengths.setValue( 3 );
    rowLengths.setElement( 0, 4);
    rowLengths.setElement( 1,  4 );
-   m.setCompressedRowLengths( rowLengths );
+   m.setRowCapacities( rowLengths );
 
    int value = 1;
    for( int i = 0; i < rows; i++ )
@@ -289,7 +289,7 @@ void setupTriDiagMatrix( Matrix& m )
    rowLengths.setValue( 3 );
    rowLengths.setElement( 0 , 4 );
    rowLengths.setElement( 1,  4 );
-   m.setCompressedRowLengths( rowLengths );
+   m.setRowCapacities( rowLengths );
 
 
    int value = 1;
diff --git a/src/UnitTests/Matrices/DenseMatrixTest.h b/src/UnitTests/Matrices/DenseMatrixTest.h
index cb50738a9..81e2ae682 100644
--- a/src/UnitTests/Matrices/DenseMatrixTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixTest.h
@@ -162,30 +162,6 @@ void test_GetCompressedRowLengths()
    EXPECT_EQ( rowLengths, correctRowLengths );
 }
 
-template< typename Matrix >
-void test_GetRowLength()
-{
-    using RealType = typename Matrix::RealType;
-    using DeviceType = typename Matrix::DeviceType;
-    using IndexType = typename Matrix::IndexType;
-
-    const IndexType rows = 8;
-    const IndexType cols = 7;
-
-    Matrix m;
-    m.reset();
-    m.setDimensions( rows, cols );
-
-    EXPECT_EQ( m.getRowLength( 0 ), 7 );
-    EXPECT_EQ( m.getRowLength( 1 ), 7 );
-    EXPECT_EQ( m.getRowLength( 2 ), 7 );
-    EXPECT_EQ( m.getRowLength( 3 ), 7 );
-    EXPECT_EQ( m.getRowLength( 4 ), 7 );
-    EXPECT_EQ( m.getRowLength( 5 ), 7 );
-    EXPECT_EQ( m.getRowLength( 6 ), 7 );
-    EXPECT_EQ( m.getRowLength( 7 ), 7 );
-}
-
 template< typename Matrix >
 void test_GetNumberOfMatrixElements()
 {
@@ -1419,13 +1395,6 @@ TYPED_TEST( MatrixTest, setElementsTest )
     test_SetElements< MatrixType >();
 }
 
-TYPED_TEST( MatrixTest, getRowLengthTest )
-{
-    using MatrixType = typename TestFixture::MatrixType;
-
-    test_GetRowLength< MatrixType >();
-}
-
 TYPED_TEST( MatrixTest, getNumberOfMatrixElementsTest )
 {
     using MatrixType = typename TestFixture::MatrixType;
-- 
GitLab


From e0eb72002bd92638672abe2211b12ded7a3c6724 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 22 Apr 2020 21:36:29 +0200
Subject: [PATCH 19/68] Writting documentation and refactoring DenseMatrix.

---
 .../Examples/Matrices/CMakeLists.txt          |  62 ++++++----
 ...enseMatrixExample_Constructor_init_list.cu |   1 -
 ...seMatrixExample_getCompressedRowLengths.cu |   1 -
 .../DenseMatrixExample_getConstRow.cpp        |  32 +++++
 .../DenseMatrixExample_getElementsCount.cpp   |  17 +++
 .../Matrices/DenseMatrixExample_getRow.cpp    |  21 ++++
 .../DenseMatrixExample_setElements.cu         |   1 -
 src/TNL/Containers/Segments/BiEllpack.hpp     |   3 +-
 .../Containers/Segments/ChunkedEllpack.hpp    |   3 +-
 src/TNL/File.hpp                              |   2 +-
 src/TNL/Matrices/DenseMatrix.h                | 114 +++++++++++++++---
 src/TNL/Matrices/DenseMatrix.hpp              |  16 +--
 src/TNL/Matrices/DenseMatrixRowView.h         |   4 +-
 src/TNL/Matrices/DenseMatrixRowView.hpp       |   4 +-
 src/TNL/Matrices/DenseMatrixView.h            |   4 +-
 src/TNL/Matrices/DenseMatrixView.hpp          |  10 +-
 src/TNL/Matrices/MatrixView.h                 |   2 -
 src/TNL/Matrices/MatrixView.hpp               |  11 --
 src/TNL/Matrices/SparseMatrix.h               |   2 +-
 src/UnitTests/Matrices/DenseMatrixTest.h      |  72 ++---------
 20 files changed, 236 insertions(+), 146 deletions(-)
 delete mode 120000 Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cu
 delete mode 120000 Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cu
 create mode 100644 Documentation/Examples/Matrices/DenseMatrixExample_getConstRow.cpp
 create mode 100644 Documentation/Examples/Matrices/DenseMatrixExample_getElementsCount.cpp
 create mode 100644 Documentation/Examples/Matrices/DenseMatrixExample_getRow.cpp
 delete mode 120000 Documentation/Examples/Matrices/DenseMatrixExample_setElements.cu

diff --git a/Documentation/Examples/Matrices/CMakeLists.txt b/Documentation/Examples/Matrices/CMakeLists.txt
index 8cb519f7a..f91ae36e0 100644
--- a/Documentation/Examples/Matrices/CMakeLists.txt
+++ b/Documentation/Examples/Matrices/CMakeLists.txt
@@ -1,17 +1,17 @@
-IF( BUILD_CUDA )
-   CUDA_ADD_EXECUTABLE( DenseMatrixExample_Constructor_init_list_cuda DenseMatrixExample_Constructor_init_list.cu )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_Constructor_init_list_cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_Constructor_init_list.out 
-                       OUTPUT DenseMatrixExample_Constructor_init_list.out )
-
-   CUDA_ADD_EXECUTABLE( DenseMatrixExample_setElements_cuda DenseMatrixExample_setElements.cu )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElements_cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElements.out 
-                       OUTPUT DenseMatrixExample_setElements.out )
-
-   CUDA_ADD_EXECUTABLE( DenseMatrixExample_getCompressedRowLengths_cuda DenseMatrixExample_getCompressedRowLengths.cu )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getCompressedRowLengths_cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getCompressedRowLengths.out 
-                       OUTPUT DenseMatrixExample_getCompressedRowLengths.out )
-
-ELSE()
+#IF( BUILD_CUDA )
+#   CUDA_ADD_EXECUTABLE( DenseMatrixExample_Constructor_init_list_cuda DenseMatrixExample_Constructor_init_list.cu )
+#   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_Constructor_init_list_cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_Constructor_init_list.out 
+#                       OUTPUT DenseMatrixExample_Constructor_init_list.out )
+#
+#   CUDA_ADD_EXECUTABLE( DenseMatrixExample_setElements_cuda DenseMatrixExample_setElements.cu )
+#   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElements_cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElements.out 
+#                       OUTPUT DenseMatrixExample_setElements.out )
+#
+#   CUDA_ADD_EXECUTABLE( DenseMatrixExample_getCompressedRowLengths_cuda DenseMatrixExample_getCompressedRowLengths.cu )
+#   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getCompressedRowLengths_cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getCompressedRowLengths.out 
+#                       OUTPUT DenseMatrixExample_getCompressedRowLengths.out )
+#
+#ELSE()
    ADD_EXECUTABLE( DenseMatrixExample_Constructor_init_list DenseMatrixExample_Constructor_init_list.cpp )
    ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_Constructor_init_list > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_Constructor_init_list.out 
                        OUTPUT DenseMatrixExample_Constructor_init_list.out )
@@ -23,18 +23,34 @@ ELSE()
    ADD_EXECUTABLE( DenseMatrixExample_getCompressedRowLengths DenseMatrixExample_getCompressedRowLengths.cpp )
    ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getCompressedRowLengths > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getCompressedRowLengths.out 
                        OUTPUT DenseMatrixExample_getCompressedRowLengths.out )
-ENDIF()
 
-IF( BUILD_CUDA )
-ADD_CUSTOM_TARGET( RunMatricesExamples-cuda ALL DEPENDS
-   DenseMatrixExample_Constructor_init_list.out
-   DenseMatrixExample_setElements.out
-   DenseMatrixExample_getCompressedRowLengths.out
-   )
-ELSE()
+   ADD_EXECUTABLE( DenseMatrixExample_getElementsCount DenseMatrixExample_getElementsCount.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getElementsCount > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getElementsCount.out 
+                       OUTPUT DenseMatrixExample_getElementsCount.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_getConstRow DenseMatrixExample_getConstRow.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getConstRow > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getConstRow.out 
+                       OUTPUT DenseMatrixExample_getConstRow.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_getRow DenseMatrixExample_getRow.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getRow > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getRow.out 
+                       OUTPUT DenseMatrixExample_getRow.out )
+
+#ENDIF()
+
+#IF( BUILD_CUDA )
+#ADD_CUSTOM_TARGET( RunMatricesExamples-cuda ALL DEPENDS
+#   DenseMatrixExample_Constructor_init_list.out
+#   DenseMatrixExample_setElements.out
+#   DenseMatrixExample_getCompressedRowLengths.out
+#   )
+#ELSE()
 ADD_CUSTOM_TARGET( RunMatricesExamples ALL DEPENDS
    DenseMatrixExample_Constructor_init_list.out
    DenseMatrixExample_setElements.out
    DenseMatrixExample_getCompressedRowLengths.out
+   DenseMatrixExample_getElementsCount.out
+   DenseMatrixExample_getConstRow.out
+   DenseMatrixExample_getRow.out
    )
-ENDIF()
+#ENDIF()
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cu b/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cu
deleted file mode 120000
index 91fa4f073..000000000
--- a/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cu
+++ /dev/null
@@ -1 +0,0 @@
-DenseMatrixExample_Constructor_init_list.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cu b/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cu
deleted file mode 120000
index 2b3cd6c13..000000000
--- a/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cu
+++ /dev/null
@@ -1 +0,0 @@
-DenseMatrixExample_getCompressedRowLengths.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getConstRow.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_getConstRow.cpp
new file mode 100644
index 000000000..8e5da1d4b
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getConstRow.cpp
@@ -0,0 +1,32 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+int main( int argc, char* argv[] )
+{
+   TNL::Matrices::DenseMatrix< double, TNL::Devices::Host > matrix {
+      { 1, 0, 0, 0, 0 },
+      { 1, 2, 0, 0, 0 },
+      { 1, 2, 3, 0, 0 },
+      { 1, 2, 3, 4, 0 },
+      { 1, 2, 3, 4, 5 }
+   };
+
+   /***
+    * We need a matrix view to pass the matrix to lambda function even on CUDA device.
+    */
+   const auto matrixView = matrix.getConstView();
+
+   /***
+    * Fetch lambda function returns diagonal element in each row.
+    */
+   auto fetch = [=] __cuda_callable__ ( int rowIdx ) mutable -> double {
+      auto row = matrixView.getRow( rowIdx );
+      return row.getElement( rowIdx );
+   };
+
+   int trace = TNL::Algorithms::Reduction< TNL::Devices::Host >::reduce( matrix.getRows(), std::plus<>{}, fetch, 0 );
+   std::cout << "Matrix trace is " << trace << "." << std::endl;
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getElementsCount.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_getElementsCount.cpp
new file mode 100644
index 000000000..997dadb7f
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getElementsCount.cpp
@@ -0,0 +1,17 @@
+#include <iostream>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+int main( int argc, char* argv[] )
+{
+   TNL::Matrices::DenseMatrix< double, TNL::Devices::Host > triangularMatrix {
+      {  1 },
+      {  2,  3 },
+      {  4,  5,  6 },
+      {  7,  8,  9, 10 },
+      { 11, 12, 13, 14, 15 }
+   };
+
+   std::cout << "Matrix elements count is " << triangularMatrix.getElementsCount() << "." << std::endl;
+   std::cout << "Non-zero matrix elements count is " << triangularMatrix.getNonzeroElementsCount() << "." << std::endl;
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getRow.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_getRow.cpp
new file mode 100644
index 000000000..120c934a3
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getRow.cpp
@@ -0,0 +1,21 @@
+#include <iostream>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+int main( int argc, char* argv[] )
+{
+   TNL::Matrices::DenseMatrix< double, TNL::Devices::Host > matrix( 5, 5 );
+
+   /***
+    * We need a matrix view to pass the matrix to lambda function even on CUDA device.
+    */
+   auto matrixView = matrix.getView();
+   auto f = [=] __cuda_callable__ ( int rowIdx ) mutable {
+      auto row = matrixView.getRow( rowIdx );
+      row.setElement( rowIdx, 10* ( rowIdx + 1 ) );
+   };
+
+   TNL::Algorithms::ParallelFor< TNL::Devices::Host >::exec( 0, matrix.getRows(), f );
+   std::cout << matrix << std::endl;
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cu b/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cu
deleted file mode 120000
index fa2487e27..000000000
--- a/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cu
+++ /dev/null
@@ -1 +0,0 @@
-DenseMatrixExample_setElements.cpp
\ No newline at end of file
diff --git a/src/TNL/Containers/Segments/BiEllpack.hpp b/src/TNL/Containers/Segments/BiEllpack.hpp
index 91ebea207..5203b3b3f 100644
--- a/src/TNL/Containers/Segments/BiEllpack.hpp
+++ b/src/TNL/Containers/Segments/BiEllpack.hpp
@@ -341,7 +341,8 @@ setSegmentsSizes( const SizesHolder& segmentsSizes )
    else
    {
       BiEllpack< Devices::Host, Index, typename Allocators::Default< Devices::Host >::template Allocator< IndexType >, RowMajorOrder > hostSegments;
-      Containers::Vector< IndexType, Devices::Host, IndexType > hostSegmentsSizes( segmentsSizes );
+      Containers::Vector< IndexType, Devices::Host, IndexType > hostSegmentsSizes;
+      hostSegmentsSizes = segmentsSizes;
       hostSegments.setSegmentsSizes( hostSegmentsSizes );
       *this = hostSegments;
    }
diff --git a/src/TNL/Containers/Segments/ChunkedEllpack.hpp b/src/TNL/Containers/Segments/ChunkedEllpack.hpp
index 444360d66..83150c766 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpack.hpp
+++ b/src/TNL/Containers/Segments/ChunkedEllpack.hpp
@@ -294,7 +294,8 @@ setSegmentsSizes( const SizesHolder& segmentsSizes )
    else
    {
       ChunkedEllpack< Devices::Host, Index, typename Allocators::Default< Devices::Host >::template Allocator< Index >, RowMajorOrder > hostSegments;
-      Containers::Vector< IndexType, Devices::Host, IndexType > hostSegmentsSizes( segmentsSizes );
+      Containers::Vector< IndexType, Devices::Host, IndexType > hostSegmentsSizes;
+      hostSegmentsSizes = segmentsSizes;
       hostSegments.setSegmentsSizes( hostSegmentsSizes );
       *this = hostSegments;
    }
diff --git a/src/TNL/File.hpp b/src/TNL/File.hpp
index af112e992..05e9d9ad7 100644
--- a/src/TNL/File.hpp
+++ b/src/TNL/File.hpp
@@ -176,7 +176,7 @@ template< typename Type,
           typename Allocator >
 void File::save( const Type* buffer, std::streamsize elements )
 {
-   static_assert( std::is_same< Type, typename Allocator::value_type >::value,
+   static_assert( std::is_same< std::remove_cv_t< Type >, typename Allocator::value_type >::value,
                   "Allocator::value_type must be the same as Type." );
    TNL_ASSERT_GE( elements, 0, "Number of elements to save must be non-negative." );
 
diff --git a/src/TNL/Matrices/DenseMatrix.h b/src/TNL/Matrices/DenseMatrix.h
index 40fc1302b..8b22930e9 100644
--- a/src/TNL/Matrices/DenseMatrix.h
+++ b/src/TNL/Matrices/DenseMatrix.h
@@ -87,23 +87,17 @@ class DenseMatrix : public Matrix< Real, Device, Index >
       using RowView = DenseMatrixRowView< SegmentViewType, ValuesViewType >;
 
       /**
-       * \brief Helper type for getting self type or its variations.
+       * \brief Helper type for getting self type or its modifications.
        */
       template< typename _Real = Real,
                 typename _Device = Device,
                 typename _Index = Index,
-                bool RowMajorOrder_ = RowMajorOrder,
-                typename RealAllocator_ = RealAllocator >
-      using Self = DenseMatrix< _Real, _Device, _Index, RowMajorOrder_, RealAllocator_ >;
-
-      // TODO: remove this
-      using CompressedRowLengthsVector = typename Matrix< Real, Device, Index >::CompressedRowLengthsVector;
-      using ConstCompressedRowLengthsVectorView = typename Matrix< Real, Device, Index >::ConstCompressedRowLengthsVectorView;
-
-
+                bool _RowMajorOrder = RowMajorOrder,
+                typename _RealAllocator = RealAllocator >
+      using Self = DenseMatrix< _Real, _Device, _Index, _RowMajorOrder, _RealAllocator >;
 
       /**
-       * \brief Constrictor without parameters.
+       * \brief Constructor without parameters.
        */
       DenseMatrix();
 
@@ -228,31 +222,119 @@ class DenseMatrix : public Matrix< Real, Device, Index >
       template< typename RowLengthsVector >
       void getCompressedRowLengths( RowLengthsVector& rowLengths ) const;
 
-      IndexType getMaxRowLength() const;
-
-      IndexType getNumberOfMatrixElements() const;
+      /**
+       * \brief Returns number of all matrix elements.
+       * 
+       * This method is here mainly for compatibility with sparse matrices since
+       * the number of all matrix elements is just number of rows times number of
+       * columns.
+       * 
+       * \return number of all matrix elements.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_getElementsCount.cpp
+       * \par Output
+       * \include DenseMatrixExample_getElementsCount.out
+       */
+      IndexType getElementsCount() const;
 
-      IndexType getNumberOfNonzeroMatrixElements() const;
+      /**
+       * \brief Returns number of non-zero matrix elements.
+       * 
+       * \return number of all non-zero matrix elements.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_getElementsCount.cpp
+       * \par Output
+       * \include DenseMatrixExample_getElementsCount.out
+       */
+      IndexType getNonzeroElementsCount() const;
 
+      /**
+       * \brief Resets the matrix to zero dimensions.
+       */
       void reset();
 
+      /**
+       * \brief Constant getter of simple structure for accessing given matrix row.
+       * 
+       * \param rowIdx is matrix row index.
+       * 
+       * \return RowView for accessing given matrix row.
+       *
+       * \par Example
+       * \include Matrices/DenseMatrixExample_getConstRow.cpp
+       * \par Output
+       * \include DenseMatrixExample_getConstRow.out
+       */
       __cuda_callable__
       const RowView getRow( const IndexType& rowIdx ) const;
 
+      /**
+       * \brief Non-constant getter of simple structure for accessing given matrix row.
+       * 
+       * \param rowIdx is matrix row index.
+       * 
+       * \return RowView for accessing given matrix row.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_getRow.cpp
+       * \par Output
+       * \include DenseMatrixExample_getRow.out
+       */
       __cuda_callable__
       RowView getRow( const IndexType& rowIdx );
 
-
+      /**
+       * \brief Sets all matrix elements to value \e v.
+       * 
+       * \param v is value all matrix elements will be set to.
+       */
       void setValue( const RealType& v );
 
+      /**
+       * \brief Returns non-constant reference to element at row \e row and column column.
+       * 
+       * Since this method returns reference to the element, it cannot be called across
+       * different address spaces. It means that it can be called only form CPU if the matrix
+       * is allocated on CPU or only from GPU kernels if the matrix is allocated on GPU.
+       * 
+       * \param row is a row index of the element.
+       * \param column is a columns index of the element. 
+       * \return reference to given matrix element.
+       */
       __cuda_callable__
       Real& operator()( const IndexType row,
                         const IndexType column );
 
+      /**
+       * \brief Returns constant reference to element at row \e row and column column.
+       * 
+       * Since this method returns reference to the element, it cannot be called across
+       * different address spaces. It means that it can be called only form CPU if the matrix
+       * is allocated on CPU or only from GPU kernels if the matrix is allocated on GPU.
+       * 
+       * \param row is a row index of the element.
+       * \param column is a columns index of the element. 
+       * \return reference to given matrix element.
+       */
       __cuda_callable__
       const Real& operator()( const IndexType row,
                               const IndexType column ) const;
 
+      /**
+       * \brief Sets element at given \e row and \e column to given \e value.
+       * 
+       * This method can be called only from the host system (CPU) no matter
+       * where the matrix is allocated. If the matrix is allocated in GPU device
+       * this methods transfer values of each matrix element separately and so the
+       * performance is very low. For higher performance see. \ref DenseMatrix::getRow
+       * or \ref DenseMatrix::forRows and \ref DenseMatrix::forAllRows.
+       * 
+       * \param row is row index of the element.
+       * \param column is columns index of the element.
+       * \param value is the value the element will be set to.
+       */
       void setElement( const IndexType row,
                        const IndexType column,
                        const RealType& value );
diff --git a/src/TNL/Matrices/DenseMatrix.hpp b/src/TNL/Matrices/DenseMatrix.hpp
index 64a14afc8..4d8166f64 100644
--- a/src/TNL/Matrices/DenseMatrix.hpp
+++ b/src/TNL/Matrices/DenseMatrix.hpp
@@ -202,17 +202,7 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-Index DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::getMaxRowLength() const
-{
-   return this->getColumns();
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          bool RowMajorOrder,
-          typename RealAllocator >
-Index DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::getNumberOfMatrixElements() const
+Index DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::getElementsCount() const
 {
    return this->getRows() * this->getColumns();
 }
@@ -222,9 +212,9 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-Index DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::getNumberOfNonzeroMatrixElements() const
+Index DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::getNonzeroElementsCount() const
 {
-   return this->view.getNumberOfNonzeroMatrixElements();
+   return this->view.getNonzeroElementsCount();
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/DenseMatrixRowView.h b/src/TNL/Matrices/DenseMatrixRowView.h
index 84c6b141c..01fdd9408 100644
--- a/src/TNL/Matrices/DenseMatrixRowView.h
+++ b/src/TNL/Matrices/DenseMatrixRowView.h
@@ -32,10 +32,10 @@ class DenseMatrixRowView
       IndexType getSize() const;
 
       __cuda_callable__
-      const RealType& getValue( const IndexType column ) const;
+      const RealType& getElement( const IndexType column ) const;
 
       __cuda_callable__
-      RealType& getValue( const IndexType column );
+      RealType& getElement( const IndexType column );
 
       __cuda_callable__
       void setElement( const IndexType column,
diff --git a/src/TNL/Matrices/DenseMatrixRowView.hpp b/src/TNL/Matrices/DenseMatrixRowView.hpp
index 1962a4d9a..9ca725396 100644
--- a/src/TNL/Matrices/DenseMatrixRowView.hpp
+++ b/src/TNL/Matrices/DenseMatrixRowView.hpp
@@ -38,7 +38,7 @@ template< typename SegmentView,
           typename ValuesView >
 __cuda_callable__ auto
 DenseMatrixRowView< SegmentView, ValuesView >::
-getValue( const IndexType column ) const -> const RealType&
+getElement( const IndexType column ) const -> const RealType&
 {
    TNL_ASSERT_LT( column, this->getSize(), "Column index exceeds matrix row size." );
    return values[ segmentView.getGlobalIndex( column ) ];
@@ -48,7 +48,7 @@ template< typename SegmentView,
           typename ValuesView >
 __cuda_callable__ auto
 DenseMatrixRowView< SegmentView, ValuesView >::
-getValue( const IndexType column ) -> RealType&
+getElement( const IndexType column ) -> RealType&
 {
    TNL_ASSERT_LT( column, this->getSize(), "Column index exceeds matrix row size." );
    return values[ segmentView.getGlobalIndex( column ) ];
diff --git a/src/TNL/Matrices/DenseMatrixView.h b/src/TNL/Matrices/DenseMatrixView.h
index a7e1a09a7..f0d0b388e 100644
--- a/src/TNL/Matrices/DenseMatrixView.h
+++ b/src/TNL/Matrices/DenseMatrixView.h
@@ -87,9 +87,9 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
 
       IndexType getMaxRowLength() const;
 
-      IndexType getNumberOfMatrixElements() const;
+      IndexType getElementsCount() const;
 
-      IndexType getNumberOfNonzeroMatrixElements() const;
+      IndexType getNonzeroElementsCount() const;
 
       void reset();
 
diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index ec2e3f892..bf2ebd4f2 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -144,7 +144,7 @@ template< typename Real,
           typename Device,
           typename Index,
           bool RowMajorOrder >
-Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getNumberOfMatrixElements() const
+Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getElementsCount() const
 {
    return this->getRows() * this->getColumns();
 }
@@ -153,7 +153,7 @@ template< typename Real,
           typename Device,
           typename Index,
           bool RowMajorOrder >
-Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getNumberOfNonzeroMatrixElements() const
+Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getNonzeroElementsCount() const
 {
    const auto values_view = this->values.getConstView();
    auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> IndexType {
@@ -189,7 +189,7 @@ DenseMatrixView< Real, Device, Index, RowMajorOrder >::
 getRow( const IndexType& rowIdx ) const -> const RowView
 {
    TNL_ASSERT_LT( rowIdx, this->getRows(), "Row index is larger than number of matrix rows." );
-   return RowView( this->segments.getSegmentView( rowIdx ), this->values.getView() );
+   return RowView( this->segments.getSegmentView( rowIdx ), this->values.getConstView() );
 }
 
 template< typename Real,
@@ -685,8 +685,8 @@ void DenseMatrixView< Real, Device, Index, RowMajorOrder >::print( std::ostream&
       for( IndexType column = 0; column < this->getColumns(); column++ )
       {
          std::stringstream str_;
-         str_ << column << ":" << this->getElement( row, column );
-         str << std::setw( 6 ) << str_.str();
+         str_ << std::setw( 4 ) << std::right << column << ":" << std::setw( 4 ) << std::left << this->getElement( row, column );
+         str << std::setw( 10 ) << str_.str();
       }
       str << std::endl;
    }
diff --git a/src/TNL/Matrices/MatrixView.h b/src/TNL/Matrices/MatrixView.h
index 895510181..76a3948a9 100644
--- a/src/TNL/Matrices/MatrixView.h
+++ b/src/TNL/Matrices/MatrixView.h
@@ -83,8 +83,6 @@ public:
 
    virtual void save( File& file ) const;
 
-   virtual void load( File& file );
-
    virtual void print( std::ostream& str ) const;
 
 
diff --git a/src/TNL/Matrices/MatrixView.hpp b/src/TNL/Matrices/MatrixView.hpp
index dfac8f3af..b2b181e4c 100644
--- a/src/TNL/Matrices/MatrixView.hpp
+++ b/src/TNL/Matrices/MatrixView.hpp
@@ -155,17 +155,6 @@ void MatrixView< Real, Device, Index >::save( File& file ) const
    file << this->values;
 }
 
-template< typename Real,
-          typename Device,
-          typename Index >
-void MatrixView< Real, Device, Index >::load( File& file )
-{
-   Object::load( file );
-   file.load( &this->rows );
-   file.load( &this->columns );
-   file >> this->values;
-}
-
 template< typename Real,
           typename Device,
           typename Index >
diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index 046b0a6ae..0348a3945 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -115,7 +115,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
 
       // TODO: Remove this when possible
       void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) {
-         this->setCompressedRowLengths( rowLengths );
+         this->setRowCapacities( rowLengths );
       };
 
       void setElements( const std::initializer_list< std::tuple< IndexType, IndexType, RealType > >& data );
diff --git a/src/UnitTests/Matrices/DenseMatrixTest.h b/src/UnitTests/Matrices/DenseMatrixTest.h
index 81e2ae682..391043f0f 100644
--- a/src/UnitTests/Matrices/DenseMatrixTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixTest.h
@@ -163,7 +163,7 @@ void test_GetCompressedRowLengths()
 }
 
 template< typename Matrix >
-void test_GetNumberOfMatrixElements()
+void test_GetElementsCount()
 {
     using RealType = typename Matrix::RealType;
     using DeviceType = typename Matrix::DeviceType;
@@ -176,11 +176,11 @@ void test_GetNumberOfMatrixElements()
     m.reset();
     m.setDimensions( rows, cols );
 
-    EXPECT_EQ( m.getNumberOfMatrixElements(), 42 );
+    EXPECT_EQ( m.getElementsCount(), 42 );
 }
 
 template< typename Matrix >
-void test_GetNumberOfNonzeroMatrixElements()
+void test_GetNonzeroElementsCount()
 {
     using RealType = typename Matrix::RealType;
     using DeviceType = typename Matrix::DeviceType;
@@ -212,7 +212,7 @@ void test_GetNumberOfNonzeroMatrixElements()
     m.setElement( 0, 0, 0); // Set the first element of the diagonal to 0.
     m.setElement( 6, 5, 0); // Set the last element of the diagonal to 0.
 
-    EXPECT_EQ( m.getNumberOfNonzeroMatrixElements(), 40 );
+    EXPECT_EQ( m.getNonzeroElementsCount(), 40 );
 }
 
 template< typename Matrix >
@@ -706,7 +706,7 @@ void test_AddRow()
       auto row = matrix_view.getRow( rowIdx );
       for( IndexType i = 0; i < 5; i++ )
       {
-         RealType& val = row.getValue( i );
+         RealType& val = row.getElement( i );
          val = rowIdx * val + values[ rowIdx ][ i ];
       }
    };
@@ -1281,53 +1281,6 @@ void test_SaveAndLoad()
     EXPECT_EQ( savedMatrix.getElement( 3, 3 ), 16 );
 }
 
-template< typename Matrix >
-void test_Print()
-{
-    using RealType = typename Matrix::RealType;
-    using DeviceType = typename Matrix::DeviceType;
-    using IndexType = typename Matrix::IndexType;
-/*
- * Sets up the following 5x4 sparse matrix:
- *
- *    /  1  2  3  4 \
- *    |  5  6  7  8 |
- *    |  9 10 11 12 |
- *    | 13 14 15 16 |
- *    \ 17 18 19 20 /
- */
-    const IndexType rows = 5;
-    const IndexType cols = 4;
-
-    Matrix m;
-    m.reset();
-    m.setDimensions( rows, cols );
-
-    RealType value = 1;
-    for( IndexType i = 0; i < rows; i++)
-        for( IndexType j = 0; j < cols; j++)
-            m.setElement( i, j, value++ );
-
-    #include <sstream>
-    std::stringstream printed;
-    std::stringstream couted;
-
-    //change the underlying buffer and save the old buffer
-    auto old_buf = std::cout.rdbuf(printed.rdbuf());
-
-    m.print( std::cout ); //all the std::cout goes to ss
-
-    std::cout.rdbuf(old_buf); //reset
-
-    couted << "Row: 0 ->  Col:0->1	 Col:1->2	 Col:2->3	 Col:3->4\t\n"
-              "Row: 1 ->  Col:0->5	 Col:1->6	 Col:2->7	 Col:3->8\t\n"
-              "Row: 2 ->  Col:0->9	 Col:1->10	 Col:2->11	 Col:3->12\t\n"
-              "Row: 3 ->  Col:0->13	 Col:1->14	 Col:2->15	 Col:3->16\t\n"
-              "Row: 4 ->  Col:0->17	 Col:1->18	 Col:2->19	 Col:3->20\t\n";
-
-    EXPECT_EQ( printed.str(), couted.str() );
-}
-
 // test fixture for typed tests
 template< typename Matrix >
 class MatrixTest : public ::testing::Test
@@ -1395,18 +1348,18 @@ TYPED_TEST( MatrixTest, setElementsTest )
     test_SetElements< MatrixType >();
 }
 
-TYPED_TEST( MatrixTest, getNumberOfMatrixElementsTest )
+TYPED_TEST( MatrixTest, getElementsCountTest )
 {
     using MatrixType = typename TestFixture::MatrixType;
 
-    test_GetNumberOfMatrixElements< MatrixType >();
+    test_GetElementsCount< MatrixType >();
 }
 
-TYPED_TEST( MatrixTest, getNumberOfNonzeroMatrixElementsTest )
+TYPED_TEST( MatrixTest, getNonzeroElementsCountTest )
 {
     using MatrixType = typename TestFixture::MatrixType;
 
-    test_GetNumberOfNonzeroMatrixElements< MatrixType >();
+    test_GetNonzeroElementsCount< MatrixType >();
 }
 
 TYPED_TEST( MatrixTest, resetTest )
@@ -1479,13 +1432,6 @@ TYPED_TEST( MatrixTest, saveAndLoadTest )
     test_SaveAndLoad< MatrixType >();
 }
 
-TYPED_TEST( MatrixTest, printTest )
-{
-    using MatrixType = typename TestFixture::MatrixType;
-
-    test_Print< MatrixType >();
-}
-
 //// test_getType is not general enough yet. DO NOT TEST IT YET.
 
 //TEST( DenseMatrixTest, Dense_GetTypeTest_Host )
-- 
GitLab


From 9f1955d9b51dad38f02730c243ca4ab18d0bd9bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 24 Apr 2020 11:09:23 +0200
Subject: [PATCH 20/68] Writing documentation on DenseMatrix.

---
 .../Examples/Matrices/CMakeLists.txt          | 111 +++++++++++++-----
 .../DenseMatrixExample_addElement.cpp         |  30 +++++
 .../Matrices/DenseMatrixExample_addElement.cu |   1 +
 .../DenseMatrixExample_allRowsReduction.cpp   |  66 +++++++++++
 .../DenseMatrixExample_allRowsReduction.cu    |   1 +
 .../Matrices/DenseMatrixExample_forRows.cpp   |  23 ++++
 .../Matrices/DenseMatrixExample_forRows.cu    |   1 +
 .../DenseMatrixExample_getElement.cpp         |  34 ++++++
 .../Matrices/DenseMatrixExample_getElement.cu |   1 +
 .../DenseMatrixExample_rowsReduction.cpp      |  66 +++++++++++
 .../DenseMatrixExample_rowsReduction.cu       |   1 +
 .../DenseMatrixExample_setElement.cpp         |  24 ++++
 .../Matrices/DenseMatrixExample_setElement.cu |   1 +
 src/TNL/Matrices/DenseMatrix.h                | 100 +++++++++++++++-
 src/TNL/Matrices/DenseMatrix.hpp              |  13 +-
 15 files changed, 436 insertions(+), 37 deletions(-)
 create mode 100644 Documentation/Examples/Matrices/DenseMatrixExample_addElement.cpp
 create mode 120000 Documentation/Examples/Matrices/DenseMatrixExample_addElement.cu
 create mode 100644 Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cpp
 create mode 120000 Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cu
 create mode 100644 Documentation/Examples/Matrices/DenseMatrixExample_forRows.cpp
 create mode 120000 Documentation/Examples/Matrices/DenseMatrixExample_forRows.cu
 create mode 100644 Documentation/Examples/Matrices/DenseMatrixExample_getElement.cpp
 create mode 120000 Documentation/Examples/Matrices/DenseMatrixExample_getElement.cu
 create mode 100644 Documentation/Examples/Matrices/DenseMatrixExample_rowsReduction.cpp
 create mode 120000 Documentation/Examples/Matrices/DenseMatrixExample_rowsReduction.cu
 create mode 100644 Documentation/Examples/Matrices/DenseMatrixExample_setElement.cpp
 create mode 120000 Documentation/Examples/Matrices/DenseMatrixExample_setElement.cu

diff --git a/Documentation/Examples/Matrices/CMakeLists.txt b/Documentation/Examples/Matrices/CMakeLists.txt
index f91ae36e0..748d5dece 100644
--- a/Documentation/Examples/Matrices/CMakeLists.txt
+++ b/Documentation/Examples/Matrices/CMakeLists.txt
@@ -1,50 +1,95 @@
-#IF( BUILD_CUDA )
-#   CUDA_ADD_EXECUTABLE( DenseMatrixExample_Constructor_init_list_cuda DenseMatrixExample_Constructor_init_list.cu )
-#   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_Constructor_init_list_cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_Constructor_init_list.out 
-#                       OUTPUT DenseMatrixExample_Constructor_init_list.out )
-#
-#   CUDA_ADD_EXECUTABLE( DenseMatrixExample_setElements_cuda DenseMatrixExample_setElements.cu )
-#   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElements_cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElements.out 
-#                       OUTPUT DenseMatrixExample_setElements.out )
-#
-#   CUDA_ADD_EXECUTABLE( DenseMatrixExample_getCompressedRowLengths_cuda DenseMatrixExample_getCompressedRowLengths.cu )
-#   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getCompressedRowLengths_cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getCompressedRowLengths.out 
-#                       OUTPUT DenseMatrixExample_getCompressedRowLengths.out )
-#
-#ELSE()
+IF( BUILD_CUDA )
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_setElement_cuda DenseMatrixExample_setElement.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElement_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElement.out
+                       OUTPUT DenseMatrixExample_setElement.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_addElement_cuda DenseMatrixExample_addElement.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_addElement_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_addElement.out
+                       OUTPUT DenseMatrixExample_addElement.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_getElement_cuda DenseMatrixExample_getElement.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getElement_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getElement.out
+                       OUTPUT DenseMatrixExample_getElement.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_rowsReduction_cuda DenseMatrixExample_rowsReduction.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_rowsReduction_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_rowsReduction.out
+                       OUTPUT DenseMatrixExample_rowsReduction.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_allRowsReduction_cuda DenseMatrixExample_allRowsReduction.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_allRowsReduction_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_allRowsReduction.out
+                       OUTPUT DenseMatrixExample_allRowsReduction.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_forRows_cuda DenseMatrixExample_forRows.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_forRows_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_forRows.out
+                       OUTPUT DenseMatrixExample_forRows.out )
+ELSE()
    ADD_EXECUTABLE( DenseMatrixExample_Constructor_init_list DenseMatrixExample_Constructor_init_list.cpp )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_Constructor_init_list > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_Constructor_init_list.out 
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_Constructor_init_list >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_Constructor_init_list.out
                        OUTPUT DenseMatrixExample_Constructor_init_list.out )
 
    ADD_EXECUTABLE( DenseMatrixExample_setElements DenseMatrixExample_setElements.cpp )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElements > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElements.out 
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElements > 
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElements.out
                        OUTPUT DenseMatrixExample_setElements.out )
 
    ADD_EXECUTABLE( DenseMatrixExample_getCompressedRowLengths DenseMatrixExample_getCompressedRowLengths.cpp )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getCompressedRowLengths > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getCompressedRowLengths.out 
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getCompressedRowLengths >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getCompressedRowLengths.out
                        OUTPUT DenseMatrixExample_getCompressedRowLengths.out )
 
    ADD_EXECUTABLE( DenseMatrixExample_getElementsCount DenseMatrixExample_getElementsCount.cpp )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getElementsCount > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getElementsCount.out 
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getElementsCount >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getElementsCount.out
                        OUTPUT DenseMatrixExample_getElementsCount.out )
 
    ADD_EXECUTABLE( DenseMatrixExample_getConstRow DenseMatrixExample_getConstRow.cpp )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getConstRow > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getConstRow.out 
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getConstRow >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getConstRow.out
                        OUTPUT DenseMatrixExample_getConstRow.out )
 
    ADD_EXECUTABLE( DenseMatrixExample_getRow DenseMatrixExample_getRow.cpp )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getRow > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getRow.out 
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getRow >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getRow.out
                        OUTPUT DenseMatrixExample_getRow.out )
 
-#ENDIF()
+   ADD_EXECUTABLE( DenseMatrixExample_setElement DenseMatrixExample_setElement.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElement >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElement.out
+                       OUTPUT DenseMatrixExample_setElement.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_addElement DenseMatrixExample_addElement.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_addElement >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_addElement.out
+                       OUTPUT DenseMatrixExample_addElement.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_getElement DenseMatrixExample_getElement.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getElement >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getElement.out
+                       OUTPUT DenseMatrixExample_getElement.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_rowsReduction DenseMatrixExample_rowsReduction.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_rowsReduction >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_rowsReduction.out
+                       OUTPUT DenseMatrixExample_rowsReduction.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_allRowsReduction DenseMatrixExample_allRowsReduction.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_allRowsReduction >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_allRowsReduction.out
+                       OUTPUT DenseMatrixExample_allRowsReduction.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_forRows DenseMatrixExample_forRows.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_forRows >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_forRows.out
+                       OUTPUT DenseMatrixExample_forRows.out )
+ENDIF()
 
-#IF( BUILD_CUDA )
-#ADD_CUSTOM_TARGET( RunMatricesExamples-cuda ALL DEPENDS
-#   DenseMatrixExample_Constructor_init_list.out
-#   DenseMatrixExample_setElements.out
-#   DenseMatrixExample_getCompressedRowLengths.out
-#   )
-#ELSE()
 ADD_CUSTOM_TARGET( RunMatricesExamples ALL DEPENDS
    DenseMatrixExample_Constructor_init_list.out
    DenseMatrixExample_setElements.out
@@ -52,5 +97,11 @@ ADD_CUSTOM_TARGET( RunMatricesExamples ALL DEPENDS
    DenseMatrixExample_getElementsCount.out
    DenseMatrixExample_getConstRow.out
    DenseMatrixExample_getRow.out
-   )
-#ENDIF()
+   DenseMatrixExample_setElement.out
+   DenseMatrixExample_addElement.out
+   DenseMatrixExample_getElement.out
+   DenseMatrixExample_rowsReduction.out
+   DenseMatrixExample_allRowsReduction.out
+   DenseMatrixExample_forRows.out
+)
+
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_addElement.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_addElement.cpp
new file mode 100644
index 000000000..32e39e6a3
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_addElement.cpp
@@ -0,0 +1,30 @@
+#include <iostream>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+template< typename Device >
+void addElements()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix( 5, 5 );
+   for( int i = 0; i < 5; i++ )
+      matrix.setElement( i, i, i );
+
+   std::cout << "Initial matrix is: " << matrix << std::endl;
+
+   for( int i = 0; i < 5; i++ )
+      for( int j = 0; j < 5; j++ )
+         matrix.addElement( i, j, 1.0, 5.0 );
+
+   std::cout << "Matrix after addition is: " << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Add elements on host:" << std::endl;
+   addElements< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Add elements on CUDA device:" << std::endl;
+   addElements< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_addElement.cu b/Documentation/Examples/Matrices/DenseMatrixExample_addElement.cu
new file mode 120000
index 000000000..dd83670e4
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_addElement.cu
@@ -0,0 +1 @@
+DenseMatrixExample_addElement.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cpp
new file mode 100644
index 000000000..0eada81c9
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cpp
@@ -0,0 +1,66 @@
+#include <iostream>
+#include <iomanip>
+#include <functional>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+template< typename Device >
+void allRowsReduction()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix {
+      {  1,  0,  0,  0,  0 },
+      {  1,  2,  0,  0,  0 },
+      {  0,  1,  8,  0,  0 },
+      {  0,  0,  1,  9,  0 },
+      {  0,  0,  0,  0,  1 } };
+
+   /***
+    * Find largest element in each row.
+    */
+   TNL::Containers::Vector< double, Device > rowMax( matrix.getRows() );
+
+   /***
+    * Prepare vector view and matrix view for lambdas.
+    */
+   const auto matrixView = matrix.getConstView();
+   auto rowMaxView = rowMax.getView();
+
+   /***
+    * Fetch lambda just returns absolute value of matrix elements.
+    */
+   auto fetch = [=] __cuda_callable__ ( int rowIdx, int columnIdx, const double& value ) -> double {
+      return TNL::abs( value );
+   };
+
+   /***
+    * Reduce lambda return maximum of given values.
+    */
+   auto reduce = [=] __cuda_callable__ ( double& a, const double& b ) {
+      a = TNL::max( a, b );
+   };
+
+   /***
+    * Keep lambda store the largest value in each row to the vector rowMax.
+    */
+   auto keep = [=] __cuda_callable__ ( int rowIdx, const double& value ) mutable {
+      rowMaxView[ rowIdx ] = value;
+   };
+
+   /***
+    * Compute the largest values in each row.
+    */
+   matrix.allRowsReduction( fetch, reduce, keep, std::numeric_limits< double >::lowest() );
+
+   std::cout << "Max. elements in rows are: " << rowMax << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "All rows reduction on host:" << std::endl;
+   allRowsReduction< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "All rows reduction on CUDA device:" << std::endl;
+   allRowsReduction< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cu b/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cu
new file mode 120000
index 000000000..70f517f68
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cu
@@ -0,0 +1 @@
+DenseMatrixExample_allRowsReduction.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_forRows.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_forRows.cpp
new file mode 100644
index 000000000..5d364f320
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_forRows.cpp
@@ -0,0 +1,23 @@
+#include <iostream>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+int main( int argc, char* argv[] )
+{
+   TNL::Matrices::DenseMatrix< double, TNL::Devices::Host > matrix( 5, 5 );
+
+   /***
+    * We need a matrix view to pass the matrix to lambda function even on CUDA device.
+    */
+   auto matrixView = matrix.getView();
+   auto f = [=] __cuda_callable__ ( int rowIdx, int columnIdx, int globalIdx, double& value, bool& compute ) {
+      if( rowIdx < columnIdx )
+         compute = false;
+      else
+         value = rowIdx + columnIdx;
+   };
+
+   matrix.forRows( 0, matrix.getRows(), f );
+   std::cout << matrix << std::endl;
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_forRows.cu b/Documentation/Examples/Matrices/DenseMatrixExample_forRows.cu
new file mode 120000
index 000000000..f97a66ee3
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_forRows.cu
@@ -0,0 +1 @@
+DenseMatrixExample_forRows.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getElement.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_getElement.cpp
new file mode 100644
index 000000000..72a5d0af4
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getElement.cpp
@@ -0,0 +1,34 @@
+#include <iostream>
+#include <iomanip>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+template< typename Device >
+void getElements()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix {
+      {  1,  0,  0,  0,  0 },
+      { -1,  2, -1,  0,  0 },
+      {  0, -1,  2, -1,  0 },
+      {  0,  0, -1,  2, -1 },
+      {  0,  0,  0,  0,  1 } };
+
+
+   for( int i = 0; i < 5; i++ )
+   {
+      for( int j = 0; j < 5; j++ )
+         std::cout << std::setw( 5 ) << std::ios::right << matrix.getElement( i, i );
+      std::cout << std::endl;
+   }
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Get elements on host:" << std::endl;
+   getElements< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Get elements on CUDA device:" << std::endl;
+   getElements< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getElement.cu b/Documentation/Examples/Matrices/DenseMatrixExample_getElement.cu
new file mode 120000
index 000000000..bad6f2fab
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getElement.cu
@@ -0,0 +1 @@
+DenseMatrixExample_getElement.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_rowsReduction.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_rowsReduction.cpp
new file mode 100644
index 000000000..2cda9500a
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_rowsReduction.cpp
@@ -0,0 +1,66 @@
+#include <iostream>
+#include <iomanip>
+#include <functional>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+template< typename Device >
+void rowsReduction()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix {
+      {  1,  0,  0,  0,  0 },
+      {  1,  2,  0,  0,  0 },
+      {  0,  1,  8,  0,  0 },
+      {  0,  0,  1,  9,  0 },
+      {  0,  0,  0,  0,  1 } };
+
+   /***
+    * Find largest element in each row.
+    */
+   TNL::Containers::Vector< double, Device > rowMax( matrix.getRows() );
+
+   /***
+    * Prepare vector view and matrix view for lambdas.
+    */
+   const auto matrixView = matrix.getConstView();
+   auto rowMaxView = rowMax.getView();
+
+   /***
+    * Fetch lambda just returns absolute value of matrix elements.
+    */
+   auto fetch = [=] __cuda_callable__ ( int rowIdx, int columnIdx, const double& value ) -> double {
+      return TNL::abs( value );
+   };
+
+   /***
+    * Reduce lambda return maximum of given values.
+    */
+   auto reduce = [=] __cuda_callable__ ( double& a, const double& b ) {
+      a = TNL::max( a, b );
+   };
+
+   /***
+    * Keep lambda store the largest value in each row to the vector rowMax.
+    */
+   auto keep = [=] __cuda_callable__ ( int rowIdx, const double& value ) mutable {
+      rowMaxView[ rowIdx ] = value;
+   };
+
+   /***
+    * Compute the largest values in each row.
+    */
+   matrix.rowsReduction( 0, matrix.getRows(), fetch, reduce, keep, std::numeric_limits< double >::lowest() );
+
+   std::cout << "Max. elements in rows are: " << rowMax << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Rows reduction on host:" << std::endl;
+   rowsReduction< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Rows reduction on CUDA device:" << std::endl;
+   rowsReduction< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_rowsReduction.cu b/Documentation/Examples/Matrices/DenseMatrixExample_rowsReduction.cu
new file mode 120000
index 000000000..41bf46ebc
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_rowsReduction.cu
@@ -0,0 +1 @@
+DenseMatrixExample_rowsReduction.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_setElement.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_setElement.cpp
new file mode 100644
index 000000000..0b5498adf
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_setElement.cpp
@@ -0,0 +1,24 @@
+#include <iostream>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+template< typename Device >
+void setElements()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix( 5, 5 );
+   for( int i = 0; i < 5; i++ )
+      matrix.setElement( i, i, i );
+
+   std::cout << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Set elements on host:" << std::endl;
+   setElements< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Set elements on CUDA device:" << std::endl;
+   setElements< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_setElement.cu b/Documentation/Examples/Matrices/DenseMatrixExample_setElement.cu
new file mode 120000
index 000000000..5128052c2
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_setElement.cu
@@ -0,0 +1 @@
+DenseMatrixExample_setElement.cpp
\ No newline at end of file
diff --git a/src/TNL/Matrices/DenseMatrix.h b/src/TNL/Matrices/DenseMatrix.h
index 8b22930e9..0fd648b45 100644
--- a/src/TNL/Matrices/DenseMatrix.h
+++ b/src/TNL/Matrices/DenseMatrix.h
@@ -334,25 +334,121 @@ class DenseMatrix : public Matrix< Real, Device, Index >
        * \param row is row index of the element.
        * \param column is columns index of the element.
        * \param value is the value the element will be set to.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_setElement.cpp
+       * \par Output
+       * \include DenseMatrixExample_setElement.out
        */
       void setElement( const IndexType row,
                        const IndexType column,
                        const RealType& value );
 
+      /**
+       * \brief Add element at given \e row and \e column to given \e value.
+       * 
+       * This method can be called only from the host system (CPU) no matter
+       * where the matrix is allocated. If the matrix is allocated in GPU device
+       * this methods transfer values of each matrix element separately and so the
+       * performance is very low. For higher performance see. \ref DenseMatrix::getRow
+       * or \ref DenseMatrix::forRows and \ref DenseMatrix::forAllRows.
+       * 
+       * \param row is row index of the element.
+       * \param column is columns index of the element.
+       * \param value is the value the element will be set to.
+       * \param thisElementMultiplicator is multiplicator the original matrix element
+       *   value is multiplied by before addition of given e value.
+       */
       void addElement( const IndexType row,
                        const IndexType column,
                        const RealType& value,
                        const RealType& thisElementMultiplicator = 1.0 );
 
+      /**
+       * \brief Returns value of matrix element at position given by its row and column index.
+       * 
+       * This method can be called only from the host system (CPU) no matter
+       * where the matrix is allocated. If the matrix is allocated in GPU device
+       * this methods transfer values of each matrix element separately and so the
+       * performance is very low. For higher performance see. \ref DenseMatrix::getRow
+       * or \ref DenseMatrix::forRows and \ref DenseMatrix::forAllRows.
+       * 
+       * \param row is a row index of the matrix element.
+       * \param column i a column index of the matrix element.
+       * 
+       * \return value of given matrix element.
+       */
       Real getElement( const IndexType row,
                        const IndexType column ) const;
 
-      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      /**
+       * \brief Method for performing general reduction on matrix rows.
+       * 
+       * \tparam Fetch is a type of lambda function for data fetch declared as
+       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
+       *          The return type of this lambda can be any non void.
+       * \tparam Reduce is a type of lambda function for reduction declared as
+       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
+       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       * \tparam FetchValue is type returned by the Fetch lambda function.
+       * 
+       * \param first is an index of the first row the reduction will be performed on.
+       * \param last is an index of the row  after the last row the reduction will be performed on.
+       * \param fetch is an instance of lambda function for data fetch.
+       * \param reduce is an instance of lambda function for reduction.
+       * \param keep in an instance of lambda function for storing results.
+       * \param zero is zero of given reduction operation also known as idempotent element.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_rowsReduction.cpp
+       * \par Output
+       * \include DenseMatrixExample_rowsReduction.out
+       */
+      template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
+      void rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchValue& zero ) const;
 
+      /**
+       * \brief Method for performing general reduction on ALL matrix rows.
+       * 
+       * \tparam Fetch is a type of lambda function for data fetch declared as
+       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
+       *          The return type of this lambda can be any non void.
+       * \tparam Reduce is a type of lambda function for reduction declared as
+       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
+       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       * \tparam FetchValue is type returned by the Fetch lambda function.
+       * 
+       * \param fetch is an instance of lambda function for data fetch.
+       * \param reduce is an instance of lambda function for reduction.
+       * \param keep in an instance of lambda function for storing results.
+       * \param zero is zero of given reduction operation also known as idempotent element.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_allRowsReduction.cpp
+       * \par Output
+       * \include DenseMatrixExample_allRowsReduction.out
+       */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
       void allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
+      /**
+       * \brief Method for iteration over all matrix rows.
+       * 
+       * \tparam Function is type of lambda function that will operate on matrix elements.
+       *    It is should have form like
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx, const RealType& value, bool compute )`.
+       * 
+       * \param first is index is the first row to be processed.
+       * \param last is index of the row after the last row to be processed.
+       * \param function is an instance of the lambda function to be called in each row.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_forRows.cpp
+       * \par Output
+       * \include DenseMatrixExample_forRows.out
+       */
       template< typename Function >
       void forRows( IndexType first, IndexType last, Function& function ) const;
 
diff --git a/src/TNL/Matrices/DenseMatrix.hpp b/src/TNL/Matrices/DenseMatrix.hpp
index 4d8166f64..df6be9232 100644
--- a/src/TNL/Matrices/DenseMatrix.hpp
+++ b/src/TNL/Matrices/DenseMatrix.hpp
@@ -412,8 +412,9 @@ template< typename Real,
           typename RealAllocator >
    template< typename Vector >
 __cuda_callable__
-typename Vector::RealType DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::rowVectorProduct( const IndexType row,
-                                                                                   const Vector& vector ) const
+typename Vector::RealType 
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+rowVectorProduct( const IndexType row, const Vector& vector ) const
 {
    return this->view.rowVectorProduct( row, vector );
 }
@@ -438,9 +439,11 @@ template< typename Real,
           bool RowMajorOrder,
           typename RealAllocator >
    template< typename Matrix >
-void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::addMatrix( const Matrix& matrix,
-                                              const RealType& matrixMultiplicator,
-                                              const RealType& thisMatrixMultiplicator )
+void
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+addMatrix( const Matrix& matrix,
+           const RealType& matrixMultiplicator,
+           const RealType& thisMatrixMultiplicator )
 {
    TNL_ASSERT( this->getColumns() == matrix.getColumns() &&
               this->getRows() == matrix.getRows(),
-- 
GitLab


From 15d15b0afafe4f1da04add1efd582401e2df7af4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 24 Apr 2020 12:44:12 +0200
Subject: [PATCH 21/68] Changing segments to be compatible with std::plus and
 simillar functionals.

---
 .../Examples/Matrices/CMakeLists.txt          | 61 ++++++++++---------
 .../DenseMatrixExample_allRowsReduction.cpp   |  9 +--
 .../DenseMatrixExample_rowsReduction.cpp      |  4 +-
 src/TNL/Containers/Segments/BiEllpack.h       |  4 +-
 src/TNL/Containers/Segments/BiEllpack.hpp     |  4 +-
 src/TNL/Containers/Segments/BiEllpackView.h   |  4 +-
 src/TNL/Containers/Segments/BiEllpackView.hpp | 18 +++---
 src/TNL/Containers/Segments/CSR.h             |  4 +-
 src/TNL/Containers/Segments/CSR.hpp           |  4 +-
 src/TNL/Containers/Segments/CSRView.h         |  4 +-
 src/TNL/Containers/Segments/CSRView.hpp       |  6 +-
 src/TNL/Containers/Segments/ChunkedEllpack.h  |  4 +-
 .../Containers/Segments/ChunkedEllpack.hpp    |  4 +-
 .../Containers/Segments/ChunkedEllpackView.h  |  4 +-
 .../Segments/ChunkedEllpackView.hpp           | 20 +++---
 src/TNL/Containers/Segments/Ellpack.h         |  4 +-
 src/TNL/Containers/Segments/Ellpack.hpp       |  4 +-
 src/TNL/Containers/Segments/EllpackView.h     |  4 +-
 src/TNL/Containers/Segments/EllpackView.hpp   |  8 +--
 src/TNL/Containers/Segments/SlicedEllpack.h   |  4 +-
 src/TNL/Containers/Segments/SlicedEllpack.hpp |  8 +--
 .../Containers/Segments/SlicedEllpackView.h   |  4 +-
 .../Containers/Segments/SlicedEllpackView.hpp |  8 +--
 src/TNL/File.hpp                              |  2 +-
 src/TNL/Matrices/DenseMatrix.h                |  4 +-
 src/TNL/Matrices/DenseMatrix.hpp              |  4 +-
 src/TNL/Matrices/DenseMatrixView.h            |  4 +-
 src/TNL/Matrices/DenseMatrixView.hpp          | 15 ++---
 src/TNL/Matrices/SparseMatrix.h               |  7 ++-
 src/TNL/Matrices/SparseMatrix.hpp             |  4 +-
 src/TNL/Matrices/SparseMatrixView.h           |  4 +-
 src/TNL/Matrices/SparseMatrixView.hpp         | 30 ++++-----
 .../Containers/Segments/SegmentsTest.hpp      |  4 +-
 .../Matrices/BinarySparseMatrixTest.hpp       | 10 +--
 src/UnitTests/Matrices/SparseMatrixTest.hpp   | 11 +---
 .../Matrices/SymmetricSparseMatrixTest.hpp    |  5 +-
 36 files changed, 139 insertions(+), 163 deletions(-)

diff --git a/Documentation/Examples/Matrices/CMakeLists.txt b/Documentation/Examples/Matrices/CMakeLists.txt
index 748d5dece..b06224620 100644
--- a/Documentation/Examples/Matrices/CMakeLists.txt
+++ b/Documentation/Examples/Matrices/CMakeLists.txt
@@ -29,36 +29,6 @@ IF( BUILD_CUDA )
                         ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_forRows.out
                        OUTPUT DenseMatrixExample_forRows.out )
 ELSE()
-   ADD_EXECUTABLE( DenseMatrixExample_Constructor_init_list DenseMatrixExample_Constructor_init_list.cpp )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_Constructor_init_list >
-                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_Constructor_init_list.out
-                       OUTPUT DenseMatrixExample_Constructor_init_list.out )
-
-   ADD_EXECUTABLE( DenseMatrixExample_setElements DenseMatrixExample_setElements.cpp )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElements > 
-                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElements.out
-                       OUTPUT DenseMatrixExample_setElements.out )
-
-   ADD_EXECUTABLE( DenseMatrixExample_getCompressedRowLengths DenseMatrixExample_getCompressedRowLengths.cpp )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getCompressedRowLengths >
-                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getCompressedRowLengths.out
-                       OUTPUT DenseMatrixExample_getCompressedRowLengths.out )
-
-   ADD_EXECUTABLE( DenseMatrixExample_getElementsCount DenseMatrixExample_getElementsCount.cpp )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getElementsCount >
-                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getElementsCount.out
-                       OUTPUT DenseMatrixExample_getElementsCount.out )
-
-   ADD_EXECUTABLE( DenseMatrixExample_getConstRow DenseMatrixExample_getConstRow.cpp )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getConstRow >
-                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getConstRow.out
-                       OUTPUT DenseMatrixExample_getConstRow.out )
-
-   ADD_EXECUTABLE( DenseMatrixExample_getRow DenseMatrixExample_getRow.cpp )
-   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getRow >
-                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getRow.out
-                       OUTPUT DenseMatrixExample_getRow.out )
-
    ADD_EXECUTABLE( DenseMatrixExample_setElement DenseMatrixExample_setElement.cpp )
    ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElement >
                         ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElement.out
@@ -90,6 +60,37 @@ ELSE()
                        OUTPUT DenseMatrixExample_forRows.out )
 ENDIF()
 
+ADD_EXECUTABLE( DenseMatrixExample_Constructor_init_list DenseMatrixExample_Constructor_init_list.cpp )
+ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_Constructor_init_list >
+                     ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_Constructor_init_list.out
+                    OUTPUT DenseMatrixExample_Constructor_init_list.out )
+
+ADD_EXECUTABLE( DenseMatrixExample_setElements DenseMatrixExample_setElements.cpp )
+ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElements > 
+                     ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElements.out
+                    OUTPUT DenseMatrixExample_setElements.out )
+
+ADD_EXECUTABLE( DenseMatrixExample_getCompressedRowLengths DenseMatrixExample_getCompressedRowLengths.cpp )
+ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getCompressedRowLengths >
+                     ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getCompressedRowLengths.out
+                    OUTPUT DenseMatrixExample_getCompressedRowLengths.out )
+
+ADD_EXECUTABLE( DenseMatrixExample_getElementsCount DenseMatrixExample_getElementsCount.cpp )
+ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getElementsCount >
+                     ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getElementsCount.out
+                    OUTPUT DenseMatrixExample_getElementsCount.out )
+
+ADD_EXECUTABLE( DenseMatrixExample_getConstRow DenseMatrixExample_getConstRow.cpp )
+ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getConstRow >
+                     ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getConstRow.out
+                    OUTPUT DenseMatrixExample_getConstRow.out )
+
+ADD_EXECUTABLE( DenseMatrixExample_getRow DenseMatrixExample_getRow.cpp )
+ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getRow >
+                     ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getRow.out
+                    OUTPUT DenseMatrixExample_getRow.out )
+
+
 ADD_CUSTOM_TARGET( RunMatricesExamples ALL DEPENDS
    DenseMatrixExample_Constructor_init_list.out
    DenseMatrixExample_setElements.out
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cpp
index 0eada81c9..e45438a83 100644
--- a/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cpp
@@ -32,13 +32,6 @@ void allRowsReduction()
       return TNL::abs( value );
    };
 
-   /***
-    * Reduce lambda return maximum of given values.
-    */
-   auto reduce = [=] __cuda_callable__ ( double& a, const double& b ) {
-      a = TNL::max( a, b );
-   };
-
    /***
     * Keep lambda store the largest value in each row to the vector rowMax.
     */
@@ -49,7 +42,7 @@ void allRowsReduction()
    /***
     * Compute the largest values in each row.
     */
-   matrix.allRowsReduction( fetch, reduce, keep, std::numeric_limits< double >::lowest() );
+   matrix.allRowsReduction( fetch, std::plus<>{}, keep, std::numeric_limits< double >::lowest() );
 
    std::cout << "Max. elements in rows are: " << rowMax << std::endl;
 }
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_rowsReduction.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_rowsReduction.cpp
index 2cda9500a..dbc44f854 100644
--- a/Documentation/Examples/Matrices/DenseMatrixExample_rowsReduction.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_rowsReduction.cpp
@@ -35,8 +35,8 @@ void rowsReduction()
    /***
     * Reduce lambda return maximum of given values.
     */
-   auto reduce = [=] __cuda_callable__ ( double& a, const double& b ) {
-      a = TNL::max( a, b );
+   auto reduce = [=] __cuda_callable__ ( double& a, const double& b ) -> double {
+      return TNL::max( a, b );
    };
 
    /***
diff --git a/src/TNL/Containers/Segments/BiEllpack.h b/src/TNL/Containers/Segments/BiEllpack.h
index 06578312e..45c633b8b 100644
--- a/src/TNL/Containers/Segments/BiEllpack.h
+++ b/src/TNL/Containers/Segments/BiEllpack.h
@@ -97,10 +97,10 @@ class BiEllpack
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       BiEllpack& operator=( const BiEllpack& source ) = default;
 
diff --git a/src/TNL/Containers/Segments/BiEllpack.hpp b/src/TNL/Containers/Segments/BiEllpack.hpp
index 5203b3b3f..a1a7419d7 100644
--- a/src/TNL/Containers/Segments/BiEllpack.hpp
+++ b/src/TNL/Containers/Segments/BiEllpack.hpp
@@ -454,7 +454,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
 }
@@ -467,7 +467,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
-allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Containers/Segments/BiEllpackView.h b/src/TNL/Containers/Segments/BiEllpackView.h
index 20726c621..2450f18ca 100644
--- a/src/TNL/Containers/Segments/BiEllpackView.h
+++ b/src/TNL/Containers/Segments/BiEllpackView.h
@@ -119,10 +119,10 @@ class BiEllpackView
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       BiEllpackView& operator=( const BiEllpackView& view );
 
diff --git a/src/TNL/Containers/Segments/BiEllpackView.hpp b/src/TNL/Containers/Segments/BiEllpackView.hpp
index abf82313a..1cb5ce7c0 100644
--- a/src/TNL/Containers/Segments/BiEllpackView.hpp
+++ b/src/TNL/Containers/Segments/BiEllpackView.hpp
@@ -320,7 +320,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( std::is_same< DeviceType, Devices::Host >::value )
@@ -350,7 +350,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
                //         << " groupWidth = " << groupWidth << " groupHeight = " << groupHeight
                //          << " localIdx = " << localIdx << " globalIdx = " << globalIdx 
                //          << " fetch = " << details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) << std::endl;
-               reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+               aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
                if( RowMajorOrder )
                   globalIdx ++;
                else
@@ -395,7 +395,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
-allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
@@ -502,9 +502,9 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx,
          for( IndexType i = 0; i < groupWidth; i++ )
          {
             if( RowMajorOrder )
-               reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm * groupWidth + i, compute ) );
+               result = reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm * groupWidth + i, compute ) );
             else
-               reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight, compute ) );
+               result = reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight, compute ) );
             localIdx++;
          }
       }
@@ -571,7 +571,7 @@ segmentsReductionKernel( IndexType gridIdx,
                   const IndexType groupWidth = ( groupEnd - groupBegin ) / groupHeight;
                   IndexType globalIdx = groupBegin + inWarpIdx * groupWidth;
                   for( IndexType i = 0; i < groupWidth && compute; i++ )
-                     reduction( results[ threadIdx.x ], fetch( globalIdx++, compute ) );
+                     results[ threadIdx.x ] = reduction( results[ threadIdx.x ], fetch( globalIdx++, compute ) );
                }
             }
          groupHeight >>= 1;
@@ -590,7 +590,7 @@ segmentsReductionKernel( IndexType gridIdx,
             IndexType globalIdx = groupBegin + inWarpIdx;
             while( globalIdx < groupEnd )
             {
-               reduction( temp[ threadIdx.x ], fetch( globalIdx, compute ) );
+               temp[ threadIdx.x ] = reduction( temp[ threadIdx.x ], fetch( globalIdx, compute ) );
                globalIdx += getWarpSize();
             }
             // TODO: reduction via templates
@@ -599,10 +599,10 @@ segmentsReductionKernel( IndexType gridIdx,
             {
                bisection2 >>= 1;
                if( inWarpIdx < bisection2 )
-                  reduction( temp[ threadIdx.x ], temp[ threadIdx.x + bisection2 ] );
+                  temp[ threadIdx.x ] = reduction( temp[ threadIdx.x ], temp[ threadIdx.x + bisection2 ] );
             }
             if( inWarpIdx < groupHeight )
-               reduction( results[ threadIdx.x ], temp[ threadIdx.x ] );
+               results[ threadIdx.x ] = reduction( results[ threadIdx.x ], temp[ threadIdx.x ] );
          }
          groupHeight >>= 1;
       }
diff --git a/src/TNL/Containers/Segments/CSR.h b/src/TNL/Containers/Segments/CSR.h
index ac9a7063e..46f9a9013 100644
--- a/src/TNL/Containers/Segments/CSR.h
+++ b/src/TNL/Containers/Segments/CSR.h
@@ -108,10 +108,10 @@ class CSR
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       CSR& operator=( const CSR& rhsSegments ) = default;
 
diff --git a/src/TNL/Containers/Segments/CSR.hpp b/src/TNL/Containers/Segments/CSR.hpp
index d8ba81461..706a2b052 100644
--- a/src/TNL/Containers/Segments/CSR.hpp
+++ b/src/TNL/Containers/Segments/CSR.hpp
@@ -207,7 +207,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 CSR< Device, Index, IndexAllocator >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
 }
@@ -218,7 +218,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 CSR< Device, Index, IndexAllocator >::
-allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Containers/Segments/CSRView.h b/src/TNL/Containers/Segments/CSRView.h
index fa091583d..b00b012d4 100644
--- a/src/TNL/Containers/Segments/CSRView.h
+++ b/src/TNL/Containers/Segments/CSRView.h
@@ -110,10 +110,10 @@ class CSRView
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       CSRView& operator=( const CSRView& view );
 
diff --git a/src/TNL/Containers/Segments/CSRView.hpp b/src/TNL/Containers/Segments/CSRView.hpp
index 4dde78a24..7599327d1 100644
--- a/src/TNL/Containers/Segments/CSRView.hpp
+++ b/src/TNL/Containers/Segments/CSRView.hpp
@@ -204,7 +204,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 CSRView< Device, Index >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    const auto offsetsView = this->offsets.getConstView();
@@ -215,7 +215,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
       IndexType localIdx( 0 );
       bool compute( true );
       for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
-         reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+         aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
       keeper( segmentIdx, aux );
    };
    Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
@@ -226,7 +226,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 CSRView< Device, Index >::
-allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Containers/Segments/ChunkedEllpack.h b/src/TNL/Containers/Segments/ChunkedEllpack.h
index 624caae68..8a1f48e7b 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpack.h
+++ b/src/TNL/Containers/Segments/ChunkedEllpack.h
@@ -100,10 +100,10 @@ class ChunkedEllpack
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       ChunkedEllpack& operator=( const ChunkedEllpack& source ) = default;
 
diff --git a/src/TNL/Containers/Segments/ChunkedEllpack.hpp b/src/TNL/Containers/Segments/ChunkedEllpack.hpp
index 83150c766..5521927af 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpack.hpp
+++ b/src/TNL/Containers/Segments/ChunkedEllpack.hpp
@@ -401,7 +401,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
 }
@@ -413,7 +413,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
-allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Containers/Segments/ChunkedEllpackView.h b/src/TNL/Containers/Segments/ChunkedEllpackView.h
index 3661e209e..d8ed4e81f 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpackView.h
+++ b/src/TNL/Containers/Segments/ChunkedEllpackView.h
@@ -132,10 +132,10 @@ class ChunkedEllpackView
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       ChunkedEllpackView& operator=( const ChunkedEllpackView& view );
 
diff --git a/src/TNL/Containers/Segments/ChunkedEllpackView.hpp b/src/TNL/Containers/Segments/ChunkedEllpackView.hpp
index ce77eccd9..19ae6f672 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpackView.hpp
+++ b/src/TNL/Containers/Segments/ChunkedEllpackView.hpp
@@ -364,7 +364,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 ChunkedEllpackView< Device, Index, RowMajorOrder >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( std::is_same< DeviceType, Devices::Host >::value )
@@ -394,7 +394,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
             IndexType end = begin + segmentSize;
             IndexType localIdx( 0 );
             for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++ )
-               reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+               aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
          }
          else
          {
@@ -404,7 +404,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
                IndexType end = begin + chunksInSlice * chunkSize;
                IndexType localIdx( 0 );
                for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx += chunksInSlice )
-                  reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+                  aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
             }
          }
          keeper( segmentIdx, aux );
@@ -438,7 +438,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 ChunkedEllpackView< Device, Index, RowMajorOrder >::
-allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
@@ -558,14 +558,14 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx,
       IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice
       IndexType end = begin + chunkSize;
       for( IndexType j = begin; j < end && compute; j++ )
-         reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute ) );
+         chunksResults[ threadIdx.x ] = reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute ) );
    }
    else
    {
       const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice
       const IndexType end = begin + chunksInSlice * chunkSize;
          for( IndexType j = begin; j < end && compute; j += chunksInSlice )
-            reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute ) );
+            chunksResults[ threadIdx.x ] = reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute ) );
    }
    __syncthreads();
    if( threadIdx.x < sliceInfo.size )
@@ -577,7 +577,7 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx,
       const IndexType lastChunk = this->rowToChunkMapping[ row ];
       RealType result( zero );
       while( chunkIndex < lastChunk )
-         reduction( result,  chunksResults[ chunkIndex++ ] );
+         result = reduction( result,  chunksResults[ chunkIndex++ ] );
       if( row >= first && row < last )
          keeper( row, result );
    }
@@ -630,14 +630,14 @@ segmentsReductionKernel( IndexType gridIdx,
       IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice
       IndexType end = begin + chunkSize;
       for( IndexType j = begin; j < end && compute; j++ )
-         reduction( chunksResults[ threadIdx.x ], fetch( j, compute ) );
+         chunksResults[ threadIdx.x ] = reduction( chunksResults[ threadIdx.x ], fetch( j, compute ) );
    }
    else
    {
       const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice
       const IndexType end = begin + chunksInSlice * chunkSize;
          for( IndexType j = begin; j < end && compute; j += chunksInSlice )
-            reduction( chunksResults[ threadIdx.x ], fetch( j, compute ) );
+            chunksResults[ threadIdx.x ] = reduction( chunksResults[ threadIdx.x ], fetch( j, compute ) );
    }
    __syncthreads();
 
@@ -650,7 +650,7 @@ segmentsReductionKernel( IndexType gridIdx,
       const IndexType lastChunk = this->rowToChunkMapping[ row ];
       RealType result( zero );
       while( chunkIndex < lastChunk )
-         reduction( result,  chunksResults[ chunkIndex++ ] );
+         result = reduction( result,  chunksResults[ chunkIndex++ ] );
       if( row >= first && row < last )
          keeper( row, result );
    }
diff --git a/src/TNL/Containers/Segments/Ellpack.h b/src/TNL/Containers/Segments/Ellpack.h
index f02ef1523..14e77f89b 100644
--- a/src/TNL/Containers/Segments/Ellpack.h
+++ b/src/TNL/Containers/Segments/Ellpack.h
@@ -105,10 +105,10 @@ class Ellpack
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       Ellpack& operator=( const Ellpack& source ) = default;
 
diff --git a/src/TNL/Containers/Segments/Ellpack.hpp b/src/TNL/Containers/Segments/Ellpack.hpp
index dedc41a41..436f470af 100644
--- a/src/TNL/Containers/Segments/Ellpack.hpp
+++ b/src/TNL/Containers/Segments/Ellpack.hpp
@@ -278,7 +278,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
 }
@@ -291,7 +291,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
-allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Containers/Segments/EllpackView.h b/src/TNL/Containers/Segments/EllpackView.h
index c9c52dd5d..846e75cf4 100644
--- a/src/TNL/Containers/Segments/EllpackView.h
+++ b/src/TNL/Containers/Segments/EllpackView.h
@@ -103,10 +103,10 @@ class EllpackView
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       EllpackView& operator=( const EllpackView& view );
 
diff --git a/src/TNL/Containers/Segments/EllpackView.hpp b/src/TNL/Containers/Segments/EllpackView.hpp
index d7654402f..fa40227f8 100644
--- a/src/TNL/Containers/Segments/EllpackView.hpp
+++ b/src/TNL/Containers/Segments/EllpackView.hpp
@@ -240,7 +240,7 @@ template< typename Device,
           int Alignment >
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void EllpackView< Device, Index, RowMajorOrder, Alignment >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
    using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
@@ -254,7 +254,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType j = begin; j < end && compute; j++  )
-            reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
+            aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
          keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
@@ -270,7 +270,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType j = begin; j < end && compute; j += alignedSize  )
-            reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
+            aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
          keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
@@ -283,7 +283,7 @@ template< typename Device,
           int Alignment >
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void EllpackView< Device, Index, RowMajorOrder, Alignment >::
-allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Containers/Segments/SlicedEllpack.h b/src/TNL/Containers/Segments/SlicedEllpack.h
index c3967bc6b..f26d791f3 100644
--- a/src/TNL/Containers/Segments/SlicedEllpack.h
+++ b/src/TNL/Containers/Segments/SlicedEllpack.h
@@ -103,10 +103,10 @@ class SlicedEllpack
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       SlicedEllpack& operator=( const SlicedEllpack& source ) = default;
 
diff --git a/src/TNL/Containers/Segments/SlicedEllpack.hpp b/src/TNL/Containers/Segments/SlicedEllpack.hpp
index b01543e61..c540735be 100644
--- a/src/TNL/Containers/Segments/SlicedEllpack.hpp
+++ b/src/TNL/Containers/Segments/SlicedEllpack.hpp
@@ -144,8 +144,8 @@ setSegmentsSizes( const SizesHolder& sizes )
          return sizes_view[ globalIdx ];
       return 0;
    };
-   auto reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType i ) {
-      aux = TNL::max( aux, i );
+   auto reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType i ) -> IndexType {
+      return TNL::max( aux, i );
    };
    auto keep = [=] __cuda_callable__ ( IndexType i, IndexType res ) mutable {
       slices_view[ i ] = res * SliceSize;
@@ -310,7 +310,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
 }
@@ -323,7 +323,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
-allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Containers/Segments/SlicedEllpackView.h b/src/TNL/Containers/Segments/SlicedEllpackView.h
index fe1f035d0..4ed62ebef 100644
--- a/src/TNL/Containers/Segments/SlicedEllpackView.h
+++ b/src/TNL/Containers/Segments/SlicedEllpackView.h
@@ -105,10 +105,10 @@ class SlicedEllpackView
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       SlicedEllpackView& operator=( const SlicedEllpackView& view );
 
diff --git a/src/TNL/Containers/Segments/SlicedEllpackView.hpp b/src/TNL/Containers/Segments/SlicedEllpackView.hpp
index dc755bb59..258b87754 100644
--- a/src/TNL/Containers/Segments/SlicedEllpackView.hpp
+++ b/src/TNL/Containers/Segments/SlicedEllpackView.hpp
@@ -295,7 +295,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
@@ -313,7 +313,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType globalIdx = begin; globalIdx< end; globalIdx++  )
-            reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+            aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
          keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
@@ -330,7 +330,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize  )
-            reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+            aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
          keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
@@ -344,7 +344,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >::
-allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/File.hpp b/src/TNL/File.hpp
index 05e9d9ad7..289dc92c8 100644
--- a/src/TNL/File.hpp
+++ b/src/TNL/File.hpp
@@ -176,7 +176,7 @@ template< typename Type,
           typename Allocator >
 void File::save( const Type* buffer, std::streamsize elements )
 {
-   static_assert( std::is_same< std::remove_cv_t< Type >, typename Allocator::value_type >::value,
+   static_assert( std::is_same< std::remove_cv_t< Type >, std::remove_cv_t< typename Allocator::value_type > >::value,
                   "Allocator::value_type must be the same as Type." );
    TNL_ASSERT_GE( elements, 0, "Number of elements to save must be non-negative." );
 
diff --git a/src/TNL/Matrices/DenseMatrix.h b/src/TNL/Matrices/DenseMatrix.h
index 0fd648b45..5e9d9dfe7 100644
--- a/src/TNL/Matrices/DenseMatrix.h
+++ b/src/TNL/Matrices/DenseMatrix.h
@@ -406,7 +406,7 @@ class DenseMatrix : public Matrix< Real, Device, Index >
        * \include DenseMatrixExample_rowsReduction.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
-      void rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchValue& zero ) const;
+      void rowsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const;
 
       /**
        * \brief Method for performing general reduction on ALL matrix rows.
@@ -431,7 +431,7 @@ class DenseMatrix : public Matrix< Real, Device, Index >
        * \include DenseMatrixExample_allRowsReduction.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void allRowsReduction( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       /**
        * \brief Method for iteration over all matrix rows.
diff --git a/src/TNL/Matrices/DenseMatrix.hpp b/src/TNL/Matrices/DenseMatrix.hpp
index df6be9232..3f3e0383c 100644
--- a/src/TNL/Matrices/DenseMatrix.hpp
+++ b/src/TNL/Matrices/DenseMatrix.hpp
@@ -335,7 +335,7 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
-rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchValue& zero ) const
+rowsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const
 {
    this->view.rowsReduction( first, last, fetch, reduce, keep, zero );
 }
@@ -348,7 +348,7 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
-allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+allRowsReduction( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
 {
    this->rowsReduction( 0, this->getRows(), fetch, reduce, keep, zero );
 }
diff --git a/src/TNL/Matrices/DenseMatrixView.h b/src/TNL/Matrices/DenseMatrixView.h
index f0d0b388e..8ae12f64e 100644
--- a/src/TNL/Matrices/DenseMatrixView.h
+++ b/src/TNL/Matrices/DenseMatrixView.h
@@ -123,10 +123,10 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
                        const IndexType column ) const;
 
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void rowsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void allRowsReduction( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       template< typename Function >
       void forRows( IndexType first, IndexType last, Function& function ) const;
diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index bf2ebd4f2..c50e34547 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -11,6 +11,7 @@
 #pragma once
 
 #include <iomanip>
+#include <functional>
 #include <TNL/Assert.h>
 #include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Exceptions/NotImplementedError.h>
@@ -113,13 +114,10 @@ getCompressedRowLengths( Vector& rowLengths ) const
    auto fetch = [] __cuda_callable__ ( IndexType row, IndexType column, const RealType& value ) -> IndexType {
       return ( value != 0.0 );
    };
-   auto reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType a ) {
-      aux += a;
-   };
    auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
       rowLengths_view[ rowIdx ] = value;
    };
-   this->allRowsReduction( fetch, reduce, keep, 0 );
+   this->allRowsReduction( fetch, std::plus<>{}, keep, 0 );
 }
 
 template< typename Real,
@@ -288,7 +286,7 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 DenseMatrixView< Real, Device, Index, RowMajorOrder >::
-rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchValue& zero ) const
+rowsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const
 {
    const auto values_view = this->values.getConstView();
    auto fetch_ = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType globalIdx, bool& compute ) mutable -> decltype( fetch( IndexType(), IndexType(), RealType() ) ) {
@@ -305,7 +303,7 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 DenseMatrixView< Real, Device, Index, RowMajorOrder >::
-allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+allRowsReduction( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
 {
    this->rowsReduction( 0, this->getRows(), fetch, reduce, keep, zero );
 }
@@ -404,13 +402,10 @@ vectorProduct( const InVector& inVector, OutVector& outVector ) const
    auto fetch = [=] __cuda_callable__ ( IndexType row, IndexType column, IndexType offset, bool& compute ) -> RealType {
       return valuesView[ offset ] * inVectorView[ column ];
    };
-   auto reduction = [] __cuda_callable__ ( RealType& sum, const RealType& value ) {
-      sum += value;
-   };
    auto keeper = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
       outVectorView[ row ] = value;
    };
-   this->segments.segmentsReduction( 0, this->getRows(), fetch, reduction, keeper, ( RealType ) 0.0 );
+   this->segments.segmentsReduction( 0, this->getRows(), fetch, std::plus<>{}, keeper, ( RealType ) 0.0 );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index 0348a3945..91818de82 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -114,7 +114,8 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       void setRowCapacities( const RowsCapacitiesVector& rowCapacities );
 
       // TODO: Remove this when possible
-      void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) {
+      template< typename RowsCapacitiesVector >
+      void setCompressedRowLengths( const RowsCapacitiesVector& rowLengths ) {
          this->setRowCapacities( rowLengths );
       };
 
@@ -181,10 +182,10 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        */
 
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void rowsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void allRowsReduction( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       template< typename Function >
       void forRows( IndexType first, IndexType last, Function& function ) const;
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index ea74a9e9d..f5757de5c 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -499,7 +499,7 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
-rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchValue& zero ) const
+rowsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const
 {
    this->view.rowsReduction( first, last, fetch, reduce, keep, zero );
    /*const auto columns_view = this->columnIndexes.getConstView();
@@ -524,7 +524,7 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
-allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+allRowsReduction( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
 {
    this->rowsReduction( 0, this->getRows(), fetch, reduce, keep, zero );
 }
diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index 183d77929..93494aa4f 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -124,10 +124,10 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
                           const RealType outVectorMultiplicator = 0.0 ) const;
 
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void rowsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void allRowsReduction( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       template< typename Function >
       void forRows( IndexType first, IndexType last, Function& function ) const;
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 9f6580a59..e17abf716 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -122,13 +122,13 @@ getCompressedRowLengths( Vector& rowLengths ) const
    auto fetch = [] __cuda_callable__ ( IndexType row, IndexType column, IndexType globalIdx, const RealType& value ) -> IndexType {
       return ( value != 0.0 );
    };
-   auto reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType a ) {
-      aux += a;
-   };
+   //auto reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType a ) {
+   //   aux += a;
+   //};
    auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
       rowLengths_view[ rowIdx ] = value;
    };
-   this->allRowsReduction( fetch, reduce, keep, 0 );
+   this->allRowsReduction( fetch, std::plus<>{}, keep, 0 );
 }
 
 template< typename Real,
@@ -175,13 +175,13 @@ getNumberOfNonzeroMatrixElements() const
             return 0.0;
          return 1 + ( column != row && column < rows && row < columns ); // the addition is for non-diagonal elements
       };
-      auto reduction = [] __cuda_callable__ ( IndexType& sum, const IndexType& value ) {
-         sum += value;
-      };
+      //auto reduction = [] __cuda_callable__ ( IndexType& sum, const IndexType& value ) {
+      //   sum += value;
+      //};
       auto keeper = [=] __cuda_callable__ ( IndexType row, const IndexType& value ) mutable {
          row_sums_view[ row ] = value;
       };
-      this->segments.segmentsReduction( 0, this->getRows(), fetch, reduction, keeper, ( IndexType ) 0 );
+      this->segments.segmentsReduction( 0, this->getRows(), fetch, std::plus<>{}, keeper, ( IndexType ) 0 );
       return sum( row_sums );
    }
 }
@@ -407,9 +407,9 @@ vectorProduct( const InVector& inVector,
       return valuesView[ globalIdx ] * inVectorView[ column ];
    };
 
-   auto reduction = [] __cuda_callable__ ( RealType& sum, const RealType& value ) {
-      sum += value;
-   };
+   //auto reduction = [] __cuda_callable__ ( RealType& sum, const RealType& value ) {
+   //   sum += value;
+   //};
    auto keeper = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
       if( isSymmetric() )
          outVectorView[ row ] += matrixMultiplicator * value;
@@ -422,9 +422,9 @@ vectorProduct( const InVector& inVector,
       }
    };
    if( isSymmetric() )
-      this->segments.segmentsReduction( 0, this->getRows(), symmetricFetch, reduction, keeper, ( RealType ) 0.0 );
+      this->segments.segmentsReduction( 0, this->getRows(), symmetricFetch, std::plus<>{}, keeper, ( RealType ) 0.0 );
    else
-      this->segments.segmentsReduction( 0, this->getRows(), fetch, reduction, keeper, ( RealType ) 0.0 );
+      this->segments.segmentsReduction( 0, this->getRows(), fetch, std::plus<>{}, keeper, ( RealType ) 0.0 );
 
    /*const auto inVectorView = inVector.getConstView();
    auto outVectorView = outVector.getView();
@@ -456,7 +456,7 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
-rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchValue& zero ) const
+rowsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const
 {
    const auto columns_view = this->columnIndexes.getConstView();
    const auto values_view = this->values.getConstView();
@@ -483,7 +483,7 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
-allRowsReduction( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+allRowsReduction( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
 {
    this->rowsReduction( 0, this->getRows(), fetch, reduce, keep, zero );
 }
diff --git a/src/UnitTests/Containers/Segments/SegmentsTest.hpp b/src/UnitTests/Containers/Segments/SegmentsTest.hpp
index 59ef44c2c..b520df21a 100644
--- a/src/UnitTests/Containers/Segments/SegmentsTest.hpp
+++ b/src/UnitTests/Containers/Segments/SegmentsTest.hpp
@@ -141,8 +141,8 @@ void test_AllReduction_MaximumInSegments()
    auto fetch = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) -> IndexType {
       return v_view[ globalIdx ];
    };
-   auto reduce = [] __cuda_callable__ ( IndexType& a, const IndexType b ) {
-      a = TNL::max( a, b );
+   auto reduce = [] __cuda_callable__ ( IndexType& a, const IndexType b ) -> IndexType {
+      return TNL::max( a, b );
    };
    auto keep = [=] __cuda_callable__ ( const IndexType i, const IndexType a ) mutable {
       result_view[ i ] = a;
diff --git a/src/UnitTests/Matrices/BinarySparseMatrixTest.hpp b/src/UnitTests/Matrices/BinarySparseMatrixTest.hpp
index 276c432ff..87d5e139b 100644
--- a/src/UnitTests/Matrices/BinarySparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/BinarySparseMatrixTest.hpp
@@ -951,13 +951,10 @@ void test_RowsReduction()
    auto fetch = [] __cuda_callable__ ( IndexType row, IndexType column, IndexType globalIdx, const RealType& value ) -> IndexType {
       return ( value != 0.0 );
    };
-   auto reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType a ) {
-      aux += a;
-   };
    auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
       rowLengths_view[ rowIdx ] = value;
    };
-   m.allRowsReduction( fetch, reduce, keep, 0 );
+   m.allRowsReduction( fetch, std::plus<>{}, keep, 0 );
    EXPECT_EQ( rowsCapacities, rowLengths );
    m.getCompressedRowLengths( rowLengths );
    EXPECT_EQ( rowsCapacities, rowLengths );
@@ -969,13 +966,10 @@ void test_RowsReduction()
    auto max_fetch = [] __cuda_callable__ ( IndexType row, IndexType column, IndexType globalIdx, const RealType& value ) -> IndexType {
       return abs( value );
    };
-   auto max_reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType a ) {
-      aux += a;
-   };
    auto max_keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
       rowSums_view[ rowIdx ] = value;
    };
-   m.allRowsReduction( max_fetch, max_reduce, max_keep, 0 );
+   m.allRowsReduction( max_fetch, std::plus<>{}, max_keep, 0 );
    const RealType maxNorm = TNL::max( rowSums );
    EXPECT_EQ( maxNorm, 8 ) ; // 29+30+31+32+33+34+35+36
 }
diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index 5490f34f8..6d7c64360 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include <functional>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Math.h>
@@ -1376,13 +1377,10 @@ void test_RowsReduction()
    auto fetch = [] __cuda_callable__ ( IndexType row, IndexType column, IndexType globalIdx, const RealType& value ) -> IndexType {
       return ( value != 0.0 );
    };
-   auto reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType a ) {
-      aux += a;
-   };
    auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
       rowLengths_view[ rowIdx ] = value;
    };
-   m.allRowsReduction( fetch, reduce, keep, 0 );
+   m.allRowsReduction( fetch, std::plus<>{}, keep, 0 );
    EXPECT_EQ( rowsCapacities, rowLengths );
    m.getCompressedRowLengths( rowLengths );
    EXPECT_EQ( rowsCapacities, rowLengths );
@@ -1394,13 +1392,10 @@ void test_RowsReduction()
    auto max_fetch = [] __cuda_callable__ ( IndexType row, IndexType column, IndexType globalIdx, const RealType& value ) -> IndexType {
       return abs( value );
    };
-   auto max_reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType a ) {
-      aux += a;
-   };
    auto max_keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
       rowSums_view[ rowIdx ] = value;
    };
-   m.allRowsReduction( max_fetch, max_reduce, max_keep, 0 );
+   m.allRowsReduction( max_fetch, std::plus<>{}, max_keep, 0 );
    const RealType maxNorm = TNL::max( rowSums );
    EXPECT_EQ( maxNorm, 260 ) ; // 29+30+31+32+33+34+35+36
 }
diff --git a/src/UnitTests/Matrices/SymmetricSparseMatrixTest.hpp b/src/UnitTests/Matrices/SymmetricSparseMatrixTest.hpp
index 58a4f4fce..4e28842ba 100644
--- a/src/UnitTests/Matrices/SymmetricSparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SymmetricSparseMatrixTest.hpp
@@ -903,13 +903,10 @@ void test_RowsReduction()
          TNL::Algorithms::AtomicOperations< DeviceType >::add( rowLengths_view[ column ], ( IndexType ) 1 );
       return ( value != 0.0 );
    };
-   auto reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType a ) {
-      aux += a;
-   };
    auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
       rowLengths_view[ rowIdx ] += value;
    };
-   m_5.allRowsReduction( fetch, reduce, keep, 0 );
+   m_5.allRowsReduction( fetch, std::plus<>{}, keep, 0 );
 
    EXPECT_EQ( rowLengths_true, rowLengths );
    m_5.getCompressedRowLengths( rowLengths );
-- 
GitLab


From 8551431afb57efe68b2bd8f8d7772725465f674d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 5 May 2020 14:46:37 +0200
Subject: [PATCH 22/68] Writting documentation on dense matrix.

---
 .../Examples/Matrices/CMakeLists.txt          |   2 +
 .../DenseMatrixExample_allRowsReduction.cpp   |   9 +-
 .../Matrices/DenseMatrixExample_forAllRows.cu |   1 +
 src/TNL/Matrices/DenseMatrix.h                | 124 ++++++++++++++++--
 src/TNL/Matrices/DenseMatrixRowView.h         |  61 ++++++++-
 5 files changed, 187 insertions(+), 10 deletions(-)
 create mode 120000 Documentation/Examples/Matrices/DenseMatrixExample_forAllRows.cu

diff --git a/Documentation/Examples/Matrices/CMakeLists.txt b/Documentation/Examples/Matrices/CMakeLists.txt
index b06224620..aee79523c 100644
--- a/Documentation/Examples/Matrices/CMakeLists.txt
+++ b/Documentation/Examples/Matrices/CMakeLists.txt
@@ -28,6 +28,7 @@ IF( BUILD_CUDA )
    ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_forRows_cuda >
                         ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_forRows.out
                        OUTPUT DenseMatrixExample_forRows.out )
+
 ELSE()
    ADD_EXECUTABLE( DenseMatrixExample_setElement DenseMatrixExample_setElement.cpp )
    ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElement >
@@ -58,6 +59,7 @@ ELSE()
    ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_forRows >
                         ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_forRows.out
                        OUTPUT DenseMatrixExample_forRows.out )
+
 ENDIF()
 
 ADD_EXECUTABLE( DenseMatrixExample_Constructor_init_list DenseMatrixExample_Constructor_init_list.cpp )
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cpp
index e45438a83..aada767b9 100644
--- a/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cpp
@@ -32,6 +32,13 @@ void allRowsReduction()
       return TNL::abs( value );
    };
 
+   /***
+    * Reduce lambda return maximum of given values.
+    */
+   auto reduce = [=] __cuda_callable__ ( double& a, const double& b ) -> double {
+      return TNL::max( a, b );
+   };
+
    /***
     * Keep lambda store the largest value in each row to the vector rowMax.
     */
@@ -42,7 +49,7 @@ void allRowsReduction()
    /***
     * Compute the largest values in each row.
     */
-   matrix.allRowsReduction( fetch, std::plus<>{}, keep, std::numeric_limits< double >::lowest() );
+   matrix.allRowsReduction( fetch, reduce, keep, std::numeric_limits< double >::lowest() );
 
    std::cout << "Max. elements in rows are: " << rowMax << std::endl;
 }
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_forAllRows.cu b/Documentation/Examples/Matrices/DenseMatrixExample_forAllRows.cu
new file mode 120000
index 000000000..589520f79
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_forAllRows.cu
@@ -0,0 +1 @@
+DenseMatrixExample_forAllRows.cpp
\ No newline at end of file
diff --git a/src/TNL/Matrices/DenseMatrix.h b/src/TNL/Matrices/DenseMatrix.h
index 5e9d9dfe7..320032c10 100644
--- a/src/TNL/Matrices/DenseMatrix.h
+++ b/src/TNL/Matrices/DenseMatrix.h
@@ -266,6 +266,8 @@ class DenseMatrix : public Matrix< Real, Device, Index >
        * \include Matrices/DenseMatrixExample_getConstRow.cpp
        * \par Output
        * \include DenseMatrixExample_getConstRow.out
+       * 
+       * See \ref DenseMatrixRowView.
        */
       __cuda_callable__
       const RowView getRow( const IndexType& rowIdx ) const;
@@ -281,6 +283,8 @@ class DenseMatrix : public Matrix< Real, Device, Index >
        * \include Matrices/DenseMatrixExample_getRow.cpp
        * \par Output
        * \include DenseMatrixExample_getRow.out
+       * 
+       * See \ref DenseMatrixRowView.
        */
       __cuda_callable__
       RowView getRow( const IndexType& rowIdx );
@@ -434,11 +438,14 @@ class DenseMatrix : public Matrix< Real, Device, Index >
       void allRowsReduction( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       /**
-       * \brief Method for iteration over all matrix rows.
+       * \brief Method for iteration over all matrix rows for constant instances.
        * 
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx, const RealType& value, bool compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
+       *  The column index repeats twice only for compatibility with sparse matrices. 
+       *  If the 'compute' variable is set to false the iteration over the row can 
+       *  be interrupted.
        * 
        * \param first is index is the first row to be processed.
        * \param last is index of the row after the last row to be processed.
@@ -452,20 +459,76 @@ class DenseMatrix : public Matrix< Real, Device, Index >
       template< typename Function >
       void forRows( IndexType first, IndexType last, Function& function ) const;
 
+      /**
+       * \brief Method for iteration over all matrix rows for non-constant instances.
+       * 
+       * \tparam Function is type of lambda function that will operate on matrix elements.
+       *    It is should have form like
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx, RealType& value, bool& compute )`.
+       *  The column index repeats twice only for compatibility with sparse matrices. 
+       *  If the 'compute' variable is set to false the iteration over the row can 
+       *  be interrupted.
+       * 
+       * \param first is index is the first row to be processed.
+       * \param last is index of the row after the last row to be processed.
+       * \param function is an instance of the lambda function to be called in each row.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_forRows.cpp
+       * \par Output
+       * \include DenseMatrixExample_forRows.out
+       */
       template< typename Function >
       void forRows( IndexType first, IndexType last, Function& function );
 
+      /**
+       * \brief This method calls \e forRows for all matrix rows.
+       * 
+       * See \ref DenseMatrix::forRows.
+       * 
+       * \tparam Function is a type of lambda function that will operate on matrix elements.
+       * \param function  is an instance of the lambda function to be called in each row.
+       */
       template< typename Function >
       void forAllRows( Function& function ) const;
 
+      /**
+       * \brief This method calls \e forRows for all matrix rows.
+       * 
+       * See \ref DenseMatrix::forRows.
+       * 
+       * \tparam Function is a type of lambda function that will operate on matrix elements.
+       * \param function  is an instance of the lambda function to be called in each row.
+       */
       template< typename Function >
       void forAllRows( Function& function );
 
+      /**
+       * \brief This method computes scalar product of given vector and one 
+       *  row of the matrix.
+       * 
+       * \tparam Vector is type of input vector. It can be \ref Vector,
+       *     \ref VectorView, \ref Array, \ref ArraView or similar container.
+       * \param row is index of the row used for the scalar product.
+       * \param vector is the input vector.
+       * \return 
+       */
       template< typename Vector >
       __cuda_callable__
       typename Vector::RealType rowVectorProduct( const IndexType row,
                                                   const Vector& vector ) const;
 
+      /**
+       * \brief Computes product of matrix and vector.
+       * 
+       * \tparam InVector is type of input vector.  It can be \ref Vector,
+       *     \ref VectorView, \ref Array, \ref ArraView or similar container.
+       * \tparam OutVector is type of output vector. It can be \ref Vector,
+       *     \ref VectorView, \ref Array, \ref ArraView or similar container.
+       * 
+       * \param inVector is input vector.
+       * \param outVector is output vector.
+       */
       template< typename InVector, typename OutVector >
       void vectorProduct( const InVector& inVector,
                           OutVector& outVector ) const;
@@ -494,16 +557,16 @@ class DenseMatrix : public Matrix< Real, Device, Index >
       /**
        * \brief Assignment operator for exactly the same type of the dense matrix.
        * 
-       * @param matrix
-       * @return 
+       * \param matrix is the right-hand side matrix.
+       * \return reference to this matrix.
        */
       DenseMatrix& operator=( const DenseMatrix& matrix );
 
       /**
        * \brief Assignment operator for other dense matrices.
        * 
-       * @param matrix
-       * @return 
+       * \param matrix is the right-hand side matrix.
+       * \return reference to this matrix.
        */
       template< typename RHSReal, typename RHSDevice, typename RHSIndex,
                  bool RHSRowMajorOrder, typename RHSRealAllocator >
@@ -511,26 +574,64 @@ class DenseMatrix : public Matrix< Real, Device, Index >
 
       /**
        * \brief Assignment operator for other (sparse) types of matrices.
-       * @param matrix
-       * @return 
+       * 
+       * \param matrix is the right-hand side matrix.
+       * \return reference to this matrix.
        */
       template< typename RHSMatrix >
       DenseMatrix& operator=( const RHSMatrix& matrix );
 
+      /**
+       * \brief Comparison operator with another dense matrix.
+       * 
+       * \param matrix is the right-hand side matrix.
+       * \return \e true if the RHS matrix is equal, \e false otherwise.
+       */
       template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
       bool operator==( const DenseMatrix< Real_, Device_, Index_, RowMajorOrder >& matrix ) const;
 
+      /**
+       * \brief Comparison operator with another dense matrix.
+       * 
+       * \param matrix is the right-hand side matrix.
+       * \return \e false if the RHS matrix is equal, \e true otherwise.
+       */
       template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
       bool operator!=( const DenseMatrix< Real_, Device_, Index_, RowMajorOrder >& matrix ) const;
 
+      /**
+       * \brief Method for saving the matrix to the file with given filename.
+       * 
+       * \param fileName is name of the file.
+       */
       void save( const String& fileName ) const;
 
+      /**
+       * \brief Method for loading the matrix from the file with given filename.
+       * 
+       * \param fileName is name of the file.
+       */
       void load( const String& fileName );
 
+      /**
+       * \brief Method for saving the matrix to a file.
+       * 
+       * \param fileName is name of the file.
+       */
       void save( File& file ) const;
 
+      /**
+       * \brief Method for loading the matrix from a file.
+       * 
+       * \param fileName is name of the file.
+       */
       void load( File& file );
 
+      /**
+       * \brief Method for printing the matrix to output stream.
+       * 
+       * \param str is the output stream.
+       */
       void print( std::ostream& str ) const;
 
    protected:
@@ -544,6 +645,13 @@ class DenseMatrix : public Matrix< Real, Device, Index >
       ViewType view;
 };
 
+/**
+ * \brief Insertion operator for dense matrix and output stream.
+ * 
+ * \param str is the output stream.
+ * \param matrix is the dense matrix.
+ * \return  reference to the stream.
+ */
 template< typename Real,
           typename Device,
           typename Index,
diff --git a/src/TNL/Matrices/DenseMatrixRowView.h b/src/TNL/Matrices/DenseMatrixRowView.h
index 01fdd9408..78fecd0f7 100644
--- a/src/TNL/Matrices/DenseMatrixRowView.h
+++ b/src/TNL/Matrices/DenseMatrixRowView.h
@@ -13,30 +13,89 @@
 namespace TNL {
    namespace Matrices {
 
+/**
+ * \brief RowView is a simple structure for accessing rows of dense matrix.
+ * 
+ * \tparam SegmentView is a segment view of segments representing the matrix format.
+ * \tparam ValuesView is a vector view storing the matrix elements values.
+ * 
+ * See \ref DenseMatrix and \ref DenseMatrixView.
+ * 
+ * \par Example
+ * \include Matrices/DenseMatrixExample_getRow.cpp
+ * \par Output
+ * \include DenseMatrixExample_getRow.out
+ */
 template< typename SegmentView,
           typename ValuesView >
 class DenseMatrixRowView
 {
    public:
 
+      /**
+       * \brief The type of matrix elements.
+       */
       using RealType = typename ValuesView::RealType;
+
+      /**
+       * \brief The type used for matrix elements indexing.
+       */
+      using IndexType = typename SegmentView::IndexType;
+
+      /**
+       * \brief Type representing matrix row format.
+       */
       using SegmentViewType = SegmentView;
-      using IndexType = typename SegmentViewType::IndexType;
+
+      /**
+       * \brief Type of container view used for storing matrix elements values.
+       */
       using ValuesViewType = ValuesView;
 
+      /**
+       * \brief Constructor with \e segmentView and \e values
+       * 
+       * \param segmentView instance of SegmentViewType representing matrix row.
+       * \param values is a container view for storing the matrix elements values.
+       */
       __cuda_callable__
       DenseMatrixRowView( const SegmentViewType& segmentView,
                           const ValuesViewType& values );
 
+      /**
+       * \brief Returns size of the matrix row, i.e. number of matrix elements in this row.
+       * 
+       * \return Size of the matrix row.
+       */
       __cuda_callable__
       IndexType getSize() const;
 
+      /**
+       * \brief Returns constants reference to an element with given column index.
+       * 
+       * \param column is column index of the matrix element.
+       * 
+       * \return constant reference to the matrix element.
+       */
       __cuda_callable__
       const RealType& getElement( const IndexType column ) const;
 
+      /**
+       * \brief Returns non-constants reference to an element with given column index.
+       * 
+       * \param column is a column index of the matrix element.
+       * 
+       * \return non-constant reference to the matrix element.
+       */
       __cuda_callable__
       RealType& getElement( const IndexType column );
 
+      /**
+       * \brief Sets value of matrix element with given column index
+       * .
+       * \param column is a column index of the matrix element.
+       * \param value is a value the matrix element will be set to.
+       */
       __cuda_callable__
       void setElement( const IndexType column,
                        const RealType& value );
-- 
GitLab


From 4b2fa277cc64f61e1c246ef6013cb75dc97244ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 5 May 2020 22:08:05 +0200
Subject: [PATCH 23/68] Added SharedPointer constructors with initializer
 lists.

---
 src/TNL/Pointers/SharedPointerCuda.h | 28 +++++++++++++++++++++++
 src/TNL/Pointers/SharedPointerHost.h | 34 +++++++++++++++++++++++++++-
 2 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/src/TNL/Pointers/SharedPointerCuda.h b/src/TNL/Pointers/SharedPointerCuda.h
index f4f73ec39..975904a73 100644
--- a/src/TNL/Pointers/SharedPointerCuda.h
+++ b/src/TNL/Pointers/SharedPointerCuda.h
@@ -90,6 +90,34 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
          this->allocate( args... );
       }
 
+      /**
+       * \brief Constructor with initializer list.
+       *
+       * \tparam Value is type of the initializer list elements.
+       * \param list is the instance of the initializer list..
+       */
+      template< typename Value >
+      explicit  SharedPointer( std::initializer_list< Value > list )
+      : pd( nullptr ),
+        cuda_pointer( nullptr )
+      {
+         this->allocate( list );
+      }
+
+      /**
+       * \brief Constructor with nested initializer lists.
+       *
+       * \tparam Value is type of the nested initializer list elements.
+       * \param list is the instance of the nested initializer list..
+       */
+      template< typename Value >
+      explicit  SharedPointer( std::initializer_list< std::initializer_list< Value > > list )
+      : pd( nullptr ),
+        cuda_pointer( nullptr )
+      {
+         this->allocate( list );
+      }
+
       /**
        * \brief Copy constructor.
        *
diff --git a/src/TNL/Pointers/SharedPointerHost.h b/src/TNL/Pointers/SharedPointerHost.h
index ea8654d16..2ef8d7abd 100644
--- a/src/TNL/Pointers/SharedPointerHost.h
+++ b/src/TNL/Pointers/SharedPointerHost.h
@@ -73,7 +73,7 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
        * \brief Constructor with parameters of the Object constructor.
        *
        * \tparam Args is variadic template type of arguments of the Object constructor.
-       * \tparam args are arguments passed to the Object constructor.
+       * \param args are arguments passed to the Object constructor.
        */
       template< typename... Args >
       explicit  SharedPointer( Args... args )
@@ -85,6 +85,38 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
          this->allocate( args... );
       }
 
+      /**
+       * \brief Constructor with initializer list.
+       *
+       * \tparam Value is type of the initializer list elements.
+       * \param list is the instance of the initializer list..
+       */
+      template< typename Value >
+      explicit  SharedPointer( std::initializer_list< Value > list )
+      : pd( nullptr )
+      {
+#ifdef TNL_DEBUG_SHARED_POINTERS
+         std::cerr << "Creating shared pointer to " << getType< ObjectType >() << std::endl;
+#endif
+         this->allocate( list );
+      }
+
+      /**
+       * \brief Constructor with nested initializer lists.
+       *
+       * \tparam Value is type of the nested initializer list elements.
+       * \param list is the instance of the nested initializer list..
+       */
+      template< typename Value >
+      explicit  SharedPointer( std::initializer_list< std::initializer_list< Value > > list )
+      : pd( nullptr )
+      {
+#ifdef TNL_DEBUG_SHARED_POINTERS
+         std::cerr << "Creating shared pointer to " << getType< ObjectType >() << std::endl;
+#endif
+         this->allocate( list );
+      }
+
       /**
        * \brief Copy constructor.
        *
-- 
GitLab


From e32d923d85e249da07dc865bbd3eba19cc65d68c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 5 May 2020 22:10:07 +0200
Subject: [PATCH 24/68] Writting documentation on DenseMatrix.

---
 .../Examples/Matrices/CMakeLists.txt          | 98 ++++++++++++++-----
 ...nseMatrixExample_Constructor_init_list.cpp | 23 ++++-
 ...enseMatrixExample_Constructor_init_list.cu |  1 +
 .../DenseMatrixExample_allRowsReduction.cpp   |  2 +-
 .../DenseMatrixExample_forAllRows.cpp         | 31 ++++++
 .../Matrices/DenseMatrixExample_forRows.cpp   | 22 +++--
 ...eMatrixExample_getCompressedRowLengths.cpp | 19 +++-
 ...seMatrixExample_getCompressedRowLengths.cu |  1 +
 .../DenseMatrixExample_getConstRow.cpp        | 38 +++++--
 .../DenseMatrixExample_getConstRow.cu         |  1 +
 .../DenseMatrixExample_getElementsCount.cpp   | 17 +++-
 .../DenseMatrixExample_getElementsCount.cu    |  1 +
 .../Matrices/DenseMatrixExample_getRow.cpp    | 37 +++++--
 .../Matrices/DenseMatrixExample_getRow.cu     |  1 +
 .../DenseMatrixExample_setElements.cpp        | 19 +++-
 .../DenseMatrixExample_setElements.cu         |  1 +
 ...enseMatrixViewExample_allRowsReduction.cpp | 66 +++++++++++++
 ...DenseMatrixViewExample_allRowsReduction.cu |  1 +
 .../DenseMatrixViewExample_getConstRow.cpp    | 45 +++++++++
 .../DenseMatrixViewExample_getRow.cpp         | 34 +++++++
 src/TNL/Matrices/DenseMatrix.h                |  6 +-
 src/TNL/Matrices/DenseMatrix.hpp              |  6 +-
 22 files changed, 401 insertions(+), 69 deletions(-)
 create mode 120000 Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cu
 create mode 100644 Documentation/Examples/Matrices/DenseMatrixExample_forAllRows.cpp
 create mode 120000 Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cu
 create mode 120000 Documentation/Examples/Matrices/DenseMatrixExample_getConstRow.cu
 create mode 120000 Documentation/Examples/Matrices/DenseMatrixExample_getElementsCount.cu
 create mode 120000 Documentation/Examples/Matrices/DenseMatrixExample_getRow.cu
 create mode 120000 Documentation/Examples/Matrices/DenseMatrixExample_setElements.cu
 create mode 100644 Documentation/Examples/Matrices/DenseMatrixViewExample_allRowsReduction.cpp
 create mode 120000 Documentation/Examples/Matrices/DenseMatrixViewExample_allRowsReduction.cu
 create mode 100644 Documentation/Examples/Matrices/DenseMatrixViewExample_getConstRow.cpp
 create mode 100644 Documentation/Examples/Matrices/DenseMatrixViewExample_getRow.cpp

diff --git a/Documentation/Examples/Matrices/CMakeLists.txt b/Documentation/Examples/Matrices/CMakeLists.txt
index aee79523c..e0d7a6f42 100644
--- a/Documentation/Examples/Matrices/CMakeLists.txt
+++ b/Documentation/Examples/Matrices/CMakeLists.txt
@@ -1,4 +1,34 @@
 IF( BUILD_CUDA )
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_Constructor_init_list_cuda DenseMatrixExample_Constructor_init_list.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_Constructor_init_list_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_Constructor_init_list.out
+                       OUTPUT DenseMatrixExample_Constructor_init_list.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_setElements_cuda DenseMatrixExample_setElements.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElements_cuda > 
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElements.out
+                       OUTPUT DenseMatrixExample_setElements.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_getCompressedRowLengths_cuda DenseMatrixExample_getCompressedRowLengths.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getCompressedRowLengths_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getCompressedRowLengths.out
+                       OUTPUT DenseMatrixExample_getCompressedRowLengths.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_getElementsCount_cuda DenseMatrixExample_getElementsCount.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getElementsCount_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getElementsCount.out
+                       OUTPUT DenseMatrixExample_getElementsCount.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_getConstRow_cuda DenseMatrixExample_getConstRow.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getConstRow_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getConstRow.out
+                       OUTPUT DenseMatrixExample_getConstRow.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_getRow_cuda DenseMatrixExample_getRow.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getRow_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getRow.out
+                       OUTPUT DenseMatrixExample_getRow.out )
+
    CUDA_ADD_EXECUTABLE( DenseMatrixExample_setElement_cuda DenseMatrixExample_setElement.cu )
    ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElement_cuda >
                         ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElement.out
@@ -29,7 +59,42 @@ IF( BUILD_CUDA )
                         ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_forRows.out
                        OUTPUT DenseMatrixExample_forRows.out )
 
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_forAllRows_cuda DenseMatrixExample_forAllRows.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_forAllRows_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_forAllRows.out
+                       OUTPUT DenseMatrixExample_forAllRows.out )
+
 ELSE()
+   ADD_EXECUTABLE( DenseMatrixExample_Constructor_init_list DenseMatrixExample_Constructor_init_list.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_Constructor_init_list >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_Constructor_init_list.out
+                       OUTPUT DenseMatrixExample_Constructor_init_list.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_setElements DenseMatrixExample_setElements.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElements > 
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElements.out
+                       OUTPUT DenseMatrixExample_setElements.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_getCompressedRowLengths DenseMatrixExample_getCompressedRowLengths.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getCompressedRowLengths >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getCompressedRowLengths.out
+                       OUTPUT DenseMatrixExample_getCompressedRowLengths.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_getElementsCount DenseMatrixExample_getElementsCount.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getElementsCount >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getElementsCount.out
+                       OUTPUT DenseMatrixExample_getElementsCount.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_getConstRow DenseMatrixExample_getConstRow.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getConstRow >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getConstRow.out
+                       OUTPUT DenseMatrixExample_getConstRow.out )
+
+   ADD_EXECUTABLE( DenseMatrixExample_getRow DenseMatrixExample_getRow.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getRow >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getRow.out
+                       OUTPUT DenseMatrixExample_getRow.out )
+
    ADD_EXECUTABLE( DenseMatrixExample_setElement DenseMatrixExample_setElement.cpp )
    ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElement >
                         ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElement.out
@@ -60,37 +125,15 @@ ELSE()
                         ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_forRows.out
                        OUTPUT DenseMatrixExample_forRows.out )
 
-ENDIF()
+   ADD_EXECUTABLE( DenseMatrixExample_forAllRows DenseMatrixExample_forAllRows.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_forAllRows >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_forAllRows.out
+                       OUTPUT DenseMatrixExample_forAllRows.out )
 
-ADD_EXECUTABLE( DenseMatrixExample_Constructor_init_list DenseMatrixExample_Constructor_init_list.cpp )
-ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_Constructor_init_list >
-                     ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_Constructor_init_list.out
-                    OUTPUT DenseMatrixExample_Constructor_init_list.out )
-
-ADD_EXECUTABLE( DenseMatrixExample_setElements DenseMatrixExample_setElements.cpp )
-ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElements > 
-                     ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElements.out
-                    OUTPUT DenseMatrixExample_setElements.out )
-
-ADD_EXECUTABLE( DenseMatrixExample_getCompressedRowLengths DenseMatrixExample_getCompressedRowLengths.cpp )
-ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getCompressedRowLengths >
-                     ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getCompressedRowLengths.out
-                    OUTPUT DenseMatrixExample_getCompressedRowLengths.out )
+ENDIF()
 
-ADD_EXECUTABLE( DenseMatrixExample_getElementsCount DenseMatrixExample_getElementsCount.cpp )
-ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getElementsCount >
-                     ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getElementsCount.out
-                    OUTPUT DenseMatrixExample_getElementsCount.out )
 
-ADD_EXECUTABLE( DenseMatrixExample_getConstRow DenseMatrixExample_getConstRow.cpp )
-ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getConstRow >
-                     ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getConstRow.out
-                    OUTPUT DenseMatrixExample_getConstRow.out )
 
-ADD_EXECUTABLE( DenseMatrixExample_getRow DenseMatrixExample_getRow.cpp )
-ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_getRow >
-                     ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_getRow.out
-                    OUTPUT DenseMatrixExample_getRow.out )
 
 
 ADD_CUSTOM_TARGET( RunMatricesExamples ALL DEPENDS
@@ -106,5 +149,6 @@ ADD_CUSTOM_TARGET( RunMatricesExamples ALL DEPENDS
    DenseMatrixExample_rowsReduction.out
    DenseMatrixExample_allRowsReduction.out
    DenseMatrixExample_forRows.out
+   DenseMatrixExample_forAllRows.out
 )
 
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cpp
index c11178c46..91426a6f1 100644
--- a/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cpp
@@ -2,17 +2,19 @@
 #include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Devices/Host.h>
 
-int main( int argc, char* argv[] )
+
+template< typename Device >
+void initializerListExample()
 {
-   TNL::Matrices::DenseMatrix< double, TNL::Devices::Host > matrix {
+   TNL::Matrices::DenseMatrix< double, Device > matrix {
       {  1,  2,  3,  4,  5,  6 },
       {  7,  8,  9, 10, 11, 12 },
       { 13, 14, 15, 16, 17, 18 }
    };
 
-   std::cout << matrix << std::endl;
+   std::cout << "General dense matrix: " << std::endl << matrix << std::endl;
 
-   TNL::Matrices::DenseMatrix< double, TNL::Devices::Host > triangularMatrix {
+   TNL::Matrices::DenseMatrix< double, Device > triangularMatrix {
       {  1 },
       {  2,  3 },
       {  4,  5,  6 },
@@ -20,5 +22,16 @@ int main( int argc, char* argv[] )
       { 11, 12, 13, 14, 15 }
    };
 
-   std::cout << triangularMatrix << std::endl;
+   std::cout << "Triangular dense matrix: " << std::endl << triangularMatrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Creating matrices on CPU ... " << std::endl;
+   initializerListExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Creating matrices on CUDA GPU ... " << std::endl;
+   initializerListExample< TNL::Devices::Cuda >();
+#endif
 }
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cu b/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cu
new file mode 120000
index 000000000..91fa4f073
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_Constructor_init_list.cu
@@ -0,0 +1 @@
+DenseMatrixExample_Constructor_init_list.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cpp
index aada767b9..ce323671f 100644
--- a/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_allRowsReduction.cpp
@@ -3,6 +3,7 @@
 #include <functional>
 #include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
 
 template< typename Device >
 void allRowsReduction()
@@ -22,7 +23,6 @@ void allRowsReduction()
    /***
     * Prepare vector view and matrix view for lambdas.
     */
-   const auto matrixView = matrix.getConstView();
    auto rowMaxView = rowMax.getView();
 
    /***
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_forAllRows.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_forAllRows.cpp
new file mode 100644
index 000000000..5fddf0f34
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_forAllRows.cpp
@@ -0,0 +1,31 @@
+#include <iostream>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void forAllRowsExample()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix( 5, 5 );
+
+   auto f = [=] __cuda_callable__ ( int rowIdx, int columnIdx, int globalIdx, double& value, bool& compute ) {
+      if( rowIdx < columnIdx )
+         compute = false;
+      else
+         value = rowIdx + columnIdx;
+   };
+
+   matrix.forAllRows( f );
+   std::cout << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Creating matrix on host: " << std::endl;
+   forAllRowsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Creating matrix on CUDA device: " << std::endl;
+   forAllRowsExample< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_forRows.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_forRows.cpp
index 5d364f320..f3e45a006 100644
--- a/Documentation/Examples/Matrices/DenseMatrixExample_forRows.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_forRows.cpp
@@ -1,16 +1,13 @@
 #include <iostream>
-#include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
 
-int main( int argc, char* argv[] )
+template< typename Device >
+void forRowsExample()
 {
-   TNL::Matrices::DenseMatrix< double, TNL::Devices::Host > matrix( 5, 5 );
+   TNL::Matrices::DenseMatrix< double, Device > matrix( 5, 5 );
 
-   /***
-    * We need a matrix view to pass the matrix to lambda function even on CUDA device.
-    */
-   auto matrixView = matrix.getView();
    auto f = [=] __cuda_callable__ ( int rowIdx, int columnIdx, int globalIdx, double& value, bool& compute ) {
       if( rowIdx < columnIdx )
          compute = false;
@@ -21,3 +18,14 @@ int main( int argc, char* argv[] )
    matrix.forRows( 0, matrix.getRows(), f );
    std::cout << matrix << std::endl;
 }
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Creating matrix on host: " << std::endl;
+   forRowsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Creating matrix on CUDA device: " << std::endl;
+   forRowsExample< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cpp
index cb0abc6fd..e89992d9f 100644
--- a/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cpp
@@ -1,10 +1,12 @@
 #include <iostream>
 #include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
 
-int main( int argc, char* argv[] )
+template< typename Device >
+void getCompressedRowLengthsExample()
 {
-   TNL::Matrices::DenseMatrix< double, TNL::Devices::Host > triangularMatrix {
+   TNL::Matrices::DenseMatrix< double, Device > triangularMatrix {
       {  1 },
       {  2,  3 },
       {  4,  5,  6 },
@@ -14,8 +16,19 @@ int main( int argc, char* argv[] )
 
    std::cout << triangularMatrix << std::endl;
 
-   TNL::Containers::Vector< int, TNL::Devices::Host > rowLengths;
+   TNL::Containers::Vector< int, Device > rowLengths;
    triangularMatrix.getCompressedRowLengths( rowLengths );
 
    std::cout << "Compressed row lengths are: " << rowLengths << std::endl;
 }
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Getting compressed row lengths on host: " << std::endl;
+   getCompressedRowLengthsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Getting compressed row lengths on CUDA device: " << std::endl;
+   getCompressedRowLengthsExample< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cu b/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cu
new file mode 120000
index 000000000..2b3cd6c13
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getCompressedRowLengths.cu
@@ -0,0 +1 @@
+DenseMatrixExample_getCompressedRowLengths.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getConstRow.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_getConstRow.cpp
index 8e5da1d4b..08b655e55 100644
--- a/Documentation/Examples/Matrices/DenseMatrixExample_getConstRow.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getConstRow.cpp
@@ -3,10 +3,14 @@
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Pointers/SharedPointer.h>
 
-int main( int argc, char* argv[] )
+template< typename Device >
+void getRowExample()
 {
-   TNL::Matrices::DenseMatrix< double, TNL::Devices::Host > matrix {
+   using MatrixType = TNL::Matrices::DenseMatrix< double, Device >;
+   TNL::Pointers::SharedPointer< MatrixType > matrix {
       { 1, 0, 0, 0, 0 },
       { 1, 2, 0, 0, 0 },
       { 1, 2, 3, 0, 0 },
@@ -14,19 +18,35 @@ int main( int argc, char* argv[] )
       { 1, 2, 3, 4, 5 }
    };
 
-   /***
-    * We need a matrix view to pass the matrix to lambda function even on CUDA device.
-    */
-   const auto matrixView = matrix.getConstView();
-
    /***
     * Fetch lambda function returns diagonal element in each row.
     */
    auto fetch = [=] __cuda_callable__ ( int rowIdx ) mutable -> double {
-      auto row = matrixView.getRow( rowIdx );
+      auto row = matrix->getRow( rowIdx );
       return row.getElement( rowIdx );
    };
 
-   int trace = TNL::Algorithms::Reduction< TNL::Devices::Host >::reduce( matrix.getRows(), std::plus<>{}, fetch, 0 );
+   /***
+    * For the case when Device is CUDA device we need to synchronize smart
+    * pointers. To avoid this you may use DenseMatrixView. See
+    * DenseMatrixView::getConstRow example for details.
+    */
+   TNL::Pointers::synchronizeSmartPointersOnDevice< Device >();
+
+   /***
+    * Compute the matrix trace.
+    */
+   int trace = TNL::Algorithms::Reduction< Device >::reduce( matrix->getRows(), std::plus<>{}, fetch, 0 );
    std::cout << "Matrix trace is " << trace << "." << std::endl;
 }
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Getting matrix rows on host: " << std::endl;
+   getRowExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Getting matrix rows on CUDA device: " << std::endl;
+   getRowExample< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getConstRow.cu b/Documentation/Examples/Matrices/DenseMatrixExample_getConstRow.cu
new file mode 120000
index 000000000..c78f9cfdc
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getConstRow.cu
@@ -0,0 +1 @@
+DenseMatrixExample_getConstRow.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getElementsCount.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_getElementsCount.cpp
index 997dadb7f..a95fa00e7 100644
--- a/Documentation/Examples/Matrices/DenseMatrixExample_getElementsCount.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getElementsCount.cpp
@@ -1,10 +1,12 @@
 #include <iostream>
 #include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
 
-int main( int argc, char* argv[] )
+template< typename Device >
+void getElementsCountExample()
 {
-   TNL::Matrices::DenseMatrix< double, TNL::Devices::Host > triangularMatrix {
+   TNL::Matrices::DenseMatrix< double, Device > triangularMatrix {
       {  1 },
       {  2,  3 },
       {  4,  5,  6 },
@@ -15,3 +17,14 @@ int main( int argc, char* argv[] )
    std::cout << "Matrix elements count is " << triangularMatrix.getElementsCount() << "." << std::endl;
    std::cout << "Non-zero matrix elements count is " << triangularMatrix.getNonzeroElementsCount() << "." << std::endl;
 }
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Computing matrix elements on host: " << std::endl;
+   getElementsCountExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Computing matrix elements on CUDA device: " << std::endl;
+   getElementsCountExample< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getElementsCount.cu b/Documentation/Examples/Matrices/DenseMatrixExample_getElementsCount.cu
new file mode 120000
index 000000000..6e8348f73
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getElementsCount.cu
@@ -0,0 +1 @@
+DenseMatrixExample_getElementsCount.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getRow.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_getRow.cpp
index 120c934a3..00a6b1119 100644
--- a/Documentation/Examples/Matrices/DenseMatrixExample_getRow.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getRow.cpp
@@ -2,20 +2,41 @@
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Pointers/SharedPointer.h>
 
-int main( int argc, char* argv[] )
+template< typename Device >
+void getRowExample()
 {
-   TNL::Matrices::DenseMatrix< double, TNL::Devices::Host > matrix( 5, 5 );
+   using MatrixType = TNL::Matrices::DenseMatrix< double, Device >;
+   TNL::Pointers::SharedPointer< MatrixType > matrix( 5, 5 );
 
-   /***
-    * We need a matrix view to pass the matrix to lambda function even on CUDA device.
-    */
-   auto matrixView = matrix.getView();
    auto f = [=] __cuda_callable__ ( int rowIdx ) mutable {
-      auto row = matrixView.getRow( rowIdx );
+      auto row = matrix->getRow( rowIdx );
       row.setElement( rowIdx, 10* ( rowIdx + 1 ) );
    };
 
-   TNL::Algorithms::ParallelFor< TNL::Devices::Host >::exec( 0, matrix.getRows(), f );
+   /***
+    * For the case when Device is CUDA device we need to synchronize smart
+    * pointers. To avoid this you may use DenseMatrixView. See
+    * DenseMatrixView::getRow example for details.
+    */
+   TNL::Pointers::synchronizeSmartPointersOnDevice< Device >();
+
+   /***
+    * Set the matrix elements.
+    */
+   TNL::Algorithms::ParallelFor< Device >::exec( 0, matrix->getRows(), f );
    std::cout << matrix << std::endl;
 }
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Getting matrix rows on host: " << std::endl;
+   getRowExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Getting matrix rows on CUDA device: " << std::endl;
+   getRowExample< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_getRow.cu b/Documentation/Examples/Matrices/DenseMatrixExample_getRow.cu
new file mode 120000
index 000000000..58a55f2fd
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_getRow.cu
@@ -0,0 +1 @@
+DenseMatrixExample_getRow.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cpp b/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cpp
index bf96abf23..0eb0610a4 100644
--- a/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cpp
@@ -1,10 +1,12 @@
 #include <iostream>
 #include <TNL/Matrices/DenseMatrix.h>
 #include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
 
-int main( int argc, char* argv[] )
+template< typename Device >
+void setElementsExample()
 {
-   TNL::Matrices::DenseMatrix< double, TNL::Devices::Host > matrix;
+   TNL::Matrices::DenseMatrix< double, Device > matrix;
    matrix.setElements( {
       {  1,  2,  3,  4,  5,  6 },
       {  7,  8,  9, 10, 11, 12 },
@@ -13,7 +15,7 @@ int main( int argc, char* argv[] )
 
    std::cout << matrix << std::endl;
 
-   TNL::Matrices::DenseMatrix< double, TNL::Devices::Host > triangularMatrix;
+   TNL::Matrices::DenseMatrix< double, Device > triangularMatrix;
    triangularMatrix.setElements( {
       {  1 },
       {  2,  3 },
@@ -24,3 +26,14 @@ int main( int argc, char* argv[] )
 
    std::cout << triangularMatrix << std::endl;
 }
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Setting matrix elements on host: " << std::endl;
+   setElementsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Setting matrix elements on CUDA device: " << std::endl;
+   setElementsExample< TNL::Devices::Cuda >();
+#endif
+}
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cu b/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cu
new file mode 120000
index 000000000..fa2487e27
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixExample_setElements.cu
@@ -0,0 +1 @@
+DenseMatrixExample_setElements.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixViewExample_allRowsReduction.cpp b/Documentation/Examples/Matrices/DenseMatrixViewExample_allRowsReduction.cpp
new file mode 100644
index 000000000..b65cb3ea9
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixViewExample_allRowsReduction.cpp
@@ -0,0 +1,66 @@
+#include <iostream>
+#include <iomanip>
+#include <functional>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+template< typename Device >
+void allRowsReduction()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix {
+      {  1,  0,  0,  0,  0 },
+      {  1,  2,  0,  0,  0 },
+      {  0,  1,  8,  0,  0 },
+      {  0,  0,  1,  9,  0 },
+      {  0,  0,  0,  0,  1 } };
+   auto matrixView = matrix.getView();
+
+   /***
+    * Find largest element in each row.
+    */
+   TNL::Containers::Vector< double, Device > rowMax( matrix.getRows() );
+
+   /***
+    * Prepare vector view and matrix view for lambdas.
+    */
+   auto rowMaxView = rowMax.getView();
+
+   /***
+    * Fetch lambda just returns absolute value of matrix elements.
+    */
+   auto fetch = [=] __cuda_callable__ ( int rowIdx, int columnIdx, const double& value ) -> double {
+      return TNL::abs( value );
+   };
+
+   /***
+    * Reduce lambda return maximum of given values.
+    */
+   auto reduce = [=] __cuda_callable__ ( double& a, const double& b ) -> double {
+      return TNL::max( a, b );
+   };
+
+   /***
+    * Keep lambda store the largest value in each row to the vector rowMax.
+    */
+   auto keep = [=] __cuda_callable__ ( int rowIdx, const double& value ) mutable {
+      rowMaxView[ rowIdx ] = value;
+   };
+
+   /***
+    * Compute the largest values in each row.
+    */
+   matrixView.allRowsReduction( fetch, reduce, keep, std::numeric_limits< double >::lowest() );
+
+   std::cout << "Max. elements in rows are: " << rowMax << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "All rows reduction on host:" << std::endl;
+   allRowsReduction< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "All rows reduction on CUDA device:" << std::endl;
+   allRowsReduction< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixViewExample_allRowsReduction.cu b/Documentation/Examples/Matrices/DenseMatrixViewExample_allRowsReduction.cu
new file mode 120000
index 000000000..61dd89125
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixViewExample_allRowsReduction.cu
@@ -0,0 +1 @@
+DenseMatrixViewExample_allRowsReduction.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/DenseMatrixViewExample_getConstRow.cpp b/Documentation/Examples/Matrices/DenseMatrixViewExample_getConstRow.cpp
new file mode 100644
index 000000000..ab155185e
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixViewExample_getConstRow.cpp
@@ -0,0 +1,45 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void getRowExample()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix {
+      { 1, 0, 0, 0, 0 },
+      { 1, 2, 0, 0, 0 },
+      { 1, 2, 3, 0, 0 },
+      { 1, 2, 3, 4, 0 },
+      { 1, 2, 3, 4, 5 }
+   };
+
+   /***
+    * We need a matrix view to pass the matrix to lambda function even on CUDA device.
+    */
+   const auto matrixView = matrix.getConstView();
+
+   /***
+    * Fetch lambda function returns diagonal element in each row.
+    */
+   auto fetch = [=] __cuda_callable__ ( int rowIdx ) mutable -> double {
+      auto row = matrixView.getRow( rowIdx );
+      return row.getElement( rowIdx );
+   };
+
+   int trace = TNL::Algorithms::Reduction< Device >::reduce( matrix.getRows(), std::plus<>{}, fetch, 0 );
+   std::cout << "Matrix trace is " << trace << "." << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Getting matrix rows on host: " << std::endl;
+   getRowExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Getting matrix rows on CUDA device: " << std::endl;
+   getRowExample< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrixViewExample_getRow.cpp b/Documentation/Examples/Matrices/DenseMatrixViewExample_getRow.cpp
new file mode 100644
index 000000000..30d893bc1
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrixViewExample_getRow.cpp
@@ -0,0 +1,34 @@
+#include <iostream>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void getRowExample()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix( 5, 5 );
+
+   /***
+    * We need a matrix view to pass the matrix to lambda function even on CUDA device.
+    */
+   auto matrixView = matrix.getView();
+   auto f = [=] __cuda_callable__ ( int rowIdx ) mutable {
+      auto row = matrixView.getRow( rowIdx );
+      row.setElement( rowIdx, 10* ( rowIdx + 1 ) );
+   };
+
+   TNL::Algorithms::ParallelFor< Device >::exec( 0, matrix.getRows(), f );
+   std::cout << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Getting matrix rows on host: " << std::endl;
+   getRowExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Getting matrix rows on CUDA device: " << std::endl;
+   getRowExample< TNL::Devices::Cuda >();
+#endif
+}
\ No newline at end of file
diff --git a/src/TNL/Matrices/DenseMatrix.h b/src/TNL/Matrices/DenseMatrix.h
index 320032c10..495277727 100644
--- a/src/TNL/Matrices/DenseMatrix.h
+++ b/src/TNL/Matrices/DenseMatrix.h
@@ -124,7 +124,8 @@ class DenseMatrix : public Matrix< Real, Device, Index >
        * \par Output
        * \include DenseMatrixExample_Constructor_init_list.out
        */
-      DenseMatrix( std::initializer_list< std::initializer_list< RealType > > data );
+      template< typename Value >
+      DenseMatrix( std::initializer_list< std::initializer_list< Value > > data );
 
       /**
        * \brief Returns a modifiable view of the dense matrix.
@@ -197,7 +198,8 @@ class DenseMatrix : public Matrix< Real, Device, Index >
        * \par Output
        * \include DenseMatrixExample_setElements.out
        */
-      void setElements( std::initializer_list< std::initializer_list< RealType > > data );
+      template< typename Value >
+      void setElements( std::initializer_list< std::initializer_list< Value > > data );
 
       /**
        * \brief This method is only for the compatibility with the sparse matrices.
diff --git a/src/TNL/Matrices/DenseMatrix.hpp b/src/TNL/Matrices/DenseMatrix.hpp
index 3f3e0383c..6e64235f0 100644
--- a/src/TNL/Matrices/DenseMatrix.hpp
+++ b/src/TNL/Matrices/DenseMatrix.hpp
@@ -42,8 +42,9 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
+   template< typename Value >
 DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
-DenseMatrix( std::initializer_list< std::initializer_list< RealType > > data )
+DenseMatrix( std::initializer_list< std::initializer_list< Value > > data )
 {
    this->setElements( data );
 }
@@ -53,9 +54,10 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
+   template< typename Value >
 void
 DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
-setElements( std::initializer_list< std::initializer_list< RealType > > data )
+setElements( std::initializer_list< std::initializer_list< Value > > data )
 {
    IndexType rows = data.size();
    IndexType columns = 0;
-- 
GitLab


From 52f6bf8c64b546899957e29ee74cff012fb3ed88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 6 May 2020 11:10:17 +0200
Subject: [PATCH 25/68] Removed legacy matrix from SolverConfig

---
 src/TNL/Solvers/SolverConfig_impl.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Solvers/SolverConfig_impl.h b/src/TNL/Solvers/SolverConfig_impl.h
index 5642995e6..3c21a7b23 100644
--- a/src/TNL/Solvers/SolverConfig_impl.h
+++ b/src/TNL/Solvers/SolverConfig_impl.h
@@ -16,7 +16,7 @@
 #include <TNL/Solvers/PDE/ExplicitTimeStepper.h>
 #include <TNL/Solvers/PDE/TimeDependentPDESolver.h>
 #include <TNL/Solvers/LinearSolverTypeResolver.h>
-#include <TNL/Matrices/Legacy/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
 
 namespace TNL {
 namespace Solvers {
@@ -139,7 +139,7 @@ bool SolverConfig< ConfigTag, ProblemConfig >::configSetup( Config::ConfigDescri
    if( ConfigTagTimeDiscretisation< ConfigTag, SemiImplicitTimeDiscretisationTag >::enabled )
    {
       config.addDelimiter( " === Semi-implicit solvers parameters === " );
-      typedef Matrices::Legacy::CSR< double, Devices::Host, int > MatrixType;
+      using MatrixType = Matrices::SparseMatrix< double >;
       Linear::CG< MatrixType >::configSetup( config );
       Linear::BICGStab< MatrixType >::configSetup( config );
       Linear::BICGStabL< MatrixType >::configSetup( config );
@@ -157,7 +157,6 @@ bool SolverConfig< ConfigTag, ProblemConfig >::configSetup( Config::ConfigDescri
    config.addEntry< String >( "log-file", "Log file for the computation.", "log.txt" );
    config.addEntry< int >( "log-width", "Number of columns of the log table.", 80 );
    return true;
-
 }
 
 } // namespace Solvers
-- 
GitLab


From d5423bf756c6825d7ab6a5ac5174042c32ce757b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 6 May 2020 11:11:04 +0200
Subject: [PATCH 26/68] Simplified MatrixType in HeatEquationProblem

---
 src/TNL/Problems/HeatEquationProblem.h      | 3 +--
 src/TNL/Problems/HeatEquationProblem_impl.h | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/TNL/Problems/HeatEquationProblem.h b/src/TNL/Problems/HeatEquationProblem.h
index 4b2a0d430..76e467380 100644
--- a/src/TNL/Problems/HeatEquationProblem.h
+++ b/src/TNL/Problems/HeatEquationProblem.h
@@ -18,7 +18,6 @@
 
 #include <TNL/Problems/PDEProblem.h>
 #include <TNL/Operators/diffusion/LinearDiffusion.h>
-#include <TNL/Matrices/Legacy/Ellpack.h>
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Timer.h>
 #include <TNL/Solvers/PDE/ExplicitUpdater.h>
@@ -50,7 +49,6 @@ class HeatEquationProblem : public PDEProblem< Mesh,
       typedef Functions::MeshFunction< Mesh > MeshFunctionType;
       typedef Pointers::SharedPointer< MeshFunctionType, DeviceType > MeshFunctionPointer;
       typedef PDEProblem< Mesh, Communicator, RealType, DeviceType, IndexType > BaseType;
-      typedef Matrices::Legacy::SlicedEllpack< RealType, DeviceType, IndexType > MatrixType;
       typedef Pointers::SharedPointer<  DifferentialOperator > DifferentialOperatorPointer;
       typedef Pointers::SharedPointer<  BoundaryCondition > BoundaryConditionPointer;
       typedef Pointers::SharedPointer<  RightHandSide, DeviceType > RightHandSidePointer;
@@ -59,6 +57,7 @@ class HeatEquationProblem : public PDEProblem< Mesh,
       using typename BaseType::MeshPointer;
       using typename BaseType::DofVectorType;
       using typename BaseType::DofVectorPointer;
+      using typename BaseType::MatrixType;
 
       typedef Communicator CommunicatorType;
 
diff --git a/src/TNL/Problems/HeatEquationProblem_impl.h b/src/TNL/Problems/HeatEquationProblem_impl.h
index 98cd6d5e4..fbb7e7d01 100644
--- a/src/TNL/Problems/HeatEquationProblem_impl.h
+++ b/src/TNL/Problems/HeatEquationProblem_impl.h
@@ -18,7 +18,6 @@
 
 #include <TNL/FileName.h>
 #include <TNL/Matrices/MatrixSetter.h>
-#include <TNL/Matrices/Legacy/MultidiagonalMatrixSetter.h>
 #include <TNL/Logger.h>
 #include <TNL/Solvers/PDE/BoundaryConditionsSetter.h>
 
@@ -192,7 +191,6 @@ setupLinearSystem( MatrixPointer& matrixPointer )
    matrixPointer->setDimensions( dofs, dofs );
    matrixPointer->setCompressedRowLengths( *rowLengthsPointer );
    return true;
-   //return MultidiagonalMatrixSetter< Mesh >::setupMatrix( mesh, matrix );
 }
 
 template< typename Mesh,
-- 
GitLab


From de51fb4d78330a844fa1a989bcb98de62391cf33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 6 May 2020 11:13:33 +0200
Subject: [PATCH 27/68] Switched MatrixType in PDEProblem to segments-based
 matrix

---
 src/TNL/Problems/PDEProblem.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Problems/PDEProblem.h b/src/TNL/Problems/PDEProblem.h
index e73bf633c..179255b93 100644
--- a/src/TNL/Problems/PDEProblem.h
+++ b/src/TNL/Problems/PDEProblem.h
@@ -13,7 +13,8 @@
 #include <TNL/Problems/Problem.h>
 #include <TNL/Problems/CommonData.h>
 #include <TNL/Pointers/SharedPointer.h>
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
+#include <TNL/Matrices/SparseMatrix.h>
+#include <TNL/Containers/Segments/SlicedEllpack.h>
 #include <TNL/Solvers/PDE/TimeDependentPDESolver.h>
 
 namespace TNL {
@@ -39,7 +40,14 @@ class PDEProblem : public Problem< Real, Device, Index >
       using SubdomainOverlapsType = typename DistributedMeshType::SubdomainOverlapsType;
       using DofVectorType = Containers::Vector< RealType, DeviceType, IndexType>;
       using DofVectorPointer = Pointers::SharedPointer< DofVectorType, DeviceType >;
-      using MatrixType = Matrices::Legacy::SlicedEllpack< RealType, DeviceType, IndexType >;
+      template< typename _Device, typename _Index, typename _IndexAlocator >
+      using SegmentsType = Containers::Segments::SlicedEllpack< _Device, _Index, _IndexAlocator >;
+      using MatrixType = TNL::Matrices::SparseMatrix< Real,
+                                                      Device,
+                                                      Index,
+                                                      TNL::Matrices::GeneralMatrix,
+                                                      SegmentsType
+                                                    >;
       using CommunicatorType = Communicator;
       using CommonDataType = CommonData;
       using CommonDataPointer = Pointers::SharedPointer< CommonDataType, DeviceType >;
-- 
GitLab


From c749c8a94ab1d7bf0df276cb33b13c52817c378a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 6 May 2020 11:14:56 +0200
Subject: [PATCH 28/68] Added operator<< and setValue for SparseMatrixRowView

---
 src/TNL/Matrices/SparseMatrixRowView.h   | 19 ++++++++++++--
 src/TNL/Matrices/SparseMatrixRowView.hpp | 32 ++++++++++++++++++++++--
 2 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/src/TNL/Matrices/SparseMatrixRowView.h b/src/TNL/Matrices/SparseMatrixRowView.h
index 0b89e685c..eda4852e9 100644
--- a/src/TNL/Matrices/SparseMatrixRowView.h
+++ b/src/TNL/Matrices/SparseMatrixRowView.h
@@ -10,8 +10,12 @@
 
 #pragma once
 
+#include <ostream>
+
+#include <TNL/Cuda/CudaCallable.h>
+
 namespace TNL {
-   namespace Matrices {
+namespace Matrices {
 
 template< typename SegmentView,
           typename ValuesView,
@@ -52,6 +56,10 @@ class SparseMatrixRowView
       __cuda_callable__
       RealType& getValue( const IndexType localIdx );
 
+      __cuda_callable__
+      void setValue( const IndexType localIdx,
+                     const RealType& value );
+
       __cuda_callable__
       void setElement( const IndexType localIdx,
                        const IndexType column,
@@ -64,7 +72,14 @@ class SparseMatrixRowView
 
       ColumnsIndexesViewType columnIndexes;
 };
-   } // namespace Matrices
+
+template< typename SegmentView,
+          typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+std::ostream& operator<<( std::ostream& str, const SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >& row );
+
+} // namespace Matrices
 } // namespace TNL
 
 #include <TNL/Matrices/SparseMatrixRowView.hpp>
diff --git a/src/TNL/Matrices/SparseMatrixRowView.hpp b/src/TNL/Matrices/SparseMatrixRowView.hpp
index 67d0845d4..c15b45f34 100644
--- a/src/TNL/Matrices/SparseMatrixRowView.hpp
+++ b/src/TNL/Matrices/SparseMatrixRowView.hpp
@@ -11,9 +11,10 @@
 #pragma once
 
 #include <TNL/Matrices/SparseMatrixRowView.h>
+#include <TNL/Assert.h>
 
 namespace TNL {
-   namespace Matrices {
+namespace Matrices {
 
 template< typename SegmentView,
           typename ValuesView,
@@ -89,6 +90,22 @@ getValue( const IndexType localIdx ) -> RealType&
    return values[ segmentView.getGlobalIndex( localIdx ) ];
 }
 
+template< typename SegmentView,
+          typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ void
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+setValue( const IndexType localIdx,
+          const RealType& value )
+{
+   TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
+   if( ! isBinary() ) {
+      const IndexType globalIdx = segmentView.getGlobalIndex( localIdx );
+      values[ globalIdx ] = value;
+   }
+}
+
 template< typename SegmentView,
           typename ValuesView,
           typename ColumnsIndexesView,
@@ -106,6 +123,17 @@ setElement( const IndexType localIdx,
       values[ globalIdx ] = value;
 }
 
+template< typename SegmentView,
+          typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+std::ostream& operator<<( std::ostream& str, const SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >& row )
+{
+   using NonConstIndex = std::remove_const_t< typename SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::IndexType >;
+   for( NonConstIndex i = 0; i < row.getSize(); i++ )
+      str << " [ " << row.getColumnIndex( i ) << " ] = " << row.getValue( i ) << ", ";
+   return str;
+}
 
-   } // namespace Matrices
+} // namespace Matrices
 } // namespace TNL
-- 
GitLab


From 3b1f0cee177b318d24657d53552d70a98939d412 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 6 May 2020 11:15:24 +0200
Subject: [PATCH 29/68] Fixed missing return statements in BiEllpack

---
 src/TNL/Containers/Segments/details/BiEllpack.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/TNL/Containers/Segments/details/BiEllpack.h b/src/TNL/Containers/Segments/details/BiEllpack.h
index a9d4eb97a..fe2701f07 100644
--- a/src/TNL/Containers/Segments/details/BiEllpack.h
+++ b/src/TNL/Containers/Segments/details/BiEllpack.h
@@ -59,6 +59,8 @@ class BiEllpack
                return numberOfGroups - i;
             bisection *= 2;
          }
+         TNL_ASSERT_TRUE( false, "segmentIdx was not found" );
+         return -1; // to avoid compiler warning
       }
 
       static IndexType getActiveGroupsCount( const ConstOffsetsHolderView& rowPermArray, const IndexType segmentIdx )
@@ -211,6 +213,7 @@ class BiEllpack
             groupHeight /= 2;
          }
          TNL_ASSERT_TRUE( false, "Segment capacity exceeded, wrong localIdx." );
+         return -1; // to avoid compiler warning
       }
 
       static __cuda_callable__
-- 
GitLab


From 485c5bc747f4d275bc891cc638678724b046a59c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 6 May 2020 12:33:54 +0200
Subject: [PATCH 30/68] Switched LinearSolvers benchmark to segments-based
 matrix

---
 .../tnl-benchmark-linear-solvers.h            | 52 +++++++++++++++----
 1 file changed, 41 insertions(+), 11 deletions(-)

diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
index 7e275244e..11fa2b91e 100644
--- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
+++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
@@ -30,6 +30,7 @@
 #include <TNL/Containers/Partitioner.h>
 #include <TNL/Containers/DistributedVector.h>
 #include <TNL/Matrices/DistributedMatrix.h>
+#include <TNL/Matrices/SparseOperations.h>
 #include <TNL/Matrices/MatrixReader.h>
 #include <TNL/Solvers/Linear/Preconditioners/Diagonal.h>
 #include <TNL/Solvers/Linear/Preconditioners/ILU0.h>
@@ -54,7 +55,12 @@
    #define HAVE_CUSOLVER
 #endif
 
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
+#include <TNL/Matrices/SparseMatrix.h>
+#include <TNL/Containers/Segments/CSR.h>
+#include <TNL/Containers/Segments/SlicedEllpack.h>
+
+template< typename _Device, typename _Index, typename _IndexAlocator >
+using SegmentsType = TNL::Containers::Segments::SlicedEllpack< _Device, _Index, _IndexAlocator >;
 
 using namespace TNL;
 using namespace TNL::Benchmarks;
@@ -465,11 +471,15 @@ struct LinearSolversBenchmark
          dist_x0[ gi ] = x0[ gi ];
          dist_b[ gi ] = b[ gi ];
 
-         const IndexType rowLength = matrixPointer->getRowLength( i );
-         IndexType columns[ rowLength ];
-         RealType values[ rowLength ];
-         matrixPointer->getRowFast( gi, columns, values );
-         distMatrixPointer->setRowFast( gi, columns, values, rowLength );
+//         const IndexType rowLength = matrixPointer->getRowLength( i );
+//         IndexType columns[ rowLength ];
+//         RealType values[ rowLength ];
+//         matrixPointer->getRowFast( gi, columns, values );
+//         distMatrixPointer->setRowFast( gi, columns, values, rowLength );
+         const auto global_row = matrixPointer->getRow( gi );
+         auto local_row = distMatrixPointer->getRow( gi );
+         for( IndexType j = 0; j < global_row.getSize(); j++ )
+            local_row.setElement( j, global_row.getColumnIndex( j ), global_row.getValue( j ) );
       }
 
       std::cout << "Iterative solvers:" << std::endl;
@@ -488,7 +498,12 @@ struct LinearSolversBenchmark
    {
       // direct solvers
       if( parameters.getParameter< bool >( "with-direct" ) ) {
-         using CSR = Matrices::Legacy::CSR< RealType, DeviceType, IndexType >;
+         using CSR = TNL::Matrices::SparseMatrix< RealType,
+                                                  DeviceType,
+                                                  IndexType,
+                                                  TNL::Matrices::GeneralMatrix,
+                                                  Containers::Segments::CSR
+                                                >;
          SharedPointer< CSR > matrixCopy;
          Matrices::copySparseMatrix( *matrixCopy, *matrixPointer );
 
@@ -511,11 +526,21 @@ struct LinearSolversBenchmark
 #ifdef HAVE_CUSOLVER
       std::cout << "CuSOLVER:" << std::endl;
       {
-         using CSR = Matrices::CSR< RealType, DeviceType, IndexType >;
+         using CSR = TNL::Matrices::SparseMatrix< RealType,
+                                                  DeviceType,
+                                                  IndexType,
+                                                  TNL::Matrices::GeneralMatrix,
+                                                  Containers::Segments::CSR
+                                                >;
          SharedPointer< CSR > matrixCopy;
          Matrices::copySparseMatrix( *matrixCopy, *matrixPointer );
 
-         using CudaCSR = Matrices::CSR< RealType, Devices::Cuda, IndexType >;
+         using CudaCSR = TNL::Matrices::SparseMatrix< RealType,
+                                                      Devices::Cuda,
+                                                      IndexType,
+                                                      TNL::Matrices::GeneralMatrix,
+                                                      Containers::Segments::CSR
+                                                    >;
          using CudaVector = typename VectorType::template Self< RealType, Devices::Cuda >;
          SharedPointer< CudaCSR > cuda_matrixCopy;
          *cuda_matrixCopy = *matrixCopy;
@@ -567,7 +592,7 @@ configSetup( Config::ConfigDescription& config )
 
    config.addDelimiter( "Linear solver settings:" );
    Solvers::IterativeSolver< double, int >::configSetup( config );
-   using Matrix = Matrices::Legacy::SlicedEllpack< double, Devices::Host, int >;
+   using Matrix = Matrices::SparseMatrix< double >;
    using GMRES = Solvers::Linear::GMRES< Matrix >;
    GMRES::configSetup( config );
    using BiCGstabL = Solvers::Linear::BICGStabL< Matrix >;
@@ -621,7 +646,12 @@ main( int argc, char* argv[] )
 //   return ! Matrices::resolveMatrixType< MainConfig,
 //                                         Devices::Host,
 //                                         LinearSolversBenchmark >( benchmark, metadata, parameters );
-   using MatrixType = Matrices::Legacy::SlicedEllpack< double, Devices::Host, int >;
+   using MatrixType = TNL::Matrices::SparseMatrix< double,
+                                                   Devices::Host,
+                                                   int,
+                                                   TNL::Matrices::GeneralMatrix,
+                                                   SegmentsType
+                                                 >;
    const bool status = LinearSolversBenchmark< MatrixType >::run( benchmark, metadata, parameters );
 
    if( rank == 0 )
-- 
GitLab


From 0d3b879801d71d9d7973466240475a025cc6d807 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 6 May 2020 12:34:26 +0200
Subject: [PATCH 31/68] Switched DistSpMV benchmark to segments-based matrix

---
 .../DistSpMV/tnl-benchmark-distributed-spmv.h | 28 ++++++++++++++-----
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
index 53dfb07f4..61169d079 100644
--- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
+++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
@@ -25,11 +25,16 @@
 #include <TNL/Containers/Partitioner.h>
 #include <TNL/Containers/DistributedVector.h>
 #include <TNL/Matrices/DistributedMatrix.h>
+#include <TNL/Matrices/SparseOperations.h>
 
 #include "../Benchmarks.h"
 #include "ordering.h"
 
-#include <TNL/Matrices/Legacy/SlicedEllpack.h>
+#include <TNL/Matrices/SparseMatrix.h>
+#include <TNL/Containers/Segments/SlicedEllpack.h>
+
+template< typename _Device, typename _Index, typename _IndexAlocator >
+using SegmentsType = TNL::Containers::Segments::SlicedEllpack< _Device, _Index, _IndexAlocator >;
 
 using namespace TNL;
 using namespace TNL::Benchmarks;
@@ -243,11 +248,15 @@ struct SpmvBenchmark
          const auto gi = distributedMatrix.getLocalRowRange().getGlobalIndex( i );
          distributedVector[ gi ] = vector[ gi ];
 
-         const IndexType rowLength = matrix.getRowLength( i );
-         IndexType columns[ rowLength ];
-         RealType values[ rowLength ];
-         matrix.getRowFast( gi, columns, values );
-         distributedMatrix.setRowFast( gi, columns, values, rowLength );
+//         const IndexType rowLength = matrix.getRowLength( i );
+//         IndexType columns[ rowLength ];
+//         RealType values[ rowLength ];
+//         matrix.getRowFast( gi, columns, values );
+//         distributedMatrix.setRowFast( gi, columns, values, rowLength );
+         const auto global_row = matrix.getRow( gi );
+         auto local_row = distributedMatrix.getRow( gi );
+         for( IndexType j = 0; j < global_row.getSize(); j++ )
+            local_row.setElement( j, global_row.getColumnIndex( j ), global_row.getValue( j ) );
       }
 
       benchmarkDistributedSpmv( benchmark, distributedMatrix, distributedVector );
@@ -339,7 +348,12 @@ main( int argc, char* argv[] )
 //   return ! Matrices::resolveMatrixType< MainConfig,
 //                                         Devices::Host,
 //                                         SpmvBenchmark >( benchmark, metadata, parameters );
-   using MatrixType = Matrices::Legacy::SlicedEllpack< double, Devices::Host, int >;
+   using MatrixType = TNL::Matrices::SparseMatrix< double,
+                                                   Devices::Host,
+                                                   int,
+                                                   TNL::Matrices::GeneralMatrix,
+                                                   SegmentsType
+                                                 >;
    const bool status = SpmvBenchmark< MatrixType >::run( benchmark, metadata, parameters );
 
    if( rank == 0 )
-- 
GitLab


From c6d1173da50b00e23d1eeaaf305ffc22e847816c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 6 May 2020 12:35:54 +0200
Subject: [PATCH 32/68] Ported SparseOperations to segments

---
 src/Python/pytnl/tnl/SparseMatrix.cpp    |  5 +--
 src/TNL/Matrices/SparseOperations_impl.h | 45 ++++++++++--------------
 2 files changed, 21 insertions(+), 29 deletions(-)

diff --git a/src/Python/pytnl/tnl/SparseMatrix.cpp b/src/Python/pytnl/tnl/SparseMatrix.cpp
index 430715597..f4b1772a7 100644
--- a/src/Python/pytnl/tnl/SparseMatrix.cpp
+++ b/src/Python/pytnl/tnl/SparseMatrix.cpp
@@ -21,10 +21,11 @@ void export_SparseMatrices( py::module & m )
     export_Matrix< E_host   >( m, "Ellpack" );
     export_Matrix< SE_host  >( m, "SlicedEllpack" );
 
+    // TODO: copySparseMatrix does not work with Legacy matrices anymore
     //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< CSR_host, E_host >);
     //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< E_host, CSR_host >);
     //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< CSR_host, SE_host >);
     //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< SE_host, CSR_host >);
-    m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< E_host, SE_host >);
-    m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< SE_host, E_host >);
+    //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< E_host, SE_host >);
+    //m.def("copySparseMatrix", &TNL::Matrices::copySparseMatrix< SE_host, E_host >);
 }
diff --git a/src/TNL/Matrices/SparseOperations_impl.h b/src/TNL/Matrices/SparseOperations_impl.h
index ff507c326..97f86c4ee 100644
--- a/src/TNL/Matrices/SparseOperations_impl.h
+++ b/src/TNL/Matrices/SparseOperations_impl.h
@@ -36,11 +36,10 @@ SparseMatrixSetRowLengthsVectorKernel( Vector* rowLengths,
    const IndexType gridSize = blockDim.x * gridDim.x;
 
    while( rowIdx < rows ) {
-      const auto max_length = matrix->getRowLengthFast( rowIdx );
       const auto row = matrix->getRow( rowIdx );
       IndexType length = 0;
-      for( IndexType c_j = 0; c_j < max_length; c_j++ )
-         if( row.getElementColumn( c_j ) < cols )
+      for( IndexType c_j = 0; c_j < row.getSize(); c_j++ )
+         if( row.getColumnIndex( c_j ) < cols )
             length++;
          else
             break;
@@ -66,7 +65,7 @@ SparseMatrixCopyKernel( Matrix1* A,
       const auto rowB = B->getRow( rowIdx );
       auto rowA = A->getRow( rowIdx );
       for( IndexType c = 0; c < length; c++ )
-         rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
+         rowA.setElement( c, rowB.getColumnIndex( c ), rowB.getValue( c ) );
       rowIdx += gridSize;
    }
 }
@@ -102,11 +101,10 @@ copySparseMatrix_impl( Matrix1& A, const Matrix2& B )
 #pragma omp parallel for if( Devices::Host::isOMPEnabled() )
 #endif
       for( IndexType i = 0; i < rows; i++ ) {
-         const auto max_length = B.getRowLength( i );
          const auto row = B.getRow( i );
          IndexType length = 0;
-         for( IndexType c_j = 0; c_j < max_length; c_j++ )
-            if( row.getElementColumn( c_j ) < cols )
+         for( IndexType c_j = 0; c_j < row.getSize(); c_j++ )
+            if( row.getColumnIndex( c_j ) < cols )
                length++;
             else
                break;
@@ -122,7 +120,7 @@ copySparseMatrix_impl( Matrix1& A, const Matrix2& B )
          const auto rowB = B.getRow( i );
          auto rowA = A.getRow( i );
          for( IndexType c = 0; c < length; c++ )
-            rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
+            rowA.setElement( c, rowB.getColumnIndex( c ), rowB.getValue( c ) );
       }
    }
 
@@ -228,11 +226,10 @@ copyAdjacencyStructure( const Matrix& A, AdjacencyMatrix& B,
    rowLengths.setSize( N );
    rowLengths.setValue( 0 );
    for( IndexType i = 0; i < A.getRows(); i++ ) {
-      const int maxLength = A.getRowLength( i );
       const auto row = A.getRow( i );
       IndexType length = 0;
-      for( int c_j = 0; c_j < maxLength; c_j++ ) {
-         const IndexType j = row.getElementColumn( c_j );
+      for( int c_j = 0; c_j < row.getSize(); c_j++ ) {
+         const IndexType j = row.getColumnIndex( c_j );
          if( j >= A.getColumns() )
             break;
          length++;
@@ -248,10 +245,9 @@ copyAdjacencyStructure( const Matrix& A, AdjacencyMatrix& B,
 
    // set non-zeros
    for( IndexType i = 0; i < A.getRows(); i++ ) {
-      const int maxLength = A.getRowLength( i );
       const auto row = A.getRow( i );
-      for( int c_j = 0; c_j < maxLength; c_j++ ) {
-         const IndexType j = row.getElementColumn( c_j );
+      for( int c_j = 0; c_j < row.getSize(); c_j++ ) {
+         const IndexType j = row.getColumnIndex( c_j );
          if( j >= A.getColumns() )
             break;
          if( ! ignore_diagonal || i != j )
@@ -282,11 +278,10 @@ reorderSparseMatrix( const Matrix1& matrix1, Matrix2& matrix2, const Permutation
    typename Matrix2::CompressedRowLengthsVector rowLengths;
    rowLengths.setSize( matrix1.getRows() );
    for( IndexType i = 0; i < matrix1.getRows(); i++ ) {
-      const IndexType maxLength = matrix1.getRowLength( perm[ i ] );
       const auto row = matrix1.getRow( perm[ i ] );
       IndexType length = 0;
-      for( IndexType j = 0; j < maxLength; j++ )
-         if( row.getElementColumn( j ) < matrix1.getColumns() )
+      for( IndexType j = 0; j < row.getSize(); j++ )
+         if( row.getColumnIndex( j ) < matrix1.getColumns() )
             length++;
       rowLengths[ i ] = length;
    }
@@ -303,8 +298,8 @@ reorderSparseMatrix( const Matrix1& matrix1, Matrix2& matrix2, const Permutation
       typename Matrix2::IndexType columns[ rowLength ];
       typename Matrix2::RealType values[ rowLength ];
       for( IndexType j = 0; j < rowLength; j++ ) {
-         columns[ j ] = iperm[ row1.getElementColumn( j ) ];
-         values[ j ] = row1.getElementValue( j );
+         columns[ j ] = iperm[ row1.getColumnIndex( j ) ];
+         values[ j ] = row1.getValue( j );
       }
 
       // sort
@@ -319,14 +314,10 @@ reorderSparseMatrix( const Matrix1& matrix1, Matrix2& matrix2, const Permutation
       };
       std::sort( indices, indices + rowLength, comparator );
 
-      typename Matrix2::IndexType sortedColumns[ rowLength ];
-      typename Matrix2::RealType sortedValues[ rowLength ];
-      for( IndexType j = 0; j < rowLength; j++ ) {
-         sortedColumns[ j ] = columns[ indices[ j ] ];
-         sortedValues[ j ] = values[ indices[ j ] ];
-      }
-
-      matrix2.setRow( i, sortedColumns, sortedValues, rowLength );
+      // set the row
+      auto row2 = matrix2.getRow( i );
+      for( IndexType j = 0; j < rowLength; j++ )
+         row2.setElement( j, columns[ indices[ j ] ], values[ indices[ j ] ] );
    }
 }
 
-- 
GitLab


From 8c17519f248e3db24d89044cad66ba406d367a9e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 6 May 2020 12:36:21 +0200
Subject: [PATCH 33/68] Started porting DistributedMatrix to segments

---
 src/TNL/Matrices/DistributedMatrix.h          | 23 ++++----
 src/TNL/Matrices/DistributedMatrix_impl.h     | 52 +++++++++----------
 src/TNL/Matrices/DistributedSpMV.h            |  4 +-
 .../Matrices/DistributedMatrixTest.h          | 10 ++--
 4 files changed, 44 insertions(+), 45 deletions(-)

diff --git a/src/TNL/Matrices/DistributedMatrix.h b/src/TNL/Matrices/DistributedMatrix.h
index dde7051c0..d0769317f 100644
--- a/src/TNL/Matrices/DistributedMatrix.h
+++ b/src/TNL/Matrices/DistributedMatrix.h
@@ -14,7 +14,6 @@
 
 #include <type_traits>
 
-#include <TNL/Matrices/Legacy/SparseRow.h>
 #include <TNL/Communicators/MpiCommunicator.h>
 #include <TNL/Containers/Subrange.h>
 #include <TNL/Containers/DistributedVector.h>
@@ -56,8 +55,8 @@ public:
 
    using CompressedRowLengthsVector = Containers::DistributedVector< IndexType, DeviceType, IndexType, CommunicatorType >;
 
-   using MatrixRow = Matrices::Legacy::SparseRow< RealType, IndexType >;
-   using ConstMatrixRow = Matrices::Legacy::SparseRow< std::add_const_t< RealType >, std::add_const_t< IndexType > >;
+   using MatrixRow = typename Matrix::RowView;
+   using ConstMatrixRow = typename Matrix::ConstRowView;
 
    template< typename _Real = RealType,
              typename _Device = DeviceType,
@@ -125,16 +124,16 @@ public:
    RealType getElementFast( IndexType row,
                             IndexType column ) const;
 
-   __cuda_callable__
-   bool setRowFast( IndexType row,
-                    const IndexType* columnIndexes,
-                    const RealType* values,
-                    IndexType elements );
+//   __cuda_callable__
+//   bool setRowFast( IndexType row,
+//                    const IndexType* columnIndexes,
+//                    const RealType* values,
+//                    IndexType elements );
 
-   __cuda_callable__
-   void getRowFast( IndexType row,
-                    IndexType* columns,
-                    RealType* values ) const;
+//   __cuda_callable__
+//   void getRowFast( IndexType row,
+//                    IndexType* columns,
+//                    RealType* values ) const;
 
    __cuda_callable__
    MatrixRow getRow( IndexType row );
diff --git a/src/TNL/Matrices/DistributedMatrix_impl.h b/src/TNL/Matrices/DistributedMatrix_impl.h
index c1a13a713..b05d236b7 100644
--- a/src/TNL/Matrices/DistributedMatrix_impl.h
+++ b/src/TNL/Matrices/DistributedMatrix_impl.h
@@ -234,32 +234,32 @@ getElementFast( IndexType row,
    return localMatrix.getElementFast( localRow, column );
 }
 
-template< typename Matrix,
-          typename Communicator >
-__cuda_callable__
-bool
-DistributedMatrix< Matrix, Communicator >::
-setRowFast( IndexType row,
-            const IndexType* columnIndexes,
-            const RealType* values,
-            IndexType elements )
-{
-   const IndexType localRow = localRowRange.getLocalIndex( row );
-   return localMatrix.setRowFast( localRow, columnIndexes, values, elements );
-}
-
-template< typename Matrix,
-          typename Communicator >
-__cuda_callable__
-void
-DistributedMatrix< Matrix, Communicator >::
-getRowFast( IndexType row,
-            IndexType* columns,
-            RealType* values ) const
-{
-   const IndexType localRow = localRowRange.getLocalIndex( row );
-   return localMatrix.getRowFast( localRow, columns, values );
-}
+//template< typename Matrix,
+//          typename Communicator >
+//__cuda_callable__
+//bool
+//DistributedMatrix< Matrix, Communicator >::
+//setRowFast( IndexType row,
+//            const IndexType* columnIndexes,
+//            const RealType* values,
+//            IndexType elements )
+//{
+//   const IndexType localRow = localRowRange.getLocalIndex( row );
+//   return localMatrix.setRowFast( localRow, columnIndexes, values, elements );
+//}
+
+//template< typename Matrix,
+//          typename Communicator >
+//__cuda_callable__
+//void
+//DistributedMatrix< Matrix, Communicator >::
+//getRowFast( IndexType row,
+//            IndexType* columns,
+//            RealType* values ) const
+//{
+//   const IndexType localRow = localRowRange.getLocalIndex( row );
+//   return localMatrix.getRowFast( localRow, columns, values );
+//}
 
 template< typename Matrix,
           typename Communicator >
diff --git a/src/TNL/Matrices/DistributedSpMV.h b/src/TNL/Matrices/DistributedSpMV.h
index 083d7a606..7c4ee3c40 100644
--- a/src/TNL/Matrices/DistributedSpMV.h
+++ b/src/TNL/Matrices/DistributedSpMV.h
@@ -85,8 +85,8 @@ public:
          const auto row = localMatrix->getRow( i );
          bool comm_left = false;
          bool comm_right = false;
-         for( IndexType c = 0; c < row.getLength(); c++ ) {
-            const IndexType j = row.getElementColumn( c );
+         for( IndexType c = 0; c < row.getSize(); c++ ) {
+            const IndexType j = row.getColumnIndex( c );
             if( j < columns ) {
                const int owner = Partitioner::getOwner( j, columns, nproc );
                // atomic assignment
diff --git a/src/UnitTests/Matrices/DistributedMatrixTest.h b/src/UnitTests/Matrices/DistributedMatrixTest.h
index ca3f2a100..d432c5da7 100644
--- a/src/UnitTests/Matrices/DistributedMatrixTest.h
+++ b/src/UnitTests/Matrices/DistributedMatrixTest.h
@@ -13,7 +13,7 @@
 #include <TNL/Communicators/NoDistrCommunicator.h>
 #include <TNL/Matrices/DistributedMatrix.h>
 #include <TNL/Containers/Partitioner.h>
-#include <TNL/Matrices/Legacy/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
 
 using namespace TNL;
 
@@ -103,12 +103,12 @@ protected:
 
 // types for which DistributedMatrixTest is instantiated
 using DistributedMatrixTypes = ::testing::Types<
-   Matrices::DistributedMatrix< Matrices::Legacy::CSR< double, Devices::Host, int >, Communicators::MpiCommunicator >,
-   Matrices::DistributedMatrix< Matrices::Legacy::CSR< double, Devices::Host, int >, Communicators::NoDistrCommunicator >
+   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Host, int >, Communicators::MpiCommunicator >,
+   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Host, int >, Communicators::NoDistrCommunicator >
 #ifdef HAVE_CUDA
    ,
-   Matrices::DistributedMatrix< Matrices::Legacy::CSR< double, Devices::Cuda, int >, Communicators::MpiCommunicator >,
-   Matrices::DistributedMatrix< Matrices::Legacy::CSR< double, Devices::Cuda, int >, Communicators::NoDistrCommunicator >
+   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Cuda, int >, Communicators::MpiCommunicator >,
+   Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Cuda, int >, Communicators::NoDistrCommunicator >
 #endif
 >;
 
-- 
GitLab


From 3045be6d201367cbb41e4697f1a427c24cc22c42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 6 May 2020 12:58:10 +0200
Subject: [PATCH 34/68] Rewritten preconditioners for segment-based matrices

---
 .../Linear/Preconditioners/Diagonal_impl.h    |  5 +-
 src/TNL/Solvers/Linear/Preconditioners/ILU0.h |  4 +-
 .../Linear/Preconditioners/ILU0_impl.h        | 48 ++++++++++---------
 src/TNL/Solvers/Linear/Preconditioners/ILUT.h |  4 +-
 .../Linear/Preconditioners/ILUT_impl.h        | 18 ++++---
 .../Linear/Preconditioners/TriangularSolve.h  | 10 ++--
 6 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
index ae404321d..104768ef7 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h
@@ -32,7 +32,8 @@ update( const MatrixPointer& matrixPointer )
    diagonal.setSize( matrixPointer->getRows() );
 
    VectorViewType diag_view( diagonal );
-   const auto kernel_matrix = matrixPointer->getView(); //.template getData< DeviceType >();
+
+   const auto kernel_matrix = matrixPointer->getView();
 
    // TODO: Rewrite this with SparseMatrix::forAllRows
    auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
@@ -75,7 +76,7 @@ update( const MatrixPointer& matrixPointer )
    auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
    {
       const IndexType gi = kernel_matrix->getLocalRowRange().getGlobalIndex( i );
-      diag_view[ i ] = kernel_matrix->getLocalMatrix().getElementFast( i, gi );
+      diag_view[ i ] = kernel_matrix->getLocalMatrix().getElement( i, gi );
    };
 
    Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel );
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
index 8e9b49cd0..cc55c153c 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
@@ -15,7 +15,7 @@
 #include "Preconditioner.h"
 
 #include <TNL/Containers/Vector.h>
-#include <TNL/Matrices/Legacy/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
 #include <TNL/Pointers/UniquePointer.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
@@ -76,7 +76,7 @@ public:
 
 protected:
    // The factors L and U are stored separately and the rows of U are reversed.
-   Matrices::Legacy::CSR< RealType, DeviceType, IndexType > L, U;
+   Matrices::SparseMatrix< RealType, DeviceType, IndexType, Matrices::GeneralMatrix, Containers::Segments::CSR > L, U;
 
    // Specialized methods to distinguish between normal and distributed matrices
    // in the implementation.
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
index 5ae255304..1f4aab02d 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
@@ -43,11 +43,10 @@ update( const MatrixPointer& matrixPointer )
    typename decltype(U)::CompressedRowLengthsVector U_rowLengths( N );
    for( IndexType i = 0; i < N; i++ ) {
       const auto row = localMatrix.getRow( i );
-      const auto max_length = row.getLength();
       IndexType L_entries = 0;
       IndexType U_entries = 0;
-      for( IndexType j = 0; j < max_length; j++ ) {
-         const auto column = row.getElementColumn( j );
+      for( IndexType j = 0; j < row.getSize(); j++ ) {
+         const auto column = row.getColumnIndex( j );
          if( column < minColumn )
             continue;
          if( column < i + minColumn )
@@ -70,7 +69,11 @@ update( const MatrixPointer& matrixPointer )
       const auto max_length = localMatrix.getRowLength( i );
       IndexType all_columns[ max_length ];
       RealType all_values[ max_length ];
-      localMatrix.getRowFast( i, all_columns, all_values );
+      const auto row = localMatrix.getRow( i );
+      for( IndexType j = 0; j < row.getSize(); j++ ) {
+         all_columns[ j ] = row.getColumnIndex( j );
+         all_values[ j ] = row.getValue( j );
+      }
 
       // skip non-local elements
       IndexType* columns = all_columns;
@@ -92,27 +95,28 @@ update( const MatrixPointer& matrixPointer )
 
       // this condition is to avoid segfaults on empty L.getRow( i )
       if( L_entries > 0 ) {
-         const auto L_i = L.getRow( i );
-         const auto U_i = U.getRow( N - 1 - i );
+         auto L_i = L.getRow( i );
+         auto U_i = U.getRow( N - 1 - i );
 
          // loop for k = 0, ..., i - 1; but only over the non-zero entries
          for( IndexType c_k = 0; c_k < L_entries; c_k++ ) {
-            const auto k = L_i.getElementColumn( c_k );
+            const auto k = L_i.getColumnIndex( c_k );
+            const auto U_k = U.getRow( N - 1 - k );
 
-            auto L_ik = L.getElementFast( i, k ) / U.getElementFast( N - 1 - k, k );
-            L.setElement( i, k, L_ik );
+            auto L_ik = L_i.getValue( c_k ) / U_k.getValue( c_k );
+            L_i.setElement( c_k, k, L_ik );
 
             // loop for j = k+1, ..., N-1; but only over the non-zero entries
             // and split into two loops over L and U separately
             for( IndexType c_j = c_k + 1; c_j < L_entries; c_j++ ) {
-               const auto j = L_i.getElementColumn( c_j );
-               const auto L_ij = L.getElementFast( i, j ) - L_ik * U.getElementFast( N - 1 - k, j );
-               L.setElement( i, j, L_ij );
+               const auto L_ij = L_i.getValue( c_j ) - L_ik * U_k.getValue( c_j );
+               const auto j = L_i.getColumnIndex( c_j );
+               L_i.setElement( c_j, j, L_ij );
             }
             for( IndexType c_j = 0; c_j < U_entries; c_j++ ) {
-               const auto j = U_i.getElementColumn( c_j );
-               const auto U_ij = U.getElementFast( N - 1 - i, j ) - L_ik * U.getElementFast( N - 1 - k, j );
-               U.setElement( N - 1 - i, j, U_ij );
+               const auto U_ij = U_i.getValue( c_j ) - L_ik * U_k.getValue( c_j );
+               const auto j = U_i.getColumnIndex( c_j );
+               U_i.setElement( c_j, j, U_ij );
             }
          }
       }
@@ -293,11 +297,10 @@ allocate_LU()
    auto kernel_copy_row_lengths = [=] __cuda_callable__ ( IndexType i ) mutable
    {
       const auto row = kernel_A->getRow( i );
-      const int max_length = row.getLength();
       int L_entries = 0;
       int U_entries = 0;
-      for( int c_j = 0; c_j < max_length; c_j++ ) {
-         const IndexType j = row.getElementColumn( c_j );
+      for( int c_j = 0; c_j < row.getSize(); c_j++ ) {
+         const IndexType j = row.getColumnIndex( c_j );
          if( j < i )
             L_entries++;
          else if( j < N )
@@ -338,13 +341,12 @@ copy_triangular_factors()
    auto kernel_copy_values = [=] __cuda_callable__ ( IndexType i ) mutable
    {
       const auto row = kernel_A->getRow( i );
-      const int max_length = row.getLength();
-      for( int c_j = 0; c_j < max_length; c_j++ ) {
-         const IndexType j = row.getElementColumn( c_j );
+      for( int c_j = 0; c_j < row.getSize(); c_j++ ) {
+         const IndexType j = row.getColumnIndex( c_j );
          if( j < i )
-            kernel_L->setElementFast( i, j, row.getElementValue( c_j ) );
+            kernel_L->setElementFast( i, j, row.getValue( c_j ) );
          else if( j < N )
-            kernel_U->setElementFast( i, j, row.getElementValue( c_j ) );
+            kernel_U->setElementFast( i, j, row.getValue( c_j ) );
          else
             break;
       }
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILUT.h b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h
index 99ac7fe52..6edf6e376 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILUT.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h
@@ -15,7 +15,7 @@
 #include "Preconditioner.h"
 
 #include <TNL/Containers/Vector.h>
-#include <TNL/Matrices/Legacy/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
 
 namespace TNL {
 namespace Solvers {
@@ -66,7 +66,7 @@ protected:
    Real tau = 1e-4;
 
    // The factors L and U are stored separately and the rows of U are reversed.
-   Matrices::Legacy::CSR< RealType, DeviceType, IndexType > L, U;
+   Matrices::SparseMatrix< RealType, DeviceType, IndexType, Matrices::GeneralMatrix, Containers::Segments::CSR > L, U;
 
    // Specialized methods to distinguish between normal and distributed matrices
    // in the implementation.
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
index 858f037fe..674ae2e9d 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
@@ -61,11 +61,10 @@ update( const MatrixPointer& matrixPointer )
    typename decltype(U)::CompressedRowLengthsVector U_rowLengths( N );
    for( IndexType i = 0; i < N; i++ ) {
       const auto row = localMatrix.getRow( i );
-      const auto max_length = localMatrix.getRowLength( i );
       IndexType L_entries = 0;
       IndexType U_entries = 0;
-      for( IndexType j = 0; j < max_length; j++ ) {
-         const auto column = row.getElementColumn( j );
+      for( IndexType j = 0; j < row.getSize(); j++ ) {
+         const auto column = row.getColumnIndex( j );
          if( column < minColumn )
             continue;
          if( column < i + minColumn )
@@ -103,7 +102,6 @@ update( const MatrixPointer& matrixPointer )
    // Incomplete LU factorization with threshold
    // (see Saad - Iterative methods for sparse linear systems, section 10.4)
    for( IndexType i = 0; i < N; i++ ) {
-      const auto max_length = localMatrix.getRowLength( i );
       const auto A_i = localMatrix.getRow( i );
 
       RealType A_i_norm = 0.0;
@@ -113,8 +111,8 @@ update( const MatrixPointer& matrixPointer )
 
       // copy A_i into the full vector w
 //      timer_copy_into_w.start();
-      for( IndexType c_j = 0; c_j < max_length; c_j++ ) {
-         auto j = A_i.getElementColumn( c_j );
+      for( IndexType c_j = 0; c_j < A_i.getSize(); c_j++ ) {
+         auto j = A_i.getColumnIndex( c_j );
          if( minColumn > 0 ) {
             // skip non-local elements
             if( j < minColumn ) continue;
@@ -122,7 +120,7 @@ update( const MatrixPointer& matrixPointer )
          }
          // handle ellpack dummy entries
          if( j >= N ) break;
-         w[ j ] = A_i.getElementValue( c_j );
+         w[ j ] = A_i.getValue( c_j );
 
          // running computation of norm
          A_i_norm += w[ j ] * w[ j ];
@@ -141,7 +139,7 @@ update( const MatrixPointer& matrixPointer )
          if( k >= i )
             break;
 
-         RealType w_k = w[ k ] / localMatrix.getElementFast( k, k + minColumn );
+         RealType w_k = w[ k ] / localMatrix.getElement( k, k + minColumn );
 
          // apply dropping rule to w_k
          if( std::abs( w_k ) < tau_i )
@@ -154,11 +152,11 @@ update( const MatrixPointer& matrixPointer )
             const auto U_k = U.getRow( N - 1 - k );
             // loop for j = 0, ..., N-1; but only over the non-zero entries
             for( Index c_j = 0; c_j < U_rowLengths[ N - 1 - k ]; c_j++ ) {
-               const auto j = U_k.getElementColumn( c_j );
+               const auto j = U_k.getColumnIndex( c_j );
 
                // skip dropped entries
                if( j >= N ) break;
-               w[ j ] -= w_k * U_k.getElementValue( c_j );
+               w[ j ] -= w_k * U_k.getValue( c_j );
 
                // add non-zero to the w_k_set
                w_k_set.insert( j );
diff --git a/src/TNL/Solvers/Linear/Preconditioners/TriangularSolve.h b/src/TNL/Solvers/Linear/Preconditioners/TriangularSolve.h
index 71f51a6eb..4ce7e34c5 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/TriangularSolve.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/TriangularSolve.h
@@ -51,11 +51,11 @@ void triangularSolveLower( const Matrix& L, Vector1& x, const Vector2& b )
 
          // loop for j = 0, ..., i - 1; but only over the non-zero entries
          for( IndexType c_j = 0; c_j < L_entries; c_j++ ) {
-            const auto j = L_i.getElementColumn( c_j );
+            const auto j = L_i.getColumnIndex( c_j );
             // skip padding zeros
             if( fullStorage == false && j >= N )
                break;
-            x_i -= L_i.getElementValue( c_j ) * x[ j ];
+            x_i -= L_i.getValue( c_j ) * x[ j ];
          }
       }
 
@@ -96,15 +96,15 @@ void triangularSolveUpper( const Matrix& U, Vector1& x, const Vector2& b )
       const auto U_entries = U.getRowLength( U_idx );
       const auto U_i = U.getRow( U_idx );
 
-      const auto U_ii = U_i.getElementValue( 0 );
+      const auto U_ii = U_i.getValue( 0 );
 
       // loop for j = i+1, ..., N-1; but only over the non-zero entries
       for( IndexType c_j = 1; c_j < U_entries ; c_j++ ) {
-         const auto j = U_i.getElementColumn( c_j );
+         const auto j = U_i.getColumnIndex( c_j );
          // skip padding zeros
          if( fullStorage == false && j >= N )
             break;
-         x_i -= U_i.getElementValue( c_j ) * x[ j ];
+         x_i -= U_i.getValue( c_j ) * x[ j ];
       }
 
       x[ i ] = x_i / U_ii;
-- 
GitLab


From 0e06bdbd5b810ff3def5d6bed6a07fc52553b4b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 6 May 2020 13:16:55 +0200
Subject: [PATCH 35/68] Fixed constructor of BiEllpackSegmentView

---
 src/TNL/Containers/Segments/BiEllpackSegmentView.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Containers/Segments/BiEllpackSegmentView.h b/src/TNL/Containers/Segments/BiEllpackSegmentView.h
index b716fe4c0..5f5e72049 100644
--- a/src/TNL/Containers/Segments/BiEllpackSegmentView.h
+++ b/src/TNL/Containers/Segments/BiEllpackSegmentView.h
@@ -46,7 +46,7 @@ class BiEllpackSegmentView
       BiEllpackSegmentView( const IndexType offset,
                             const IndexType inStripIdx,
                             const GroupsWidthType& groupsWidth )
-      : groupOffset( offset ), segmentSize( TNL::sum( groupsWidth ) ), inStripIdx( inStripIdx ), groupsWidth( groupsWidth ){};
+      : groupOffset( offset ), inStripIdx( inStripIdx ), segmentSize( TNL::sum( groupsWidth ) ), groupsWidth( groupsWidth ){};
 
       __cuda_callable__
       IndexType getSize() const
-- 
GitLab


From c3f5e63f96ccea472b0eacd0140df2f0ff2ca4be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 6 May 2020 13:25:08 +0200
Subject: [PATCH 36/68] Simplified ILU0

---
 src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
index 1f4aab02d..50f4be555 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
@@ -83,7 +83,7 @@ update( const MatrixPointer& matrixPointer )
          values++;
       }
 
-      // update column column indices
+      // update column indices
       if( minColumn > 0 )
          for( IndexType c_j = 0; c_j < max_length; c_j++ )
             all_columns[ c_j ] -= minColumn;
@@ -110,13 +110,11 @@ update( const MatrixPointer& matrixPointer )
             // and split into two loops over L and U separately
             for( IndexType c_j = c_k + 1; c_j < L_entries; c_j++ ) {
                const auto L_ij = L_i.getValue( c_j ) - L_ik * U_k.getValue( c_j );
-               const auto j = L_i.getColumnIndex( c_j );
-               L_i.setElement( c_j, j, L_ij );
+               L_i.setValue( c_j, L_ij );
             }
             for( IndexType c_j = 0; c_j < U_entries; c_j++ ) {
                const auto U_ij = U_i.getValue( c_j ) - L_ik * U_k.getValue( c_j );
-               const auto j = U_i.getColumnIndex( c_j );
-               U_i.setElement( c_j, j, U_ij );
+               U_i.setValue( c_j, U_ij );
             }
          }
       }
-- 
GitLab


From bb5ba33d6ba5e584b62d2e7795f183b85e535e4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 6 May 2020 20:22:36 +0200
Subject: [PATCH 37/68] Code reformatting in DenseMatrix and DenseMatrixView.

---
 src/TNL/Matrices/DenseMatrix.h       | 12 +++++++++++-
 src/TNL/Matrices/DenseMatrix.hpp     | 16 ++++++++++++----
 src/TNL/Matrices/DenseMatrixView.hpp | 24 ++++++++++++++++++------
 3 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/src/TNL/Matrices/DenseMatrix.h b/src/TNL/Matrices/DenseMatrix.h
index 495277727..aea7a33d6 100644
--- a/src/TNL/Matrices/DenseMatrix.h
+++ b/src/TNL/Matrices/DenseMatrix.h
@@ -490,6 +490,11 @@ class DenseMatrix : public Matrix< Real, Device, Index >
        * 
        * \tparam Function is a type of lambda function that will operate on matrix elements.
        * \param function  is an instance of the lambda function to be called in each row.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_forAllRows.cpp
+       * \par Output
+       * \include DenseMatrixExample_forAllRows.out
        */
       template< typename Function >
       void forAllRows( Function& function ) const;
@@ -501,6 +506,11 @@ class DenseMatrix : public Matrix< Real, Device, Index >
        * 
        * \tparam Function is a type of lambda function that will operate on matrix elements.
        * \param function  is an instance of the lambda function to be called in each row.
+       * 
+       * \par Example
+       * \include Matrices/DenseMatrixExample_forAllRows.cpp
+       * \par Output
+       * \include DenseMatrixExample_forAllRows.out
        */
       template< typename Function >
       void forAllRows( Function& function );
@@ -513,7 +523,7 @@ class DenseMatrix : public Matrix< Real, Device, Index >
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
        * \param row is index of the row used for the scalar product.
        * \param vector is the input vector.
-       * \return 
+       * \return result of the matrix row and vector product.
        */
       template< typename Vector >
       __cuda_callable__
diff --git a/src/TNL/Matrices/DenseMatrix.hpp b/src/TNL/Matrices/DenseMatrix.hpp
index 6e64235f0..bd2ea6212 100644
--- a/src/TNL/Matrices/DenseMatrix.hpp
+++ b/src/TNL/Matrices/DenseMatrix.hpp
@@ -204,7 +204,9 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-Index DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::getElementsCount() const
+Index
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+getElementsCount() const
 {
    return this->getRows() * this->getColumns();
 }
@@ -214,7 +216,9 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-Index DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::getNonzeroElementsCount() const
+Index
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+getNonzeroElementsCount() const
 {
    return this->view.getNonzeroElementsCount();
 }
@@ -224,7 +228,9 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::reset()
+void
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+reset()
 {
    Matrix< Real, Device, Index >::reset();
 }
@@ -234,7 +240,9 @@ template< typename Real,
           typename Index,
           bool RowMajorOrder,
           typename RealAllocator >
-void DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::setValue( const Real& value )
+void
+DenseMatrix< Real, Device, Index, RowMajorOrder, RealAllocator >::
+setValue( const Real& value )
 {
    this->view.setValue( value );
 }
diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index c50e34547..917fb596b 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -124,7 +124,9 @@ template< typename Real,
           typename Device,
           typename Index,
           bool RowMajorOrder >
-Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getRowLength( const IndexType row ) const
+Index
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+getRowLength( const IndexType row ) const
 {
    return this->getColumns();
 }
@@ -133,7 +135,9 @@ template< typename Real,
           typename Device,
           typename Index,
           bool RowMajorOrder >
-Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getMaxRowLength() const
+Index
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+getMaxRowLength() const
 {
    return this->getColumns();
 }
@@ -142,7 +146,9 @@ template< typename Real,
           typename Device,
           typename Index,
           bool RowMajorOrder >
-Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getElementsCount() const
+Index
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+getElementsCount() const
 {
    return this->getRows() * this->getColumns();
 }
@@ -151,7 +157,9 @@ template< typename Real,
           typename Device,
           typename Index,
           bool RowMajorOrder >
-Index DenseMatrixView< Real, Device, Index, RowMajorOrder >::getNonzeroElementsCount() const
+Index
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+getNonzeroElementsCount() const
 {
    const auto values_view = this->values.getConstView();
    auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> IndexType {
@@ -164,7 +172,9 @@ template< typename Real,
           typename Device,
           typename Index,
           bool RowMajorOrder >
-void DenseMatrixView< Real, Device, Index, RowMajorOrder >::reset()
+void
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+reset()
 {
    Matrix< Real, Device, Index >::reset();
 }
@@ -173,7 +183,9 @@ template< typename Real,
           typename Device,
           typename Index,
           bool RowMajorOrder >
-void DenseMatrixView< Real, Device, Index, RowMajorOrder >::setValue( const Real& value )
+void
+DenseMatrixView< Real, Device, Index, RowMajorOrder >::
+setValue( const Real& value )
 {
    this->values = value;
 }
-- 
GitLab


From 1fd30e62f941018b27dc5dcb7f9e703684751caa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 7 May 2020 06:46:06 +0200
Subject: [PATCH 38/68] Methods Array/ArrayView::get/setElement works even in
 CUDA kernels.

---
 src/TNL/Algorithms/MemoryOperationsCuda.hpp | 18 +++++++++--
 src/TNL/Containers/Array.h                  | 10 +++---
 src/TNL/Containers/Array.hpp                | 14 +++++++--
 src/TNL/Containers/ArrayView.h              | 10 +++---
 src/TNL/Containers/ArrayView.hpp            | 13 ++++++--
 src/UnitTests/Containers/ArrayTest.h        | 34 +++++++++++++++++++++
 src/UnitTests/Containers/ArrayViewTest.h    | 34 +++++++++++++++++++++
 7 files changed, 118 insertions(+), 15 deletions(-)

diff --git a/src/TNL/Algorithms/MemoryOperationsCuda.hpp b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
index ea4b92b61..49522a42b 100644
--- a/src/TNL/Algorithms/MemoryOperationsCuda.hpp
+++ b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
@@ -30,7 +30,14 @@ setElement( Element* data,
             const Element& value )
 {
    TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." );
-   MemoryOperations< Devices::Cuda >::set( data, value, 1 );
+#ifdef HAVE_CUDA
+   cudaMemcpy( ( void* ) data, ( void* ) &value, sizeof( Element ), cudaMemcpyHostToDevice );
+#endif
+   // TODO: For some reason the following does not work after adding
+   // #ifdef __CUDA_ARCH__ to Array::setElement and ArrayView::setElement.
+   // Probably it might be a problem with lambda function 'kernel' which
+   // nvcc probably does not handle properly.
+   //MemoryOperations< Devices::Cuda >::set( data, value, 1 );
 }
 
 template< typename Element >
@@ -40,7 +47,14 @@ getElement( const Element* data )
 {
    TNL_ASSERT_TRUE( data, "Attempted to get data through a nullptr." );
    Element result;
-   MultiDeviceMemoryOperations< void, Devices::Cuda >::template copy< Element, Element, int >( &result, data, 1 );
+#ifdef HAVE_CUDA
+   cudaMemcpy( ( void* ) &result, ( void* ) data, sizeof( Element ), cudaMemcpyDeviceToHost );
+#endif
+   // TODO: For some reason the following does not work after adding
+   // #ifdef __CUDA_ARCH__ to Array::getElement and ArrayView::getElement 
+   // Probably it might be a problem with lambda function 'kernel' which
+   // nvcc probably does not handle properly.
+   //MultiDeviceMemoryOperations< void, Devices::Cuda >::template copy< Element, Element, int >( &result, data, 1 );
    return result;
 }
 
diff --git a/src/TNL/Containers/Array.h b/src/TNL/Containers/Array.h
index 25f8048d8..0888cdf9b 100644
--- a/src/TNL/Containers/Array.h
+++ b/src/TNL/Containers/Array.h
@@ -446,22 +446,24 @@ class Array
       /**
        * \brief Sets the value of the \e i-th element to \e v.
        *
-       * This method can be called only from the host, but even for arrays
-       * allocated in a different memory space (e.g. GPU global memory).
+       * This method can be called from both the host system and the device
+       * where the array is allocated.
        *
        * \param i The index of the element to be set.
        * \param v The new value of the element.
        */
+      __cuda_callable__
       void setElement( const Index& i, const Value& v );
 
       /**
        * \brief Returns the value of the \e i-th element.
        *
-       * This method can be called only from the host, but even for arrays
-       * allocated in a different memory space (e.g. GPU global memory).
+       * This method can be called from both the host system and the device
+       * where the array is allocated.
        *
        * \param i The index of the element to be returned.
        */
+      __cuda_callable__
       Value getElement( const Index& i ) const;
 
       /**
diff --git a/src/TNL/Containers/Array.hpp b/src/TNL/Containers/Array.hpp
index f45b7370d..77903e60a 100644
--- a/src/TNL/Containers/Array.hpp
+++ b/src/TNL/Containers/Array.hpp
@@ -504,26 +504,34 @@ template< typename Value,
           typename Device,
           typename Index,
           typename Allocator >
-void
+__cuda_callable__ void
 Array< Value, Device, Index, Allocator >::
 setElement( const Index& i, const Value& x )
 {
    TNL_ASSERT_GE( i, (Index) 0, "Element index must be non-negative." );
    TNL_ASSERT_LT( i, this->getSize(), "Element index is out of bounds." );
-   return Algorithms::MemoryOperations< Device >::setElement( &( this->data[ i ] ), x );
+#ifdef __CUDA_ARCH__
+   this->data[ i ] = x;
+#else
+   Algorithms::MemoryOperations< Device >::setElement( &( this->data[ i ] ), x );
+#endif
 }
 
 template< typename Value,
           typename Device,
           typename Index,
           typename Allocator >
-Value
+__cuda_callable__ Value
 Array< Value, Device, Index, Allocator >::
 getElement( const Index& i ) const
 {
    TNL_ASSERT_GE( i, (Index) 0, "Element index must be non-negative." );
    TNL_ASSERT_LT( i, this->getSize(), "Element index is out of bounds." );
+#ifdef __CUDA_ARCH__
+   return this->data[ i ];
+#else
    return Algorithms::MemoryOperations< Device >::getElement( & ( this->data[ i ] ) );
+#endif
 }
 
 template< typename Value,
diff --git a/src/TNL/Containers/ArrayView.h b/src/TNL/Containers/ArrayView.h
index 5b9766ffd..d1d1c1177 100644
--- a/src/TNL/Containers/ArrayView.h
+++ b/src/TNL/Containers/ArrayView.h
@@ -313,22 +313,24 @@ public:
    /**
     * \brief Sets the value of the \e i-th element to \e v.
     *
-    * This method can be called only from the host, but even for array views
-    * allocated in a different memory space (e.g. GPU global memory).
+    * This method can be called from both the host system and the device
+    * where the array is allocated.
     *
     * \param i The index of the element to be set.
     * \param v The new value of the element.
     */
+   __cuda_callable__
    void setElement( Index i, Value value );
 
    /**
     * \brief Returns the value of the \e i-th element.
     *
-    * This method can be called only from the host, but even for array views
-    * allocated in a different memory space (e.g. GPU global memory).
+    * This method can be called from both the host system and the device
+    * where the array is allocated.
     *
     * \param i The index of the element to be returned.
     */
+   __cuda_callable__
    Value getElement( Index i ) const;
 
    /**
diff --git a/src/TNL/Containers/ArrayView.hpp b/src/TNL/Containers/ArrayView.hpp
index e36182cd5..d545c6048 100644
--- a/src/TNL/Containers/ArrayView.hpp
+++ b/src/TNL/Containers/ArrayView.hpp
@@ -211,25 +211,34 @@ getSize() const
 template< typename Value,
           typename Device,
           typename Index >
+__cuda_callable__
 void
 ArrayView< Value, Device, Index >::
 setElement( Index i, Value value )
 {
    TNL_ASSERT_GE( i, 0, "Element index must be non-negative." );
    TNL_ASSERT_LT( i, this->getSize(), "Element index is out of bounds." );
-   return Algorithms::MemoryOperations< Device >::setElement( &data[ i ], value );
+#ifdef __CUDA_ARCH__
+   data[ i ] = value;
+#else
+   Algorithms::MemoryOperations< Device >::setElement( &this->data[ i ], value );
+#endif
 }
 
 template< typename Value,
           typename Device,
           typename Index >
-Value
+__cuda_callable__ Value
 ArrayView< Value, Device, Index >::
 getElement( Index i ) const
 {
    TNL_ASSERT_GE( i, 0, "Element index must be non-negative." );
    TNL_ASSERT_LT( i, this->getSize(), "Element index is out of bounds." );
+#ifdef __CUDA_ARCH__
+   return data[ i ];
+#else
    return Algorithms::MemoryOperations< Device >::getElement( &data[ i ] );
+#endif
 }
 
 template< typename Value,
diff --git a/src/UnitTests/Containers/ArrayTest.h b/src/UnitTests/Containers/ArrayTest.h
index 4f6fd7c92..c7d9a3740 100644
--- a/src/UnitTests/Containers/ArrayTest.h
+++ b/src/UnitTests/Containers/ArrayTest.h
@@ -16,6 +16,9 @@
 #include <TNL/Containers/Array.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Pointers/DevicePointer.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include <TNL/Pointers/SmartPointersRegister.h>
+#include <TNL/Algorithms/ParallelFor.h>
 
 #include "gtest/gtest.h"
 
@@ -340,6 +343,37 @@ TYPED_TEST( ArrayTest, elementwiseAccess )
    testArrayElementwiseAccess( ArrayType() );
 }
 
+template< typename ArrayType >
+void test_setElement()
+{
+   Pointers::SharedPointer< ArrayType > a( 10, 0 ), b( 10, 0 );
+   auto set = [=] __cuda_callable__ ( int i ) mutable {
+      a->setElement( i, i );
+      b->setElement( i, a->getElement( i ) );
+   };
+   Pointers::synchronizeSmartPointersOnDevice< typename ArrayType::DeviceType >();
+   Algorithms::ParallelFor< typename ArrayType::DeviceType >::exec( 0, 10, set );
+   for( int i = 0; i < 10; i++ )
+   {
+      EXPECT_EQ( a->getElement( i ), i );
+      EXPECT_EQ( b->getElement( i ), i );
+   }
+}
+
+TYPED_TEST( ArrayTest, setElement )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+
+   ArrayType a( 10 );
+   for( int i = 0; i < 10; i++ )
+      a.setElement( i, i );
+
+   for( int i = 0; i < 10; i++ )
+      EXPECT_EQ( a.getElement( i ), i );
+
+   test_setElement< ArrayType >();
+}
+
 TYPED_TEST( ArrayTest, containsValue )
 {
    using ArrayType = typename TestFixture::ArrayType;
diff --git a/src/UnitTests/Containers/ArrayViewTest.h b/src/UnitTests/Containers/ArrayViewTest.h
index e5a9d5a20..b6f152c54 100644
--- a/src/UnitTests/Containers/ArrayViewTest.h
+++ b/src/UnitTests/Containers/ArrayViewTest.h
@@ -287,6 +287,40 @@ void ArrayViewEvaluateTest( ArrayType& u )
    }
 }
 
+template< typename ArrayType >
+void test_setElement()
+{
+   ArrayType a( 10, 0 ), b( 10, 0 );
+   auto a_view = a.getView();
+   auto b_view = b.getView();
+   auto set = [=] __cuda_callable__ ( int i ) mutable {
+      a_view.setElement( i, i );
+      b_view.setElement( i, a_view.getElement( i ) );
+   };
+   Algorithms::ParallelFor< typename ArrayType::DeviceType >::exec( 0, 10, set );
+   for( int i = 0; i < 10; i++ )
+   {
+      EXPECT_EQ( a.getElement( i ), i );
+      EXPECT_EQ( b.getElement( i ), i );
+   }
+}
+
+TYPED_TEST( ArrayViewTest, setElement )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+
+   ArrayType a( 10 );
+   auto a_view = a.getView();
+   for( int i = 0; i < 10; i++ )
+      a_view.setElement( i, i );
+
+   for( int i = 0; i < 10; i++ )
+      EXPECT_EQ( a_view.getElement( i ), i );
+
+   test_setElement< ArrayType >();
+}
+
+
 TYPED_TEST( ArrayViewTest, evaluate )
 {
    using ArrayType = typename TestFixture::ArrayType;
-- 
GitLab


From cc6d77e2a5f017843870e85c91437cad300f37ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 7 May 2020 09:38:13 +0200
Subject: [PATCH 39/68] Added missing TNL_CHECK_CUDA_DEVICE to
 MemoryOperationsCuda

---
 src/TNL/Algorithms/MemoryOperationsCuda.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/TNL/Algorithms/MemoryOperationsCuda.hpp b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
index 49522a42b..f3481f76f 100644
--- a/src/TNL/Algorithms/MemoryOperationsCuda.hpp
+++ b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
@@ -32,6 +32,7 @@ setElement( Element* data,
    TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." );
 #ifdef HAVE_CUDA
    cudaMemcpy( ( void* ) data, ( void* ) &value, sizeof( Element ), cudaMemcpyHostToDevice );
+   TNL_CHECK_CUDA_DEVICE;
 #endif
    // TODO: For some reason the following does not work after adding
    // #ifdef __CUDA_ARCH__ to Array::setElement and ArrayView::setElement.
@@ -49,6 +50,7 @@ getElement( const Element* data )
    Element result;
 #ifdef HAVE_CUDA
    cudaMemcpy( ( void* ) &result, ( void* ) data, sizeof( Element ), cudaMemcpyDeviceToHost );
+   TNL_CHECK_CUDA_DEVICE;
 #endif
    // TODO: For some reason the following does not work after adding
    // #ifdef __CUDA_ARCH__ to Array::getElement and ArrayView::getElement 
-- 
GitLab


From 0b810c0ff39d889ce1e64abdbf8fe7e28e9e5af3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 7 May 2020 10:06:05 +0200
Subject: [PATCH 40/68] Fixed setRow in ILU0

---
 src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
index 50f4be555..38f1ab004 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
@@ -90,13 +90,20 @@ update( const MatrixPointer& matrixPointer )
 
       const auto L_entries = L_rowLengths[ i ];
       const auto U_entries = U_rowLengths[ N - 1 - i ];
-      L.setRow( i, columns, values, L_entries );
-      U.setRow( N - 1 - i, &columns[ L_entries ], &values[ L_entries ], U_entries );
+//      L.setRow( i, columns, values, L_entries );
+//      U.setRow( N - 1 - i, &columns[ L_entries ], &values[ L_entries ], U_entries );
+
+      // copy values into U
+      auto U_i = U.getRow( N - 1 - i );
+      for( IndexType c_j = 0; c_j < U_entries; c_j++ )
+         U_i.setElement( c_j, columns[ L_entries + c_j ], values[ L_entries + c_j ] );
 
       // this condition is to avoid segfaults on empty L.getRow( i )
       if( L_entries > 0 ) {
+         // copy values into L
          auto L_i = L.getRow( i );
-         auto U_i = U.getRow( N - 1 - i );
+         for( IndexType c_j = 0; c_j < U_entries; c_j++ )
+            L_i.setElement( c_j, columns[ c_j ], values[ c_j ] );
 
          // loop for k = 0, ..., i - 1; but only over the non-zero entries
          for( IndexType c_k = 0; c_k < L_entries; c_k++ ) {
-- 
GitLab


From 8aa647ff4d9b2872c8fc98659807c488c155f11b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 7 May 2020 11:11:19 +0200
Subject: [PATCH 41/68] Fixed ILU0

---
 .../Linear/Preconditioners/ILU0_impl.h        | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
index 38f1ab004..bd4abfb04 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
@@ -66,11 +66,11 @@ update( const MatrixPointer& matrixPointer )
    // The factors L and U are stored separately and the rows of U are reversed.
    for( IndexType i = 0; i < N; i++ ) {
       // copy all non-zero entries from A into L and U
-      const auto max_length = localMatrix.getRowLength( i );
+      const auto row = localMatrix.getRow( i );
+      const auto max_length = row.getSize();
       IndexType all_columns[ max_length ];
       RealType all_values[ max_length ];
-      const auto row = localMatrix.getRow( i );
-      for( IndexType j = 0; j < row.getSize(); j++ ) {
+      for( IndexType j = 0; j < max_length; j++ ) {
          all_columns[ j ] = row.getColumnIndex( j );
          all_values[ j ] = row.getValue( j );
       }
@@ -102,25 +102,26 @@ update( const MatrixPointer& matrixPointer )
       if( L_entries > 0 ) {
          // copy values into L
          auto L_i = L.getRow( i );
-         for( IndexType c_j = 0; c_j < U_entries; c_j++ )
+         for( IndexType c_j = 0; c_j < L_entries; c_j++ )
             L_i.setElement( c_j, columns[ c_j ], values[ c_j ] );
 
          // loop for k = 0, ..., i - 1; but only over the non-zero entries
          for( IndexType c_k = 0; c_k < L_entries; c_k++ ) {
             const auto k = L_i.getColumnIndex( c_k );
-            const auto U_k = U.getRow( N - 1 - k );
 
-            auto L_ik = L_i.getValue( c_k ) / U_k.getValue( c_k );
-            L_i.setElement( c_k, k, L_ik );
+            auto L_ik = L_i.getValue( c_k ) / U.getElement( N - 1 - k, k );
+            L_i.setValue( c_k, L_ik );
 
             // loop for j = k+1, ..., N-1; but only over the non-zero entries
             // and split into two loops over L and U separately
             for( IndexType c_j = c_k + 1; c_j < L_entries; c_j++ ) {
-               const auto L_ij = L_i.getValue( c_j ) - L_ik * U_k.getValue( c_j );
+               const auto j = L_i.getColumnIndex( c_j );
+               const auto L_ij = L_i.getValue( c_j ) - L_ik * U.getElement( N - 1 - k, j );
                L_i.setValue( c_j, L_ij );
             }
             for( IndexType c_j = 0; c_j < U_entries; c_j++ ) {
-               const auto U_ij = U_i.getValue( c_j ) - L_ik * U_k.getValue( c_j );
+               const auto j = U_i.getColumnIndex( c_j );
+               const auto U_ij = U_i.getValue( c_j ) - L_ik * U.getElement( N - 1 - k, j );
                U_i.setValue( c_j, U_ij );
             }
          }
-- 
GitLab


From ec1ab1d6c3922848b4cd8da2f7751c3659acc5e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 7 May 2020 11:24:12 +0200
Subject: [PATCH 42/68] Fixed padding index in ILUT

---
 src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
index 674ae2e9d..29b173b25 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h
@@ -119,7 +119,7 @@ update( const MatrixPointer& matrixPointer )
             j -= minColumn;
          }
          // handle ellpack dummy entries
-         if( j >= N ) break;
+         if( j == localMatrix.getPaddingIndex() ) break;
          w[ j ] = A_i.getValue( c_j );
 
          // running computation of norm
@@ -155,7 +155,7 @@ update( const MatrixPointer& matrixPointer )
                const auto j = U_k.getColumnIndex( c_j );
 
                // skip dropped entries
-               if( j >= N ) break;
+               if( j == localMatrix.getPaddingIndex() ) break;
                w[ j ] -= w_k * U_k.getValue( c_j );
 
                // add non-zero to the w_k_set
-- 
GitLab


From 514f4d15c10dda6026278ab356244cc6551e0b66 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 7 May 2020 14:26:46 +0200
Subject: [PATCH 43/68] Added missing __cuda_callable__ to
 FetchLambdaAdapter::call

---
 src/TNL/Containers/Segments/details/LambdaAdapter.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Containers/Segments/details/LambdaAdapter.h b/src/TNL/Containers/Segments/details/LambdaAdapter.h
index a2d118fdd..a87915ced 100644
--- a/src/TNL/Containers/Segments/details/LambdaAdapter.h
+++ b/src/TNL/Containers/Segments/details/LambdaAdapter.h
@@ -9,11 +9,9 @@
 /* See Copyright Notice in tnl/Copyright */
 
 #pragma once
-#include<TNL/Containers/Segments/details/CheckLambdas.h>
 
 #include "CheckLambdas.h"
 
-
 namespace TNL {
    namespace Containers {
       namespace Segments {
@@ -31,7 +29,8 @@ template< typename Index,
 struct FetchLambdaAdapter< Index, Lambda, true >
 {
    using ReturnType = decltype( std::declval< Lambda >()( Index(), Index(), Index(), std::declval< bool& >() ) );
-   
+
+   __cuda_callable__
    static ReturnType call( Lambda& f, Index segmentIdx, Index localIdx, Index globalIdx, bool& compute )
    {
       return f( segmentIdx, localIdx, globalIdx, compute );
@@ -43,6 +42,8 @@ template< typename Index,
 struct FetchLambdaAdapter< Index, Lambda, false >
 {
    using ReturnType = decltype( std::declval< Lambda >()( Index(), std::declval< bool& >() ) );
+
+   __cuda_callable__
    static ReturnType call( Lambda& f, Index segmentIdx, Index localIdx, Index globalIdx, bool& compute )
    {
       return f( globalIdx, compute );
-- 
GitLab


From 0cd05a1756ba80077f1284ff904e5547d41fc837 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 7 May 2020 15:50:07 +0200
Subject: [PATCH 44/68] Removed useless 'const' in front of ConstRowView return
 type

---
 src/TNL/Matrices/SparseMatrixView.h   | 2 +-
 src/TNL/Matrices/SparseMatrixView.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index 93494aa4f..fe7df5495 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -91,7 +91,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       void reset();
 
       __cuda_callable__
-      const ConstRowView getRow( const IndexType& rowIdx ) const;
+      ConstRowView getRow( const IndexType& rowIdx ) const;
 
       __cuda_callable__
       RowView getRow( const IndexType& rowIdx );
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index e17abf716..a095be8fc 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -193,7 +193,7 @@ template< typename Real,
           template< typename, typename > class SegmentsView >
 __cuda_callable__ auto
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
-getRow( const IndexType& rowIdx ) const -> const ConstRowView
+getRow( const IndexType& rowIdx ) const -> ConstRowView
 {
    TNL_ASSERT_LT( rowIdx, this->getRows(), "Row index is larger than number of matrix rows." );
    return ConstRowView( this->segments.getSegmentView( rowIdx ), this->values.getConstView(), this->columnIndexes.getConstView() );
-- 
GitLab


From 8940fe40215fe515e04c73d9d01c5388066d2c0f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 7 May 2020 15:51:39 +0200
Subject: [PATCH 45/68] Removed useless getView() call - values and
 columnIndexes in SparseMatrixView are already views

Clang is KOKOT and did not like it...
---
 src/TNL/Matrices/SparseMatrixView.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index a095be8fc..aaa3b8d59 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -196,7 +196,7 @@ SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
 getRow( const IndexType& rowIdx ) const -> ConstRowView
 {
    TNL_ASSERT_LT( rowIdx, this->getRows(), "Row index is larger than number of matrix rows." );
-   return ConstRowView( this->segments.getSegmentView( rowIdx ), this->values.getConstView(), this->columnIndexes.getConstView() );
+   return ConstRowView( this->segments.getSegmentView( rowIdx ), this->values, this->columnIndexes );
 }
 
 template< typename Real,
@@ -209,7 +209,7 @@ SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
 getRow( const IndexType& rowIdx ) -> RowView
 {
    TNL_ASSERT_LT( rowIdx, this->getRows(), "Row index is larger than number of matrix rows." );
-   return RowView( this->segments.getSegmentView( rowIdx ), this->values.getView(), this->columnIndexes.getView() );
+   return RowView( this->segments.getSegmentView( rowIdx ), this->values, this->columnIndexes );
 }
 
 template< typename Real,
-- 
GitLab


From 0809ff44abe49a8bb5f15e4fc8c88c99c0e49c8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 7 May 2020 10:27:39 +0200
Subject: [PATCH 46/68] Adding __cuda_callable__ to getElement in SparseMatrix
 and SparseMatrixView fixes the diagonal preconditioner

---
 src/TNL/Matrices/SparseMatrix.h       | 1 +
 src/TNL/Matrices/SparseMatrix.hpp     | 1 +
 src/TNL/Matrices/SparseMatrixView.h   | 1 +
 src/TNL/Matrices/SparseMatrixView.hpp | 1 +
 4 files changed, 4 insertions(+)

diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index 91818de82..fb48e97bb 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -153,6 +153,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
                        const RealType& value,
                        const RealType& thisElementMultiplicator );
 
+      __cuda_callable__
       RealType getElement( const IndexType row,
                            const IndexType column ) const;
 
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index f5757de5c..696d49a21 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -423,6 +423,7 @@ template< typename Real,
           template< typename, typename, typename > class Segments,
           typename RealAllocator,
           typename IndexAllocator >
+__cuda_callable__
 Real
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
 getElement( const IndexType row,
diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index fe7df5495..132fd589f 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -105,6 +105,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
                        const RealType& value,
                        const RealType& thisElementMultiplicator = 1.0 );
 
+      __cuda_callable__
       RealType getElement( IndexType row,
                            IndexType column ) const;
 
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index aaa3b8d59..0dc55e462 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -308,6 +308,7 @@ template< typename Real,
           typename Index,
           typename MatrixType,
           template< typename, typename > class SegmentsView >
+__cuda_callable__
 Real
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
 getElement( IndexType row,
-- 
GitLab


From 307e0e72fe0122f2a420651ff77f1f172a9ef679 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 7 May 2020 16:25:12 +0200
Subject: [PATCH 47/68] Migrating stuff for segment-based matrices

---
 src/TNL/Operators/DirichletBoundaryConditions.h    | 2 +-
 src/TNL/Operators/NeumannBoundaryConditions.h      | 6 +++---
 src/TNL/Operators/diffusion/LinearDiffusion_impl.h | 6 +++---
 src/TNL/Solvers/PDE/BackwardTimeDiscretisation.h   | 2 +-
 src/TNL/Solvers/PDE/LinearSystemAssembler.h        | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/TNL/Operators/DirichletBoundaryConditions.h b/src/TNL/Operators/DirichletBoundaryConditions.h
index 313894072..ddc9b08ff 100644
--- a/src/TNL/Operators/DirichletBoundaryConditions.h
+++ b/src/TNL/Operators/DirichletBoundaryConditions.h
@@ -105,7 +105,7 @@ class DirichletBoundaryConditions
                               Matrix& matrix,
                               Vector& b ) const
       {
-         typename Matrix::MatrixRow matrixRow = matrix.getRow( entity.getIndex() );
+         auto matrixRow = matrix.getRow( entity.getIndex() );
          const IndexType& index = entity.getIndex();
          matrixRow.setElement( 0, index, 1.0 );
          b[ index ] = Functions::FunctionAdapter< MeshType, Function >::getValue( this->function, entity, time );
diff --git a/src/TNL/Operators/NeumannBoundaryConditions.h b/src/TNL/Operators/NeumannBoundaryConditions.h
index a46545cd1..3b6d48fa1 100644
--- a/src/TNL/Operators/NeumannBoundaryConditions.h
+++ b/src/TNL/Operators/NeumannBoundaryConditions.h
@@ -155,7 +155,7 @@ class NeumannBoundaryConditions< Meshes::Grid< 1, MeshReal, Device, MeshIndex >,
       {
          const auto& neighborEntities = entity.getNeighborEntities();
          const IndexType& index = entity.getIndex();
-         typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
+         auto matrixRow = matrix.getRow( index );
          if( entity.getCoordinates().x() == 0 )
          {
             matrixRow.setElement( 0, index, 1.0 );
@@ -261,7 +261,7 @@ class NeumannBoundaryConditions< Meshes::Grid< 2, MeshReal, Device, MeshIndex >,
       {
          const auto& neighborEntities = entity.getNeighborEntities();
          const IndexType& index = entity.getIndex();
-         typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
+         auto matrixRow = matrix.getRow( index );
          if( entity.getCoordinates().x() == 0 )
          {
             matrixRow.setElement( 0, index,                                                1.0 );
@@ -390,7 +390,7 @@ class NeumannBoundaryConditions< Meshes::Grid< 3, MeshReal, Device, MeshIndex >,
       {
          const auto& neighborEntities = entity.getNeighborEntities();
          const IndexType& index = entity.getIndex();
-         typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
+         auto matrixRow = matrix.getRow( index );
          if( entity.getCoordinates().x() == 0 )
          {
             matrixRow.setElement( 0, index,                                                   1.0 );
diff --git a/src/TNL/Operators/diffusion/LinearDiffusion_impl.h b/src/TNL/Operators/diffusion/LinearDiffusion_impl.h
index 51bdf8a62..bbdfb4db1 100644
--- a/src/TNL/Operators/diffusion/LinearDiffusion_impl.h
+++ b/src/TNL/Operators/diffusion/LinearDiffusion_impl.h
@@ -87,7 +87,7 @@ setMatrixElements( const PreimageFunction& u,
    static_assert( PreimageFunction::getEntitiesDimension() == 1, "Wrong preimage function" );
    const typename MeshEntity::template NeighborEntities< 1 >& neighborEntities = entity.getNeighborEntities();
    const IndexType& index = entity.getIndex();
-   typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
+   auto matrixRow = matrix.getRow( index );
    const RealType lambdaX = tau * entity.getMesh().template getSpaceStepsProducts< -2 >();
    matrixRow.setElement( 0, neighborEntities.template getEntityIndex< -1 >(),      - lambdaX );
    matrixRow.setElement( 1, index,                                              2.0 * lambdaX );
@@ -162,7 +162,7 @@ setMatrixElements( const PreimageFunction& u,
    static_assert( MeshEntity::getEntityDimension() == 2, "Wrong mesh entity dimensions." );
    static_assert( PreimageFunction::getEntitiesDimension() == 2, "Wrong preimage function" );
    const IndexType& index = entity.getIndex();
-   typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
+   auto matrixRow = matrix.getRow( index );
    const RealType lambdaX = tau * entity.getMesh().template getSpaceStepsProducts< -2, 0 >();
    const RealType lambdaY = tau * entity.getMesh().template getSpaceStepsProducts< 0, -2 >();
    const typename MeshEntity::template NeighborEntities< 2 >& neighborEntities = entity.getNeighborEntities();
@@ -244,7 +244,7 @@ setMatrixElements( const PreimageFunction& u,
    static_assert( PreimageFunction::getEntitiesDimension() == 3, "Wrong preimage function" );
    const typename MeshEntity::template NeighborEntities< 3 >& neighborEntities = entity.getNeighborEntities();
    const IndexType& index = entity.getIndex();
-   typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
+   auto matrixRow = matrix.getRow( index );
    const RealType lambdaX = tau * entity.getMesh().template getSpaceStepsProducts< -2, 0, 0 >();
    const RealType lambdaY = tau * entity.getMesh().template getSpaceStepsProducts< 0, -2, 0 >();
    const RealType lambdaZ = tau * entity.getMesh().template getSpaceStepsProducts< 0, 0, -2 >();
diff --git a/src/TNL/Solvers/PDE/BackwardTimeDiscretisation.h b/src/TNL/Solvers/PDE/BackwardTimeDiscretisation.h
index 7172e08e8..2050fb0a7 100644
--- a/src/TNL/Solvers/PDE/BackwardTimeDiscretisation.h
+++ b/src/TNL/Solvers/PDE/BackwardTimeDiscretisation.h
@@ -32,7 +32,7 @@ class BackwardTimeDiscretisation
                                                                const RealType& rhs )
         {
             b += u + tau * rhs;
-            matrix.addElementFast( index, index, 1.0, 1.0 );
+            matrix.addElement( index, index, 1.0, 1.0 );
         }
 };
 
diff --git a/src/TNL/Solvers/PDE/LinearSystemAssembler.h b/src/TNL/Solvers/PDE/LinearSystemAssembler.h
index b74cb2660..abc80f9b7 100644
--- a/src/TNL/Solvers/PDE/LinearSystemAssembler.h
+++ b/src/TNL/Solvers/PDE/LinearSystemAssembler.h
@@ -114,8 +114,8 @@ class LinearSystemAssembler
                                            typename MeshFunction::IndexType > >::value != true,
       "Error: I am getting Vector instead of MeshFunction or similar object. You might forget to bind DofVector into MeshFunction in you method getExplicitUpdate."  );
 
-      const IndexType maxRowLength = matrixPointer.template getData< Devices::Host >().getMaxRowLength();
-      TNL_ASSERT_GT( maxRowLength, 0, "maximum row length must be positive" );
+      //const IndexType maxRowLength = matrixPointer.template getData< Devices::Host >().getMaxRowLength();
+      //TNL_ASSERT_GT( maxRowLength, 0, "maximum row length must be positive" );
       this->userData.time = time;
       this->userData.tau = tau;
       this->userData.u = &uPointer.template getData< DeviceType >();
-- 
GitLab


From 5c693b33e75337b73407282acfa7b1761e4a227e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 7 May 2020 16:35:37 +0200
Subject: [PATCH 48/68] Fixing compilation of rowVectorProduct

---
 src/TNL/Matrices/SparseMatrix.hpp     | 2 +-
 src/TNL/Matrices/SparseMatrixView.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index 696d49a21..d0621b0d0 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -446,7 +446,7 @@ SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAll
 rowVectorProduct( const IndexType row,
                   const Vector& vector ) const
 {
-   this->view.rowVectorProduct( row, vector );
+   return this->view.rowVectorProduct( row, vector );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 0dc55e462..da5fbab9f 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -355,7 +355,7 @@ SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
 rowVectorProduct( const IndexType row,
                   const Vector& vector ) const
 {
-
+   throw Exceptions::NotImplementedError("TODO: rowVectorProduct is not implemented yet.");
 }
 
 template< typename Real,
-- 
GitLab


From ca86abf2e38bd676a6d860075605e6ce2ee27c9c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 7 May 2020 16:56:40 +0200
Subject: [PATCH 49/68] Added missing CUDA support checks

---
 src/TNL/Algorithms/MemoryOperationsCuda.hpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/TNL/Algorithms/MemoryOperationsCuda.hpp b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
index f3481f76f..70a31cf26 100644
--- a/src/TNL/Algorithms/MemoryOperationsCuda.hpp
+++ b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
@@ -33,6 +33,8 @@ setElement( Element* data,
 #ifdef HAVE_CUDA
    cudaMemcpy( ( void* ) data, ( void* ) &value, sizeof( Element ), cudaMemcpyHostToDevice );
    TNL_CHECK_CUDA_DEVICE;
+#else
+   throw Exceptions::CudaSupportMissing();
 #endif
    // TODO: For some reason the following does not work after adding
    // #ifdef __CUDA_ARCH__ to Array::setElement and ArrayView::setElement.
@@ -51,6 +53,8 @@ getElement( const Element* data )
 #ifdef HAVE_CUDA
    cudaMemcpy( ( void* ) &result, ( void* ) data, sizeof( Element ), cudaMemcpyDeviceToHost );
    TNL_CHECK_CUDA_DEVICE;
+#else
+   throw Exceptions::CudaSupportMissing();
 #endif
    // TODO: For some reason the following does not work after adding
    // #ifdef __CUDA_ARCH__ to Array::getElement and ArrayView::getElement 
-- 
GitLab


From de38db4ab659818d7e0e1c1f9a5f4ca30d733b59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 7 May 2020 16:57:01 +0200
Subject: [PATCH 50/68] Added 'Self' template into SparseMatrix

---
 src/TNL/Matrices/SparseMatrix.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index fb48e97bb..4a41f9643 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -68,6 +68,15 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       using RowView = SparseMatrixRowView< SegmentViewType, ValuesViewType, ColumnsIndexesViewType, isBinary() >;
       using ConstRowView = typename RowView::ConstViewType;
 
+      template< typename _Real = Real,
+                typename _Device = Device,
+                typename _Index = Index,
+                typename _MatrixType = MatrixType,
+                template< typename, typename, typename > class _Segments = Segments,
+                typename _RealAllocator = typename Allocators::Default< _Device >::template Allocator< _Real >,
+                typename _IndexAllocator = typename Allocators::Default< _Device >::template Allocator< _Index > >
+      using Self = SparseMatrix< _Real, _Device, _Index, _MatrixType, _Segments, _RealAllocator, _IndexAllocator >;
+
       // TODO: remove this - it is here only for compatibility with original matrix implementation
       typedef Containers::Vector< IndexType, DeviceType, IndexType > CompressedRowLengthsVector;
       typedef Containers::VectorView< IndexType, DeviceType, IndexType > CompressedRowLengthsVectorView;
-- 
GitLab


From 92539874a0ecc785b88cbfcb3a2cb9b861f3197d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 7 May 2020 17:11:55 +0200
Subject: [PATCH 51/68] Updating DistributedMatrix for segments

---
 src/TNL/Matrices/DistributedMatrix.h      | 18 +--------
 src/TNL/Matrices/DistributedMatrix_impl.h | 47 ++---------------------
 2 files changed, 5 insertions(+), 60 deletions(-)

diff --git a/src/TNL/Matrices/DistributedMatrix.h b/src/TNL/Matrices/DistributedMatrix.h
index d0769317f..08ab33a4f 100644
--- a/src/TNL/Matrices/DistributedMatrix.h
+++ b/src/TNL/Matrices/DistributedMatrix.h
@@ -108,15 +108,10 @@ public:
 
    IndexType getRowLength( IndexType row ) const;
 
-   bool setElement( IndexType row,
+   void setElement( IndexType row,
                     IndexType column,
                     RealType value );
 
-   __cuda_callable__
-   bool setElementFast( IndexType row,
-                        IndexType column,
-                        RealType value );
-
    RealType getElement( IndexType row,
                         IndexType column ) const;
 
@@ -124,17 +119,6 @@ public:
    RealType getElementFast( IndexType row,
                             IndexType column ) const;
 
-//   __cuda_callable__
-//   bool setRowFast( IndexType row,
-//                    const IndexType* columnIndexes,
-//                    const RealType* values,
-//                    IndexType elements );
-
-//   __cuda_callable__
-//   void getRowFast( IndexType row,
-//                    IndexType* columns,
-//                    RealType* values ) const;
-
    __cuda_callable__
    MatrixRow getRow( IndexType row );
 
diff --git a/src/TNL/Matrices/DistributedMatrix_impl.h b/src/TNL/Matrices/DistributedMatrix_impl.h
index b05d236b7..dfef7b7c9 100644
--- a/src/TNL/Matrices/DistributedMatrix_impl.h
+++ b/src/TNL/Matrices/DistributedMatrix_impl.h
@@ -172,7 +172,8 @@ getCompressedRowLengths( CompressedRowLengthsVector& rowLengths ) const
 {
    if( getCommunicationGroup() != CommunicatorType::NullGroup ) {
       rowLengths.setDistribution( getLocalRowRange(), getRows(), getCommunicationGroup() );
-      localMatrix.getCompressedRowLengths( rowLengths.getLocalView() );
+      auto localRowLengths = rowLengths.getView();
+      localMatrix.getCompressedRowLengths( localRowLengths );
    }
 }
 
@@ -188,27 +189,14 @@ getRowLength( IndexType row ) const
 
 template< typename Matrix,
           typename Communicator >
-bool
+void
 DistributedMatrix< Matrix, Communicator >::
 setElement( IndexType row,
             IndexType column,
             RealType value )
 {
    const IndexType localRow = localRowRange.getLocalIndex( row );
-   return localMatrix.setElement( localRow, column, value );
-}
-
-template< typename Matrix,
-          typename Communicator >
-__cuda_callable__
-bool
-DistributedMatrix< Matrix, Communicator >::
-setElementFast( IndexType row,
-                IndexType column,
-                RealType value )
-{
-   const IndexType localRow = localRowRange.getLocalIndex( row );
-   return localMatrix.setElementFast( localRow, column, value );
+   localMatrix.setElement( localRow, column, value );
 }
 
 template< typename Matrix,
@@ -234,33 +222,6 @@ getElementFast( IndexType row,
    return localMatrix.getElementFast( localRow, column );
 }
 
-//template< typename Matrix,
-//          typename Communicator >
-//__cuda_callable__
-//bool
-//DistributedMatrix< Matrix, Communicator >::
-//setRowFast( IndexType row,
-//            const IndexType* columnIndexes,
-//            const RealType* values,
-//            IndexType elements )
-//{
-//   const IndexType localRow = localRowRange.getLocalIndex( row );
-//   return localMatrix.setRowFast( localRow, columnIndexes, values, elements );
-//}
-
-//template< typename Matrix,
-//          typename Communicator >
-//__cuda_callable__
-//void
-//DistributedMatrix< Matrix, Communicator >::
-//getRowFast( IndexType row,
-//            IndexType* columns,
-//            RealType* values ) const
-//{
-//   const IndexType localRow = localRowRange.getLocalIndex( row );
-//   return localMatrix.getRowFast( localRow, columns, values );
-//}
-
 template< typename Matrix,
           typename Communicator >
 __cuda_callable__
-- 
GitLab


From 3554295bdda38e5f9dceb51dc38057e966d06c56 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 7 May 2020 17:24:40 +0200
Subject: [PATCH 52/68] Refactoring get/setElement in Array/ArrayView.

---
 src/TNL/Algorithms/MemoryOperationsCuda.hpp | 8 ++++++++
 src/TNL/Containers/Array.hpp                | 8 --------
 src/TNL/Containers/ArrayView.hpp            | 8 --------
 3 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/src/TNL/Algorithms/MemoryOperationsCuda.hpp b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
index 70a31cf26..d03fa4b12 100644
--- a/src/TNL/Algorithms/MemoryOperationsCuda.hpp
+++ b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
@@ -30,6 +30,9 @@ setElement( Element* data,
             const Element& value )
 {
    TNL_ASSERT_TRUE( data, "Attempted to set data through a nullptr." );
+#ifdef __CUDA_ARCH__
+   *data = value;
+#else
 #ifdef HAVE_CUDA
    cudaMemcpy( ( void* ) data, ( void* ) &value, sizeof( Element ), cudaMemcpyHostToDevice );
    TNL_CHECK_CUDA_DEVICE;
@@ -41,6 +44,7 @@ setElement( Element* data,
    // Probably it might be a problem with lambda function 'kernel' which
    // nvcc probably does not handle properly.
    //MemoryOperations< Devices::Cuda >::set( data, value, 1 );
+#endif
 }
 
 template< typename Element >
@@ -49,6 +53,9 @@ MemoryOperations< Devices::Cuda >::
 getElement( const Element* data )
 {
    TNL_ASSERT_TRUE( data, "Attempted to get data through a nullptr." );
+#ifdef __CUDA_ARCH__
+   return *data;
+#else
    Element result;
 #ifdef HAVE_CUDA
    cudaMemcpy( ( void* ) &result, ( void* ) data, sizeof( Element ), cudaMemcpyDeviceToHost );
@@ -62,6 +69,7 @@ getElement( const Element* data )
    // nvcc probably does not handle properly.
    //MultiDeviceMemoryOperations< void, Devices::Cuda >::template copy< Element, Element, int >( &result, data, 1 );
    return result;
+#endif
 }
 
 template< typename Element, typename Index >
diff --git a/src/TNL/Containers/Array.hpp b/src/TNL/Containers/Array.hpp
index 77903e60a..b2f377ff4 100644
--- a/src/TNL/Containers/Array.hpp
+++ b/src/TNL/Containers/Array.hpp
@@ -510,11 +510,7 @@ setElement( const Index& i, const Value& x )
 {
    TNL_ASSERT_GE( i, (Index) 0, "Element index must be non-negative." );
    TNL_ASSERT_LT( i, this->getSize(), "Element index is out of bounds." );
-#ifdef __CUDA_ARCH__
-   this->data[ i ] = x;
-#else
    Algorithms::MemoryOperations< Device >::setElement( &( this->data[ i ] ), x );
-#endif
 }
 
 template< typename Value,
@@ -527,11 +523,7 @@ getElement( const Index& i ) const
 {
    TNL_ASSERT_GE( i, (Index) 0, "Element index must be non-negative." );
    TNL_ASSERT_LT( i, this->getSize(), "Element index is out of bounds." );
-#ifdef __CUDA_ARCH__
-   return this->data[ i ];
-#else
    return Algorithms::MemoryOperations< Device >::getElement( & ( this->data[ i ] ) );
-#endif
 }
 
 template< typename Value,
diff --git a/src/TNL/Containers/ArrayView.hpp b/src/TNL/Containers/ArrayView.hpp
index d545c6048..7ab7915e6 100644
--- a/src/TNL/Containers/ArrayView.hpp
+++ b/src/TNL/Containers/ArrayView.hpp
@@ -218,11 +218,7 @@ setElement( Index i, Value value )
 {
    TNL_ASSERT_GE( i, 0, "Element index must be non-negative." );
    TNL_ASSERT_LT( i, this->getSize(), "Element index is out of bounds." );
-#ifdef __CUDA_ARCH__
-   data[ i ] = value;
-#else
    Algorithms::MemoryOperations< Device >::setElement( &this->data[ i ], value );
-#endif
 }
 
 template< typename Value,
@@ -234,11 +230,7 @@ getElement( Index i ) const
 {
    TNL_ASSERT_GE( i, 0, "Element index must be non-negative." );
    TNL_ASSERT_LT( i, this->getSize(), "Element index is out of bounds." );
-#ifdef __CUDA_ARCH__
-   return data[ i ];
-#else
    return Algorithms::MemoryOperations< Device >::getElement( &data[ i ] );
-#endif
 }
 
 template< typename Value,
-- 
GitLab


From acc96f1013286429e19d24bd61f2f3f7cbc0f2bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 7 May 2020 17:43:20 +0200
Subject: [PATCH 53/68] Updating DistributedMatrix for segments

---
 src/TNL/Matrices/DistributedMatrix_impl.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Matrices/DistributedMatrix_impl.h b/src/TNL/Matrices/DistributedMatrix_impl.h
index dfef7b7c9..5be4bbaba 100644
--- a/src/TNL/Matrices/DistributedMatrix_impl.h
+++ b/src/TNL/Matrices/DistributedMatrix_impl.h
@@ -172,7 +172,7 @@ getCompressedRowLengths( CompressedRowLengthsVector& rowLengths ) const
 {
    if( getCommunicationGroup() != CommunicatorType::NullGroup ) {
       rowLengths.setDistribution( getLocalRowRange(), getRows(), getCommunicationGroup() );
-      auto localRowLengths = rowLengths.getView();
+      auto localRowLengths = rowLengths.getLocalView();
       localMatrix.getCompressedRowLengths( localRowLengths );
    }
 }
@@ -183,8 +183,7 @@ typename Matrix::IndexType
 DistributedMatrix< Matrix, Communicator >::
 getRowLength( IndexType row ) const
 {
-   const IndexType localRow = localRowRange.getLocalIndex( row );
-   return localMatrix.getRowLength( localRow );
+   return getRow( row ).getSize();
 }
 
 template< typename Matrix,
-- 
GitLab


From 8b89e1a8357ac94651a903e6fdef08afe7a38be6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 7 May 2020 17:43:39 +0200
Subject: [PATCH 54/68] SparseMatrixView::getCompressedRowLengths works even
 for views

---
 src/TNL/Matrices/SparseMatrixView.h   | 16 ++++++++++++++++
 src/TNL/Matrices/SparseMatrixView.hpp |  2 +-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index 132fd589f..b2e7f48c6 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -164,6 +164,22 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       ColumnsIndexesViewType columnIndexes;
 
       SegmentsViewType segments;
+
+   private:
+      // TODO: this should be probably moved into a detail namespace
+      template< typename VectorOrView,
+                std::enable_if_t< HasSetSizeMethod< VectorOrView >::value, bool > = true >
+      static void set_size_if_resizable( VectorOrView& v, IndexType size )
+      {
+         v.setSize( size );
+      }
+
+      template< typename VectorOrView,
+                std::enable_if_t< ! HasSetSizeMethod< VectorOrView >::value, bool > = true >
+      static void set_size_if_resizable( VectorOrView& v, IndexType size )
+      {
+         TNL_ASSERT_EQ( v.getSize(), size, "view has wrong size" );
+      }
 };
 
 }  // namespace Conatiners
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index da5fbab9f..745ee76ef 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -116,7 +116,7 @@ void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
 getCompressedRowLengths( Vector& rowLengths ) const
 {
-   rowLengths.setSize( this->getRows() );
+   set_size_if_resizable( rowLengths, this->getRows() );
    rowLengths = 0;
    auto rowLengths_view = rowLengths.getView();
    auto fetch = [] __cuda_callable__ ( IndexType row, IndexType column, IndexType globalIdx, const RealType& value ) -> IndexType {
-- 
GitLab


From 88b1c3bef8ea7bbd7c6ced32ce20a5ec349cf649 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 8 May 2020 08:08:46 +0200
Subject: [PATCH 55/68] Fixing distributed matrix.

---
 .../DistSpMV/tnl-benchmark-distributed-spmv.h |  2 +-
 .../tnl-benchmark-linear-solvers.h            |  2 +-
 src/TNL/Algorithms/MemoryOperations.h         |  2 +
 src/TNL/Algorithms/MemoryOperationsCuda.hpp   |  4 +-
 src/TNL/Matrices/DistributedMatrix.h          |  3 +-
 src/TNL/Matrices/DistributedMatrix_impl.h     |  6 +-
 src/TNL/Matrices/Legacy/SparseRow.h           | 18 +++++
 src/TNL/Matrices/SparseMatrix.h               |  5 +-
 src/TNL/Matrices/SparseMatrix.hpp             | 31 +++++--
 src/TNL/Matrices/SparseMatrixView.h           |  5 +-
 src/TNL/Matrices/SparseMatrixView.hpp         | 20 +++--
 src/TNL/Matrices/details/SparseMatrix.h       | 80 +++++++++++++++++++
 src/TNL/Solvers/Linear/Preconditioners/ILU0.h |  1 +
 .../Linear/Preconditioners/ILU0_impl.h        |  4 +-
 .../Linear/Preconditioners/TriangularSolve.h  |  4 +-
 .../Matrices/DistributedMatrixTest.h          |  4 +-
 16 files changed, 161 insertions(+), 30 deletions(-)
 create mode 100644 src/TNL/Matrices/details/SparseMatrix.h

diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
index 61169d079..4f7b07dee 100644
--- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
+++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
@@ -239,7 +239,7 @@ struct SpmvBenchmark
       DistributedRowLengths distributedRowLengths( localRange, matrix.getRows(), group );
       for( IndexType i = 0; i < distributedMatrix.getLocalMatrix().getRows(); i++ ) {
          const auto gi = distributedMatrix.getLocalRowRange().getGlobalIndex( i );
-         distributedRowLengths[ gi ] = matrix.getRowLength( gi );
+         distributedRowLengths[ gi ] = matrix.getRowCapacity( gi );
       }
       distributedMatrix.setCompressedRowLengths( distributedRowLengths );
 
diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
index 11fa2b91e..36ca471e1 100644
--- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
+++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
@@ -461,7 +461,7 @@ struct LinearSolversBenchmark
       DistributedRowLengths distributedRowLengths( localRange, matrixPointer->getRows(), group );
       for( IndexType i = 0; i < distMatrixPointer->getLocalMatrix().getRows(); i++ ) {
          const auto gi = distMatrixPointer->getLocalRowRange().getGlobalIndex( i );
-         distributedRowLengths[ gi ] = matrixPointer->getRowLength( gi );
+         distributedRowLengths[ gi ] = matrixPointer->getRowCapacity( gi );
       }
       distMatrixPointer->setCompressedRowLengths( distributedRowLengths );
 
diff --git a/src/TNL/Algorithms/MemoryOperations.h b/src/TNL/Algorithms/MemoryOperations.h
index 59da32402..85b44a465 100644
--- a/src/TNL/Algorithms/MemoryOperations.h
+++ b/src/TNL/Algorithms/MemoryOperations.h
@@ -132,10 +132,12 @@ template<>
 struct MemoryOperations< Devices::Cuda >
 {
    template< typename Element >
+   __cuda_callable__
    static void setElement( Element* data,
                            const Element& value );
 
    template< typename Element >
+   __cuda_callable__
    static Element getElement( const Element* data );
 
    template< typename Element, typename Index >
diff --git a/src/TNL/Algorithms/MemoryOperationsCuda.hpp b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
index d03fa4b12..31e442a3f 100644
--- a/src/TNL/Algorithms/MemoryOperationsCuda.hpp
+++ b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
@@ -24,7 +24,7 @@ namespace TNL {
 namespace Algorithms {
 
 template< typename Element >
-void
+__cuda_callable__ void
 MemoryOperations< Devices::Cuda >::
 setElement( Element* data,
             const Element& value )
@@ -48,7 +48,7 @@ setElement( Element* data,
 }
 
 template< typename Element >
-Element
+__cuda_callable__ Element
 MemoryOperations< Devices::Cuda >::
 getElement( const Element* data )
 {
diff --git a/src/TNL/Matrices/DistributedMatrix.h b/src/TNL/Matrices/DistributedMatrix.h
index 08ab33a4f..c6a50a633 100644
--- a/src/TNL/Matrices/DistributedMatrix.h
+++ b/src/TNL/Matrices/DistributedMatrix.h
@@ -104,7 +104,8 @@ public:
 
    void setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths );
 
-   void getCompressedRowLengths( CompressedRowLengthsVector& rowLengths ) const;
+   template< typename Vector >
+   void getCompressedRowLengths( Vector& rowLengths ) const;
 
    IndexType getRowLength( IndexType row ) const;
 
diff --git a/src/TNL/Matrices/DistributedMatrix_impl.h b/src/TNL/Matrices/DistributedMatrix_impl.h
index 5be4bbaba..280c1583a 100644
--- a/src/TNL/Matrices/DistributedMatrix_impl.h
+++ b/src/TNL/Matrices/DistributedMatrix_impl.h
@@ -166,9 +166,10 @@ setCompressedRowLengths( const CompressedRowLengthsVector& rowLengths )
 
 template< typename Matrix,
           typename Communicator >
+   template< typename Vector >
 void
 DistributedMatrix< Matrix, Communicator >::
-getCompressedRowLengths( CompressedRowLengthsVector& rowLengths ) const
+getCompressedRowLengths( Vector& rowLengths ) const
 {
    if( getCommunicationGroup() != CommunicatorType::NullGroup ) {
       rowLengths.setDistribution( getLocalRowRange(), getRows(), getCommunicationGroup() );
@@ -183,7 +184,8 @@ typename Matrix::IndexType
 DistributedMatrix< Matrix, Communicator >::
 getRowLength( IndexType row ) const
 {
-   return getRow( row ).getSize();
+   const IndexType localRow = localRowRange.getLocalIndex( row );
+   return localMatrix.getRowCapacity( localRow );
 }
 
 template< typename Matrix,
diff --git a/src/TNL/Matrices/Legacy/SparseRow.h b/src/TNL/Matrices/Legacy/SparseRow.h
index d0008c93f..eb7a461fb 100644
--- a/src/TNL/Matrices/Legacy/SparseRow.h
+++ b/src/TNL/Matrices/Legacy/SparseRow.h
@@ -51,12 +51,30 @@ class SparseRow
       __cuda_callable__
       const Index& getElementColumn( const Index& elementIndex ) const;
 
+      __cuda_callable__
+      const Index& getColumnIndex( const Index& elementIndex ) const
+      {
+         return getElementColumn( elementIndex );
+      };
+
+      
       __cuda_callable__
       const Real& getElementValue( const Index& elementIndex ) const;
 
+      __cuda_callable__
+      const Real& getValue( const Index& elementIndex ) const
+      {
+         return getElementValue( elementIndex );
+      };
+
+
       __cuda_callable__
       Index getLength() const;
 
+      __cuda_callable__
+      Index getSize() const { return length; };
+
+
       __cuda_callable__
       Index getNonZeroElementsCount() const;
 
diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index 4a41f9643..bbc3efe4a 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -137,8 +137,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       template< typename Vector >
       void getCompressedRowLengths( Vector& rowLengths ) const;
 
-      [[deprecated]]
-      virtual IndexType getRowLength( const IndexType row ) const { return 0;};
+      IndexType getRowCapacity( const IndexType row ) const;
 
       template< typename Matrix >
       void setLike( const Matrix& matrix );
@@ -153,10 +152,12 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       __cuda_callable__
       RowView getRow( const IndexType& rowIdx );
 
+      __cuda_callable__
       void setElement( const IndexType row,
                        const IndexType column,
                        const RealType& value );
 
+      __cuda_callable__
       void addElement( const IndexType row,
                        const IndexType column,
                        const RealType& value,
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index d0621b0d0..308bf349f 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -28,7 +28,7 @@ template< typename Real,
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
 SparseMatrix( const RealAllocatorType& realAllocator,
               const IndexAllocatorType& indexAllocator )
-   : BaseType( realAllocator ), columnIndexes( indexAllocator )
+   : BaseType( realAllocator ), columnIndexes( indexAllocator ), view( this->getView() )
 {
 }
 
@@ -41,7 +41,8 @@ template< typename Real,
           typename IndexAllocator >
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
 SparseMatrix( const SparseMatrix& m )
-   : Matrix< Real, Device, Index, RealAllocator >( m ), columnIndexes( m.columnIndexes )
+   : Matrix< Real, Device, Index, RealAllocator >( m ), columnIndexes( m.columnIndexes ),
+   segments( m.segments ), view( this->getView() )
 {
 }
 
@@ -54,7 +55,9 @@ template< typename Real,
           typename IndexAllocator >
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
 SparseMatrix( const SparseMatrix&& m )
-   : Matrix< Real, Device, Index, RealAllocator >( std::move( m ) ), columnIndexes( std::move( m.columnIndexes ) )
+   : Matrix< Real, Device, Index, RealAllocator >( std::move( m ) ),
+   columnIndexes( std::move( m.columnIndexes ) ), segments( std::move( m.segments ) ),
+   view( this->getView() )
 {
 }
 
@@ -70,7 +73,9 @@ SparseMatrix( const IndexType rows,
               const IndexType columns,
               const RealAllocatorType& realAllocator,
               const IndexAllocatorType& indexAllocator )
-: BaseType( rows, columns, realAllocator ), columnIndexes( indexAllocator )
+: BaseType( rows, columns, realAllocator ), columnIndexes( indexAllocator ),
+   segments( Containers::Vector< IndexType, DeviceType, IndexType >( rows, 0 ) ),
+   view( this->getView() )
 {
 }
 
@@ -312,6 +317,20 @@ getCompressedRowLengths( Vector& rowLengths ) const
    this->view.getCompressedRowLengths( rowLengths );
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          template< typename, typename, typename > class Segments,
+          typename RealAllocator,
+          typename IndexAllocator >
+Index
+SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
+getRowCapacity( const IndexType row ) const
+{
+   return this->view.getRowCapacity( row );
+}
+
 template< typename Real,
           typename Device,
           typename Index,
@@ -390,7 +409,7 @@ template< typename Real,
           template< typename, typename, typename > class Segments,
           typename RealAllocator,
           typename IndexAllocator >
-void
+__cuda_callable__ void
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
 setElement( const IndexType row,
             const IndexType column,
@@ -406,7 +425,7 @@ template< typename Real,
           template< typename, typename, typename > class Segments,
           typename RealAllocator,
           typename IndexAllocator >
-void
+__cuda_callable__ void
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
 addElement( const IndexType row,
             const IndexType column,
diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index b2e7f48c6..35d509bdf 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -83,8 +83,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       template< typename Vector >
       void getCompressedRowLengths( Vector& rowLengths ) const;
 
-      [[deprecated]]
-      IndexType getRowLength( const IndexType row ) const;
+      IndexType getRowCapacity( const IndexType row ) const;
 
       IndexType getNumberOfNonzeroMatrixElements() const;
 
@@ -96,10 +95,12 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       __cuda_callable__
       RowView getRow( const IndexType& rowIdx );
 
+      __cuda_callable__
       void setElement( const IndexType row,
                        const IndexType column,
                        const RealType& value );
 
+      __cuda_callable__
       void addElement( IndexType row,
                        IndexType column,
                        const RealType& value,
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 745ee76ef..bc97cbc75 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -14,6 +14,7 @@
 #include <TNL/Matrices/SparseMatrixView.h>
 #include <TNL/Algorithms/Reduction.h>
 #include <TNL/Algorithms/AtomicOperations.h>
+#include <TNL/Matrices/details/SparseMatrix.h>
 
 namespace TNL {
 namespace Matrices {
@@ -116,7 +117,7 @@ void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
 getCompressedRowLengths( Vector& rowLengths ) const
 {
-   set_size_if_resizable( rowLengths, this->getRows() );
+   details::CompressedRowLengthVectorSizeSetter< Vector >::setSize( rowLengths, this->getRows() );
    rowLengths = 0;
    auto rowLengths_view = rowLengths.getView();
    auto fetch = [] __cuda_callable__ ( IndexType row, IndexType column, IndexType globalIdx, const RealType& value ) -> IndexType {
@@ -138,9 +139,9 @@ template< typename Real,
           template< typename, typename > class SegmentsView >
 Index
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
-getRowLength( const IndexType row ) const
+getRowCapacity( const IndexType row ) const
 {
-   return 0;
+   return this->segments.getSegmentSize( row );
 }
 
 template< typename Real,
@@ -217,7 +218,7 @@ template< typename Real,
           typename Index,
           typename MatrixType,
           template< typename, typename > class SegmentsView >
-void
+__cuda_callable__ void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
 setElement( const IndexType row,
             const IndexType column,
@@ -231,7 +232,7 @@ template< typename Real,
           typename Index,
           typename MatrixType,
           template< typename, typename > class SegmentsView >
-void
+__cuda_callable__ void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
 addElement( IndexType row,
             IndexType column,
@@ -270,9 +271,14 @@ addElement( IndexType row,
    }
    if( i == rowSize )
    {
+#ifndef __CUDA_ARCH__
       std::stringstream msg;
       msg << "The capacity of the sparse matrix row number "  << row << " was exceeded.";
       throw std::logic_error( msg.str() );
+#else
+      TNL_ASSERT_TRUE( false, "");
+      return;
+#endif
    }
    if( col == this->getPaddingIndex() )
    {
@@ -308,7 +314,7 @@ template< typename Real,
           typename Index,
           typename MatrixType,
           template< typename, typename > class SegmentsView >
-__cuda_callable__
+__cuda_callable__ 
 Real
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
 getElement( IndexType row,
@@ -355,7 +361,7 @@ SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
 rowVectorProduct( const IndexType row,
                   const Vector& vector ) const
 {
-   throw Exceptions::NotImplementedError("TODO: rowVectorProduct is not implemented yet.");
+   TNL_ASSERT_TRUE( false, "TODO: rowVectorProduct is not implemented yet.");
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/details/SparseMatrix.h b/src/TNL/Matrices/details/SparseMatrix.h
new file mode 100644
index 000000000..522594806
--- /dev/null
+++ b/src/TNL/Matrices/details/SparseMatrix.h
@@ -0,0 +1,80 @@
+/***************************************************************************
+                          SparseMatrix.h  -  description
+                             -------------------
+    begin                : Jan 5, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Containers/ArrayView.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Containers/DistributedArray.h>
+#include <TNL/Containers/DistributedVector.h>
+
+
+namespace TNL {
+   namespace Matrices {
+      namespace details {
+
+template< typename Vector >
+struct CompressedRowLengthVectorSizeSetter
+{
+   static void setSize( Vector& v, typename Vector::IndexType size )
+   {
+      v.setSize( size );
+   }
+};
+
+template< typename Value,
+   typename Device,
+   typename Index >
+struct CompressedRowLengthVectorSizeSetter< Containers::ArrayView< Value, Device, Index > >
+{
+   static void setSize( Containers::ArrayView< Value, Device, Index >& v, Index size )
+   {
+      TNL_ASSERT_EQ( v.getSize(), size, "ArrayView has wrong size, different from number of matrix rows." );
+   }
+};
+
+template< typename Value,
+   typename Device,
+   typename Index >
+struct CompressedRowLengthVectorSizeSetter< Containers::VectorView< Value, Device, Index > >
+{
+   static void setSize( Containers::VectorView< Value, Device, Index >& v, Index size )
+   {
+      TNL_ASSERT_EQ( v.getSize(), size, "VectorView has wrong size, different from number of matrix rows." );
+   }
+};
+
+template< typename Value,
+   typename Device,
+   typename Index,
+   typename Communicator >
+struct CompressedRowLengthVectorSizeSetter< Containers::DistributedArray< Value, Device, Index, Communicator > >
+{
+   static void setSize( Containers::DistributedArray< Value, Device, Index, Communicator >& v, Index size )
+   {
+      TNL_ASSERT_EQ( v.getSize(), size, "DistributedArray has wrong size, different from number of matrix rows." );
+   }
+};
+
+template< typename Value,
+   typename Device,
+   typename Index,
+   typename Communicator >
+struct CompressedRowLengthVectorSizeSetter< Containers::DistributedVector< Value, Device, Index, Communicator > >
+{
+   static void setSize( Containers::DistributedVector< Value, Device, Index, Communicator >& v, Index size )
+   {
+      TNL_ASSERT_EQ( v.getSize(), size, "DistributedVector has wrong size, different from number of matrix rows." );
+   }
+};
+
+      } //namespace details
+   } //namepsace Matrices
+} //namespace TNL
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
index cc55c153c..07f0aea4e 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h
@@ -20,6 +20,7 @@
 #include <TNL/Exceptions/NotImplementedError.h>
 
 #if defined(HAVE_CUDA) && defined(HAVE_CUSPARSE)
+#include <TNL/Matrices/Legacy/CSR.h>
 #include <cusparse.h>
 #endif
 
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
index bd4abfb04..acf2ce129 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
@@ -350,9 +350,9 @@ copy_triangular_factors()
       for( int c_j = 0; c_j < row.getSize(); c_j++ ) {
          const IndexType j = row.getColumnIndex( c_j );
          if( j < i )
-            kernel_L->setElementFast( i, j, row.getValue( c_j ) );
+            kernel_L->setElement( i, j, row.getValue( c_j ) );
          else if( j < N )
-            kernel_U->setElementFast( i, j, row.getValue( c_j ) );
+            kernel_U->setElement( i, j, row.getValue( c_j ) );
          else
             break;
       }
diff --git a/src/TNL/Solvers/Linear/Preconditioners/TriangularSolve.h b/src/TNL/Solvers/Linear/Preconditioners/TriangularSolve.h
index 4ce7e34c5..f47eba8bf 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/TriangularSolve.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/TriangularSolve.h
@@ -43,7 +43,7 @@ void triangularSolveLower( const Matrix& L, Vector1& x, const Vector2& b )
    for( IndexType i = 0; i < N; i++ ) {
       RealType x_i = b[ i ];
 
-      const auto L_entries = L.getRowLength( i );
+      const auto L_entries = L.getRowCapacity( i );
 
       // this condition is to avoid segfaults on empty L.getRow( i )
       if( L_entries > 0 ) {
@@ -93,7 +93,7 @@ void triangularSolveUpper( const Matrix& U, Vector1& x, const Vector2& b )
 
       const IndexType U_idx = (reversedRows) ? N - 1 - i : i;
 
-      const auto U_entries = U.getRowLength( U_idx );
+      const auto U_entries = U.getRowCapacity( U_idx );
       const auto U_i = U.getRow( U_idx );
 
       const auto U_ii = U_i.getValue( 0 );
diff --git a/src/UnitTests/Matrices/DistributedMatrixTest.h b/src/UnitTests/Matrices/DistributedMatrixTest.h
index d432c5da7..16ec045c4 100644
--- a/src/UnitTests/Matrices/DistributedMatrixTest.h
+++ b/src/UnitTests/Matrices/DistributedMatrixTest.h
@@ -155,13 +155,13 @@ TYPED_TEST( DistributedMatrixTest, setCompressedRowLengths )
    for( int i = 0; i < this->matrix.getLocalMatrix().getRows(); i++ ) {
       const auto gi = this->matrix.getLocalRowRange().getGlobalIndex( i );
       EXPECT_EQ( this->matrix.getRowLength( gi ), 0 );
-      EXPECT_EQ( this->matrix.getLocalMatrix().getRowLength( i ), 0 );
+      EXPECT_EQ( this->matrix.getLocalMatrix().getRowCapacity( i ), 0 );
    }
    this->matrix.setCompressedRowLengths( this->rowLengths );
    for( int i = 0; i < this->matrix.getLocalMatrix().getRows(); i++ ) {
       const auto gi = this->matrix.getLocalRowRange().getGlobalIndex( i );
       EXPECT_EQ( this->matrix.getRowLength( gi ), gi + 1 );
-      EXPECT_EQ( this->matrix.getLocalMatrix().getRowLength( i ), gi + 1 );
+      EXPECT_EQ( this->matrix.getLocalMatrix().getRowCapacity( i ), gi + 1 );
    }
 }
 
-- 
GitLab


From 9d2970b89125843a0607adb97f34a1c0e59cf47d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 8 May 2020 12:16:55 +0200
Subject: [PATCH 56/68] Fixed SparseMatrix getting into inconsistent state

- BiEllpack and ChunkedEllpack are broken because the "segmentsCount"
  attribute is missing, see issue #67
---
 src/TNL/Containers/Segments/BiEllpack.hpp     |  7 ++-
 .../Containers/Segments/ChunkedEllpack.hpp    |  4 +-
 src/TNL/Containers/Segments/details/CSR.h     |  7 ++-
 src/TNL/Matrices/Matrix.h                     |  4 +-
 src/TNL/Matrices/SparseMatrix.h               |  9 ++-
 src/TNL/Matrices/SparseMatrix.hpp             | 58 +++++++++----------
 src/TNL/Matrices/SparseMatrixView.h           |  2 +-
 src/TNL/Matrices/SparseMatrixView.hpp         |  2 +-
 8 files changed, 50 insertions(+), 43 deletions(-)

diff --git a/src/TNL/Containers/Segments/BiEllpack.hpp b/src/TNL/Containers/Segments/BiEllpack.hpp
index a1a7419d7..52ade2178 100644
--- a/src/TNL/Containers/Segments/BiEllpack.hpp
+++ b/src/TNL/Containers/Segments/BiEllpack.hpp
@@ -117,6 +117,9 @@ template< typename Device,
 void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
 performRowBubbleSort( const SizesHolder& segmentsSizes )
 {
+   if( segmentsSizes.getSize() == 0 )
+      return;
+
    this->rowPermArray.evaluate( [] __cuda_callable__ ( const IndexType i ) -> IndexType { return i; } );
 
    //if( std::is_same< DeviceType, Devices::Host >::value )
@@ -356,7 +359,9 @@ template< typename Device,
 __cuda_callable__ auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
 getSegmentsCount() const -> IndexType
 {
-   return this->segmentsCount;
+   // FIXME
+//   return this->segmentsCount;
+   return 0;
 }
 
 template< typename Device,
diff --git a/src/TNL/Containers/Segments/ChunkedEllpack.hpp b/src/TNL/Containers/Segments/ChunkedEllpack.hpp
index 5521927af..0ae4c7763 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpack.hpp
+++ b/src/TNL/Containers/Segments/ChunkedEllpack.hpp
@@ -308,7 +308,9 @@ template< typename Device,
 __cuda_callable__ auto ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
 getSegmentsCount() const -> IndexType
 {
-   return this->segmentsCount;
+   // FIXME
+//   return this->segmentsCount;
+   return 0;
 }
 
 template< typename Device,
diff --git a/src/TNL/Containers/Segments/details/CSR.h b/src/TNL/Containers/Segments/details/CSR.h
index 38f097669..637ebac36 100644
--- a/src/TNL/Containers/Segments/details/CSR.h
+++ b/src/TNL/Containers/Segments/details/CSR.h
@@ -29,8 +29,11 @@ class CSR
       static void setSegmentsSizes( const SizesHolder& sizes, CSROffsets& offsets )
       {
          offsets.setSize( sizes.getSize() + 1 );
-         auto view = offsets.getView( 0, sizes.getSize() );
-         view = sizes;
+         // GOTCHA: when sizes.getSize() == 0, getView returns a full view with size == 1
+         if( sizes.getSize() > 0 ) {
+            auto view = offsets.getView( 0, sizes.getSize() );
+            view = sizes;
+         }
          offsets.setElement( sizes.getSize(), 0 );
          offsets.template scan< Algorithms::ScanType::Exclusive >();
       }
diff --git a/src/TNL/Matrices/Matrix.h b/src/TNL/Matrices/Matrix.h
index 129a54cbe..ba2172168 100644
--- a/src/TNL/Matrices/Matrix.h
+++ b/src/TNL/Matrices/Matrix.h
@@ -47,8 +47,8 @@ public:
            const IndexType columns,
            const RealAllocatorType& allocator = RealAllocatorType() );
 
-   void setDimensions( const IndexType rows,
-                       const IndexType columns );
+   virtual void setDimensions( const IndexType rows,
+                               const IndexType columns );
 
    template< typename Matrix_ >
    void setLike( const Matrix_& matrix );
diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index bbc3efe4a..9ba50f6a8 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -85,9 +85,9 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       SparseMatrix( const RealAllocatorType& realAllocator = RealAllocatorType(),
                     const IndexAllocatorType& indexAllocator = IndexAllocatorType() );
 
-      SparseMatrix( const SparseMatrix& m );
+      SparseMatrix( const SparseMatrix& m ) = default;
 
-      SparseMatrix( const SparseMatrix&& m );
+      SparseMatrix( SparseMatrix&& m ) = default;
 
       SparseMatrix( const IndexType rows,
                     const IndexType columns,
@@ -111,6 +111,9 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
                              const IndexType columns,
                              const std::map< std::pair< MapIndex, MapIndex > , MapValue >& map );
 
+      virtual void setDimensions( const IndexType rows,
+                                  const IndexType columns ) override;
+
       ViewType getView() const; // TODO: remove const
 
       ConstViewType getConstView() const;
@@ -267,7 +270,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       ViewType view;
 };
 
-} // namespace Matrices
+   } // namespace Matrices
 } // namespace TNL
 
 #include <TNL/Matrices/SparseMatrix.hpp>
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index 308bf349f..de862697b 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -28,36 +28,7 @@ template< typename Real,
 SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
 SparseMatrix( const RealAllocatorType& realAllocator,
               const IndexAllocatorType& indexAllocator )
-   : BaseType( realAllocator ), columnIndexes( indexAllocator ), view( this->getView() )
-{
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename MatrixType,
-          template< typename, typename, typename > class Segments,
-          typename RealAllocator,
-          typename IndexAllocator >
-SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
-SparseMatrix( const SparseMatrix& m )
-   : Matrix< Real, Device, Index, RealAllocator >( m ), columnIndexes( m.columnIndexes ),
-   segments( m.segments ), view( this->getView() )
-{
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename MatrixType,
-          template< typename, typename, typename > class Segments,
-          typename RealAllocator,
-          typename IndexAllocator >
-SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
-SparseMatrix( const SparseMatrix&& m )
-   : Matrix< Real, Device, Index, RealAllocator >( std::move( m ) ),
-   columnIndexes( std::move( m.columnIndexes ) ), segments( std::move( m.segments ) ),
-   view( this->getView() )
+: BaseType( realAllocator ), columnIndexes( indexAllocator ), view( this->getView() )
 {
 }
 
@@ -74,8 +45,8 @@ SparseMatrix( const IndexType rows,
               const RealAllocatorType& realAllocator,
               const IndexAllocatorType& indexAllocator )
 : BaseType( rows, columns, realAllocator ), columnIndexes( indexAllocator ),
-   segments( Containers::Vector< IndexType, DeviceType, IndexType >( rows, 0 ) ),
-   view( this->getView() )
+  segments( Containers::Vector< IndexType, DeviceType, IndexType >( rows, 0 ) ),
+  view( this->getView() )
 {
 }
 
@@ -132,6 +103,23 @@ SparseMatrix( const IndexType rows,
    this->setElements( map );
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          template< typename, typename, typename > class Segments,
+          typename RealAllocator,
+          typename IndexAllocator >
+void
+SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
+setDimensions( const IndexType rows,
+               const IndexType columns )
+{
+   BaseType::setDimensions( rows, columns );
+   segments.setSegmentsSizes( Containers::Vector< IndexType, DeviceType, IndexType >( rows, 0 ) );
+   this->view = this->getView();
+}
+
 template< typename Real,
           typename Device,
           typename Index,
@@ -344,6 +332,9 @@ SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAll
 setLike( const Matrix_& matrix )
 {
    BaseType::setLike( matrix );
+   this->segments.setSegmentsSizes( Containers::Vector< IndexType, DeviceType, IndexType >( matrix.getRows(), 0 ) ),
+   this->view = this->getView();
+   TNL_ASSERT_EQ( this->getRows(), segments.getSegmentsCount(), "mismatched segments count" );
 }
 
 template< typename Real,
@@ -372,6 +363,9 @@ SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAll
 reset()
 {
    BaseType::reset();
+   this->segments = SegmentsType( Containers::Vector< IndexType, DeviceType, IndexType >() ),
+   this->view = this->getView();
+   TNL_ASSERT_EQ( this->getRows(), segments.getSegmentsCount(), "mismatched segments count" );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index 35d509bdf..beb35eb7e 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -183,7 +183,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       }
 };
 
-}  // namespace Conatiners
+} // namespace Conatiners
 } // namespace TNL
 
 #include <TNL/Matrices/SparseMatrixView.hpp>
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index bc97cbc75..909351ecf 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -42,7 +42,7 @@ SparseMatrixView( const IndexType rows,
                   const ValuesViewType& values,
                   const ColumnsIndexesViewType& columnIndexes,
                   const SegmentsViewType& segments )
- : MatrixView< Real, Device, Index >( rows, columns, values ), columnIndexes( columnIndexes ), segments( segments )
+: MatrixView< Real, Device, Index >( rows, columns, values ), columnIndexes( columnIndexes ), segments( segments )
 {
 }
 
-- 
GitLab


From ed9176ddd6dd9f0cb4b8acdc18a27d9374571096 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 8 May 2020 12:19:15 +0200
Subject: [PATCH 57/68] Fixed DistributedMatrix

- DistributedMatrixTest is still broken because the rowVectorProduct
  method is not implemented, see issue #65
---
 src/TNL/Matrices/DistributedMatrix.h           |  2 +-
 src/TNL/Matrices/DistributedMatrix_impl.h      |  2 +-
 src/UnitTests/Matrices/DistributedMatrixTest.h | 16 ++++++++++------
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/TNL/Matrices/DistributedMatrix.h b/src/TNL/Matrices/DistributedMatrix.h
index c6a50a633..98cade7ce 100644
--- a/src/TNL/Matrices/DistributedMatrix.h
+++ b/src/TNL/Matrices/DistributedMatrix.h
@@ -107,7 +107,7 @@ public:
    template< typename Vector >
    void getCompressedRowLengths( Vector& rowLengths ) const;
 
-   IndexType getRowLength( IndexType row ) const;
+   IndexType getRowCapacity( IndexType row ) const;
 
    void setElement( IndexType row,
                     IndexType column,
diff --git a/src/TNL/Matrices/DistributedMatrix_impl.h b/src/TNL/Matrices/DistributedMatrix_impl.h
index 280c1583a..38b7f3af0 100644
--- a/src/TNL/Matrices/DistributedMatrix_impl.h
+++ b/src/TNL/Matrices/DistributedMatrix_impl.h
@@ -182,7 +182,7 @@ template< typename Matrix,
           typename Communicator >
 typename Matrix::IndexType
 DistributedMatrix< Matrix, Communicator >::
-getRowLength( IndexType row ) const
+getRowCapacity( IndexType row ) const
 {
    const IndexType localRow = localRowRange.getLocalIndex( row );
    return localMatrix.getRowCapacity( localRow );
diff --git a/src/UnitTests/Matrices/DistributedMatrixTest.h b/src/UnitTests/Matrices/DistributedMatrixTest.h
index 16ec045c4..18d4ef973 100644
--- a/src/UnitTests/Matrices/DistributedMatrixTest.h
+++ b/src/UnitTests/Matrices/DistributedMatrixTest.h
@@ -57,9 +57,7 @@ void setMatrix( Matrix& matrix, const RowLengths& rowLengths )
  * - Number of processes is not limited.
  * - Global size is hardcoded as 97 to force non-uniform distribution.
  * - Communication group is hardcoded as AllGroup -- it may be changed as needed.
- * - Matrix format is hardcoded as CSR -- it should be possible to change it to
- *   any other format which does not include padding zeros in the getRowLength()
- *   result.
+ * - Matrix format is hardcoded as CSR.
  */
 template< typename DistributedMatrix >
 class DistributedMatrixTest
@@ -154,13 +152,13 @@ TYPED_TEST( DistributedMatrixTest, setCompressedRowLengths )
 {
    for( int i = 0; i < this->matrix.getLocalMatrix().getRows(); i++ ) {
       const auto gi = this->matrix.getLocalRowRange().getGlobalIndex( i );
-      EXPECT_EQ( this->matrix.getRowLength( gi ), 0 );
+      EXPECT_EQ( this->matrix.getRowCapacity( gi ), 0 );
       EXPECT_EQ( this->matrix.getLocalMatrix().getRowCapacity( i ), 0 );
    }
    this->matrix.setCompressedRowLengths( this->rowLengths );
    for( int i = 0; i < this->matrix.getLocalMatrix().getRows(); i++ ) {
       const auto gi = this->matrix.getLocalRowRange().getGlobalIndex( i );
-      EXPECT_EQ( this->matrix.getRowLength( gi ), gi + 1 );
+      EXPECT_EQ( this->matrix.getRowCapacity( gi ), gi + 1 );
       EXPECT_EQ( this->matrix.getLocalMatrix().getRowCapacity( i ), gi + 1 );
    }
 }
@@ -171,7 +169,13 @@ TYPED_TEST( DistributedMatrixTest, getCompressedRowLengths )
 
    this->matrix.setCompressedRowLengths( this->rowLengths );
    RowLengthsVector output;
-   this->matrix.getCompressedRowLengths( output ); // TODO: replace this with getRowCapacities
+   this->matrix.getCompressedRowLengths( output );
+   // zero row lengths because the matrix is empty
+   EXPECT_EQ( output, 0 );
+   for( int i = 0; i < this->matrix.getLocalMatrix().getRows(); i++ ) {
+      const auto gi = this->matrix.getLocalRowRange().getGlobalIndex( i );
+      output[ gi ] = this->matrix.getRowCapacity( gi );
+   }
    EXPECT_EQ( output, this->rowLengths );
 }
 
-- 
GitLab


From 676a3d82f3fd40bb3f64b280eb0d210d4db1aa9f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 8 May 2020 12:37:12 +0200
Subject: [PATCH 58/68] Avoid build error in CI

---
 src/TNL/Matrices/SparseMatrixView.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 909351ecf..8cf222f21 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -362,6 +362,7 @@ rowVectorProduct( const IndexType row,
                   const Vector& vector ) const
 {
    TNL_ASSERT_TRUE( false, "TODO: rowVectorProduct is not implemented yet.");
+   return 0;
 }
 
 template< typename Real,
-- 
GitLab


From 5814345d1f53b10b149702e7ea584c1401829c39 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 8 May 2020 22:31:08 +0200
Subject: [PATCH 59/68] Debuging distributed matrix.

---
 src/TNL/Containers/Segments/BiEllpack.h       |  9 ++-
 src/TNL/Containers/Segments/BiEllpack.hpp     | 24 ++++----
 src/TNL/Containers/Segments/BiEllpackView.h   |  2 +-
 src/TNL/Containers/Segments/CSR.h             |  2 +-
 src/TNL/Containers/Segments/ChunkedEllpack.h  |  9 ++-
 .../Containers/Segments/ChunkedEllpack.hpp    |  5 +-
 .../Containers/Segments/ChunkedEllpackView.h  |  2 +-
 src/TNL/Matrices/DistributedSpMV.h            | 20 ++++---
 src/TNL/Matrices/SparseMatrix.h               |  8 ++-
 src/TNL/Matrices/SparseMatrix.hpp             | 59 ++----------------
 src/TNL/Matrices/SparseMatrixView.h           |  8 ++-
 src/TNL/Matrices/SparseMatrixView.hpp         | 49 ++++-----------
 src/TNL/Matrices/ThreePartVector.h            | 18 +++++-
 src/TNL/Matrices/details/SparseMatrix.h       | 60 ++++---------------
 .../Solvers/Linear/LinearResidueGetter_impl.h |  8 ++-
 .../Matrices/DistributedMatrixTest.h          |  6 +-
 16 files changed, 106 insertions(+), 183 deletions(-)

diff --git a/src/TNL/Containers/Segments/BiEllpack.h b/src/TNL/Containers/Segments/BiEllpack.h
index 45c633b8b..b365266f4 100644
--- a/src/TNL/Containers/Segments/BiEllpack.h
+++ b/src/TNL/Containers/Segments/BiEllpack.h
@@ -54,15 +54,18 @@ class BiEllpack
 
       const ConstViewType getConstView() const;
 
+      /**
+       * \brief Number of segments.
+       */
+      __cuda_callable__
+      IndexType getSegmentsCount() const;
+
       /**
        * \brief Set sizes of particular segments.
        */
       template< typename SizesHolder = OffsetsHolder >
       void setSegmentsSizes( const SizesHolder& sizes );
 
-      __cuda_callable__
-      IndexType getSegmentsCount() const;
-
       IndexType getSegmentSize( const IndexType segmentIdx ) const;
 
       /**
diff --git a/src/TNL/Containers/Segments/BiEllpack.hpp b/src/TNL/Containers/Segments/BiEllpack.hpp
index 52ade2178..f3ac41906 100644
--- a/src/TNL/Containers/Segments/BiEllpack.hpp
+++ b/src/TNL/Containers/Segments/BiEllpack.hpp
@@ -108,6 +108,17 @@ getConstView() const -> const ConstViewType
    return ConstViewType( size, storageSize, virtualRows, rowPermArray.getConstView(), groupPointers.getConstView() );
 }
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+getSegmentsCount() const -> IndexType
+{
+   return this->size;
+}
+
 template< typename Device,
           typename Index,
           typename IndexAllocator,
@@ -351,19 +362,6 @@ setSegmentsSizes( const SizesHolder& segmentsSizes )
    }
 }
 
-template< typename Device,
-          typename Index,
-          typename IndexAllocator,
-          bool RowMajorOrder,
-          int WarpSize >
-__cuda_callable__ auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
-getSegmentsCount() const -> IndexType
-{
-   // FIXME
-//   return this->segmentsCount;
-   return 0;
-}
-
 template< typename Device,
           typename Index,
           typename IndexAllocator,
diff --git a/src/TNL/Containers/Segments/BiEllpackView.h b/src/TNL/Containers/Segments/BiEllpackView.h
index 2450f18ca..e4807bef8 100644
--- a/src/TNL/Containers/Segments/BiEllpackView.h
+++ b/src/TNL/Containers/Segments/BiEllpackView.h
@@ -73,7 +73,7 @@ class BiEllpackView
       const ConstViewType getConstView() const;
 
       /**
-       * \brief Number segments.
+       * \brief Number of segments.
        */
       __cuda_callable__
       IndexType getSegmentsCount() const;
diff --git a/src/TNL/Containers/Segments/CSR.h b/src/TNL/Containers/Segments/CSR.h
index 46f9a9013..cfb75d43f 100644
--- a/src/TNL/Containers/Segments/CSR.h
+++ b/src/TNL/Containers/Segments/CSR.h
@@ -60,7 +60,7 @@ class CSR
       const ConstViewType getConstView() const;
 
       /**
-       * \brief Number segments.
+       * \brief Number of segments.
        */
       __cuda_callable__
       IndexType getSegmentsCount() const;
diff --git a/src/TNL/Containers/Segments/ChunkedEllpack.h b/src/TNL/Containers/Segments/ChunkedEllpack.h
index 8a1f48e7b..c78fea11e 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpack.h
+++ b/src/TNL/Containers/Segments/ChunkedEllpack.h
@@ -57,15 +57,18 @@ class ChunkedEllpack
 
       const ConstViewType getConstView() const;
 
+      /**
+       * \brief Number of segments.
+       */
+      __cuda_callable__
+      IndexType getSegmentsCount() const;
+
       /**
        * \brief Set sizes of particular segments.
        */
       template< typename SizesHolder = OffsetsHolder >
       void setSegmentsSizes( const SizesHolder& sizes );
 
-      __cuda_callable__
-      IndexType getSegmentsCount() const;
-
       IndexType getSegmentSize( const IndexType segmentIdx ) const;
 
       /**
diff --git a/src/TNL/Containers/Segments/ChunkedEllpack.hpp b/src/TNL/Containers/Segments/ChunkedEllpack.hpp
index 0ae4c7763..38f46c629 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpack.hpp
+++ b/src/TNL/Containers/Segments/ChunkedEllpack.hpp
@@ -222,7 +222,6 @@ setSlice( SegmentsSizes& rowLengths,
       maxChunkInSlice = TNL::max( maxChunkInSlice,
                               roundUpDivision( rowLengths[ i ], this->rowToChunkMapping[ i ] ) );
    }
-   TNL_ASSERT_GT( maxChunkInSlice, 0, "" );
 
    /****
     * Set-up the slice info.
@@ -308,9 +307,7 @@ template< typename Device,
 __cuda_callable__ auto ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
 getSegmentsCount() const -> IndexType
 {
-   // FIXME
-//   return this->segmentsCount;
-   return 0;
+   return this->size;
 }
 
 template< typename Device,
diff --git a/src/TNL/Containers/Segments/ChunkedEllpackView.h b/src/TNL/Containers/Segments/ChunkedEllpackView.h
index d8ed4e81f..a840447b9 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpackView.h
+++ b/src/TNL/Containers/Segments/ChunkedEllpackView.h
@@ -86,7 +86,7 @@ class ChunkedEllpackView
       const ConstViewType getConstView() const;
 
       /**
-       * \brief Number segments.
+       * \brief Number of segments.
        */
       __cuda_callable__
       IndexType getSegmentsCount() const;
diff --git a/src/TNL/Matrices/DistributedSpMV.h b/src/TNL/Matrices/DistributedSpMV.h
index 7c4ee3c40..457cc0ecf 100644
--- a/src/TNL/Matrices/DistributedSpMV.h
+++ b/src/TNL/Matrices/DistributedSpMV.h
@@ -189,41 +189,47 @@ public:
          CommunicatorType::WaitAll( &commRequests[0], commRequests.size() );
 
          // perform matrix-vector multiplication
-         auto outVectorView = outVector.getLocalView();
+         localMatrix.vectorProduct( globalBuffer, outVector );
+         /*auto outVectorView = outVector.getLocalView();
          const Pointers::DevicePointer< const MatrixType > localMatrixPointer( localMatrix );
          auto kernel = [=] __cuda_callable__ ( IndexType i, const MatrixType* localMatrix ) mutable
          {
             outVectorView[ i ] = localMatrix->rowVectorProduct( i, globalBufferView );
          };
          Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, localMatrix.getRows(), kernel,
-                                                      &localMatrixPointer.template getData< DeviceType >() );
+                                                      &localMatrixPointer.template getData< DeviceType >() );*/
       }
       // optimization for banded matrices
       else {
+         return;
          auto outVectorView = outVector.getLocalView();
          const Pointers::DevicePointer< const MatrixType > localMatrixPointer( localMatrix );
-         const auto inView = inVector.getConstView();
+         //const auto inView = inVector.getConstView();
 
          // matrix-vector multiplication using local-only rows
-         auto kernel1 = [=] __cuda_callable__ ( IndexType i, const MatrixType* localMatrix ) mutable
+         localMatrix.vectorProduct( inVector, outVector, 1.0, 0.0, localOnlySpan.first, localOnlySpan.second );
+         /*auto kernel1 = [=] __cuda_callable__ ( IndexType i, const MatrixType* localMatrix ) mutable
          {
             outVectorView[ i ] = localMatrix->rowVectorProduct( i, inView );
          };
          Algorithms::ParallelFor< DeviceType >::exec( localOnlySpan.first, localOnlySpan.second, kernel1,
-                                                      &localMatrixPointer.template getData< DeviceType >() );
+                                                      &localMatrixPointer.template getData< DeviceType >() );*/
+
 
          // wait for all communications to finish
          CommunicatorType::WaitAll( &commRequests[0], commRequests.size() );
 
          // finish the multiplication by adding the non-local entries
-         auto kernel2 = [=] __cuda_callable__ ( IndexType i, const MatrixType* localMatrix ) mutable
+         localMatrix.vectorProduct( globalBufferView, outVector, 1.0, 0.0, 0, localOnlySpan.first );
+         localMatrix.vectorProduct( globalBufferView, outVector, 1.0, 0.0, localOnlySpan.second, localMatrix.getRows() );
+         /*auto kernel2 = [=] __cuda_callable__ ( IndexType i, const MatrixType* localMatrix ) mutable
          {
             outVectorView[ i ] = localMatrix->rowVectorProduct( i, globalBufferView );
          };
          Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, localOnlySpan.first, kernel2,
                                                       &localMatrixPointer.template getData< DeviceType >() );
          Algorithms::ParallelFor< DeviceType >::exec( localOnlySpan.second, localMatrix.getRows(), kernel2,
-                                                      &localMatrixPointer.template getData< DeviceType >() );
+                                                      &localMatrixPointer.template getData< DeviceType >() );*/
       }
    }
 
diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index 9ba50f6a8..1aeab54cd 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -170,10 +170,10 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       RealType getElement( const IndexType row,
                            const IndexType column ) const;
 
-      template< typename Vector >
+      /*template< typename Vector >
       __cuda_callable__
       typename Vector::RealType rowVectorProduct( const IndexType row,
-                                                  const Vector& vector ) const;
+                                                  const Vector& vector ) const;*/
 
       /***
        * \brief This method computes outVector = matrixMultiplicator * ( *this ) * inVector + inVectorAddition * inVector
@@ -183,7 +183,9 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       void vectorProduct( const InVector& inVector,
                           OutVector& outVector,
                           const RealType& matrixMultiplicator = 1.0,
-                          const RealType& outVectorMultiplicator = 0.0 ) const;
+                          const RealType& outVectorMultiplicator = 0.0,
+                          const IndexType firstRow = 0,
+                          const IndexType lastRow = -1 ) const;
 
       /*template< typename Real2, typename Index2 >
       void addMatrix( const SparseMatrix< Real2, Segments, Device, Index2 >& matrix,
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index de862697b..8fa5cf3b0 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -445,7 +445,7 @@ getElement( const IndexType row,
    return this->view.getElement( row, column );
 }
 
-template< typename Real,
+/*template< typename Real,
           typename Device,
           typename Index,
           typename MatrixType,
@@ -460,7 +460,7 @@ rowVectorProduct( const IndexType row,
                   const Vector& vector ) const
 {
    return this->view.rowVectorProduct( row, vector );
-}
+}*/
 
 template< typename Real,
           typename Device,
@@ -476,31 +476,11 @@ SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAll
 vectorProduct( const InVector& inVector,
                OutVector& outVector,
                const RealType& matrixMultiplicator,
-               const RealType& outVectorMultiplicator ) const
+               const RealType& outVectorMultiplicator,
+               const IndexType firstRow,
+               const IndexType lastRow ) const
 {
-   this->view.vectorProduct( inVector, outVector, matrixMultiplicator, outVectorMultiplicator );
-   /*TNL_ASSERT_EQ( this->getColumns(), inVector.getSize(), "Matrix columns do not fit with input vector." );
-   TNL_ASSERT_EQ( this->getRows(), outVector.getSize(), "Matrix rows do not fit with output vector." );
-
-   const auto inVectorView = inVector.getConstView();
-   auto outVectorView = outVector.getView();
-   const auto valuesView = this->values.getConstView();
-   const auto columnIndexesView = this->columnIndexes.getConstView();
-   const IndexType paddingIndex = this->getPaddingIndex();
-   auto fetch = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType globalIdx, bool& compute ) -> RealType {
-      const IndexType column = columnIndexesView[ globalIdx ];
-      compute = ( column != paddingIndex );
-      if( ! compute )
-         return 0.0;
-      return valuesView[ globalIdx ] * inVectorView[ column ];
-   };
-   auto reduction = [] __cuda_callable__ ( RealType& sum, const RealType& value ) {
-      sum += value;
-   };
-   auto keeper = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
-      outVectorView[ row ] = value;
-   };
-   this->segments.segmentsReduction( 0, this->getRows(), fetch, reduction, keeper, ( RealType ) 0.0 );*/
+   this->view.vectorProduct( inVector, outVector, matrixMultiplicator, outVectorMultiplicator, firstRow, lastRow );
 }
 
 template< typename Real,
@@ -516,16 +496,6 @@ SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAll
 rowsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const
 {
    this->view.rowsReduction( first, last, fetch, reduce, keep, zero );
-   /*const auto columns_view = this->columnIndexes.getConstView();
-   const auto values_view = this->values.getConstView();
-   const IndexType paddingIndex_ = this->getPaddingIndex();
-   auto fetch_ = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable -> decltype( fetch( IndexType(), IndexType(), IndexType(), RealType() ) ) {
-      IndexType columnIdx = columns_view[ globalIdx ];
-      if( columnIdx != paddingIndex_ )
-         return fetch( rowIdx, columnIdx, globalIdx, values_view[ globalIdx ] );
-      return zero;
-   };
-   this->segments.segmentsReduction( first, last, fetch_, reduce, keep, zero );*/
 }
 
 template< typename Real,
@@ -556,15 +526,6 @@ SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAll
 forRows( IndexType first, IndexType last, Function& function ) const
 {
    this->view.forRows( first, last, function );
-   /*const auto columns_view = this->columnIndexes.getConstView();
-   const auto values_view = this->values.getConstView();
-   const IndexType paddingIndex_ = this->getPaddingIndex();
-   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx ) mutable -> bool {
-      function( rowIdx, localIdx, columns_view[ globalIdx ], values_view[ globalIdx ] );
-      return true;
-   };
-   this->segments.forSegments( first, last, f );
-    */
 }
 
 template< typename Real,
@@ -580,14 +541,6 @@ SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAll
 forRows( IndexType first, IndexType last, Function& function )
 {
    this->view.forRows( first, last, function );
-   /*auto columns_view = this->columnIndexes.getView();
-   auto values_view = this->values.getView();
-   const IndexType paddingIndex_ = this->getPaddingIndex();
-   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx ) mutable -> bool {
-      function( rowIdx, localIdx, columns_view[ globalIdx ], values_view[ globalIdx ] );
-      return true;
-   };
-   this->segments.forSegments( first, last, f );*/
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index beb35eb7e..4482ff173 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -110,10 +110,10 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       RealType getElement( IndexType row,
                            IndexType column ) const;
 
-      template< typename Vector >
+      /*template< typename Vector >
       __cuda_callable__
       typename Vector::RealType rowVectorProduct( const IndexType row,
-                                                  const Vector& vector ) const;
+                                                  const Vector& vector ) const;*/
 
       /***
        * \brief This method computes outVector = matrixMultiplicator * ( *this ) * inVector + inVectorAddition * inVector
@@ -123,7 +123,9 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       void vectorProduct( const InVector& inVector,
                           OutVector& outVector,
                           const RealType matrixMultiplicator = 1.0,
-                          const RealType outVectorMultiplicator = 0.0 ) const;
+                          const RealType outVectorMultiplicator = 0.0,
+                          const IndexType firstRow = 0,
+                          IndexType lastRow = -1 ) const;
 
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
       void rowsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 8cf222f21..0673f50ff 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -117,15 +117,12 @@ void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
 getCompressedRowLengths( Vector& rowLengths ) const
 {
-   details::CompressedRowLengthVectorSizeSetter< Vector >::setSize( rowLengths, this->getRows() );
+   details::set_size_if_resizable( rowLengths, this->getRows() );
    rowLengths = 0;
    auto rowLengths_view = rowLengths.getView();
    auto fetch = [] __cuda_callable__ ( IndexType row, IndexType column, IndexType globalIdx, const RealType& value ) -> IndexType {
       return ( value != 0.0 );
    };
-   //auto reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType a ) {
-   //   aux += a;
-   //};
    auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
       rowLengths_view[ rowIdx ] = value;
    };
@@ -349,7 +346,7 @@ getElement( IndexType row,
    return 0.0;
 }
 
-template< typename Real,
+/*template< typename Real,
           typename Device,
           typename Index,
           typename MatrixType,
@@ -363,7 +360,7 @@ rowVectorProduct( const IndexType row,
 {
    TNL_ASSERT_TRUE( false, "TODO: rowVectorProduct is not implemented yet.");
    return 0;
-}
+}*/
 
 template< typename Real,
           typename Device,
@@ -377,10 +374,12 @@ SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
 vectorProduct( const InVector& inVector,
                OutVector& outVector,
                const RealType matrixMultiplicator,
-               const RealType outVectorMultiplicator ) const
+               const RealType outVectorMultiplicator,
+               const IndexType firstRow,
+               IndexType lastRow ) const
 {
    TNL_ASSERT_EQ( this->getColumns(), inVector.getSize(), "Matrix columns do not fit with input vector." );
-   TNL_ASSERT_EQ( this->getRows(), outVector.getSize(), "Matrix rows do not fit with output vector." );
+   //TNL_ASSERT_EQ( this->getRows(), outVector.getSize(), "Matrix rows do not fit with output vector." );
 
    const auto inVectorView = inVector.getConstView();
    auto outVectorView = outVector.getView();
@@ -415,45 +414,23 @@ vectorProduct( const InVector& inVector,
       return valuesView[ globalIdx ] * inVectorView[ column ];
    };
 
-   //auto reduction = [] __cuda_callable__ ( RealType& sum, const RealType& value ) {
-   //   sum += value;
-   //};
    auto keeper = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
       if( isSymmetric() )
          outVectorView[ row ] += matrixMultiplicator * value;
       else
       {
-         if( outVectorMultiplicator == 0.0 )
+         /*if( outVectorMultiplicator == 0.0 )
             outVectorView[ row ] = matrixMultiplicator * value;
          else
-            outVectorView[ row ] = outVectorMultiplicator * outVectorView[ row ] + matrixMultiplicator * value;
+            outVectorView[ row ] = outVectorMultiplicator * outVectorView[ row ] + matrixMultiplicator * value;*/
       }
    };
+   if( lastRow == -1 )
+      lastRow = this->getRows();
    if( isSymmetric() )
-      this->segments.segmentsReduction( 0, this->getRows(), symmetricFetch, std::plus<>{}, keeper, ( RealType ) 0.0 );
+      this->segments.segmentsReduction( firstRow, lastRow, symmetricFetch, std::plus<>{}, keeper, ( RealType ) 0.0 );
    else
-      this->segments.segmentsReduction( 0, this->getRows(), fetch, std::plus<>{}, keeper, ( RealType ) 0.0 );
-
-   /*const auto inVectorView = inVector.getConstView();
-   auto outVectorView = outVector.getView();
-   const auto valuesView = this->values.getConstView();
-   const auto columnIndexesView = this->columnIndexes.getConstView();
-   const IndexType paddingIndex = this->getPaddingIndex();
-   auto fetch = [=] __cuda_callable__ ( IndexType row, IndexType offset, bool& compute ) -> RealType {
-      const IndexType column = columnIndexesView[ offset ];
-      compute = ( column != paddingIndex );
-      if( ! compute )
-         return 0.0;
-      return valuesView[ offset ] * inVectorView[ column ];
-   };
-   auto reduction = [] __cuda_callable__ ( RealType& sum, const RealType& value ) {
-      sum += value;
-   };
-   auto keeper = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
-      outVectorView[ row ] = value;
-   };
-   this->segments.segmentsReduction( 0, this->getRows(), fetch, reduction, keeper, ( RealType ) 0.0 );
-   */
+      this->segments.segmentsReduction( firstRow, lastRow, fetch, std::plus<>{}, keeper, ( RealType ) 0.0 );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/ThreePartVector.h b/src/TNL/Matrices/ThreePartVector.h
index f28f544f5..a583c5eab 100644
--- a/src/TNL/Matrices/ThreePartVector.h
+++ b/src/TNL/Matrices/ThreePartVector.h
@@ -24,6 +24,7 @@ template< typename Real,
           typename Index = int >
 class ThreePartVectorView
 {
+   using ConstReal = std::add_const_t< Real >;
 public:
    using RealType = Real;
    using DeviceType = Device;
@@ -53,6 +54,16 @@ public:
       right.reset();
    }
 
+   IndexType getSize() const
+   {
+      return left.getSize() + middle.getSize() + right.getSize();
+   }
+
+   ThreePartVectorView< ConstReal, Device, Index > getConstView() const
+   {
+      return *this; //{left.getConstView(), middle, right.getConstView()};
+   }
+
 //   __cuda_callable__
 //   Real& operator[]( Index i )
 //   {
@@ -127,7 +138,12 @@ public:
       right.reset();
    }
 
-   ThreePartVectorView< ConstReal, Device, Index > getConstView()
+   IndexType getSize() const
+   {
+      return left.getSize() + middle.getSize() + right.getSize();
+   }
+
+   ThreePartVectorView< ConstReal, Device, Index > getConstView() const
    {
       return {left.getConstView(), middle, right.getConstView()};
    }
diff --git a/src/TNL/Matrices/details/SparseMatrix.h b/src/TNL/Matrices/details/SparseMatrix.h
index 522594806..9eeac7614 100644
--- a/src/TNL/Matrices/details/SparseMatrix.h
+++ b/src/TNL/Matrices/details/SparseMatrix.h
@@ -20,60 +20,20 @@ namespace TNL {
    namespace Matrices {
       namespace details {
 
-template< typename Vector >
-struct CompressedRowLengthVectorSizeSetter
-{
-   static void setSize( Vector& v, typename Vector::IndexType size )
-   {
-      v.setSize( size );
-   }
-};
-
-template< typename Value,
-   typename Device,
-   typename Index >
-struct CompressedRowLengthVectorSizeSetter< Containers::ArrayView< Value, Device, Index > >
-{
-   static void setSize( Containers::ArrayView< Value, Device, Index >& v, Index size )
-   {
-      TNL_ASSERT_EQ( v.getSize(), size, "ArrayView has wrong size, different from number of matrix rows." );
-   }
-};
-
-template< typename Value,
-   typename Device,
-   typename Index >
-struct CompressedRowLengthVectorSizeSetter< Containers::VectorView< Value, Device, Index > >
-{
-   static void setSize( Containers::VectorView< Value, Device, Index >& v, Index size )
-   {
-      TNL_ASSERT_EQ( v.getSize(), size, "VectorView has wrong size, different from number of matrix rows." );
-   }
-};
 
-template< typename Value,
-   typename Device,
-   typename Index,
-   typename Communicator >
-struct CompressedRowLengthVectorSizeSetter< Containers::DistributedArray< Value, Device, Index, Communicator > >
+template< typename VectorOrView,
+          std::enable_if_t< HasSetSizeMethod< VectorOrView >::value, bool > = true >
+static void set_size_if_resizable( VectorOrView& v, typename VectorOrView::IndexType size )
 {
-   static void setSize( Containers::DistributedArray< Value, Device, Index, Communicator >& v, Index size )
-   {
-      TNL_ASSERT_EQ( v.getSize(), size, "DistributedArray has wrong size, different from number of matrix rows." );
-   }
-};
+   v.setSize( size );
+}
 
-template< typename Value,
-   typename Device,
-   typename Index,
-   typename Communicator >
-struct CompressedRowLengthVectorSizeSetter< Containers::DistributedVector< Value, Device, Index, Communicator > >
+template< typename VectorOrView,
+          std::enable_if_t< ! HasSetSizeMethod< VectorOrView >::value, bool > = true >
+static void set_size_if_resizable( VectorOrView& v, typename VectorOrView::IndexType size )
 {
-   static void setSize( Containers::DistributedVector< Value, Device, Index, Communicator >& v, Index size )
-   {
-      TNL_ASSERT_EQ( v.getSize(), size, "DistributedVector has wrong size, different from number of matrix rows." );
-   }
-};
+   TNL_ASSERT_EQ( v.getSize(), size, "view has wrong size" );
+}
 
       } //namespace details
    } //namepsace Matrices
diff --git a/src/TNL/Solvers/Linear/LinearResidueGetter_impl.h b/src/TNL/Solvers/Linear/LinearResidueGetter_impl.h
index 21fc726aa..7165cdde7 100644
--- a/src/TNL/Solvers/Linear/LinearResidueGetter_impl.h
+++ b/src/TNL/Solvers/Linear/LinearResidueGetter_impl.h
@@ -27,18 +27,22 @@ getResidue( const Matrix& matrix,
             typename Matrix::RealType bNorm )
 {
    using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
    using IndexType = typename Matrix::IndexType;
 
    const IndexType size = matrix.getRows();
    RealType res( 0.0 );
    if( bNorm == 0.0 )
       bNorm = lpNorm( b, 2.0 );
-   for( IndexType i = 0; i < size; i ++ )
+   Containers::Vector< RealType, DeviceType, IndexType > v( b.getSize() );
+   matrix.vectorProduct( x, v );
+   return l2Norm( v - b );
+   /*for( IndexType i = 0; i < size; i ++ )
    {
       RealType err = abs( matrix.rowVectorProduct( i, x ) - b[ i ] );
       res += err * err;
    }
-   return std::sqrt( res ) / bNorm;
+   return std::sqrt( res ) / bNorm;*/
 }
 
 } // namespace Linear
diff --git a/src/UnitTests/Matrices/DistributedMatrixTest.h b/src/UnitTests/Matrices/DistributedMatrixTest.h
index 18d4ef973..ce593b504 100644
--- a/src/UnitTests/Matrices/DistributedMatrixTest.h
+++ b/src/UnitTests/Matrices/DistributedMatrixTest.h
@@ -111,7 +111,7 @@ using DistributedMatrixTypes = ::testing::Types<
 >;
 
 TYPED_TEST_SUITE( DistributedMatrixTest, DistributedMatrixTypes );
-
+/*
 TYPED_TEST( DistributedMatrixTest, checkSumOfLocalSizes )
 {
    using CommunicatorType = typename TestFixture::CommunicatorType;
@@ -207,6 +207,7 @@ TYPED_TEST( DistributedMatrixTest, setGetElement )
 
 // TODO: getRow (const and non-const)
 
+
 TYPED_TEST( DistributedMatrixTest, vectorProduct_globalInput )
 {
    using GlobalVector = typename TestFixture::GlobalVector;
@@ -224,7 +225,7 @@ TYPED_TEST( DistributedMatrixTest, vectorProduct_globalInput )
       << "outVector.getLocalView() = " << outVector.getLocalView()
       << ",\nthis->rowLengths.getLocalView() = " << this->rowLengths.getLocalView();
 }
-
+*/
 TYPED_TEST( DistributedMatrixTest, vectorProduct_distributedInput )
 {
    using DistributedVector = typename TestFixture::DistributedVector;
@@ -242,6 +243,7 @@ TYPED_TEST( DistributedMatrixTest, vectorProduct_distributedInput )
       << ",\nthis->rowLengths.getLocalView() = " << this->rowLengths.getLocalView();
 }
 
+
 #endif  // HAVE_GTEST
 
 #include "../main_mpi.h"
-- 
GitLab


From 53fc98a99b5796b86fc6f84c7817567a67a5f2e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 8 May 2020 22:44:34 +0200
Subject: [PATCH 60/68] Debuging distributed matrix.

---
 src/TNL/Matrices/DistributedSpMV.h    | 1 -
 src/TNL/Matrices/SparseMatrixView.hpp | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Matrices/DistributedSpMV.h b/src/TNL/Matrices/DistributedSpMV.h
index 457cc0ecf..2a37236e6 100644
--- a/src/TNL/Matrices/DistributedSpMV.h
+++ b/src/TNL/Matrices/DistributedSpMV.h
@@ -201,7 +201,6 @@ public:
       }
       // optimization for banded matrices
       else {
-         return;
          auto outVectorView = outVector.getLocalView();
          const Pointers::DevicePointer< const MatrixType > localMatrixPointer( localMatrix );
          //const auto inView = inVector.getConstView();
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 0673f50ff..974c2580f 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -419,10 +419,10 @@ vectorProduct( const InVector& inVector,
          outVectorView[ row ] += matrixMultiplicator * value;
       else
       {
-         /*if( outVectorMultiplicator == 0.0 )
+         if( outVectorMultiplicator == 0.0 )
             outVectorView[ row ] = matrixMultiplicator * value;
          else
-            outVectorView[ row ] = outVectorMultiplicator * outVectorView[ row ] + matrixMultiplicator * value;*/
+            outVectorView[ row ] = outVectorMultiplicator * outVectorView[ row ] + matrixMultiplicator * value;
       }
    };
    if( lastRow == -1 )
-- 
GitLab


From 7676df5522bf9629c958a89ab8d80e3f5a9761e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 8 May 2020 23:20:57 +0200
Subject: [PATCH 61/68] Fixed DistributedSpMV

---
 src/TNL/Matrices/DistributedSpMV.h            | 34 +++----------------
 src/TNL/Matrices/ThreePartVector.h            |  2 +-
 .../Matrices/DistributedMatrixTest.h          |  5 ++-
 3 files changed, 8 insertions(+), 33 deletions(-)

diff --git a/src/TNL/Matrices/DistributedSpMV.h b/src/TNL/Matrices/DistributedSpMV.h
index 2a37236e6..55527834c 100644
--- a/src/TNL/Matrices/DistributedSpMV.h
+++ b/src/TNL/Matrices/DistributedSpMV.h
@@ -189,46 +189,22 @@ public:
          CommunicatorType::WaitAll( &commRequests[0], commRequests.size() );
 
          // perform matrix-vector multiplication
-         localMatrix.vectorProduct( globalBuffer, outVector );
-         /*auto outVectorView = outVector.getLocalView();
-         const Pointers::DevicePointer< const MatrixType > localMatrixPointer( localMatrix );
-         auto kernel = [=] __cuda_callable__ ( IndexType i, const MatrixType* localMatrix ) mutable
-         {
-            outVectorView[ i ] = localMatrix->rowVectorProduct( i, globalBufferView );
-         };
-         Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, localMatrix.getRows(), kernel,
-                                                      &localMatrixPointer.template getData< DeviceType >() );*/
+         auto outVectorView = outVector.getLocalView();
+         localMatrix.vectorProduct( globalBuffer, outVectorView );
       }
       // optimization for banded matrices
       else {
          auto outVectorView = outVector.getLocalView();
-         const Pointers::DevicePointer< const MatrixType > localMatrixPointer( localMatrix );
-         //const auto inView = inVector.getConstView();
 
          // matrix-vector multiplication using local-only rows
-         localMatrix.vectorProduct( inVector, outVector, 1.0, 0.0, localOnlySpan.first, localOnlySpan.second );
-         /*auto kernel1 = [=] __cuda_callable__ ( IndexType i, const MatrixType* localMatrix ) mutable
-         {
-            outVectorView[ i ] = localMatrix->rowVectorProduct( i, inView );
-         };
-         Algorithms::ParallelFor< DeviceType >::exec( localOnlySpan.first, localOnlySpan.second, kernel1,
-                                                      &localMatrixPointer.template getData< DeviceType >() );*/
-
+         localMatrix.vectorProduct( inVector, outVectorView, 1.0, 0.0, localOnlySpan.first, localOnlySpan.second );
 
          // wait for all communications to finish
          CommunicatorType::WaitAll( &commRequests[0], commRequests.size() );
 
          // finish the multiplication by adding the non-local entries
-         localMatrix.vectorProduct( globalBufferView, outVector, 1.0, 0.0, 0, localOnlySpan.first );
-         localMatrix.vectorProduct( globalBufferView, outVector, 1.0, 0.0, localOnlySpan.second, localMatrix.getRows() );
-         /*auto kernel2 = [=] __cuda_callable__ ( IndexType i, const MatrixType* localMatrix ) mutable
-         {
-            outVectorView[ i ] = localMatrix->rowVectorProduct( i, globalBufferView );
-         };
-         Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, localOnlySpan.first, kernel2,
-                                                      &localMatrixPointer.template getData< DeviceType >() );
-         Algorithms::ParallelFor< DeviceType >::exec( localOnlySpan.second, localMatrix.getRows(), kernel2,
-                                                      &localMatrixPointer.template getData< DeviceType >() );*/
+         localMatrix.vectorProduct( globalBufferView, outVectorView, 1.0, 0.0, 0, localOnlySpan.first );
+         localMatrix.vectorProduct( globalBufferView, outVectorView, 1.0, 0.0, localOnlySpan.second, localMatrix.getRows() );
       }
    }
 
diff --git a/src/TNL/Matrices/ThreePartVector.h b/src/TNL/Matrices/ThreePartVector.h
index a583c5eab..01caaae52 100644
--- a/src/TNL/Matrices/ThreePartVector.h
+++ b/src/TNL/Matrices/ThreePartVector.h
@@ -61,7 +61,7 @@ public:
 
    ThreePartVectorView< ConstReal, Device, Index > getConstView() const
    {
-      return *this; //{left.getConstView(), middle, right.getConstView()};
+      return {left.getConstView(), middle, right.getConstView()};
    }
 
 //   __cuda_callable__
diff --git a/src/UnitTests/Matrices/DistributedMatrixTest.h b/src/UnitTests/Matrices/DistributedMatrixTest.h
index ce593b504..fad3a6420 100644
--- a/src/UnitTests/Matrices/DistributedMatrixTest.h
+++ b/src/UnitTests/Matrices/DistributedMatrixTest.h
@@ -111,7 +111,7 @@ using DistributedMatrixTypes = ::testing::Types<
 >;
 
 TYPED_TEST_SUITE( DistributedMatrixTest, DistributedMatrixTypes );
-/*
+
 TYPED_TEST( DistributedMatrixTest, checkSumOfLocalSizes )
 {
    using CommunicatorType = typename TestFixture::CommunicatorType;
@@ -225,7 +225,7 @@ TYPED_TEST( DistributedMatrixTest, vectorProduct_globalInput )
       << "outVector.getLocalView() = " << outVector.getLocalView()
       << ",\nthis->rowLengths.getLocalView() = " << this->rowLengths.getLocalView();
 }
-*/
+
 TYPED_TEST( DistributedMatrixTest, vectorProduct_distributedInput )
 {
    using DistributedVector = typename TestFixture::DistributedVector;
@@ -243,7 +243,6 @@ TYPED_TEST( DistributedMatrixTest, vectorProduct_distributedInput )
       << ",\nthis->rowLengths.getLocalView() = " << this->rowLengths.getLocalView();
 }
 
-
 #endif  // HAVE_GTEST
 
 #include "../main_mpi.h"
-- 
GitLab


From 950bc455cd8a09dc23b29ed250e45f7c343e4f06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 8 May 2020 23:23:28 +0200
Subject: [PATCH 62/68] Changed default value for lastRow in vectorProduct from
 -1 to 0

Otherwise the compiler would complain for unsigned types, see also Array
and Vector methods.
---
 src/TNL/Matrices/SparseMatrix.h       | 2 +-
 src/TNL/Matrices/SparseMatrixView.h   | 2 +-
 src/TNL/Matrices/SparseMatrixView.hpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index 1aeab54cd..29804e656 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -185,7 +185,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
                           const RealType& matrixMultiplicator = 1.0,
                           const RealType& outVectorMultiplicator = 0.0,
                           const IndexType firstRow = 0,
-                          const IndexType lastRow = -1 ) const;
+                          const IndexType lastRow = 0 ) const;
 
       /*template< typename Real2, typename Index2 >
       void addMatrix( const SparseMatrix< Real2, Segments, Device, Index2 >& matrix,
diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index 4482ff173..a600f1bb8 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -125,7 +125,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
                           const RealType matrixMultiplicator = 1.0,
                           const RealType outVectorMultiplicator = 0.0,
                           const IndexType firstRow = 0,
-                          IndexType lastRow = -1 ) const;
+                          IndexType lastRow = 0 ) const;
 
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
       void rowsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 974c2580f..13ca5fcfa 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -425,7 +425,7 @@ vectorProduct( const InVector& inVector,
             outVectorView[ row ] = outVectorMultiplicator * outVectorView[ row ] + matrixMultiplicator * value;
       }
    };
-   if( lastRow == -1 )
+   if( lastRow == 0 )
       lastRow = this->getRows();
    if( isSymmetric() )
       this->segments.segmentsReduction( firstRow, lastRow, symmetricFetch, std::plus<>{}, keeper, ( RealType ) 0.0 );
-- 
GitLab


From b761bc3fefe2475fd20c9fab6210102363861405 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 8 May 2020 23:39:22 +0200
Subject: [PATCH 63/68] Uncommented assert in vectorProduct

---
 src/TNL/Matrices/SparseMatrixView.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 13ca5fcfa..8d82ab292 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -379,7 +379,7 @@ vectorProduct( const InVector& inVector,
                IndexType lastRow ) const
 {
    TNL_ASSERT_EQ( this->getColumns(), inVector.getSize(), "Matrix columns do not fit with input vector." );
-   //TNL_ASSERT_EQ( this->getRows(), outVector.getSize(), "Matrix rows do not fit with output vector." );
+   TNL_ASSERT_EQ( this->getRows(), outVector.getSize(), "Matrix rows do not fit with output vector." );
 
    const auto inVectorView = inVector.getConstView();
    auto outVectorView = outVector.getView();
-- 
GitLab


From fd9ccbd5091fbaa4ce0ba1b0175c144a7aaecbd6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 8 May 2020 23:54:46 +0200
Subject: [PATCH 64/68] Added operator== and operator!= for SparseMatrix

---
 src/TNL/Matrices/SparseMatrix.h          |  6 +++++
 src/TNL/Matrices/SparseMatrix.hpp        | 30 +++++++++++++++++++++
 src/TNL/Matrices/SparseMatrixRowView.h   |  8 ++++++
 src/TNL/Matrices/SparseMatrixRowView.hpp | 32 ++++++++++++++++++++++
 src/TNL/Matrices/SparseMatrixView.h      |  6 +++++
 src/TNL/Matrices/SparseMatrixView.hpp    | 34 ++++++++++++++++++++++++
 6 files changed, 116 insertions(+)

diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index 29804e656..ab5caacfc 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -243,6 +243,12 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       template< typename RHSMatrix >
       SparseMatrix& operator=( const RHSMatrix& matrix );
 
+      template< typename Matrix >
+      bool operator==( const Matrix& m ) const;
+
+      template< typename Matrix >
+      bool operator!=( const Matrix& m ) const;
+
       void save( File& file ) const;
 
       void load( File& file );
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index 8fa5cf3b0..9a709e479 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -880,6 +880,36 @@ operator=( const RHSMatrix& matrix )
    return *this;
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          template< typename, typename, typename > class Segments,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Matrix >
+bool
+SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
+operator==( const Matrix& m ) const
+{
+   return view == m;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          template< typename, typename, typename > class Segments,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Matrix >
+bool
+SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAllocator >::
+operator!=( const Matrix& m ) const
+{
+   return view != m;
+}
+
 template< typename Real,
           typename Device,
           typename Index,
diff --git a/src/TNL/Matrices/SparseMatrixRowView.h b/src/TNL/Matrices/SparseMatrixRowView.h
index eda4852e9..c859655ef 100644
--- a/src/TNL/Matrices/SparseMatrixRowView.h
+++ b/src/TNL/Matrices/SparseMatrixRowView.h
@@ -64,6 +64,14 @@ class SparseMatrixRowView
       void setElement( const IndexType localIdx,
                        const IndexType column,
                        const RealType& value );
+
+      template< typename _SegmentView,
+                typename _ValuesView,
+                typename _ColumnsIndexesView,
+                bool _isBinary >
+      __cuda_callable__
+      bool operator==( const SparseMatrixRowView< _SegmentView, _ValuesView, _ColumnsIndexesView, _isBinary >& other ) const;
+
    protected:
 
       SegmentViewType segmentView;
diff --git a/src/TNL/Matrices/SparseMatrixRowView.hpp b/src/TNL/Matrices/SparseMatrixRowView.hpp
index c15b45f34..545e395fc 100644
--- a/src/TNL/Matrices/SparseMatrixRowView.hpp
+++ b/src/TNL/Matrices/SparseMatrixRowView.hpp
@@ -123,6 +123,38 @@ setElement( const IndexType localIdx,
       values[ globalIdx ] = value;
 }
 
+template< typename SegmentView,
+          typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+   template< typename _SegmentView,
+             typename _ValuesView,
+             typename _ColumnsIndexesView,
+             bool _isBinary >
+__cuda_callable__
+bool
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+operator==( const SparseMatrixRowView< _SegmentView, _ValuesView, _ColumnsIndexesView, _isBinary >& other ) const
+{
+   IndexType i = 0;
+   while( i < getSize() && i < other.getSize() ) {
+      if( getColumnIndex( i ) != other.getColumnIndex( i ) )
+         return false;
+      if( getValue( i ) != other.getValue( i ) )
+         return false;
+      ++i;
+   }
+   for( IndexType j = i; j < getSize(); j++ )
+      // TODO: use ... != getPaddingIndex()
+      if( getColumnIndex( j ) >= 0 )
+         return false;
+   for( IndexType j = i; j < other.getSize(); j++ )
+      // TODO: use ... != getPaddingIndex()
+      if( other.getColumnIndex( j ) >= 0 )
+         return false;
+   return true;
+}
+
 template< typename SegmentView,
           typename ValuesView,
           typename ColumnsIndexesView,
diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index a600f1bb8..4dc3413b8 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -153,6 +153,12 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
 
       SparseMatrixView& operator=( const SparseMatrixView& matrix );
 
+      template< typename Matrix >
+      bool operator==( const Matrix& m ) const;
+
+      template< typename Matrix >
+      bool operator!=( const Matrix& m ) const;
+
       void save( File& file ) const;
 
       void save( const String& fileName ) const;
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 8d82ab292..26de19dcb 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -609,6 +609,40 @@ operator=( const SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView
    return *this;
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          template< typename, typename > class SegmentsView >
+   template< typename Matrix >
+bool
+SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
+operator==( const Matrix& m ) const
+{
+   const auto& view1 = *this;
+   // FIXME: getConstView does not work
+   //const auto view2 = m.getConstView();
+   const auto view2 = m.getView();
+   auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> bool
+   {
+      return view1.getRow( i ) == view2.getRow( i );
+   };
+   return Algorithms::Reduction< DeviceType >::reduce( this->getRows(), std::logical_and<>{}, fetch, true );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          template< typename, typename > class SegmentsView >
+   template< typename Matrix >
+bool
+SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView >::
+operator!=( const Matrix& m ) const
+{
+   return ! operator==( m );
+}
+
 template< typename Real,
           typename Device,
           typename Index,
-- 
GitLab


From 8298c25e79ee46045bf624ecd45523d3d5f7c94b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 9 May 2020 11:20:39 +0200
Subject: [PATCH 65/68] Add reset method to segments.

---
 src/TNL/Containers/Segments/BiEllpack.h        |  4 ++--
 src/TNL/Containers/Segments/BiEllpack.hpp      | 16 ++++++++++++++++
 src/TNL/Containers/Segments/CSR.h              |  2 ++
 src/TNL/Containers/Segments/CSR.hpp            | 12 ++++++++++++
 src/TNL/Containers/Segments/ChunkedEllpack.h   |  2 ++
 src/TNL/Containers/Segments/ChunkedEllpack.hpp | 18 ++++++++++++++++++
 src/TNL/Containers/Segments/Ellpack.h          |  3 +++
 src/TNL/Containers/Segments/Ellpack.hpp        | 14 ++++++++++++++
 src/TNL/Containers/Segments/SlicedEllpack.h    |  2 ++
 src/TNL/Containers/Segments/SlicedEllpack.hpp  | 16 ++++++++++++++++
 src/TNL/Matrices/SparseMatrix.hpp              |  2 +-
 11 files changed, 88 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Containers/Segments/BiEllpack.h b/src/TNL/Containers/Segments/BiEllpack.h
index b365266f4..5f16011c5 100644
--- a/src/TNL/Containers/Segments/BiEllpack.h
+++ b/src/TNL/Containers/Segments/BiEllpack.h
@@ -66,6 +66,8 @@ class BiEllpack
       template< typename SizesHolder = OffsetsHolder >
       void setSegmentsSizes( const SizesHolder& sizes );
 
+      void reset();
+
       IndexType getSegmentSize( const IndexType segmentIdx ) const;
 
       /**
@@ -148,8 +150,6 @@ class BiEllpack
 
       OffsetsHolder groupPointers;
 
-
-
       // TODO: Replace later
       __cuda_callable__ Index power( const IndexType number, const IndexType exponent ) const
       {
diff --git a/src/TNL/Containers/Segments/BiEllpack.hpp b/src/TNL/Containers/Segments/BiEllpack.hpp
index f3ac41906..032543d1a 100644
--- a/src/TNL/Containers/Segments/BiEllpack.hpp
+++ b/src/TNL/Containers/Segments/BiEllpack.hpp
@@ -362,6 +362,22 @@ setSegmentsSizes( const SizesHolder& segmentsSizes )
    }
 }
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int WarpSize >
+void
+BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
+reset()
+{
+   this->size = 0;
+   this->storageSize = 0;
+   this->virtualRows = 0;
+   rowPermArray.reset();
+   groupPointers.reset();
+}
+
 template< typename Device,
           typename Index,
           typename IndexAllocator,
diff --git a/src/TNL/Containers/Segments/CSR.h b/src/TNL/Containers/Segments/CSR.h
index cfb75d43f..c5c0ce68f 100644
--- a/src/TNL/Containers/Segments/CSR.h
+++ b/src/TNL/Containers/Segments/CSR.h
@@ -55,6 +55,8 @@ class CSR
       template< typename SizesHolder = OffsetsHolder >
       void setSegmentsSizes( const SizesHolder& sizes );
 
+      void reset();
+
       ViewType getView();
 
       const ConstViewType getConstView() const;
diff --git a/src/TNL/Containers/Segments/CSR.hpp b/src/TNL/Containers/Segments/CSR.hpp
index 706a2b052..685f6ef54 100644
--- a/src/TNL/Containers/Segments/CSR.hpp
+++ b/src/TNL/Containers/Segments/CSR.hpp
@@ -85,6 +85,18 @@ setSegmentsSizes( const SizesHolder& sizes )
    details::CSR< Device, Index >::setSegmentsSizes( sizes, this->offsets );
 }
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator >
+void
+CSR< Device, Index, IndexAllocator >::
+reset()
+{
+   this->offsets.setSize( 1 );
+   this->offsets = 0;
+}
+
+
 template< typename Device,
           typename Index,
           typename IndexAllocator >
diff --git a/src/TNL/Containers/Segments/ChunkedEllpack.h b/src/TNL/Containers/Segments/ChunkedEllpack.h
index c78fea11e..dd4805887 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpack.h
+++ b/src/TNL/Containers/Segments/ChunkedEllpack.h
@@ -69,6 +69,8 @@ class ChunkedEllpack
       template< typename SizesHolder = OffsetsHolder >
       void setSegmentsSizes( const SizesHolder& sizes );
 
+      void reset();
+
       IndexType getSegmentSize( const IndexType segmentIdx ) const;
 
       /**
diff --git a/src/TNL/Containers/Segments/ChunkedEllpack.hpp b/src/TNL/Containers/Segments/ChunkedEllpack.hpp
index 38f46c629..005b22a78 100644
--- a/src/TNL/Containers/Segments/ChunkedEllpack.hpp
+++ b/src/TNL/Containers/Segments/ChunkedEllpack.hpp
@@ -300,6 +300,24 @@ setSegmentsSizes( const SizesHolder& segmentsSizes )
    }
 }
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder >
+void
+ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
+reset()
+{
+   this->size = 0;
+   this->storageSize = 0;
+   this->rowToSliceMapping.reset();
+   this->rowToChunkMapping.reset();
+   this->chunksToSegmentsMapping.reset();
+   this->rowPointers.reset();
+   this->slices.reset();
+   this->numberOfSlices = 0;
+}
+
 template< typename Device,
           typename Index,
           typename IndexAllocator,
diff --git a/src/TNL/Containers/Segments/Ellpack.h b/src/TNL/Containers/Segments/Ellpack.h
index 14e77f89b..63ca556a4 100644
--- a/src/TNL/Containers/Segments/Ellpack.h
+++ b/src/TNL/Containers/Segments/Ellpack.h
@@ -64,6 +64,9 @@ class Ellpack
       void setSegmentsSizes( const SizesHolder& sizes );
 
       void setSegmentsSizes( const IndexType segmentsCount, const IndexType segmentSize );
+
+      void reset();
+
       /**
        * \brief Number segments.
        */
diff --git a/src/TNL/Containers/Segments/Ellpack.hpp b/src/TNL/Containers/Segments/Ellpack.hpp
index 436f470af..e4e2180ad 100644
--- a/src/TNL/Containers/Segments/Ellpack.hpp
+++ b/src/TNL/Containers/Segments/Ellpack.hpp
@@ -142,6 +142,20 @@ setSegmentsSizes( const SizesHolder& sizes )
       this->alignedSize = roundUpDivision( size, this->getAlignment() ) * this->getAlignment();
 }
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int Alignment >
+void
+Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >::
+reset()
+{
+   this->segmentSize = 0;
+   this->size = 0;
+   this->alignedSize = 0;
+}
+
 template< typename Device,
           typename Index,
           typename IndexAllocator,
diff --git a/src/TNL/Containers/Segments/SlicedEllpack.h b/src/TNL/Containers/Segments/SlicedEllpack.h
index f26d791f3..19c1b8eb4 100644
--- a/src/TNL/Containers/Segments/SlicedEllpack.h
+++ b/src/TNL/Containers/Segments/SlicedEllpack.h
@@ -61,6 +61,8 @@ class SlicedEllpack
       template< typename SizesHolder = OffsetsHolder >
       void setSegmentsSizes( const SizesHolder& sizes );
 
+      void reset();
+
       __cuda_callable__
       IndexType getSegmentsCount() const;
 
diff --git a/src/TNL/Containers/Segments/SlicedEllpack.hpp b/src/TNL/Containers/Segments/SlicedEllpack.hpp
index c540735be..e76e6d430 100644
--- a/src/TNL/Containers/Segments/SlicedEllpack.hpp
+++ b/src/TNL/Containers/Segments/SlicedEllpack.hpp
@@ -157,6 +157,22 @@ setSegmentsSizes( const SizesHolder& sizes )
    this->alignedSize = this->sliceOffsets.getElement( slicesCount );
 }
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          bool RowMajorOrder,
+          int SliceSize >
+void
+SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >::
+reset()
+{
+   this->size = 0;
+   this->alignedSize = 0;
+   this->segmentsCount = 0;
+   this->sliceOffsets.reset();
+   this->sliceSegmentSizes.reset();
+}
+
 template< typename Device,
           typename Index,
           typename IndexAllocator,
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index 9a709e479..7ce144e34 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -363,7 +363,7 @@ SparseMatrix< Real, Device, Index, MatrixType, Segments, RealAllocator, IndexAll
 reset()
 {
    BaseType::reset();
-   this->segments = SegmentsType( Containers::Vector< IndexType, DeviceType, IndexType >() ),
+   this->segments.reset();
    this->view = this->getView();
    TNL_ASSERT_EQ( this->getRows(), segments.getSegmentsCount(), "mismatched segments count" );
 }
-- 
GitLab


From 0a511aac3bb8bf96cfe9bf25ff83a7d0850e3584 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 9 May 2020 11:21:14 +0200
Subject: [PATCH 66/68] Fixed DistributedMatrixTest to work with CUDA.

---
 src/UnitTests/Matrices/DistributedMatrixTest.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/UnitTests/Matrices/DistributedMatrixTest.h b/src/UnitTests/Matrices/DistributedMatrixTest.h
index fad3a6420..d030777a6 100644
--- a/src/UnitTests/Matrices/DistributedMatrixTest.h
+++ b/src/UnitTests/Matrices/DistributedMatrixTest.h
@@ -174,7 +174,7 @@ TYPED_TEST( DistributedMatrixTest, getCompressedRowLengths )
    EXPECT_EQ( output, 0 );
    for( int i = 0; i < this->matrix.getLocalMatrix().getRows(); i++ ) {
       const auto gi = this->matrix.getLocalRowRange().getGlobalIndex( i );
-      output[ gi ] = this->matrix.getRowCapacity( gi );
+      output.setElement( gi, this->matrix.getRowCapacity( gi ) );
    }
    EXPECT_EQ( output, this->rowLengths );
 }
-- 
GitLab


From d7d8c80829f293adec1df4c682a2d8c151a4c2ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 9 May 2020 11:21:46 +0200
Subject: [PATCH 67/68] Add initializer list constructors to UniquePointer.

---
 src/TNL/Pointers/UniquePointer.h | 52 ++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/src/TNL/Pointers/UniquePointer.h b/src/TNL/Pointers/UniquePointer.h
index 66bc4a33c..8683dbcbd 100644
--- a/src/TNL/Pointers/UniquePointer.h
+++ b/src/TNL/Pointers/UniquePointer.h
@@ -96,6 +96,30 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
          this->pointer = new Object( args... );
       }
 
+      /**
+       * \brief Constructor with initializer list.
+       *
+       * \tparam Value is type of the initializer list elements.
+       * \param list is the instance of the initializer list..
+       */
+      template< typename Value >
+      explicit  UniquePointer( std::initializer_list< Value > list )
+      {
+         this->pointer = new Object( list );
+      }
+
+      /**
+       * \brief Constructor with nested initializer lists.
+       *
+       * \tparam Value is type of the nested initializer list elements.
+       * \param list is the instance of the nested initializer list..
+       */
+      template< typename Value >
+      explicit  UniquePointer( std::initializer_list< std::initializer_list< Value > > list )
+      {
+         this->pointer = new Object( list );
+      }
+
       /**
        * \brief Arrow operator for accessing the object owned by constant smart pointer.
        *
@@ -300,6 +324,34 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
          this->allocate( args... );
       }
 
+      /**
+       * \brief Constructor with initializer list.
+       *
+       * \tparam Value is type of the initializer list elements.
+       * \param list is the instance of the initializer list..
+       */
+      template< typename Value >
+      explicit  UniquePointer( std::initializer_list< Value > list )
+      : pd( nullptr ),
+        cuda_pointer( nullptr )
+      {
+         this->allocate( list );
+      }
+
+      /**
+       * \brief Constructor with nested initializer lists.
+       *
+       * \tparam Value is type of the nested initializer list elements.
+       * \param list is the instance of the nested initializer list..
+       */
+      template< typename Value >
+      explicit  UniquePointer( std::initializer_list< std::initializer_list< Value > > list )
+      : pd( nullptr ),
+        cuda_pointer( nullptr )
+      {
+         this->allocate( list );
+      }
+
       /**
        * \brief Arrow operator for accessing the object owned by constant smart pointer.
        *
-- 
GitLab


From 43a56838339b4e0e3225950b43ad13c06658e9a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 9 May 2020 11:52:25 +0200
Subject: [PATCH 68/68] Fixed arrays tutorial.

---
 Documentation/Tutorials/Arrays/tutorial_Arrays.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/Tutorials/Arrays/tutorial_Arrays.md b/Documentation/Tutorials/Arrays/tutorial_Arrays.md
index 0d728935e..8050c1867 100644
--- a/Documentation/Tutorials/Arrays/tutorial_Arrays.md
+++ b/Documentation/Tutorials/Arrays/tutorial_Arrays.md
@@ -104,7 +104,7 @@ In general in TNL, each method defined as `__cuda_callable__` can be called from
 
 #### Accessing the array element with `setElement` and `getElement` <a name="accessing_the_array_elements_with_set_get_element"></a>
 
-On the other hand, the methods `setElement` and `getElement` can be called **from the host only** no matter where the array is allocated. None of the methods can be used in CUDA kernels. `getElement` returns copy of an element rather than a reference. Therefore it is slightly slower. If the array is on GPU, the array element is copied from the device on the host (or vice versa) which is significantly slower. In the parts of code where the performance matters, these methods shall not be called. Their use is, however, much easier and they allow to write one simple code for both CPU and GPU. Both methods are good candidates for:
+On the other hand, the methods `setElement` and `getElement` can be called from the host **no matter where the array is allocated**. In addition they can be called from kernels on device where the array is allocated. `getElement` returns copy of an element rather than a reference. Therefore it is slightly slower. If the array is on GPU and the methods are called from the host, the array element is copied from the device on the host (or vice versa) which is significantly slower. In the parts of code where the performance matters, these methods shall not be called from the host when the array is allocated on the device. In this way, their use is, however, easier compared to `operator[]` and they allow to write one simple code for both CPU and GPU. Both methods are good candidates for:
 
 * reading/writing of only few elements in the array
 * arrays initiation which is done only once and it is not time critical part of a code
-- 
GitLab