From 9336e8ba6368cd4493a710788f1d87505385209b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 30 Mar 2021 13:53:21 +0200
Subject: [PATCH 001/117] Added multivector kernel to CSRHybrid kernel.

---
 src/TNL/Algorithms/Segments/CSRHybridKernel.h |   9 +-
 .../Algorithms/Segments/CSRHybridKernel.hpp   | 181 ++++++++++++++----
 2 files changed, 145 insertions(+), 45 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/CSRHybridKernel.h b/src/TNL/Algorithms/Segments/CSRHybridKernel.h
index d3e48be1e..316d277bd 100644
--- a/src/TNL/Algorithms/Segments/CSRHybridKernel.h
+++ b/src/TNL/Algorithms/Segments/CSRHybridKernel.h
@@ -21,7 +21,8 @@ namespace TNL {
       namespace Segments {
 
 template< typename Index,
-          typename Device >
+          typename Device,
+          int ThreadsInBlock = 256 >
 struct CSRHybridKernel
 {
    using IndexType = Index;
@@ -44,16 +45,14 @@ struct CSRHybridKernel
              typename Fetch,
              typename Reduction,
              typename ResultKeeper,
-             typename Real,
-             typename... Args >
+             typename Real >
    void segmentsReduction( const OffsetsView& offsets,
                                   Index first,
                                   Index last,
                                   Fetch& fetch,
                                   const Reduction& reduction,
                                   ResultKeeper& keeper,
-                                  const Real& zero,
-                                  Args... args ) const;
+                                  const Real& zero ) const;
 
    protected:
       int threadsPerSegment;
diff --git a/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp b/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp
index 90505358e..03a8608a0 100644
--- a/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp
+++ b/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp
@@ -28,10 +28,9 @@ template< int ThreadsPerSegment,
           typename Fetch,
           typename Reduction,
           typename ResultKeeper,
-          typename Real,
-          typename... Args >
+          typename Real >
 __global__
-void segmentsReductionCSRHybridKernel(
+void segmentsReductionCSRHybridVectorKernel(
     int gridIdx,
     const Offsets offsets,
     Index first,
@@ -39,12 +38,8 @@ void segmentsReductionCSRHybridKernel(
     Fetch fetch,
     const Reduction reduce,
     ResultKeeper keep,
-    const Real zero,
-    Args... args )
+    const Real zero )
 {
-    /***
-     * We map one warp to each segment
-     */
     const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / ThreadsPerSegment + first;
     if( segmentIdx >= last )
         return;
@@ -78,55 +73,150 @@ void segmentsReductionCSRHybridKernel(
     if( laneIdx == 0 )
         keep( segmentIdx, aux );
 }
+
+template< int BlockSize,
+          int ThreadsPerSegment,
+          typename Offsets,
+          typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real >
+__global__
+void segmentsReductionCSRHybridMultivectorKernel(
+    int gridIdx,
+    const Offsets offsets,
+    Index first,
+    Index last,
+    Fetch fetch,
+    const Reduction reduce,
+    ResultKeeper keep,
+    const Real zero )
+{
+    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / ThreadsPerSegment + first;
+    if( segmentIdx >= last )
+        return;
+
+    __shared__ Real shared[ BlockSize / 32 ];
+    if( threadIdx.x < BlockSize / TNL::Cuda::getWarpSize() )
+        shared[ threadIdx.x ] = zero;
+
+    const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than %
+    const int inWarpLaneIdx = threadIdx.x & ( TNL::Cuda::getWarpSize() - 1 ); // & is cheaper than %
+    const Index beginIdx = offsets[ segmentIdx ];
+    const Index endIdx   = offsets[ segmentIdx + 1 ] ;
+
+    Real result = zero;
+    bool compute( true );
+    Index localIdx = laneIdx;
+    for( Index globalIdx = beginIdx + laneIdx; globalIdx < endIdx && compute; globalIdx += ThreadsPerSegment )
+    {
+       result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
+       localIdx += ThreadsPerSegment;
+    }
+    result += __shfl_down_sync(0xFFFFFFFF, result, 16);
+    result += __shfl_down_sync(0xFFFFFFFF, result, 8);
+    result += __shfl_down_sync(0xFFFFFFFF, result, 4);
+    result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+    result += __shfl_down_sync(0xFFFFFFFF, result, 1);
+
+    const Index warpIdx = threadIdx.x / TNL::Cuda::getWarpSize();
+    if( inWarpLaneIdx == 0 )
+        shared[ warpIdx ] = result;
+
+    __syncthreads();
+    // Reduction in shared
+    if( warpIdx == 0 && inWarpLaneIdx < 16 )
+    {
+        //constexpr int totalWarps = BlockSize / WarpSize;
+        constexpr int warpsPerSegment = ThreadsPerSegment / TNL::Cuda::getWarpSize();
+        if( warpsPerSegment >= 32 )
+        {
+            shared[ laneIdx ] =  reduce( shared[ laneIdx ], shared[ laneIdx + 16 ] );
+            __syncwarp();
+        }
+        if( warpsPerSegment >= 16 )
+        {
+            shared[ laneIdx ] =  reduce( shared[ laneIdx ], shared[ laneIdx +  8 ] );
+            __syncwarp();
+        }
+        if( warpsPerSegment >= 8 )
+        {
+            shared[ laneIdx ] =  reduce( shared[ laneIdx ], shared[ laneIdx +  4 ] );
+            __syncwarp();
+        }
+        if( warpsPerSegment >= 4 )
+        {
+            shared[ laneIdx ] =  reduce( shared[ laneIdx ], shared[ laneIdx +  2 ] );
+            __syncwarp();
+        }
+        if( warpsPerSegment >= 2 )
+        {
+            shared[ laneIdx ] =  reduce( shared[ laneIdx ], shared[ laneIdx +  1 ] );
+            __syncwarp();
+        }
+        constexpr int segmentsCount = BlockSize / ThreadsPerSegment;
+        if( inWarpLaneIdx < segmentsCount )
+        {
+            //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, shared[ inWarpLaneIdx ] );
+            keep( segmentIdx + inWarpLaneIdx, shared[ inWarpLaneIdx ] );
+        }
+    }
+}
 #endif
 
 
 template< typename Index,
-          typename Device >
+          typename Device,
+          int ThreadsInBlock >
     template< typename Offsets >
 void
-CSRHybridKernel< Index, Device >::
+CSRHybridKernel< Index, Device, ThreadsInBlock >::
 init( const Offsets& offsets )
 {
     const Index segmentsCount = offsets.getSize() - 1;
     const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount );
-    this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), TNL::Cuda::getWarpSize() );
+    this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), ThreadsInBlock ); //TNL::Cuda::getWarpSize() );
     TNL_ASSERT_GE( threadsPerSegment, 0, "" );
-    TNL_ASSERT_LE( threadsPerSegment, 32, "" );
+    TNL_ASSERT_LE( threadsPerSegment, ThreadsInBlock, "" );
 }
 
 template< typename Index,
-          typename Device >
+          typename Device,
+          int ThreadsInBlock >
 void
-CSRHybridKernel< Index, Device >::
+CSRHybridKernel< Index, Device, ThreadsInBlock >::
 reset()
 {
     this->threadsPerSegment = 0;
 }
 
 template< typename Index,
-          typename Device >
+          typename Device,
+          int ThreadsInBlock >
 auto
-CSRHybridKernel< Index, Device >::
+CSRHybridKernel< Index, Device, ThreadsInBlock >::
 getView() -> ViewType
 {
     return *this;
 }
 
 template< typename Index,
-          typename Device >
+          typename Device,
+          int ThreadsInBlock >
 TNL::String
-CSRHybridKernel< Index, Device >::
+CSRHybridKernel< Index, Device, ThreadsInBlock >::
 getKernelType()
 {
     return "Hybrid";
 }
 
 template< typename Index,
-          typename Device >
+          typename Device,
+          int ThreadsInBlock >
 auto
-CSRHybridKernel< Index, Device >::
+CSRHybridKernel< Index, Device, ThreadsInBlock >::
 getConstView() const -> ConstViewType
 {
     return *this;
@@ -134,30 +224,29 @@ getConstView() const -> ConstViewType
 
 
 template< typename Index,
-          typename Device >
+          typename Device,
+          int ThreadsInBlock >
     template< typename OffsetsView,
               typename Fetch,
               typename Reduction,
               typename ResultKeeper,
-              typename Real,
-              typename... Args >
+              typename Real >
 void
-CSRHybridKernel< Index, Device >::
+CSRHybridKernel< Index, Device, ThreadsInBlock >::
 segmentsReduction( const OffsetsView& offsets,
                          Index first,
                          Index last,
                          Fetch& fetch,
                          const Reduction& reduction,
                          ResultKeeper& keeper,
-                         const Real& zero,
-                         Args... args ) const
+                         const Real& zero ) const
 {
     TNL_ASSERT_GE( this->threadsPerSegment, 0, "" );
-    TNL_ASSERT_LE( this->threadsPerSegment, 32, "" );
+    TNL_ASSERT_LE( this->threadsPerSegment, ThreadsInBlock, "" );
 
 #ifdef HAVE_CUDA
     const size_t threadsCount = this->threadsPerSegment * ( last - first );
-    dim3 blocksCount, gridsCount, blockSize( 256 );
+    dim3 blocksCount, gridsCount, blockSize( ThreadsInBlock );
     TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
     //std::cerr << " this->threadsPerSegment = " << this->threadsPerSegment << " offsets = " << offsets << std::endl;
     for( unsigned int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ )
@@ -169,28 +258,40 @@ segmentsReduction( const OffsetsView& offsets,
             case 0:      // this means zero/empty matrix
                 break;
             case 1:
-                segmentsReductionCSRHybridKernel<  1, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
-                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
+                segmentsReductionCSRHybridVectorKernel<  1, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
                     break;
             case 2:
-                segmentsReductionCSRHybridKernel<  2, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
-                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
+                segmentsReductionCSRHybridVectorKernel<  2, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
                     break;
             case 4:
-                segmentsReductionCSRHybridKernel<  4, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
-                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
+                segmentsReductionCSRHybridVectorKernel<  4, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
                     break;
             case 8:
-                segmentsReductionCSRHybridKernel<  8, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
-                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
+                segmentsReductionCSRHybridVectorKernel<  8, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
                     break;
             case 16:
-                segmentsReductionCSRHybridKernel< 16, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
-                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
+                segmentsReductionCSRHybridVectorKernel< 16, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
                     break;
             case 32:
-                segmentsReductionCSRHybridKernel< 32, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>(
-                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... );
+                segmentsReductionCSRHybridVectorKernel< 32, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
+                    break;
+            case 64:
+                segmentsReductionCSRHybridMultivectorKernel< ThreadsInBlock,  64, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
+                    break;
+            case 128:
+                segmentsReductionCSRHybridMultivectorKernel< ThreadsInBlock, 128, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
+                    break;
+            case 256:
+                segmentsReductionCSRHybridMultivectorKernel< ThreadsInBlock, 256, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                    gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
                     break;
             default:
                 throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) );
-- 
GitLab


From d3c015971c51a9da43c2d9f27742b62678cd29c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 30 Mar 2021 16:19:39 +0200
Subject: [PATCH 002/117] Extracting vector product test from sparse matrix
 unit tests.

---
 src/UnitTests/Matrices/CMakeLists.txt         |   8 +
 src/UnitTests/Matrices/SparseMatrixTest.h     |   7 -
 src/UnitTests/Matrices/SparseMatrixTest.hpp   | 365 ----------------
 .../Matrices/SparseMatrixTest_CSRHybrid.cpp   |   4 +-
 .../Matrices/SparseMatrixVectorProductTest.h  |  42 ++
 .../SparseMatrixVectorProductTest.hpp         | 393 ++++++++++++++++++
 ...parseMatrixVectorProductTest_BiEllpack.cpp |  11 +
 ...SparseMatrixVectorProductTest_BiEllpack.cu |   1 +
 .../SparseMatrixVectorProductTest_BiEllpack.h |  58 +++
 ...rseMatrixVectorProductTest_CSRAdaptive.cpp |  11 +
 ...arseMatrixVectorProductTest_CSRAdaptive.cu |   1 +
 ...parseMatrixVectorProductTest_CSRAdaptive.h |  46 ++
 ...parseMatrixVectorProductTest_CSRHybrid.cpp |  11 +
 ...SparseMatrixVectorProductTest_CSRHybrid.cu |   1 +
 .../SparseMatrixVectorProductTest_CSRHybrid.h |  46 ++
 ...parseMatrixVectorProductTest_CSRScalar.cpp |  11 +
 ...SparseMatrixVectorProductTest_CSRScalar.cu |   1 +
 .../SparseMatrixVectorProductTest_CSRScalar.h |  46 ++
 ...parseMatrixVectorProductTest_CSRVector.cpp |  11 +
 ...SparseMatrixVectorProductTest_CSRVector.cu |   1 +
 .../SparseMatrixVectorProductTest_CSRVector.h |  46 ++
 ...MatrixVectorProductTest_ChunkedEllpack.cpp |  11 +
 ...eMatrixVectorProductTest_ChunkedEllpack.cu |   1 +
 ...seMatrixVectorProductTest_ChunkedEllpack.h |  57 +++
 .../SparseMatrixVectorProductTest_Ellpack.cpp |  11 +
 .../SparseMatrixVectorProductTest_Ellpack.cu  |   1 +
 .../SparseMatrixVectorProductTest_Ellpack.h   |  56 +++
 ...eMatrixVectorProductTest_SlicedEllpack.cpp |  11 +
 ...seMatrixVectorProductTest_SlicedEllpack.cu |   1 +
 ...rseMatrixVectorProductTest_SlicedEllpack.h |  57 +++
 30 files changed, 953 insertions(+), 374 deletions(-)
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest.h
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest.hpp
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_BiEllpack.cpp
 create mode 120000 src/UnitTests/Matrices/SparseMatrixVectorProductTest_BiEllpack.cu
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_BiEllpack.h
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRAdaptive.cpp
 create mode 120000 src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRAdaptive.cu
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRAdaptive.h
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRHybrid.cpp
 create mode 120000 src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRHybrid.cu
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRHybrid.h
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRScalar.cpp
 create mode 120000 src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRScalar.cu
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRScalar.h
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRVector.cpp
 create mode 120000 src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRVector.cu
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRVector.h
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_ChunkedEllpack.cpp
 create mode 120000 src/UnitTests/Matrices/SparseMatrixVectorProductTest_ChunkedEllpack.cu
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_ChunkedEllpack.h
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.cpp
 create mode 120000 src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.cu
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.h
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_SlicedEllpack.cpp
 create mode 120000 src/UnitTests/Matrices/SparseMatrixVectorProductTest_SlicedEllpack.cu
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_SlicedEllpack.h

diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt
index ef639fc44..a4b06708e 100644
--- a/src/UnitTests/Matrices/CMakeLists.txt
+++ b/src/UnitTests/Matrices/CMakeLists.txt
@@ -13,6 +13,14 @@ set( COMMON_TESTS
             SparseMatrixTest_SlicedEllpack
             SparseMatrixTest_ChunkedEllpack
             SparseMatrixTest_BiEllpack
+            SparseMatrixVectorProductTest_CSRScalar
+            SparseMatrixVectorProductTest_CSRVector
+            SparseMatrixVectorProductTest_CSRHybrid
+            SparseMatrixVectorProductTest_CSRAdaptive
+            SparseMatrixVectorProductTest_Ellpack
+            SparseMatrixVectorProductTest_SlicedEllpack
+            SparseMatrixVectorProductTest_ChunkedEllpack
+            SparseMatrixVectorProductTest_BiEllpack
             SparseMatrixCopyTest
             BinarySparseMatrixTest_CSR
             BinarySparseMatrixTest_Ellpack
diff --git a/src/UnitTests/Matrices/SparseMatrixTest.h b/src/UnitTests/Matrices/SparseMatrixTest.h
index 1ae0fda8a..68d7bedb0 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest.h
@@ -88,13 +88,6 @@ TYPED_TEST( MatrixTest, addElementTest )
     test_AddElement< MatrixType >();
 }
 
-TYPED_TEST( MatrixTest, vectorProductTest )
-{
-    using MatrixType = typename TestFixture::MatrixType;
-
-    test_VectorProduct< MatrixType >();
-}
-
 TYPED_TEST( MatrixTest, forElements )
 {
     using MatrixType = typename TestFixture::MatrixType;
diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index f906adfbf..cca22d857 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -1026,371 +1026,6 @@ void test_AddElement()
    EXPECT_EQ( m.getElement( 5, 4 ), 20 );
 }
 
-template< typename Matrix >
-void test_VectorProduct()
-{
-   using RealType = typename Matrix::RealType;
-   using DeviceType = typename Matrix::DeviceType;
-   using IndexType = typename Matrix::IndexType;
-   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
-
-   /*
-    * Sets up the following 4x4 sparse matrix:
-    *
-    *    /  1  0  0  0 \
-    *    |  0  2  0  3 |
-    *    |  0  4  0  0 |
-    *    \  0  0  5  0 /
-    */
-
-   const IndexType m_rows_1 = 4;
-   const IndexType m_cols_1 = 4;
-
-   Matrix m_1;
-   m_1.reset();
-   m_1.setDimensions( m_rows_1, m_cols_1 );
-   typename Matrix::RowsCapacitiesType rowLengths_1{ 1, 2, 1, 1 };
-   m_1.setRowCapacities( rowLengths_1 );
-
-   RealType value_1 = 1;
-   m_1.setElement( 0, 0, value_1++ );      // 0th row
-
-   m_1.setElement( 1, 1, value_1++ );      // 1st row
-   m_1.setElement( 1, 3, value_1++ );
-
-   m_1.setElement( 2, 1, value_1++ );      // 2nd row
-
-   m_1.setElement( 3, 2, value_1++ );      // 3rd row
-
-   VectorType inVector_1;
-   inVector_1.setSize( m_cols_1 );
-   for( IndexType i = 0; i < inVector_1.getSize(); i++ )
-       inVector_1.setElement( i, 2 );
-
-   VectorType outVector_1;
-   outVector_1.setSize( m_rows_1 );
-   for( IndexType j = 0; j < outVector_1.getSize(); j++ )
-       outVector_1.setElement( j, 0 );
-
-   m_1.vectorProduct( inVector_1, outVector_1 );
-   EXPECT_EQ( outVector_1.getElement( 0 ),  2 );
-   EXPECT_EQ( outVector_1.getElement( 1 ), 10 );
-   EXPECT_EQ( outVector_1.getElement( 2 ),  8 );
-   EXPECT_EQ( outVector_1.getElement( 3 ), 10 );
-
-   /*
-    * Sets up the following 4x4 sparse matrix:
-    *
-    *    /  1  2  3  0 \
-    *    |  0  0  0  4 |
-    *    |  5  6  7  0 |
-    *    \  0  8  0  0 /
-    */
-
-   const IndexType m_rows_2 = 4;
-   const IndexType m_cols_2 = 4;
-
-   Matrix m_2( m_rows_2, m_cols_2 );
-   typename Matrix::RowsCapacitiesType rowLengths_2{ 3, 1, 3, 1 };
-   m_2.setRowCapacities( rowLengths_2 );
-
-   RealType value_2 = 1;
-   for( IndexType i = 0; i < 3; i++ )      // 0th row
-      m_2.setElement( 0, i, value_2++ );
-
-   m_2.setElement( 1, 3, value_2++ );      // 1st row
-
-   for( IndexType i = 0; i < 3; i++ )      // 2nd row
-      m_2.setElement( 2, i, value_2++ );
-
-   for( IndexType i = 1; i < 2; i++ )      // 3rd row
-      m_2.setElement( 3, i, value_2++ );
-
-   VectorType inVector_2;
-   inVector_2.setSize( m_cols_2 );
-   for( IndexType i = 0; i < inVector_2.getSize(); i++ )
-      inVector_2.setElement( i, 2 );
-
-   VectorType outVector_2;
-   outVector_2.setSize( m_rows_2 );
-   for( IndexType j = 0; j < outVector_2.getSize(); j++ )
-      outVector_2.setElement( j, 0 );
-
-   m_2.vectorProduct( inVector_2, outVector_2 );
-
-   EXPECT_EQ( outVector_2.getElement( 0 ), 12 );
-   EXPECT_EQ( outVector_2.getElement( 1 ),  8 );
-   EXPECT_EQ( outVector_2.getElement( 2 ), 36 );
-   EXPECT_EQ( outVector_2.getElement( 3 ), 16 );
-
-   /*
-    * Sets up the following 4x4 sparse matrix:
-    *
-    *    /  1  2  3  0 \
-    *    |  0  4  5  6 |
-    *    |  7  8  9  0 |
-    *    \  0 10 11 12 /
-    */
-
-   const IndexType m_rows_3 = 4;
-   const IndexType m_cols_3 = 4;
-
-   Matrix m_3( m_rows_3, m_cols_3 );
-   typename Matrix::RowsCapacitiesType rowLengths_3{ 3, 3, 3, 3 };
-   m_3.setRowCapacities( rowLengths_3 );
-
-   RealType value_3 = 1;
-   for( IndexType i = 0; i < 3; i++ )          // 0th row
-      m_3.setElement( 0, i, value_3++ );
-
-   for( IndexType i = 1; i < 4; i++ )
-      m_3.setElement( 1, i, value_3++ );      // 1st row
-
-   for( IndexType i = 0; i < 3; i++ )          // 2nd row
-      m_3.setElement( 2, i, value_3++ );
-
-   for( IndexType i = 1; i < 4; i++ )          // 3rd row
-      m_3.setElement( 3, i, value_3++ );
-
-   VectorType inVector_3;
-   inVector_3.setSize( m_cols_3 );
-   for( IndexType i = 0; i < inVector_3.getSize(); i++ )
-      inVector_3.setElement( i, 2 );
-
-   VectorType outVector_3;
-   outVector_3.setSize( m_rows_3 );
-   for( IndexType j = 0; j < outVector_3.getSize(); j++ )
-      outVector_3.setElement( j, 0 );
-
-   m_3.vectorProduct( inVector_3, outVector_3 );
-
-   EXPECT_EQ( outVector_3.getElement( 0 ), 12 );
-   EXPECT_EQ( outVector_3.getElement( 1 ), 30 );
-   EXPECT_EQ( outVector_3.getElement( 2 ), 48 );
-   EXPECT_EQ( outVector_3.getElement( 3 ), 66 );
-
-   /*
-    * Sets up the following 8x8 sparse matrix:
-    *
-    *    /  1  2  3  0  0  4  0  0 \
-    *    |  0  5  6  7  8  0  0  0 |
-    *    |  9 10 11 12 13  0  0  0 |
-    *    |  0 14 15 16 17  0  0  0 |
-    *    |  0  0 18 19 20 21  0  0 |
-    *    |  0  0  0 22 23 24 25  0 |
-    *    | 26 27 28 29 30  0  0  0 |
-    *    \ 31 32 33 34 35  0  0  0 /
-    */
-
-   const IndexType m_rows_4 = 8;
-   const IndexType m_cols_4 = 8;
-
-   Matrix m_4( m_rows_4, m_cols_4 );
-   typename Matrix::RowsCapacitiesType rowLengths_4{ 4, 4, 5, 4, 4, 4, 5, 5 };
-   m_4.setRowCapacities( rowLengths_4 );
-
-   RealType value_4 = 1;
-   for( IndexType i = 0; i < 3; i++ )       // 0th row
-      m_4.setElement( 0, i, value_4++ );
-
-   m_4.setElement( 0, 5, value_4++ );
-
-   for( IndexType i = 1; i < 5; i++ )       // 1st row
-      m_4.setElement( 1, i, value_4++ );
-
-   for( IndexType i = 0; i < 5; i++ )       // 2nd row
-      m_4.setElement( 2, i, value_4++ );
-
-   for( IndexType i = 1; i < 5; i++ )       // 3rd row
-      m_4.setElement( 3, i, value_4++ );
-
-   for( IndexType i = 2; i < 6; i++ )       // 4th row
-      m_4.setElement( 4, i, value_4++ );
-
-   for( IndexType i = 3; i < 7; i++ )       // 5th row
-      m_4.setElement( 5, i, value_4++ );
-
-   for( IndexType i = 0; i < 5; i++ )       // 6th row
-      m_4.setElement( 6, i, value_4++ );
-
-   for( IndexType i = 0; i < 5; i++ )       // 7th row
-      m_4.setElement( 7, i, value_4++ );
-
-   VectorType inVector_4;
-   inVector_4.setSize( m_cols_4 );
-   for( IndexType i = 0; i < inVector_4.getSize(); i++ )
-      inVector_4.setElement( i, 2 );
-
-   VectorType outVector_4;
-   outVector_4.setSize( m_rows_4 );
-   for( IndexType j = 0; j < outVector_4.getSize(); j++ )
-      outVector_4.setElement( j, 0 );
-
-   m_4.vectorProduct( inVector_4, outVector_4 );
-
-   EXPECT_EQ( outVector_4.getElement( 0 ),  20 );
-   EXPECT_EQ( outVector_4.getElement( 1 ),  52 );
-   EXPECT_EQ( outVector_4.getElement( 2 ), 110 );
-   EXPECT_EQ( outVector_4.getElement( 3 ), 124 );
-   EXPECT_EQ( outVector_4.getElement( 4 ), 156 );
-   EXPECT_EQ( outVector_4.getElement( 5 ), 188 );
-   EXPECT_EQ( outVector_4.getElement( 6 ), 280 );
-   EXPECT_EQ( outVector_4.getElement( 7 ), 330 );
-
-   /*
-    * Sets up the following 8x8 sparse matrix:
-    *
-    *    /  1  2  3  0  4  5  0  1 \   6
-    *    |  0  6  0  7  0  0  0  1 |   3
-    *    |  0  8  9  0 10  0  0  1 |   4
-    *    |  0 11 12 13 14  0  0  1 |   5
-    *    |  0 15  0  0  0  0  0  1 |   2
-    *    |  0 16 17 18 19 20 21  1 |   7
-    *    | 22 23 24 25 26 27 28  1 |   8
-    *    \ 29 30 31 32 33 34 35 36 /   8
-    */
-
-   const IndexType m_rows_5 = 8;
-   const IndexType m_cols_5 = 8;
-
-   Matrix m_5( m_rows_5, m_cols_5 );
-   typename Matrix::RowsCapacitiesType rowLengths_5{ 6, 3, 4, 5, 2, 7, 8, 8 };
-   m_5.setRowCapacities( rowLengths_5 );
-
-   RealType value_5 = 1;
-   for( IndexType i = 0; i < 3; i++ )   // 0th row
-      m_5.setElement( 0, i, value_5++ );
-
-   m_5.setElement( 0, 4, value_5++ );           // 0th row
-   m_5.setElement( 0, 5, value_5++ );
-
-   m_5.setElement( 1, 1, value_5++ );           // 1st row
-   m_5.setElement( 1, 3, value_5++ );
-
-   for( IndexType i = 1; i < 3; i++ )            // 2nd row
-      m_5.setElement( 2, i, value_5++ );
-
-   m_5.setElement( 2, 4, value_5++ );           // 2nd row
-
-   for( IndexType i = 1; i < 5; i++ )            // 3rd row
-      m_5.setElement( 3, i, value_5++ );
-
-   m_5.setElement( 4, 1, value_5++ );           // 4th row
-
-   for( IndexType i = 1; i < 7; i++ )            // 5th row
-      m_5.setElement( 5, i, value_5++ );
-
-   for( IndexType i = 0; i < 7; i++ )            // 6th row
-      m_5.setElement( 6, i, value_5++ );
-
-   for( IndexType i = 0; i < 8; i++ )            // 7th row
-      m_5.setElement( 7, i, value_5++ );
-
-   for( IndexType i = 0; i < 7; i++ )            // 1s at the end of rows
-      m_5.setElement( i, 7, 1);
-
-   VectorType inVector_5;
-   inVector_5.setSize( m_cols_5 );
-   for( IndexType i = 0; i < inVector_5.getSize(); i++ )
-       inVector_5.setElement( i, 2 );
-
-   VectorType outVector_5;
-   outVector_5.setSize( m_rows_5 );
-   for( IndexType j = 0; j < outVector_5.getSize(); j++ )
-       outVector_5.setElement( j, 0 );
-
-   m_5.vectorProduct( inVector_5, outVector_5 );
-
-   EXPECT_EQ( outVector_5.getElement( 0 ),  32 );
-   EXPECT_EQ( outVector_5.getElement( 1 ),  28 );
-   EXPECT_EQ( outVector_5.getElement( 2 ),  56 );
-   EXPECT_EQ( outVector_5.getElement( 3 ), 102 );
-   EXPECT_EQ( outVector_5.getElement( 4 ),  32 );
-   EXPECT_EQ( outVector_5.getElement( 5 ), 224 );
-   EXPECT_EQ( outVector_5.getElement( 6 ), 352 );
-   EXPECT_EQ( outVector_5.getElement( 7 ), 520 );
-
-   /////
-   // Large test
-   const IndexType size( 1051 );
-   //for( int size = 1; size < 1000; size++ )
-   {
-      //std::cerr << " size = " << size << std::endl;
-      // Test with large diagonal matrix
-      Matrix m1( size, size );
-      TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowCapacities( size );
-      rowCapacities.forAllElements( [] __cuda_callable__ ( IndexType i, IndexType& value ) { value = 1; } );
-      m1.setRowCapacities( rowCapacities );
-      auto f1 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
-         if( localIdx == 0  )
-         {
-            value = row + 1;
-            column = row;
-         }
-      };
-      m1.forAllElements( f1 );
-      // check that the matrix was initialized
-      m1.getCompressedRowLengths( rowCapacities );
-      EXPECT_EQ( rowCapacities, 1 );
-
-      TNL::Containers::Vector< double, DeviceType, IndexType > in( size, 1.0 ), out( size, 0.0 );
-      m1.vectorProduct( in, out );
-      //std::cerr << out << std::endl;
-      for( IndexType i = 0; i < size; i++ )
-         EXPECT_EQ( out.getElement( i ), i + 1 );
-
-      // Test with large triangular matrix
-      const int rows( size ), columns( size );
-      Matrix m2( rows, columns );
-      rowCapacities.setSize( rows );
-      rowCapacities.forAllElements( [=] __cuda_callable__ ( IndexType i, IndexType& value ) { value = i + 1; } );
-      m2.setRowCapacities( rowCapacities );
-      auto f2 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
-         if( localIdx <= row )
-         {
-            value = localIdx + 1;
-            column = localIdx;
-         }
-      };
-      m2.forAllElements( f2 );
-      // check that the matrix was initialized
-      TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowLengths( rows );
-      m2.getCompressedRowLengths( rowLengths );
-      EXPECT_EQ( rowLengths, rowCapacities );
-
-      out.setSize( rows );
-      out = 0.0;
-      m2.vectorProduct( in, out );
-      for( IndexType i = 0; i < rows; i++ )
-         EXPECT_EQ( out.getElement( i ), ( i + 1 ) * ( i + 2 ) / 2 );
-   }
-
-   /**
-    * Long row test
-    */
-   using MatrixSegmentsType = typename Matrix::SegmentsType;
-   constexpr TNL::Algorithms::Segments::ElementsOrganization organization = MatrixSegmentsType::getOrganization();
-   using ChunkedEllpackView_ = TNL::Algorithms::Segments::ChunkedEllpackView< DeviceType, IndexType, organization >;
-   if( ! std::is_same< typename Matrix::SegmentsViewType, ChunkedEllpackView_ >::value )
-   {
-      // TODO: Fix ChunkedEllpack for this test - seems that it allocates too much memory
-      const int columns = 3000;
-      const int rows = 1;
-      Matrix m3( rows, columns );
-      TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowsCapacities( rows );
-      rowsCapacities = columns;
-      m3.setRowCapacities( rowsCapacities );
-      auto f = [] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
-         column = localIdx;
-         value = localIdx + 1;
-      };
-      m3.forAllElements( f );
-      TNL::Containers::Vector< double, DeviceType, IndexType > in( columns, 1.0 ), out( rows, 0.0 );
-      m3.vectorProduct( in, out );
-      EXPECT_EQ( out.getElement( 0 ), ( double ) columns * ( double ) (columns + 1 ) / 2.0 );
-   }
-}
-
 template< typename Matrix >
 void test_ForElements()
 {
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cpp
index 214ed2ca7..5aef16abb 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          SparseMatrixTest_CSRHybrid.cpp -  description
+                          SparseMatrixVectorProductTest_CSRHybrid.cpp -  description
                              -------------------
     begin                : Jan 23, 2021
     copyright            : (C) 2021 by Tomas Oberhuber et al.
@@ -8,4 +8,4 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include "SparseMatrixTest_CSRHybrid.h"
+#include "SparseMatrixVectorProductTest_CSRHybrid.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest.h
new file mode 100644
index 000000000..5e3ac36da
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest.h
@@ -0,0 +1,42 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest.h -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Containers/Vector.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Math.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <iostream>
+#include <sstream>
+
+#include "SparseMatrixVectorProductTest.hpp"
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+// test fixture for typed tests
+template< typename Matrix >
+class MatrixTest : public ::testing::Test
+{
+protected:
+   using MatrixType = Matrix;
+};
+
+TYPED_TEST_SUITE( MatrixTest, MatrixTypes);
+
+TYPED_TEST( MatrixTest, vectorProductTest )
+{
+    using MatrixType = typename TestFixture::MatrixType;
+
+    test_VectorProduct< MatrixType >();
+}
+
+#endif
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest.hpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest.hpp
new file mode 100644
index 000000000..d49356042
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest.hpp
@@ -0,0 +1,393 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest.hpp -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Math.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <iostream>
+#include <sstream>
+
+// Just for ChunkedEllpack vectorProduct test exception
+#include <TNL/Algorithms/Segments/ChunkedEllpackView.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+template< typename Matrix >
+void test_VectorProduct()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+
+   /*
+    * Sets up the following 4x4 sparse matrix:
+    *
+    *    /  1  0  0  0 \
+    *    |  0  2  0  3 |
+    *    |  0  4  0  0 |
+    *    \  0  0  5  0 /
+    */
+
+   const IndexType m_rows_1 = 4;
+   const IndexType m_cols_1 = 4;
+
+   Matrix m_1;
+   m_1.reset();
+   m_1.setDimensions( m_rows_1, m_cols_1 );
+   typename Matrix::RowsCapacitiesType rowLengths_1{ 1, 2, 1, 1 };
+   m_1.setRowCapacities( rowLengths_1 );
+
+   RealType value_1 = 1;
+   m_1.setElement( 0, 0, value_1++ );      // 0th row
+
+   m_1.setElement( 1, 1, value_1++ );      // 1st row
+   m_1.setElement( 1, 3, value_1++ );
+
+   m_1.setElement( 2, 1, value_1++ );      // 2nd row
+
+   m_1.setElement( 3, 2, value_1++ );      // 3rd row
+
+   VectorType inVector_1;
+   inVector_1.setSize( m_cols_1 );
+   for( IndexType i = 0; i < inVector_1.getSize(); i++ )
+       inVector_1.setElement( i, 2 );
+
+   VectorType outVector_1;
+   outVector_1.setSize( m_rows_1 );
+   for( IndexType j = 0; j < outVector_1.getSize(); j++ )
+       outVector_1.setElement( j, 0 );
+
+   m_1.vectorProduct( inVector_1, outVector_1 );
+   EXPECT_EQ( outVector_1.getElement( 0 ),  2 );
+   EXPECT_EQ( outVector_1.getElement( 1 ), 10 );
+   EXPECT_EQ( outVector_1.getElement( 2 ),  8 );
+   EXPECT_EQ( outVector_1.getElement( 3 ), 10 );
+
+   /*
+    * Sets up the following 4x4 sparse matrix:
+    *
+    *    /  1  2  3  0 \
+    *    |  0  0  0  4 |
+    *    |  5  6  7  0 |
+    *    \  0  8  0  0 /
+    */
+
+   const IndexType m_rows_2 = 4;
+   const IndexType m_cols_2 = 4;
+
+   Matrix m_2( m_rows_2, m_cols_2 );
+   typename Matrix::RowsCapacitiesType rowLengths_2{ 3, 1, 3, 1 };
+   m_2.setRowCapacities( rowLengths_2 );
+
+   RealType value_2 = 1;
+   for( IndexType i = 0; i < 3; i++ )      // 0th row
+      m_2.setElement( 0, i, value_2++ );
+
+   m_2.setElement( 1, 3, value_2++ );      // 1st row
+
+   for( IndexType i = 0; i < 3; i++ )      // 2nd row
+      m_2.setElement( 2, i, value_2++ );
+
+   for( IndexType i = 1; i < 2; i++ )      // 3rd row
+      m_2.setElement( 3, i, value_2++ );
+
+   VectorType inVector_2;
+   inVector_2.setSize( m_cols_2 );
+   for( IndexType i = 0; i < inVector_2.getSize(); i++ )
+      inVector_2.setElement( i, 2 );
+
+   VectorType outVector_2;
+   outVector_2.setSize( m_rows_2 );
+   for( IndexType j = 0; j < outVector_2.getSize(); j++ )
+      outVector_2.setElement( j, 0 );
+
+   m_2.vectorProduct( inVector_2, outVector_2 );
+
+   EXPECT_EQ( outVector_2.getElement( 0 ), 12 );
+   EXPECT_EQ( outVector_2.getElement( 1 ),  8 );
+   EXPECT_EQ( outVector_2.getElement( 2 ), 36 );
+   EXPECT_EQ( outVector_2.getElement( 3 ), 16 );
+
+   /*
+    * Sets up the following 4x4 sparse matrix:
+    *
+    *    /  1  2  3  0 \
+    *    |  0  4  5  6 |
+    *    |  7  8  9  0 |
+    *    \  0 10 11 12 /
+    */
+
+   const IndexType m_rows_3 = 4;
+   const IndexType m_cols_3 = 4;
+
+   Matrix m_3( m_rows_3, m_cols_3 );
+   typename Matrix::RowsCapacitiesType rowLengths_3{ 3, 3, 3, 3 };
+   m_3.setRowCapacities( rowLengths_3 );
+
+   RealType value_3 = 1;
+   for( IndexType i = 0; i < 3; i++ )          // 0th row
+      m_3.setElement( 0, i, value_3++ );
+
+   for( IndexType i = 1; i < 4; i++ )
+      m_3.setElement( 1, i, value_3++ );      // 1st row
+
+   for( IndexType i = 0; i < 3; i++ )          // 2nd row
+      m_3.setElement( 2, i, value_3++ );
+
+   for( IndexType i = 1; i < 4; i++ )          // 3rd row
+      m_3.setElement( 3, i, value_3++ );
+
+   VectorType inVector_3;
+   inVector_3.setSize( m_cols_3 );
+   for( IndexType i = 0; i < inVector_3.getSize(); i++ )
+      inVector_3.setElement( i, 2 );
+
+   VectorType outVector_3;
+   outVector_3.setSize( m_rows_3 );
+   for( IndexType j = 0; j < outVector_3.getSize(); j++ )
+      outVector_3.setElement( j, 0 );
+
+   m_3.vectorProduct( inVector_3, outVector_3 );
+
+   EXPECT_EQ( outVector_3.getElement( 0 ), 12 );
+   EXPECT_EQ( outVector_3.getElement( 1 ), 30 );
+   EXPECT_EQ( outVector_3.getElement( 2 ), 48 );
+   EXPECT_EQ( outVector_3.getElement( 3 ), 66 );
+
+   /*
+    * Sets up the following 8x8 sparse matrix:
+    *
+    *    /  1  2  3  0  0  4  0  0 \
+    *    |  0  5  6  7  8  0  0  0 |
+    *    |  9 10 11 12 13  0  0  0 |
+    *    |  0 14 15 16 17  0  0  0 |
+    *    |  0  0 18 19 20 21  0  0 |
+    *    |  0  0  0 22 23 24 25  0 |
+    *    | 26 27 28 29 30  0  0  0 |
+    *    \ 31 32 33 34 35  0  0  0 /
+    */
+
+   const IndexType m_rows_4 = 8;
+   const IndexType m_cols_4 = 8;
+
+   Matrix m_4( m_rows_4, m_cols_4 );
+   typename Matrix::RowsCapacitiesType rowLengths_4{ 4, 4, 5, 4, 4, 4, 5, 5 };
+   m_4.setRowCapacities( rowLengths_4 );
+
+   RealType value_4 = 1;
+   for( IndexType i = 0; i < 3; i++ )       // 0th row
+      m_4.setElement( 0, i, value_4++ );
+
+   m_4.setElement( 0, 5, value_4++ );
+
+   for( IndexType i = 1; i < 5; i++ )       // 1st row
+      m_4.setElement( 1, i, value_4++ );
+
+   for( IndexType i = 0; i < 5; i++ )       // 2nd row
+      m_4.setElement( 2, i, value_4++ );
+
+   for( IndexType i = 1; i < 5; i++ )       // 3rd row
+      m_4.setElement( 3, i, value_4++ );
+
+   for( IndexType i = 2; i < 6; i++ )       // 4th row
+      m_4.setElement( 4, i, value_4++ );
+
+   for( IndexType i = 3; i < 7; i++ )       // 5th row
+      m_4.setElement( 5, i, value_4++ );
+
+   for( IndexType i = 0; i < 5; i++ )       // 6th row
+      m_4.setElement( 6, i, value_4++ );
+
+   for( IndexType i = 0; i < 5; i++ )       // 7th row
+      m_4.setElement( 7, i, value_4++ );
+
+   VectorType inVector_4;
+   inVector_4.setSize( m_cols_4 );
+   for( IndexType i = 0; i < inVector_4.getSize(); i++ )
+      inVector_4.setElement( i, 2 );
+
+   VectorType outVector_4;
+   outVector_4.setSize( m_rows_4 );
+   for( IndexType j = 0; j < outVector_4.getSize(); j++ )
+      outVector_4.setElement( j, 0 );
+
+   m_4.vectorProduct( inVector_4, outVector_4 );
+
+   EXPECT_EQ( outVector_4.getElement( 0 ),  20 );
+   EXPECT_EQ( outVector_4.getElement( 1 ),  52 );
+   EXPECT_EQ( outVector_4.getElement( 2 ), 110 );
+   EXPECT_EQ( outVector_4.getElement( 3 ), 124 );
+   EXPECT_EQ( outVector_4.getElement( 4 ), 156 );
+   EXPECT_EQ( outVector_4.getElement( 5 ), 188 );
+   EXPECT_EQ( outVector_4.getElement( 6 ), 280 );
+   EXPECT_EQ( outVector_4.getElement( 7 ), 330 );
+
+   /*
+    * Sets up the following 8x8 sparse matrix:
+    *
+    *    /  1  2  3  0  4  5  0  1 \   6
+    *    |  0  6  0  7  0  0  0  1 |   3
+    *    |  0  8  9  0 10  0  0  1 |   4
+    *    |  0 11 12 13 14  0  0  1 |   5
+    *    |  0 15  0  0  0  0  0  1 |   2
+    *    |  0 16 17 18 19 20 21  1 |   7
+    *    | 22 23 24 25 26 27 28  1 |   8
+    *    \ 29 30 31 32 33 34 35 36 /   8
+    */
+
+   const IndexType m_rows_5 = 8;
+   const IndexType m_cols_5 = 8;
+
+   Matrix m_5( m_rows_5, m_cols_5 );
+   typename Matrix::RowsCapacitiesType rowLengths_5{ 6, 3, 4, 5, 2, 7, 8, 8 };
+   m_5.setRowCapacities( rowLengths_5 );
+
+   RealType value_5 = 1;
+   for( IndexType i = 0; i < 3; i++ )   // 0th row
+      m_5.setElement( 0, i, value_5++ );
+
+   m_5.setElement( 0, 4, value_5++ );           // 0th row
+   m_5.setElement( 0, 5, value_5++ );
+
+   m_5.setElement( 1, 1, value_5++ );           // 1st row
+   m_5.setElement( 1, 3, value_5++ );
+
+   for( IndexType i = 1; i < 3; i++ )            // 2nd row
+      m_5.setElement( 2, i, value_5++ );
+
+   m_5.setElement( 2, 4, value_5++ );           // 2nd row
+
+   for( IndexType i = 1; i < 5; i++ )            // 3rd row
+      m_5.setElement( 3, i, value_5++ );
+
+   m_5.setElement( 4, 1, value_5++ );           // 4th row
+
+   for( IndexType i = 1; i < 7; i++ )            // 5th row
+      m_5.setElement( 5, i, value_5++ );
+
+   for( IndexType i = 0; i < 7; i++ )            // 6th row
+      m_5.setElement( 6, i, value_5++ );
+
+   for( IndexType i = 0; i < 8; i++ )            // 7th row
+      m_5.setElement( 7, i, value_5++ );
+
+   for( IndexType i = 0; i < 7; i++ )            // 1s at the end of rows
+      m_5.setElement( i, 7, 1);
+
+   VectorType inVector_5;
+   inVector_5.setSize( m_cols_5 );
+   for( IndexType i = 0; i < inVector_5.getSize(); i++ )
+       inVector_5.setElement( i, 2 );
+
+   VectorType outVector_5;
+   outVector_5.setSize( m_rows_5 );
+   for( IndexType j = 0; j < outVector_5.getSize(); j++ )
+       outVector_5.setElement( j, 0 );
+
+   m_5.vectorProduct( inVector_5, outVector_5 );
+
+   EXPECT_EQ( outVector_5.getElement( 0 ),  32 );
+   EXPECT_EQ( outVector_5.getElement( 1 ),  28 );
+   EXPECT_EQ( outVector_5.getElement( 2 ),  56 );
+   EXPECT_EQ( outVector_5.getElement( 3 ), 102 );
+   EXPECT_EQ( outVector_5.getElement( 4 ),  32 );
+   EXPECT_EQ( outVector_5.getElement( 5 ), 224 );
+   EXPECT_EQ( outVector_5.getElement( 6 ), 352 );
+   EXPECT_EQ( outVector_5.getElement( 7 ), 520 );
+
+   /////
+   // Large test
+   const IndexType size( 1051 );
+   //for( int size = 1; size < 1000; size++ )
+   {
+      //std::cerr << " size = " << size << std::endl;
+      // Test with large diagonal matrix
+      Matrix m1( size, size );
+      TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowCapacities( size );
+      rowCapacities.forAllElements( [] __cuda_callable__ ( IndexType i, IndexType& value ) { value = 1; } );
+      m1.setRowCapacities( rowCapacities );
+      auto f1 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
+         if( localIdx == 0  )
+         {
+            value = row + 1;
+            column = row;
+         }
+      };
+      m1.forAllElements( f1 );
+      // check that the matrix was initialized
+      m1.getCompressedRowLengths( rowCapacities );
+      EXPECT_EQ( rowCapacities, 1 );
+
+      TNL::Containers::Vector< double, DeviceType, IndexType > in( size, 1.0 ), out( size, 0.0 );
+      m1.vectorProduct( in, out );
+      //std::cerr << out << std::endl;
+      for( IndexType i = 0; i < size; i++ )
+         EXPECT_EQ( out.getElement( i ), i + 1 );
+
+      // Test with large triangular matrix
+      const int rows( size ), columns( size );
+      Matrix m2( rows, columns );
+      rowCapacities.setSize( rows );
+      rowCapacities.forAllElements( [=] __cuda_callable__ ( IndexType i, IndexType& value ) { value = i + 1; } );
+      m2.setRowCapacities( rowCapacities );
+      auto f2 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
+         if( localIdx <= row )
+         {
+            value = localIdx + 1;
+            column = localIdx;
+         }
+      };
+      m2.forAllElements( f2 );
+      // check that the matrix was initialized
+      TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowLengths( rows );
+      m2.getCompressedRowLengths( rowLengths );
+      EXPECT_EQ( rowLengths, rowCapacities );
+
+      out.setSize( rows );
+      out = 0.0;
+      m2.vectorProduct( in, out );
+      for( IndexType i = 0; i < rows; i++ )
+         EXPECT_EQ( out.getElement( i ), ( i + 1 ) * ( i + 2 ) / 2 );
+   }
+
+   /**
+    * Long row test
+    */
+   using MatrixSegmentsType = typename Matrix::SegmentsType;
+   constexpr TNL::Algorithms::Segments::ElementsOrganization organization = MatrixSegmentsType::getOrganization();
+   using ChunkedEllpackView_ = TNL::Algorithms::Segments::ChunkedEllpackView< DeviceType, IndexType, organization >;
+   if( ! std::is_same< typename Matrix::SegmentsViewType, ChunkedEllpackView_ >::value )
+   {
+      // TODO: Fix ChunkedEllpack for this test - seems that it allocates too much memory
+      const int columns = 3000;
+      const int rows = 1;
+      Matrix m3( rows, columns );
+      TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowsCapacities( rows );
+      rowsCapacities = columns;
+      m3.setRowCapacities( rowsCapacities );
+      auto f = [] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
+         column = localIdx;
+         value = localIdx + 1;
+      };
+      m3.forAllElements( f );
+      TNL::Containers::Vector< double, DeviceType, IndexType > in( columns, 1.0 ), out( rows, 0.0 );
+      m3.vectorProduct( in, out );
+      EXPECT_EQ( out.getElement( 0 ), ( double ) columns * ( double ) (columns + 1 ) / 2.0 );
+   }
+}
+
+
+#endif
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_BiEllpack.cpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_BiEllpack.cpp
new file mode 100644
index 000000000..319ed6605
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_BiEllpack.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_BiEllpack.cpp -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixVectorProductTest_BiEllpack.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_BiEllpack.cu b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_BiEllpack.cu
new file mode 120000
index 000000000..f34bb20d5
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_BiEllpack.cu
@@ -0,0 +1 @@
+SparseMatrixVectorProductTest_BiEllpack.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_BiEllpack.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_BiEllpack.h
new file mode 100644
index 000000000..abdc11ca2
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_BiEllpack.h
@@ -0,0 +1,58 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_BiEllpack.h -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/BiEllpack.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_BiEllpack_segments";
+
+////
+// Row-major format is used for the host system
+template< typename Device, typename Index, typename IndexAllocator >
+using RowMajorBiEllpack = TNL::Algorithms::Segments::BiEllpack< Device, Index, IndexAllocator, TNL::Algorithms::Segments::RowMajorOrder >;
+
+////
+// Column-major format is used for GPUs
+template< typename Device, typename Index, typename IndexAllocator >
+using ColumnMajorBiEllpack = TNL::Algorithms::Segments::BiEllpack< Device, Index, IndexAllocator, TNL::Algorithms::Segments::ColumnMajorOrder >;
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorBiEllpack >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixVectorProductTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRAdaptive.cpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRAdaptive.cpp
new file mode 100644
index 000000000..bafa050d0
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRAdaptive.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRAdaptive.cpp -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixVectorProductTest_CSRAdaptive.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRAdaptive.cu b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRAdaptive.cu
new file mode 120000
index 000000000..28919f745
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRAdaptive.cu
@@ -0,0 +1 @@
+SparseMatrixVectorProductTest_CSRAdaptive.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRAdaptive.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRAdaptive.h
new file mode 100644
index 000000000..93a0d79fb
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRAdaptive.h
@@ -0,0 +1,46 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRAdaptive.h -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRAdaptive_segments";
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixVectorProductTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRHybrid.cpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRHybrid.cpp
new file mode 100644
index 000000000..a6795b4e1
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRHybrid.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRHybrid.cpp -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixVectorProductTest_CSRHybrid.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRHybrid.cu b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRHybrid.cu
new file mode 120000
index 000000000..4c81adef3
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRHybrid.cu
@@ -0,0 +1 @@
+SparseMatrixVectorProductTest_CSRHybrid.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRHybrid.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRHybrid.h
new file mode 100644
index 000000000..99b5e4403
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRHybrid.h
@@ -0,0 +1,46 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRHybrid.h -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRHybrid_segments";
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixVectorProductTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRScalar.cpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRScalar.cpp
new file mode 100644
index 000000000..bfa16c02b
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRScalar.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRScalar.cpp -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixVectorProductTest_CSRScalar.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRScalar.cu b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRScalar.cu
new file mode 120000
index 000000000..024a31f15
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRScalar.cu
@@ -0,0 +1 @@
+SparseMatrixVectorProductTest_CSRScalar.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRScalar.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRScalar.h
new file mode 100644
index 000000000..b9586f66e
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRScalar.h
@@ -0,0 +1,46 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRScalar.h -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRScalar_segments";
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixVectorProductTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRVector.cpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRVector.cpp
new file mode 100644
index 000000000..68075da02
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRVector.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRVector.cpp -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixVectorProductTest_CSRVector.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRVector.cu b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRVector.cu
new file mode 120000
index 000000000..91409a4b4
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRVector.cu
@@ -0,0 +1 @@
+SparseMatrixVectorProductTest_CSRVector.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRVector.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRVector.h
new file mode 100644
index 000000000..0afe07e82
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRVector.h
@@ -0,0 +1,46 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRVector.h -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRVector_segments";
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixVectorProductTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_ChunkedEllpack.cpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_ChunkedEllpack.cpp
new file mode 100644
index 000000000..1586d8191
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_ChunkedEllpack.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_ChunkedEllpack.cpp -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixVectorProductTest_ChunkedEllpack.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_ChunkedEllpack.cu b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_ChunkedEllpack.cu
new file mode 120000
index 000000000..dea4491d6
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_ChunkedEllpack.cu
@@ -0,0 +1 @@
+SparseMatrixVectorProductTest_ChunkedEllpack.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_ChunkedEllpack.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_ChunkedEllpack.h
new file mode 100644
index 000000000..d2cb049f6
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_ChunkedEllpack.h
@@ -0,0 +1,57 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_ChunkedEllpack.h -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/ChunkedEllpack.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_ChunkedEllpack_segments";
+
+////
+// Row-major format is used for the host system
+template< typename Device, typename Index, typename IndexAllocator >
+using RowMajorChunkedEllpack = TNL::Algorithms::Segments::ChunkedEllpack< Device, Index, IndexAllocator, TNL::Algorithms::Segments::RowMajorOrder >;
+
+////
+// Column-major format is used for GPUs
+template< typename Device, typename Index, typename IndexAllocator >
+using ColumnMajorChunkedEllpack = TNL::Algorithms::Segments::ChunkedEllpack< Device, Index, IndexAllocator, TNL::Algorithms::Segments::ColumnMajorOrder >;
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+     TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, ColumnMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorChunkedEllpack >
+#ifdef HAVE_CUDA
+    ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorChunkedEllpack >
+    ,TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorChunkedEllpack >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixVectorProductTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.cpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.cpp
new file mode 100644
index 000000000..9e2446c38
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_Ellpack.cpp -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixVectorProductTest_Ellpack.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.cu b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.cu
new file mode 120000
index 000000000..d30bd03b8
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.cu
@@ -0,0 +1 @@
+SparseMatrixVectorProductTest_Ellpack.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.h
new file mode 100644
index 000000000..abb4213ca
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.h
@@ -0,0 +1,56 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_Ellpack.h -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/Ellpack.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_Ellpack_segments";
+
+////
+// Row-major format is used for the host system
+template< typename Device, typename Index, typename IndexAlocator >
+using RowMajorEllpack = TNL::Algorithms::Segments::Ellpack< Device, Index, IndexAlocator, TNL::Algorithms::Segments::RowMajorOrder, 32 >;
+
+////
+// Column-major format is used for GPUs
+template< typename Device, typename Index, typename IndexAllocator >
+using ColumnMajorEllpack = TNL::Algorithms::Segments::Ellpack< Device, Index, IndexAllocator, TNL::Algorithms::Segments::ColumnMajorOrder, 32 >;
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixVectorProductTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SlicedEllpack.cpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SlicedEllpack.cpp
new file mode 100644
index 000000000..0afb094fd
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SlicedEllpack.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_SlicedEllpack.cpp -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixVectorProductTest_SlicedEllpack.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SlicedEllpack.cu b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SlicedEllpack.cu
new file mode 120000
index 000000000..6c3448930
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SlicedEllpack.cu
@@ -0,0 +1 @@
+SparseMatrixVectorProductTest_SlicedEllpack.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SlicedEllpack.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SlicedEllpack.h
new file mode 100644
index 000000000..5efa70d45
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SlicedEllpack.h
@@ -0,0 +1,57 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_SlicedEllpack.h -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/SlicedEllpack.h>
+#include <TNL/Matrices/SparseMatrix.h>
+#include <TNL/Matrices/MatrixType.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_SlicedEllpack_segments";
+
+////
+// Row-major format is used for the host system
+template< typename Device, typename Index, typename IndexAllocator >
+using RowMajorSlicedEllpack = TNL::Algorithms::Segments::SlicedEllpack< Device, Index, IndexAllocator, TNL::Algorithms::Segments::RowMajorOrder, 32 >;
+
+////
+// Column-major format is used for GPUs
+template< typename Device, typename Index, typename IndexAllocator >
+using ColumnMajorSlicedEllpack = TNL::Algorithms::Segments::SlicedEllpack< Device, Index, IndexAllocator, TNL::Algorithms::Segments::ColumnMajorOrder, 32 >;
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, RowMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, RowMajorSlicedEllpack >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, ColumnMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorSlicedEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorSlicedEllpack >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixVectorProductTest.h"
+#include "../main.h"
-- 
GitLab


From e9715dd4badccd3a77be2351abb03ae0bfaf7d37 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 31 Mar 2021 15:57:53 +0200
Subject: [PATCH 003/117] Fix of CSR hybrid kernel.

---
 src/TNL/Algorithms/Segments/CSRHybridKernel.hpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp b/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp
index 03a8608a0..7d752396b 100644
--- a/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp
+++ b/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp
@@ -132,34 +132,34 @@ void segmentsReductionCSRHybridMultivectorKernel(
         constexpr int warpsPerSegment = ThreadsPerSegment / TNL::Cuda::getWarpSize();
         if( warpsPerSegment >= 32 )
         {
-            shared[ laneIdx ] =  reduce( shared[ laneIdx ], shared[ laneIdx + 16 ] );
+            shared[ inWarpLaneIdx ] =  reduce( shared[ inWarpLaneIdx ], shared[ inWarpLaneIdx + 16 ] );
             __syncwarp();
         }
         if( warpsPerSegment >= 16 )
         {
-            shared[ laneIdx ] =  reduce( shared[ laneIdx ], shared[ laneIdx +  8 ] );
+            shared[ inWarpLaneIdx ] =  reduce( shared[ inWarpLaneIdx ], shared[ inWarpLaneIdx +  8 ] );
             __syncwarp();
         }
         if( warpsPerSegment >= 8 )
         {
-            shared[ laneIdx ] =  reduce( shared[ laneIdx ], shared[ laneIdx +  4 ] );
+            shared[ inWarpLaneIdx ] =  reduce( shared[ inWarpLaneIdx ], shared[ inWarpLaneIdx +  4 ] );
             __syncwarp();
         }
         if( warpsPerSegment >= 4 )
         {
-            shared[ laneIdx ] =  reduce( shared[ laneIdx ], shared[ laneIdx +  2 ] );
+            shared[ inWarpLaneIdx ] =  reduce( shared[ inWarpLaneIdx ], shared[ inWarpLaneIdx +  2 ] );
             __syncwarp();
         }
         if( warpsPerSegment >= 2 )
         {
-            shared[ laneIdx ] =  reduce( shared[ laneIdx ], shared[ laneIdx +  1 ] );
+            shared[ inWarpLaneIdx ] =  reduce( shared[ inWarpLaneIdx ], shared[ inWarpLaneIdx +  1 ] );
             __syncwarp();
         }
         constexpr int segmentsCount = BlockSize / ThreadsPerSegment;
-        if( inWarpLaneIdx < segmentsCount )
+        if( inWarpLaneIdx < segmentsCount && segmentIdx + inWarpLaneIdx < last )
         {
-            //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, shared[ inWarpLaneIdx ] );
-            keep( segmentIdx + inWarpLaneIdx, shared[ inWarpLaneIdx ] );
+            //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, aux );
+            keep( segmentIdx + inWarpLaneIdx, shared[ inWarpLaneIdx * ThreadsPerSegment / 32 ] );
         }
     }
 }
-- 
GitLab


From 364fb421b702abaebb1088ae30797ac2941d6c71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 31 Mar 2021 15:58:45 +0200
Subject: [PATCH 004/117] Splitting sparse-matrix vector product unit test into
 separate functions.

---
 .../Matrices/SparseMatrixVectorProductTest.h  | 46 ++++++++-
 .../SparseMatrixVectorProductTest.hpp         | 94 +++++++++++++++----
 2 files changed, 122 insertions(+), 18 deletions(-)

diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest.h
index 5e3ac36da..3a77cbe1a 100644
--- a/src/UnitTests/Matrices/SparseMatrixVectorProductTest.h
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest.h
@@ -32,11 +32,53 @@ protected:
 
 TYPED_TEST_SUITE( MatrixTest, MatrixTypes);
 
-TYPED_TEST( MatrixTest, vectorProductTest )
+TYPED_TEST( MatrixTest, vectorProductTest_smallMatrix1 )
 {
     using MatrixType = typename TestFixture::MatrixType;
 
-    test_VectorProduct< MatrixType >();
+    test_VectorProduct_smallMatrix1< MatrixType >();
+}
+
+TYPED_TEST( MatrixTest, vectorProductTest_smallMatrix2 )
+{
+    using MatrixType = typename TestFixture::MatrixType;
+
+    test_VectorProduct_smallMatrix2< MatrixType >();
+}
+
+TYPED_TEST( MatrixTest, vectorProductTest_smallMatrix3 )
+{
+    using MatrixType = typename TestFixture::MatrixType;
+
+    test_VectorProduct_smallMatrix3< MatrixType >();
+}
+
+TYPED_TEST( MatrixTest, vectorProductTest_mediumSizeMatrix1 )
+{
+    using MatrixType = typename TestFixture::MatrixType;
+
+    test_VectorProduct_mediumSizeMatrix1< MatrixType >();
+}
+
+TYPED_TEST( MatrixTest, vectorProductTest_mediumSizeMatrix2 )
+{
+    using MatrixType = typename TestFixture::MatrixType;
+
+    test_VectorProduct_mediumSizeMatrix2< MatrixType >();
+}
+
+TYPED_TEST( MatrixTest, vectorProductTest_largeMatrix )
+{
+    using MatrixType = typename TestFixture::MatrixType;
+
+    test_VectorProduct_largeMatrix< MatrixType >();
+}
+
+TYPED_TEST( MatrixTest, vectorProductTest_longRowsMatrix )
+{
+    using MatrixType = typename TestFixture::MatrixType;
+
+    test_VectorProduct_longRowsMatrix< MatrixType >();
 }
 
 #endif
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest.hpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest.hpp
index d49356042..c3f8a39db 100644
--- a/src/UnitTests/Matrices/SparseMatrixVectorProductTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest.hpp
@@ -25,7 +25,7 @@
 #include <gtest/gtest.h>
 
 template< typename Matrix >
-void test_VectorProduct()
+void test_VectorProduct_smallMatrix1()
 {
    using RealType = typename Matrix::RealType;
    using DeviceType = typename Matrix::DeviceType;
@@ -75,6 +75,15 @@ void test_VectorProduct()
    EXPECT_EQ( outVector_1.getElement( 1 ), 10 );
    EXPECT_EQ( outVector_1.getElement( 2 ),  8 );
    EXPECT_EQ( outVector_1.getElement( 3 ), 10 );
+}
+
+template< typename Matrix >
+void test_VectorProduct_smallMatrix2()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
 
    /*
     * Sets up the following 4x4 sparse matrix:
@@ -120,6 +129,15 @@ void test_VectorProduct()
    EXPECT_EQ( outVector_2.getElement( 1 ),  8 );
    EXPECT_EQ( outVector_2.getElement( 2 ), 36 );
    EXPECT_EQ( outVector_2.getElement( 3 ), 16 );
+}
+
+template< typename Matrix >
+void test_VectorProduct_smallMatrix3()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
 
    /*
     * Sets up the following 4x4 sparse matrix:
@@ -166,6 +184,15 @@ void test_VectorProduct()
    EXPECT_EQ( outVector_3.getElement( 1 ), 30 );
    EXPECT_EQ( outVector_3.getElement( 2 ), 48 );
    EXPECT_EQ( outVector_3.getElement( 3 ), 66 );
+}
+
+template< typename Matrix >
+void test_VectorProduct_mediumSizeMatrix1()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
 
    /*
     * Sets up the following 8x8 sparse matrix:
@@ -234,6 +261,15 @@ void test_VectorProduct()
    EXPECT_EQ( outVector_4.getElement( 5 ), 188 );
    EXPECT_EQ( outVector_4.getElement( 6 ), 280 );
    EXPECT_EQ( outVector_4.getElement( 7 ), 330 );
+}
+
+template< typename Matrix >
+void test_VectorProduct_mediumSizeMatrix2()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
 
    /*
     * Sets up the following 8x8 sparse matrix:
@@ -307,6 +343,16 @@ void test_VectorProduct()
    EXPECT_EQ( outVector_5.getElement( 5 ), 224 );
    EXPECT_EQ( outVector_5.getElement( 6 ), 352 );
    EXPECT_EQ( outVector_5.getElement( 7 ), 520 );
+}
+
+
+template< typename Matrix >
+void test_VectorProduct_largeMatrix()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
 
    /////
    // Large test
@@ -362,6 +408,15 @@ void test_VectorProduct()
       for( IndexType i = 0; i < rows; i++ )
          EXPECT_EQ( out.getElement( i ), ( i + 1 ) * ( i + 2 ) / 2 );
    }
+}
+
+template< typename Matrix >
+void test_VectorProduct_longRowsMatrix()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
 
    /**
     * Long row test
@@ -372,22 +427,29 @@ void test_VectorProduct()
    if( ! std::is_same< typename Matrix::SegmentsViewType, ChunkedEllpackView_ >::value )
    {
       // TODO: Fix ChunkedEllpack for this test - seems that it allocates too much memory
-      const int columns = 3000;
-      const int rows = 1;
-      Matrix m3( rows, columns );
-      TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowsCapacities( rows );
-      rowsCapacities = columns;
-      m3.setRowCapacities( rowsCapacities );
-      auto f = [] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
-         column = localIdx;
-         value = localIdx + 1;
-      };
-      m3.forAllElements( f );
-      TNL::Containers::Vector< double, DeviceType, IndexType > in( columns, 1.0 ), out( rows, 0.0 );
-      m3.vectorProduct( in, out );
-      EXPECT_EQ( out.getElement( 0 ), ( double ) columns * ( double ) (columns + 1 ) / 2.0 );
+      for( auto columns : { 64, 65, 128, 129, 256, 257, 512, 513, 1024, 1025, 2048, 2049, 3000 } )
+      {
+         //std::cerr << "Long-rows-matrix-test: columns = " << columns << std::endl;
+         //const int columns = 3000;
+         const int rows = 33;
+         Matrix m3( rows, columns );
+         TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowsCapacities( rows );
+         rowsCapacities = columns;
+         m3.setRowCapacities( rowsCapacities );
+         auto f = [] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
+            column = localIdx;
+            value = localIdx + row;
+         };
+         m3.forAllElements( f );
+         TNL::Containers::Vector< double, DeviceType, IndexType > in( columns, 1.0 ), out( rows, 0.0 );
+         m3.vectorProduct( in, out );
+         for( IndexType rowIdx = 0; rowIdx < rows; rowIdx++ )
+         {
+            //std::cerr << "Long-rows-matrix-test: rowIndex = " << rowIdx << std::endl;
+            EXPECT_EQ( out.getElement( rowIdx ), ( double ) columns * ( double ) (columns - 1 ) / 2.0 + columns * rowIdx );
+         }
+      }
    }
 }
 
-
 #endif
-- 
GitLab


From 185599c98566ddc6e32605728a22f336d9bdf087 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 31 Mar 2021 20:16:02 +0200
Subject: [PATCH 005/117] Fixing CUDA max grid size.

---
 .../Algorithms/Segments/CSRAdaptiveKernelView.hpp |  6 +++---
 src/TNL/Cuda/LaunchHelpers.h                      | 15 +++++++++++++++
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
index 4f1560857..bc8801eea 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
@@ -56,7 +56,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
    __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];
    //__shared__ BlockType sharedBlocks[ WarpsCount ];
 
-   const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x;
+   const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridXSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x;
    const Index blockIdx = index / WarpSize;
    if( blockIdx >= blocks.getSize() - 1 )
       return;
@@ -237,8 +237,8 @@ struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduc
 
       Index blocksCount;
 
-      const Index threads = detail::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
-      constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize();
+      const Index threads = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
+      constexpr size_t maxGridSize = TNL::Cuda::getMaxGridXSize();
 
       // Fill blocks
       size_t neededThreads = blocks.getSize() * TNL::Cuda::getWarpSize(); // one warp per block
diff --git a/src/TNL/Cuda/LaunchHelpers.h b/src/TNL/Cuda/LaunchHelpers.h
index a9e8bc168..278f23da5 100644
--- a/src/TNL/Cuda/LaunchHelpers.h
+++ b/src/TNL/Cuda/LaunchHelpers.h
@@ -22,6 +22,21 @@ inline constexpr std::size_t getMaxGridSize()
    return 65535;
 }
 
+inline constexpr size_t getMaxGridXSize()
+{
+   return 2147483647;//65535;
+}
+
+inline constexpr size_t getMaxGridYSize()
+{
+   return 65535;
+}
+
+inline constexpr size_t getMaxGridZSize()
+{
+   return 65535;
+}
+
 inline constexpr int getMaxBlockSize()
 {
    return 1024;
-- 
GitLab


From 9bb000f4892b9bff5eee2ac4f3e067dab84abca1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 31 Mar 2021 20:37:02 +0200
Subject: [PATCH 006/117] Renaming forEachSegment to forAllSegments.

---
 src/TNL/Algorithms/Segments/BiEllpack.h            | 2 +-
 src/TNL/Algorithms/Segments/BiEllpack.hpp          | 4 ++--
 src/TNL/Algorithms/Segments/BiEllpackView.h        | 2 +-
 src/TNL/Algorithms/Segments/BiEllpackView.hpp      | 2 +-
 src/TNL/Algorithms/Segments/CSR.h                  | 2 +-
 src/TNL/Algorithms/Segments/CSR.hpp                | 4 ++--
 src/TNL/Algorithms/Segments/CSRView.h              | 2 +-
 src/TNL/Algorithms/Segments/CSRView.hpp            | 2 +-
 src/TNL/Algorithms/Segments/ChunkedEllpack.h       | 2 +-
 src/TNL/Algorithms/Segments/ChunkedEllpack.hpp     | 4 ++--
 src/TNL/Algorithms/Segments/ChunkedEllpackView.h   | 2 +-
 src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp | 2 +-
 src/TNL/Algorithms/Segments/Ellpack.h              | 2 +-
 src/TNL/Algorithms/Segments/Ellpack.hpp            | 4 ++--
 src/TNL/Algorithms/Segments/EllpackView.h          | 2 +-
 src/TNL/Algorithms/Segments/EllpackView.hpp        | 2 +-
 src/TNL/Algorithms/Segments/SlicedEllpack.h        | 2 +-
 src/TNL/Algorithms/Segments/SlicedEllpack.hpp      | 4 ++--
 src/TNL/Algorithms/Segments/SlicedEllpackView.h    | 2 +-
 src/TNL/Algorithms/Segments/SlicedEllpackView.hpp  | 2 +-
 20 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/BiEllpack.h b/src/TNL/Algorithms/Segments/BiEllpack.h
index 3a5a7c202..f49786308 100644
--- a/src/TNL/Algorithms/Segments/BiEllpack.h
+++ b/src/TNL/Algorithms/Segments/BiEllpack.h
@@ -110,7 +110,7 @@ namespace TNL
             void forSegments(IndexType begin, IndexType end, Function &&f) const;
 
             template <typename Function>
-            void forEachSegment(Function &&f) const;
+            void forAllSegments(Function &&f) const;
 
             /***
        * \brief Go over all segments and perform a reduction in each of them.
diff --git a/src/TNL/Algorithms/Segments/BiEllpack.hpp b/src/TNL/Algorithms/Segments/BiEllpack.hpp
index 53a3eb905..a455662f7 100644
--- a/src/TNL/Algorithms/Segments/BiEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpack.hpp
@@ -486,9 +486,9 @@ template< typename Device,
    template< typename Function >
 void
 BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
-forEachSegment( Function&& f ) const
+forAllSegments( Function&& f ) const
 {
-   this->getConstView().forEachSegment( f );
+   this->getConstView().forAllSegments( f );
 }
 
 
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.h b/src/TNL/Algorithms/Segments/BiEllpackView.h
index 50f69e3aa..b349b292c 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.h
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.h
@@ -121,7 +121,7 @@ class BiEllpackView
       void forSegments( IndexType begin, IndexType end, Function&& f ) const;
 
       template< typename Function >
-      void forEachSegment( Function&& f ) const;
+      void forAllSegments( Function&& f ) const;
 
       /***
        * \brief Go over all segments and perform a reduction in each of them.
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.hpp b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
index 03131a0de..39f25c1f3 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
@@ -343,7 +343,7 @@ template< typename Device,
    template< typename Function >
 void
 BiEllpackView< Device, Index, Organization, WarpSize >::
-forEachSegment( Function&& f ) const
+forAllSegments( Function&& f ) const
 {
    this->forSegments( 0, this->getSegmentsCount(), f );
 }
diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index e63b4c8da..1db5f839b 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -118,7 +118,7 @@ class CSR
       void forSegments( IndexType begin, IndexType end, Function&& f ) const;
 
       template< typename Function >
-      void forEachSegment( Function&& f ) const;
+      void forAllSegments( Function&& f ) const;
 
       /***
        * \brief Go over all segments and perform a reduction in each of them.
diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp
index 44f9aa799..f2827a60c 100644
--- a/src/TNL/Algorithms/Segments/CSR.hpp
+++ b/src/TNL/Algorithms/Segments/CSR.hpp
@@ -266,9 +266,9 @@ template< typename Device,
    template< typename Function >
 void
 CSR< Device, Index, Kernel, IndexAllocator >::
-forEachSegment( Function&& f ) const
+forAllSegments( Function&& f ) const
 {
-   this->getConstView().forEachSegment( f );
+   this->getConstView().forAllSegments( f );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index cd9e44a2a..64646f761 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -115,7 +115,7 @@ class CSRView
       void forSegments( IndexType begin, IndexType end, Function&& f ) const;
 
       template< typename Function >
-      void forEachSegment( Function&& f ) const;
+      void forAllSegments( Function&& f ) const;
 
       /***
        * \brief Go over all segments and perform a reduction in each of them.
diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp
index 8c9f1e789..4aa83d8e6 100644
--- a/src/TNL/Algorithms/Segments/CSRView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRView.hpp
@@ -233,7 +233,7 @@ template< typename Device,
    template< typename Function >
 void
 CSRView< Device, Index, Kernel >::
-forEachSegment( Function&& f ) const
+forAllSegments( Function&& f ) const
 {
    this->forSegments( 0, this->getSegmentsCount(), f );
 }
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
index 5abb93b5a..68bcc9c7f 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
@@ -106,7 +106,7 @@ class ChunkedEllpack
       void forSegments( IndexType begin, IndexType end, Function&& f ) const;
 
       template< typename Function >
-      void forEachSegment( Function&& f ) const;
+      void forAllSegments( Function&& f ) const;
 
       /***
        * \brief Go over all segments and perform a reduction in each of them.
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
index b4f60047b..83a778924 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
@@ -431,9 +431,9 @@ template< typename Device,
    template< typename Function >
 void
 ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
-forEachSegment( Function&& f ) const
+forAllSegments( Function&& f ) const
 {
-   this->getConstView().forEachSegment( f );
+   this->getConstView().forAllSegments( f );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
index f7211c216..e19e1578e 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
@@ -135,7 +135,7 @@ class ChunkedEllpackView
       void forSegments( IndexType begin, IndexType end, Function&& f ) const;
 
       template< typename Function >
-      void forEachSegment( Function&& f ) const;
+      void forAllSegments( Function&& f ) const;
 
       /***
        * \brief Go over all segments and perform a reduction in each of them.
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
index 26e8fd0f7..61b206104 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
@@ -384,7 +384,7 @@ template< typename Device,
    template< typename Function >
 void
 ChunkedEllpackView< Device, Index, Organization >::
-forEachSegment( Function&& f ) const
+forAllSegments( Function&& f ) const
 {
    this->forSegments( 0, this->getSegmentsCount(), f );
 }
diff --git a/src/TNL/Algorithms/Segments/Ellpack.h b/src/TNL/Algorithms/Segments/Ellpack.h
index c88ba6a1d..74145fe1a 100644
--- a/src/TNL/Algorithms/Segments/Ellpack.h
+++ b/src/TNL/Algorithms/Segments/Ellpack.h
@@ -106,7 +106,7 @@ class Ellpack
       void forSegments( IndexType begin, IndexType end, Function&& f ) const;
 
       template< typename Function >
-      void forEachSegment( Function&& f ) const;
+      void forAllSegments( Function&& f ) const;
 
       /***
        * \brief Go over all segments and perform a reduction in each of them.
diff --git a/src/TNL/Algorithms/Segments/Ellpack.hpp b/src/TNL/Algorithms/Segments/Ellpack.hpp
index 124e3dfc2..c6cff3dc1 100644
--- a/src/TNL/Algorithms/Segments/Ellpack.hpp
+++ b/src/TNL/Algorithms/Segments/Ellpack.hpp
@@ -294,9 +294,9 @@ template< typename Device,
    template< typename Function >
 void
 Ellpack< Device, Index, IndexAllocator, Organization, Alignment >::
-forEachSegment( Function&& f ) const
+forAllSegments( Function&& f ) const
 {
-   this->getConstView().forEachSegment( f );
+   this->getConstView().forAllSegments( f );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/EllpackView.h b/src/TNL/Algorithms/Segments/EllpackView.h
index 77d0d8b7b..cfe9dd238 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.h
+++ b/src/TNL/Algorithms/Segments/EllpackView.h
@@ -105,7 +105,7 @@ class EllpackView
       void forSegments( IndexType begin, IndexType end, Function&& f ) const;
 
       template< typename Function >
-      void forEachSegment( Function&& f ) const;
+      void forAllSegments( Function&& f ) const;
 
       /***
        * \brief Go over all segments and perform a reduction in each of them.
diff --git a/src/TNL/Algorithms/Segments/EllpackView.hpp b/src/TNL/Algorithms/Segments/EllpackView.hpp
index 6215f4ef9..c9b94c420 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/EllpackView.hpp
@@ -262,7 +262,7 @@ template< typename Device,
           int Alignment >
    template< typename Function >
 void EllpackView< Device, Index, Organization, Alignment >::
-forEachSegment( Function&& f ) const
+forAllSegments( Function&& f ) const
 {
    this->forSegments( 0, this->getSegmentsCount(), f );
 }
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.h b/src/TNL/Algorithms/Segments/SlicedEllpack.h
index 942306c75..a5e6683dd 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpack.h
+++ b/src/TNL/Algorithms/Segments/SlicedEllpack.h
@@ -103,7 +103,7 @@ class SlicedEllpack
       void forSegments( IndexType begin, IndexType end, Function&& f ) const;
 
       template< typename Function >
-      void forEachSegment( Function&& f ) const;
+      void forAllSegments( Function&& f ) const;
 
       /***
        * \brief Go over all segments and perform a reduction in each of them.
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
index 4482cd567..b7089e6f1 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
@@ -328,9 +328,9 @@ template< typename Device,
    template< typename Function >
 void
 SlicedEllpack< Device, Index, IndexAllocator, Organization, SliceSize >::
-forEachSegment( Function&& f ) const
+forAllSegments( Function&& f ) const
 {
-   this->getConstView().forEachSegment( f );
+   this->getConstView().forAllSegments( f );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.h b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
index 2955ee351..a1447c025 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
@@ -104,7 +104,7 @@ class SlicedEllpackView
       void forSegments( IndexType begin, IndexType end, Function&& f ) const;
 
       template< typename Function >
-      void forEachSegment( Function&& f ) const;
+      void forAllSegments( Function&& f ) const;
 
       /***
        * \brief Go over all segments and perform a reduction in each of them.
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
index 94bebca13..f639973f2 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
@@ -317,7 +317,7 @@ template< typename Device,
    template< typename Function >
 void
 SlicedEllpackView< Device, Index, Organization, SliceSize >::
-forEachSegment( Function&& f ) const
+forAllSegments( Function&& f ) const
 {
    this->forSegments( 0, this->getSegmentsCount(), f );
 }
-- 
GitLab


From b936f2a5a7ffdfbbc68f19ece6e3b310fd99dfd3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 31 Mar 2021 20:42:12 +0200
Subject: [PATCH 007/117] Renaming segmentsReduction to reduceSegments.

---
 src/TNL/Algorithms/Segments/BiEllpack.h       |  2 +-
 src/TNL/Algorithms/Segments/BiEllpack.hpp     |  6 ++---
 src/TNL/Algorithms/Segments/BiEllpackView.h   | 10 ++++----
 src/TNL/Algorithms/Segments/BiEllpackView.hpp | 10 ++++----
 src/TNL/Algorithms/Segments/CSR.h             |  2 +-
 src/TNL/Algorithms/Segments/CSR.hpp           |  6 ++---
 .../Algorithms/Segments/CSRAdaptiveKernel.h   |  4 ++--
 .../Algorithms/Segments/CSRAdaptiveKernel.hpp |  4 ++--
 .../Segments/CSRAdaptiveKernelView.h          |  2 +-
 .../Segments/CSRAdaptiveKernelView.hpp        | 18 +++++++-------
 src/TNL/Algorithms/Segments/CSRHybridKernel.h |  2 +-
 .../Algorithms/Segments/CSRHybridKernel.hpp   | 24 +++++++++----------
 src/TNL/Algorithms/Segments/CSRScalarKernel.h |  2 +-
 .../Algorithms/Segments/CSRScalarKernel.hpp   |  2 +-
 src/TNL/Algorithms/Segments/CSRVectorKernel.h |  2 +-
 .../Algorithms/Segments/CSRVectorKernel.hpp   |  6 ++---
 src/TNL/Algorithms/Segments/CSRView.h         |  2 +-
 src/TNL/Algorithms/Segments/CSRView.hpp       |  8 +++----
 src/TNL/Algorithms/Segments/ChunkedEllpack.h  |  2 +-
 .../Algorithms/Segments/ChunkedEllpack.hpp    |  6 ++---
 .../Algorithms/Segments/ChunkedEllpackView.h  | 10 ++++----
 .../Segments/ChunkedEllpackView.hpp           | 12 +++++-----
 src/TNL/Algorithms/Segments/Ellpack.h         |  2 +-
 src/TNL/Algorithms/Segments/Ellpack.hpp       |  6 ++---
 src/TNL/Algorithms/Segments/EllpackView.h     |  2 +-
 src/TNL/Algorithms/Segments/EllpackView.hpp   |  4 ++--
 src/TNL/Algorithms/Segments/SlicedEllpack.h   |  2 +-
 src/TNL/Algorithms/Segments/SlicedEllpack.hpp |  6 ++---
 .../Algorithms/Segments/SlicedEllpackView.h   |  2 +-
 .../Algorithms/Segments/SlicedEllpackView.hpp |  4 ++--
 .../Algorithms/Segments/detail/BiEllpack.h    | 16 ++++++-------
 src/TNL/Algorithms/Segments/detail/CSR.h      |  2 +-
 .../detail/CSRAdaptiveKernelParameters.h      |  2 +-
 .../Segments/detail/ChunkedEllpack.h          | 16 ++++++-------
 src/TNL/Matrices/DenseMatrixView.hpp          |  8 +++++--
 src/TNL/Matrices/SparseMatrixView.hpp         | 16 ++++++-------
 36 files changed, 117 insertions(+), 113 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/BiEllpack.h b/src/TNL/Algorithms/Segments/BiEllpack.h
index f49786308..b580206bc 100644
--- a/src/TNL/Algorithms/Segments/BiEllpack.h
+++ b/src/TNL/Algorithms/Segments/BiEllpack.h
@@ -116,7 +116,7 @@ namespace TNL
        * \brief Go over all segments and perform a reduction in each of them.
        */
             template <typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args>
-            void segmentsReduction(IndexType first, IndexType last, Fetch &fetch, const Reduction &reduction, ResultKeeper &keeper, const Real &zero, Args... args) const;
+            void reduceSegments(IndexType first, IndexType last, Fetch &fetch, const Reduction &reduction, ResultKeeper &keeper, const Real &zero, Args... args) const;
 
             template <typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args>
             void allReduction(Fetch &fetch, const Reduction &reduction, ResultKeeper &keeper, const Real &zero, Args... args) const;
diff --git a/src/TNL/Algorithms/Segments/BiEllpack.hpp b/src/TNL/Algorithms/Segments/BiEllpack.hpp
index a455662f7..5bb5a38bb 100644
--- a/src/TNL/Algorithms/Segments/BiEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpack.hpp
@@ -500,9 +500,9 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
+   this->getConstView().reduceSegments( first, last, fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
@@ -515,7 +515,7 @@ void
 BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
 allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.h b/src/TNL/Algorithms/Segments/BiEllpackView.h
index b349b292c..5a7d25310 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.h
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.h
@@ -127,7 +127,7 @@ class BiEllpackView
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
       void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
@@ -162,7 +162,7 @@ class BiEllpackView
                 int BlockDim,
                 typename... Args >
       __device__
-      void segmentsReductionKernelWithAllParameters( IndexType gridIdx,
+      void reduceSegmentsKernelWithAllParameters( IndexType gridIdx,
                                                      IndexType first,
                                                      IndexType last,
                                                      Fetch fetch,
@@ -178,7 +178,7 @@ class BiEllpackView
                 int BlockDim,
                 typename... Args >
       __device__
-      void segmentsReductionKernel( IndexType gridIdx,
+      void reduceSegmentsKernel( IndexType gridIdx,
                                     IndexType first,
                                     IndexType last,
                                     Fetch fetch,
@@ -196,7 +196,7 @@ class BiEllpackView
                 int BlockDim,
                 typename... Args_ >
       friend __global__
-      void BiEllpackSegmentsReductionKernel( View_ chunkedEllpack,
+      void BiEllpackreduceSegmentsKernel( View_ chunkedEllpack,
                                              Index_ gridIdx,
                                              Index_ first,
                                              Index_ last,
@@ -207,7 +207,7 @@ class BiEllpackView
                                              Args_... args );
 
       template< typename Index_, typename Fetch_, int BlockDim_, int WarpSize_, bool B_ >
-      friend struct detail::BiEllpackSegmentsReductionDispatcher;
+      friend struct details::BiEllpackreduceSegmentsDispatcher;
 #endif
 };
       } // namespace Segments
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.hpp b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
index 39f25c1f3..88347e834 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
@@ -355,7 +355,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 BiEllpackView< Device, Index, Organization, WarpSize >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( this->getStorageSize() == 0 )
@@ -425,7 +425,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
          dim3 cudaGridSize = Cuda::getMaxGridSize();
          if( gridIdx == cudaGrids - 1 )
             cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-         detail::BiEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim, Args...  >
+         details::BiEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim, Args...  >
             <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
             ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
          cudaThreadSynchronize();
@@ -444,7 +444,7 @@ void
 BiEllpackView< Device, Index, Organization, WarpSize >::
 allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
@@ -518,7 +518,7 @@ template< typename Device,
 __device__
 void
 BiEllpackView< Device, Index, Organization, WarpSize >::
-segmentsReductionKernelWithAllParameters( IndexType gridIdx,
+reduceSegmentsKernelWithAllParameters( IndexType gridIdx,
                                           IndexType first,
                                           IndexType last,
                                           Fetch fetch,
@@ -574,7 +574,7 @@ template< typename Device,
 __device__
 void
 BiEllpackView< Device, Index, Organization, WarpSize >::
-segmentsReductionKernel( IndexType gridIdx,
+reduceSegmentsKernel( IndexType gridIdx,
                          IndexType first,
                          IndexType last,
                          Fetch fetch,
diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index 1db5f839b..a06c287f1 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -124,7 +124,7 @@ class CSR
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
       void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp
index f2827a60c..8560accfe 100644
--- a/src/TNL/Algorithms/Segments/CSR.hpp
+++ b/src/TNL/Algorithms/Segments/CSR.hpp
@@ -278,9 +278,9 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 CSR< Device, Index, Kernel, IndexAllocator >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
+   this->getConstView().reduceSegments( first, last, fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
@@ -292,7 +292,7 @@ void
 CSR< Device, Index, Kernel, IndexAllocator >::
 allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
index 640120f86..40f06e1f9 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
@@ -39,7 +39,7 @@ template< int CudaBlockSize,
           typename Real,
           typename... Args >
 __global__ void
-segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
+reduceSegmentsCSRAdaptiveKernel( BlocksView blocks,
                                     int gridIdx,
                                     Offsets offsets,
                                     Index first,
@@ -84,7 +84,7 @@ struct CSRAdaptiveKernel
               typename ResultKeeper,
               typename Real,
               typename... Args >
-   void segmentsReduction( const OffsetsView& offsets,
+   void reduceSegments( const OffsetsView& offsets,
                         Index first,
                         Index last,
                         Fetch& fetch,
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp
index a510ac395..c5d809920 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp
@@ -100,7 +100,7 @@ template< typename Index,
                typename... Args >
 void
 CSRAdaptiveKernel< Index, Device >::
-segmentsReduction( const OffsetsView& offsets,
+reduceSegments( const OffsetsView& offsets,
                    Index first,
                    Index last,
                    Fetch& fetch,
@@ -109,7 +109,7 @@ segmentsReduction( const OffsetsView& offsets,
                    const Real& zero,
                    Args... args ) const
 {
-   view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
+   view.reduceSegments( offsets, first, last, fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Index,
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h
index 9de407051..a48d2aa34 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h
@@ -49,7 +49,7 @@ struct CSRAdaptiveKernelView
              typename ResultKeeper,
              typename Real,
              typename... Args >
-   void segmentsReduction( const OffsetsView& offsets,
+   void reduceSegments( const OffsetsView& offsets,
                         Index first,
                         Index last,
                         Fetch& fetch,
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
index bc8801eea..fb1ab36c1 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
@@ -35,7 +35,7 @@ template< typename BlocksView,
           typename Real,
           typename... Args >
 __global__ void
-segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
+reduceSegmentsCSRAdaptiveKernel( BlocksView blocks,
                                     int gridIdx,
                                     Offsets offsets,
                                     Index first,
@@ -183,14 +183,14 @@ template< typename Index,
           bool DispatchScalarCSR =
             detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() ||
             std::is_same< Device, Devices::Host >::value >
-struct CSRAdaptiveKernelSegmentsReductionDispatcher;
+struct CSRAdaptiveKernelreduceSegmentsDispatcher;
 
 template< typename Index,
           typename Device,
           typename Fetch,
           typename Reduction,
           typename ResultKeeper >
-struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, true >
+struct CSRAdaptiveKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, true >
 {
 
    template< typename BlocksView,
@@ -208,7 +208,7 @@ struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduc
                        Args... args)
    {
       TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
-         segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
+         reduceSegments( offsets, first, last, fetch, reduction, keeper, zero, args... );
    }
 };
 
@@ -217,7 +217,7 @@ template< typename Index,
           typename Fetch,
           typename Reduction,
           typename ResultKeeper >
-struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, false >
+struct CSRAdaptiveKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, false >
 {
    template< typename BlocksView,
              typename Offsets,
@@ -256,7 +256,7 @@ struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduc
             neededThreads -= maxGridSize * threads;
          }
 
-         segmentsReductionCSRAdaptiveKernel<
+         reduceSegmentsCSRAdaptiveKernel<
                BlocksView,
                Offsets,
                Index, Fetch, Reduction, ResultKeeper, Real, Args... >
@@ -322,7 +322,7 @@ template< typename Index,
                typename... Args >
 void
 CSRAdaptiveKernelView< Index, Device >::
-segmentsReduction( const OffsetsView& offsets,
+reduceSegments( const OffsetsView& offsets,
                    Index first,
                    Index last,
                    Fetch& fetch,
@@ -336,11 +336,11 @@ segmentsReduction( const OffsetsView& offsets,
    if( detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog >= MaxValueSizeLog )
    {
       TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
-         segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
+         reduceSegments( offsets, first, last, fetch, reduction, keeper, zero, args... );
       return;
    }
 
-   CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper  >::template
+   CSRAdaptiveKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduction, ResultKeeper  >::template
       reduce< BlocksView, OffsetsView, Real, Args... >( offsets, this->blocksArray[ valueSizeLog ], first, last, fetch, reduction, keeper, zero, args... );
 }
 
diff --git a/src/TNL/Algorithms/Segments/CSRHybridKernel.h b/src/TNL/Algorithms/Segments/CSRHybridKernel.h
index 316d277bd..479b2b287 100644
--- a/src/TNL/Algorithms/Segments/CSRHybridKernel.h
+++ b/src/TNL/Algorithms/Segments/CSRHybridKernel.h
@@ -46,7 +46,7 @@ struct CSRHybridKernel
              typename Reduction,
              typename ResultKeeper,
              typename Real >
-   void segmentsReduction( const OffsetsView& offsets,
+   void reduceSegments( const OffsetsView& offsets,
                                   Index first,
                                   Index last,
                                   Fetch& fetch,
diff --git a/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp b/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp
index 7d752396b..ad431050f 100644
--- a/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp
+++ b/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp
@@ -30,7 +30,7 @@ template< int ThreadsPerSegment,
           typename ResultKeeper,
           typename Real >
 __global__
-void segmentsReductionCSRHybridVectorKernel(
+void reduceSegmentsCSRHybridVectorKernel(
     int gridIdx,
     const Offsets offsets,
     Index first,
@@ -83,7 +83,7 @@ template< int BlockSize,
           typename ResultKeeper,
           typename Real >
 __global__
-void segmentsReductionCSRHybridMultivectorKernel(
+void reduceSegmentsCSRHybridMultivectorKernel(
     int gridIdx,
     const Offsets offsets,
     Index first,
@@ -233,7 +233,7 @@ template< typename Index,
               typename Real >
 void
 CSRHybridKernel< Index, Device, ThreadsInBlock >::
-segmentsReduction( const OffsetsView& offsets,
+reduceSegments( const OffsetsView& offsets,
                          Index first,
                          Index last,
                          Fetch& fetch,
@@ -258,39 +258,39 @@ segmentsReduction( const OffsetsView& offsets,
             case 0:      // this means zero/empty matrix
                 break;
             case 1:
-                segmentsReductionCSRHybridVectorKernel<  1, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                reduceSegmentsCSRHybridVectorKernel<  1, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
                     gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
                     break;
             case 2:
-                segmentsReductionCSRHybridVectorKernel<  2, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                reduceSegmentsCSRHybridVectorKernel<  2, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
                     gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
                     break;
             case 4:
-                segmentsReductionCSRHybridVectorKernel<  4, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                reduceSegmentsCSRHybridVectorKernel<  4, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
                     gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
                     break;
             case 8:
-                segmentsReductionCSRHybridVectorKernel<  8, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                reduceSegmentsCSRHybridVectorKernel<  8, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
                     gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
                     break;
             case 16:
-                segmentsReductionCSRHybridVectorKernel< 16, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                reduceSegmentsCSRHybridVectorKernel< 16, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
                     gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
                     break;
             case 32:
-                segmentsReductionCSRHybridVectorKernel< 32, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                reduceSegmentsCSRHybridVectorKernel< 32, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
                     gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
                     break;
             case 64:
-                segmentsReductionCSRHybridMultivectorKernel< ThreadsInBlock,  64, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                reduceSegmentsCSRHybridMultivectorKernel< ThreadsInBlock,  64, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
                     gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
                     break;
             case 128:
-                segmentsReductionCSRHybridMultivectorKernel< ThreadsInBlock, 128, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                reduceSegmentsCSRHybridMultivectorKernel< ThreadsInBlock, 128, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
                     gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
                     break;
             case 256:
-                segmentsReductionCSRHybridMultivectorKernel< ThreadsInBlock, 256, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
+                reduceSegmentsCSRHybridMultivectorKernel< ThreadsInBlock, 256, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real ><<< gridSize, blockSize >>>(
                     gridIdx, offsets, first, last, fetch, reduction, keeper, zero );
                     break;
             default:
diff --git a/src/TNL/Algorithms/Segments/CSRScalarKernel.h b/src/TNL/Algorithms/Segments/CSRScalarKernel.h
index c76708319..bd04670d7 100644
--- a/src/TNL/Algorithms/Segments/CSRScalarKernel.h
+++ b/src/TNL/Algorithms/Segments/CSRScalarKernel.h
@@ -46,7 +46,7 @@ struct CSRScalarKernel
               typename ResultKeeper,
               typename Real,
               typename... Args >
-    static void segmentsReduction( const OffsetsView& offsets,
+    static void reduceSegments( const OffsetsView& offsets,
                                Index first,
                                Index last,
                                Fetch& fetch,
diff --git a/src/TNL/Algorithms/Segments/CSRScalarKernel.hpp b/src/TNL/Algorithms/Segments/CSRScalarKernel.hpp
index dd05fee20..4bed6934c 100644
--- a/src/TNL/Algorithms/Segments/CSRScalarKernel.hpp
+++ b/src/TNL/Algorithms/Segments/CSRScalarKernel.hpp
@@ -75,7 +75,7 @@ template< typename Index,
               typename... Args >
 void
 CSRScalarKernel< Index, Device >::
-segmentsReduction( const OffsetsView& offsets,
+reduceSegments( const OffsetsView& offsets,
                    Index first,
                    Index last,
                    Fetch& fetch,
diff --git a/src/TNL/Algorithms/Segments/CSRVectorKernel.h b/src/TNL/Algorithms/Segments/CSRVectorKernel.h
index 074f15c5a..fccc1022a 100644
--- a/src/TNL/Algorithms/Segments/CSRVectorKernel.h
+++ b/src/TNL/Algorithms/Segments/CSRVectorKernel.h
@@ -46,7 +46,7 @@ struct CSRVectorKernel
              typename ResultKeeper,
              typename Real,
              typename... Args >
-   static void segmentsReduction( const OffsetsView& offsets,
+   static void reduceSegments( const OffsetsView& offsets,
                                   Index first,
                                   Index last,
                                   Fetch& fetch,
diff --git a/src/TNL/Algorithms/Segments/CSRVectorKernel.hpp b/src/TNL/Algorithms/Segments/CSRVectorKernel.hpp
index 847d1c355..3bf799288 100644
--- a/src/TNL/Algorithms/Segments/CSRVectorKernel.hpp
+++ b/src/TNL/Algorithms/Segments/CSRVectorKernel.hpp
@@ -30,7 +30,7 @@ template< typename Offsets,
           typename Real,
           typename... Args >
 __global__
-void segmentsReductionCSRKernelVector(
+void reduceSegmentsCSRKernelVector(
     int gridIdx,
     const Offsets offsets,
     Index first,
@@ -130,7 +130,7 @@ template< typename Index,
               typename... Args >
 void
 CSRVectorKernel< Index, Device >::
-segmentsReduction( const OffsetsView& offsets,
+reduceSegments( const OffsetsView& offsets,
                          Index first,
                          Index last,
                          Fetch& fetch,
@@ -149,7 +149,7 @@ segmentsReduction( const OffsetsView& offsets,
     {
         dim3 gridSize;
         TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
-        segmentsReductionCSRKernelVector< OffsetsView, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... >
+        reduceSegmentsCSRKernelVector< OffsetsView, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... >
         <<< gridSize, blockSize >>>(
             gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args... );
     };
diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index 64646f761..53b441c77 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -121,7 +121,7 @@ class CSRView
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
       void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp
index 4aa83d8e6..ad1d95653 100644
--- a/src/TNL/Algorithms/Segments/CSRView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRView.hpp
@@ -244,12 +244,12 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 CSRView< Device, Index, Kernel >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    if( std::is_same< DeviceType, TNL::Devices::Host >::value )
-      TNL::Algorithms::Segments::CSRScalarKernel< IndexType, DeviceType >::segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
+      TNL::Algorithms::Segments::CSRScalarKernel< IndexType, DeviceType >::reduceSegments( offsets, first, last, fetch, reduction, keeper, zero, args... );
    else
-      kernel.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
+      kernel.reduceSegments( offsets, first, last, fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
@@ -260,7 +260,7 @@ void
 CSRView< Device, Index, Kernel >::
 allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
index 68bcc9c7f..c24b0843c 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
@@ -112,7 +112,7 @@ class ChunkedEllpack
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
       void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
index 83a778924..04343ce5d 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
@@ -443,9 +443,9 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
+   this->getConstView().reduceSegments( first, last, fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
@@ -457,7 +457,7 @@ void
 ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
 allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
index e19e1578e..134433b04 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
@@ -141,7 +141,7 @@ class ChunkedEllpackView
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
       void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
@@ -161,7 +161,7 @@ class ChunkedEllpackView
                 typename Real,
                 typename... Args >
       __device__
-      void segmentsReductionKernelWithAllParameters( IndexType gridIdx,
+      void reduceSegmentsKernelWithAllParameters( IndexType gridIdx,
                                                      IndexType first,
                                                      IndexType last,
                                                      Fetch fetch,
@@ -176,7 +176,7 @@ class ChunkedEllpackView
                 typename Real,
                 typename... Args >
       __device__
-      void segmentsReductionKernel( IndexType gridIdx,
+      void reduceSegmentsKernel( IndexType gridIdx,
                                     IndexType first,
                                     IndexType last,
                                     Fetch fetch,
@@ -219,7 +219,7 @@ class ChunkedEllpackView
                 typename Real_,
                 typename... Args_ >
       friend __global__
-      void ChunkedEllpackSegmentsReductionKernel( View_ chunkedEllpack,
+      void ChunkedEllpackreduceSegmentsKernel( View_ chunkedEllpack,
                                                   Index_ gridIdx,
                                                   Index_ first,
                                                   Index_ last,
@@ -230,7 +230,7 @@ class ChunkedEllpackView
                                                   Args_... args );
 
       template< typename Index_, typename Fetch_, bool B_ >
-      friend struct detail::ChunkedEllpackSegmentsReductionDispatcher;
+      friend struct details::ChunkedEllpackreduceSegmentsDispatcher;
 #endif
 };
       } // namespace Segments
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
index 61b206104..453ac9cdc 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
@@ -396,12 +396,12 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 ChunkedEllpackView< Device, Index, Organization >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( std::is_same< DeviceType, Devices::Host >::value )
    {
-      //segmentsReductionKernel( 0, first, last, fetch, reduction, keeper, zero, args... );
+      //reduceSegmentsKernel( 0, first, last, fetch, reduction, keeper, zero, args... );
       //return;
 
       for( IndexType segmentIdx = first; segmentIdx < last; segmentIdx++ )
@@ -456,7 +456,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
       {
          if( gridIdx == cudaGrids - 1 )
             cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-         detail::ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args...  >
+         details::ChunkedEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args...  >
             <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
             ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
       }
@@ -472,7 +472,7 @@ void
 ChunkedEllpackView< Device, Index, Organization >::
 allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
@@ -548,7 +548,7 @@ template< typename Device,
 __device__
 void
 ChunkedEllpackView< Device, Index, Organization >::
-segmentsReductionKernelWithAllParameters( IndexType gridIdx,
+reduceSegmentsKernelWithAllParameters( IndexType gridIdx,
                                           IndexType first,
                                           IndexType last,
                                           Fetch fetch,
@@ -626,7 +626,7 @@ template< typename Device,
 __device__
 void
 ChunkedEllpackView< Device, Index, Organization >::
-segmentsReductionKernel( IndexType gridIdx,
+reduceSegmentsKernel( IndexType gridIdx,
                          IndexType first,
                          IndexType last,
                          Fetch fetch,
diff --git a/src/TNL/Algorithms/Segments/Ellpack.h b/src/TNL/Algorithms/Segments/Ellpack.h
index 74145fe1a..6a1c2b150 100644
--- a/src/TNL/Algorithms/Segments/Ellpack.h
+++ b/src/TNL/Algorithms/Segments/Ellpack.h
@@ -112,7 +112,7 @@ class Ellpack
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
       void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
diff --git a/src/TNL/Algorithms/Segments/Ellpack.hpp b/src/TNL/Algorithms/Segments/Ellpack.hpp
index c6cff3dc1..52ca1ef33 100644
--- a/src/TNL/Algorithms/Segments/Ellpack.hpp
+++ b/src/TNL/Algorithms/Segments/Ellpack.hpp
@@ -307,9 +307,9 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 Ellpack< Device, Index, IndexAllocator, Organization, Alignment >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
+   this->getConstView().reduceSegments( first, last, fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
@@ -322,7 +322,7 @@ void
 Ellpack< Device, Index, IndexAllocator, Organization, Alignment >::
 allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/EllpackView.h b/src/TNL/Algorithms/Segments/EllpackView.h
index cfe9dd238..e32173060 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.h
+++ b/src/TNL/Algorithms/Segments/EllpackView.h
@@ -111,7 +111,7 @@ class EllpackView
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
       void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
diff --git a/src/TNL/Algorithms/Segments/EllpackView.hpp b/src/TNL/Algorithms/Segments/EllpackView.hpp
index c9b94c420..675468ea0 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/EllpackView.hpp
@@ -273,7 +273,7 @@ template< typename Device,
           int Alignment >
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void EllpackView< Device, Index, Organization, Alignment >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
    using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
@@ -318,7 +318,7 @@ template< typename Device,
 void EllpackView< Device, Index, Organization, Alignment >::
 allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.h b/src/TNL/Algorithms/Segments/SlicedEllpack.h
index a5e6683dd..4807c8670 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpack.h
+++ b/src/TNL/Algorithms/Segments/SlicedEllpack.h
@@ -109,7 +109,7 @@ class SlicedEllpack
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
       void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
index b7089e6f1..4aa2383e9 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
@@ -341,9 +341,9 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 SlicedEllpack< Device, Index, IndexAllocator, Organization, SliceSize >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
+   this->getConstView().reduceSegments( first, last, fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
@@ -356,7 +356,7 @@ void
 SlicedEllpack< Device, Index, IndexAllocator, Organization, SliceSize >::
 allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.h b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
index a1447c025..dccb55fae 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
@@ -110,7 +110,7 @@ class SlicedEllpackView
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
       void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
index f639973f2..8282ea46c 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
@@ -329,7 +329,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 SlicedEllpackView< Device, Index, Organization, SliceSize >::
-segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
@@ -380,7 +380,7 @@ void
 SlicedEllpackView< Device, Index, Organization, SliceSize >::
 allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/detail/BiEllpack.h b/src/TNL/Algorithms/Segments/detail/BiEllpack.h
index a45e16d77..5605c8fb6 100644
--- a/src/TNL/Algorithms/Segments/detail/BiEllpack.h
+++ b/src/TNL/Algorithms/Segments/detail/BiEllpack.h
@@ -292,11 +292,11 @@ template< typename Index,
           typename Fetch,
           int BlockDim = 256,
           int WarpSize = 32,
-          bool HasAllParameters = detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
-struct BiEllpackSegmentsReductionDispatcher{};
+          bool HasAllParameters = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
+struct BiEllpackreduceSegmentsDispatcher{};
 
 template< typename Index, typename Fetch, int BlockDim, int WarpSize >
-struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, BlockDim, WarpSize, true >
+struct BiEllpackreduceSegmentsDispatcher< Index, Fetch, BlockDim, WarpSize, true >
 {
    template< typename View,
              typename Reduction,
@@ -314,12 +314,12 @@ struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, BlockDim, WarpSize, t
                      Real zero,
                      Args... args )
    {
-      biEllpack.template segmentsReductionKernelWithAllParameters< Fetch, Reduction, ResultKeeper, Real, BlockDim, Args... >( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+      biEllpack.template reduceSegmentsKernelWithAllParameters< Fetch, Reduction, ResultKeeper, Real, BlockDim, Args... >( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
    }
 };
 
 template< typename Index, typename Fetch, int BlockDim, int WarpSize >
-struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, BlockDim, WarpSize, false >
+struct BiEllpackreduceSegmentsDispatcher< Index, Fetch, BlockDim, WarpSize, false >
 {
    template< typename View,
              typename Reduction,
@@ -337,7 +337,7 @@ struct BiEllpackSegmentsReductionDispatcher< Index, Fetch, BlockDim, WarpSize, f
                      Real zero,
                      Args... args )
    {
-      biEllpack.template segmentsReductionKernel< Fetch, Reduction, ResultKeeper, Real, BlockDim, Args... >( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+      biEllpack.template reduceSegmentsKernel< Fetch, Reduction, ResultKeeper, Real, BlockDim, Args... >( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
    }
 };
 
@@ -350,7 +350,7 @@ template< typename View,
           int BlockDim,
           typename... Args >
 __global__
-void BiEllpackSegmentsReductionKernel( View biEllpack,
+void BiEllpackreduceSegmentsKernel( View biEllpack,
                                        Index gridIdx,
                                        Index first,
                                        Index last,
@@ -360,7 +360,7 @@ void BiEllpackSegmentsReductionKernel( View biEllpack,
                                        Real zero,
                                        Args... args )
 {
-   BiEllpackSegmentsReductionDispatcher< Index, Fetch, BlockDim >::exec( biEllpack, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+   BiEllpackreduceSegmentsDispatcher< Index, Fetch, BlockDim >::exec( biEllpack, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
 }
 #endif
 
diff --git a/src/TNL/Algorithms/Segments/detail/CSR.h b/src/TNL/Algorithms/Segments/detail/CSR.h
index e43a97b67..5b17e5373 100644
--- a/src/TNL/Algorithms/Segments/detail/CSR.h
+++ b/src/TNL/Algorithms/Segments/detail/CSR.h
@@ -105,7 +105,7 @@ class CSR
        * \brief Go over all segments and perform a reduction in each of them.
        */
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
       void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
diff --git a/src/TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h b/src/TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h
index f11668c2d..4af0197d2 100644
--- a/src/TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h
+++ b/src/TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h
@@ -93,7 +93,7 @@ struct CSRAdaptiveKernelParameters
 
 template< int SizeOfValue,
           int StreamedSharedMemory_ >
-constexpr int 
+constexpr int
 CSRAdaptiveKernelParameters< SizeOfValue, StreamedSharedMemory_ >::
 getSizeValueLogConstexpr( const int i )
 {
diff --git a/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h
index 5f47b0caf..19169b558 100644
--- a/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h
+++ b/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h
@@ -233,11 +233,11 @@ class ChunkedEllpack
 #ifdef HAVE_CUDA
 template< typename Index,
           typename Fetch,
-          bool HasAllParameters = detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
-struct ChunkedEllpackSegmentsReductionDispatcher{};
+          bool HasAllParameters = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
+struct ChunkedEllpackreduceSegmentsDispatcher{};
 
 template< typename Index, typename Fetch >
-struct ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch, true >
+struct ChunkedEllpackreduceSegmentsDispatcher< Index, Fetch, true >
 {
    template< typename View,
              typename Reduction,
@@ -255,12 +255,12 @@ struct ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch, true >
                      Real zero,
                      Args... args )
    {
-      chunkedEllpack.segmentsReductionKernelWithAllParameters( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+      chunkedEllpack.reduceSegmentsKernelWithAllParameters( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
    }
 };
 
 template< typename Index, typename Fetch >
-struct ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch, false >
+struct ChunkedEllpackreduceSegmentsDispatcher< Index, Fetch, false >
 {
    template< typename View,
              typename Reduction,
@@ -278,7 +278,7 @@ struct ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch, false >
                      Real zero,
                      Args... args )
    {
-      chunkedEllpack.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+      chunkedEllpack.reduceSegmentsKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
    }
 };
 
@@ -290,7 +290,7 @@ template< typename View,
           typename Real,
           typename... Args >
 __global__
-void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack,
+void ChunkedEllpackreduceSegmentsKernel( View chunkedEllpack,
                                             Index gridIdx,
                                             Index first,
                                             Index last,
@@ -300,7 +300,7 @@ void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack,
                                             Real zero,
                                             Args... args )
 {
-   ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch >::exec( chunkedEllpack, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+   ChunkedEllpackreduceSegmentsDispatcher< Index, Fetch >::exec( chunkedEllpack, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
 }
 #endif
 
diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index 7dd5428b5..68b2de7ee 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -297,7 +297,7 @@ reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce,
          return fetch( rowIdx, columnIdx, values_view[ globalIdx ] );
       return identity;
    };
-   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, identity );
+   this->segments.reduceSegments( begin, end, fetch_, reduce, keep, zero );
 }
 
 template< typename Real,
@@ -314,7 +314,11 @@ reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce,
          return fetch( rowIdx, columnIdx, values_view[ globalIdx ] );
       return identity;
    };
+<<<<<<< HEAD
    this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, identity );
+=======
+   this->segments.reduceSegments( begin, end, fetch_, reduce, keep, zero );
+>>>>>>> Renaming segmentsReduction to reduceSegments.
 }
 
 template< typename Real,
@@ -537,7 +541,7 @@ vectorProduct( const InVector& inVector,
    auto keeper = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
       outVectorView[ row ] = matrixMultiplicator * value + outVectorMultiplicator * outVectorView[ row ];
    };
-   this->segments.segmentsReduction( begin, end, fetch, std::plus<>{}, keeper, ( RealType ) 0.0 );
+   this->segments.reduceSegments( begin, end, fetch, std::plus<>{}, keeper, ( RealType ) 0.0 );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 94a8fc572..5df969867 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -210,7 +210,7 @@ getNonzeroElementsCount() const
       auto keeper = [=] __cuda_callable__ ( IndexType row, const IndexType& value ) mutable {
          row_sums_view[ row ] = value;
       };
-      this->segments.segmentsReduction( (IndexType) 0, this->getRows(), fetch, std::plus<>{}, keeper, ( IndexType ) 0 );
+      this->segments.reduceSegments( 0, this->getRows(), fetch, std::plus<>{}, keeper, ( IndexType ) 0 );
       return sum( row_sums );
    }
 }
@@ -475,22 +475,22 @@ vectorProduct( const InVector& inVector,
    if( lastRow == 0 )
       lastRow = this->getRows();
    if( isSymmetric() )
-      this->segments.segmentsReduction( firstRow, lastRow, symmetricFetch, std::plus<>{}, keeperGeneral, ( ComputeRealType ) 0.0 );
+      this->segments.reduceSegments( firstRow, lastRow, symmetricFetch, std::plus<>{}, keeperGeneral, ( ComputeRealType ) 0.0 );
    else
    {
       if( outVectorMultiplicator == 0.0 )
       {
          if( matrixMultiplicator == 1.0 )
-            this->segments.segmentsReduction( firstRow, lastRow, fetch, std::plus<>{}, keeperDirect, ( ComputeRealType ) 0.0 );
+            this->segments.reduceSegments( firstRow, lastRow, fetch, std::plus<>{}, keeperDirect, ( ComputeRealType ) 0.0 );
          else
-            this->segments.segmentsReduction( firstRow, lastRow, fetch, std::plus<>{}, keeperMatrixMult, ( ComputeRealType ) 0.0 );
+            this->segments.reduceSegments( firstRow, lastRow, fetch, std::plus<>{}, keeperMatrixMult, ( ComputeRealType ) 0.0 );
       }
       else
       {
          if( matrixMultiplicator == 1.0 )
-            this->segments.segmentsReduction( firstRow, lastRow, fetch, std::plus<>{}, keeperVectorMult, ( ComputeRealType ) 0.0 );
+            this->segments.reduceSegments( firstRow, lastRow, fetch, std::plus<>{}, keeperVectorMult, ( ComputeRealType ) 0.0 );
          else
-            this->segments.segmentsReduction( firstRow, lastRow, fetch, std::plus<>{}, keeperGeneral, ( ComputeRealType ) 0.0 );
+            this->segments.reduceSegments( firstRow, lastRow, fetch, std::plus<>{}, keeperGeneral, ( ComputeRealType ) 0.0 );
       }
    }
 }
@@ -520,7 +520,7 @@ reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce,
       }
       return identity;
    };
-   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, identity );
+   this->segments.reduceSegments( begin, end, fetch_, reduce, keep, zero );
 }
 
 template< typename Real,
@@ -549,7 +549,7 @@ reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce,
       }
       return identity;
    };
-   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, identity );
+   this->segments.reduceSegments( begin, end, fetch_, reduce, keep, zero );
 }
 
 template< typename Real,
-- 
GitLab


From a006571b9ffe370009171d6b5fcd8bdc3b3154a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 31 Mar 2021 20:47:24 +0200
Subject: [PATCH 008/117] Renaming allReduction to reduceAllSegments.

---
 src/TNL/Algorithms/Segments/BiEllpack.h                     | 2 +-
 src/TNL/Algorithms/Segments/BiEllpack.hpp                   | 2 +-
 src/TNL/Algorithms/Segments/BiEllpackView.h                 | 2 +-
 src/TNL/Algorithms/Segments/BiEllpackView.hpp               | 2 +-
 src/TNL/Algorithms/Segments/CSR.h                           | 2 +-
 src/TNL/Algorithms/Segments/CSR.hpp                         | 2 +-
 src/TNL/Algorithms/Segments/CSRView.h                       | 2 +-
 src/TNL/Algorithms/Segments/CSRView.hpp                     | 2 +-
 src/TNL/Algorithms/Segments/ChunkedEllpack.h                | 2 +-
 src/TNL/Algorithms/Segments/ChunkedEllpack.hpp              | 2 +-
 src/TNL/Algorithms/Segments/ChunkedEllpackView.h            | 2 +-
 src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp          | 2 +-
 src/TNL/Algorithms/Segments/Ellpack.h                       | 2 +-
 src/TNL/Algorithms/Segments/Ellpack.hpp                     | 2 +-
 src/TNL/Algorithms/Segments/EllpackView.h                   | 2 +-
 src/TNL/Algorithms/Segments/EllpackView.hpp                 | 2 +-
 src/TNL/Algorithms/Segments/SlicedEllpack.h                 | 2 +-
 src/TNL/Algorithms/Segments/SlicedEllpack.hpp               | 6 +++---
 src/TNL/Algorithms/Segments/SlicedEllpackView.h             | 2 +-
 src/TNL/Algorithms/Segments/SlicedEllpackView.hpp           | 2 +-
 src/TNL/Algorithms/Segments/detail/CSR.h                    | 2 +-
 src/UnitTests/Algorithms/Segments/SegmentsTest.hpp          | 6 +++---
 src/UnitTests/Algorithms/Segments/SegmentsTest_CSR.h        | 4 ++--
 src/UnitTests/Algorithms/Segments/SegmentsTest_Ellpack.h    | 4 ++--
 .../Algorithms/Segments/SegmentsTest_SlicedEllpack.h        | 4 ++--
 25 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/BiEllpack.h b/src/TNL/Algorithms/Segments/BiEllpack.h
index b580206bc..347bfe628 100644
--- a/src/TNL/Algorithms/Segments/BiEllpack.h
+++ b/src/TNL/Algorithms/Segments/BiEllpack.h
@@ -119,7 +119,7 @@ namespace TNL
             void reduceSegments(IndexType first, IndexType last, Fetch &fetch, const Reduction &reduction, ResultKeeper &keeper, const Real &zero, Args... args) const;
 
             template <typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args>
-            void allReduction(Fetch &fetch, const Reduction &reduction, ResultKeeper &keeper, const Real &zero, Args... args) const;
+            void reduceAllSegments(Fetch &fetch, const Reduction &reduction, ResultKeeper &keeper, const Real &zero, Args... args) const;
 
             BiEllpack &operator=(const BiEllpack &source) = default;
 
diff --git a/src/TNL/Algorithms/Segments/BiEllpack.hpp b/src/TNL/Algorithms/Segments/BiEllpack.hpp
index 5bb5a38bb..6dfaa0ab1 100644
--- a/src/TNL/Algorithms/Segments/BiEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpack.hpp
@@ -513,7 +513,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
-allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.h b/src/TNL/Algorithms/Segments/BiEllpackView.h
index 5a7d25310..344f96c00 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.h
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.h
@@ -130,7 +130,7 @@ class BiEllpackView
       void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       BiEllpackView& operator=( const BiEllpackView& view );
 
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.hpp b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
index 88347e834..29d554e0a 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
@@ -442,7 +442,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 BiEllpackView< Device, Index, Organization, WarpSize >::
-allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index a06c287f1..aee35bbde 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -127,7 +127,7 @@ class CSR
       void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       CSR& operator=( const CSR& rhsSegments ) = default;
 
diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp
index 8560accfe..f0e1d7881 100644
--- a/src/TNL/Algorithms/Segments/CSR.hpp
+++ b/src/TNL/Algorithms/Segments/CSR.hpp
@@ -290,7 +290,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 CSR< Device, Index, Kernel, IndexAllocator >::
-allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index 53b441c77..6b47a4a3a 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -124,7 +124,7 @@ class CSRView
       void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       CSRView& operator=( const CSRView& view );
 
diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp
index ad1d95653..e6cbcfff4 100644
--- a/src/TNL/Algorithms/Segments/CSRView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRView.hpp
@@ -258,7 +258,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 CSRView< Device, Index, Kernel >::
-allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
index c24b0843c..52ab9cd21 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
@@ -115,7 +115,7 @@ class ChunkedEllpack
       void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       ChunkedEllpack& operator=( const ChunkedEllpack& source ) = default;
 
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
index 04343ce5d..3031c7baa 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
@@ -455,7 +455,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
-allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
index 134433b04..77352f081 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
@@ -144,7 +144,7 @@ class ChunkedEllpackView
       void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       ChunkedEllpackView& operator=( const ChunkedEllpackView& view );
 
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
index 453ac9cdc..92bc43b77 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
@@ -470,7 +470,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 ChunkedEllpackView< Device, Index, Organization >::
-allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Algorithms/Segments/Ellpack.h b/src/TNL/Algorithms/Segments/Ellpack.h
index 6a1c2b150..368d435eb 100644
--- a/src/TNL/Algorithms/Segments/Ellpack.h
+++ b/src/TNL/Algorithms/Segments/Ellpack.h
@@ -115,7 +115,7 @@ class Ellpack
       void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       Ellpack& operator=( const Ellpack& source ) = default;
 
diff --git a/src/TNL/Algorithms/Segments/Ellpack.hpp b/src/TNL/Algorithms/Segments/Ellpack.hpp
index 52ca1ef33..0355ee62b 100644
--- a/src/TNL/Algorithms/Segments/Ellpack.hpp
+++ b/src/TNL/Algorithms/Segments/Ellpack.hpp
@@ -320,7 +320,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 Ellpack< Device, Index, IndexAllocator, Organization, Alignment >::
-allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Algorithms/Segments/EllpackView.h b/src/TNL/Algorithms/Segments/EllpackView.h
index e32173060..37e6290db 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.h
+++ b/src/TNL/Algorithms/Segments/EllpackView.h
@@ -114,7 +114,7 @@ class EllpackView
       void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       EllpackView& operator=( const EllpackView& view );
 
diff --git a/src/TNL/Algorithms/Segments/EllpackView.hpp b/src/TNL/Algorithms/Segments/EllpackView.hpp
index 675468ea0..b39915185 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/EllpackView.hpp
@@ -316,7 +316,7 @@ template< typename Device,
           int Alignment >
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void EllpackView< Device, Index, Organization, Alignment >::
-allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.h b/src/TNL/Algorithms/Segments/SlicedEllpack.h
index 4807c8670..edb35fac3 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpack.h
+++ b/src/TNL/Algorithms/Segments/SlicedEllpack.h
@@ -112,7 +112,7 @@ class SlicedEllpack
       void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       SlicedEllpack& operator=( const SlicedEllpack& source ) = default;
 
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
index 4aa2383e9..8f90b3aa5 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
@@ -152,8 +152,8 @@ setSegmentsSizes( const SizesHolder& sizes )
       slices_view[ i ] = res * SliceSize;
       slice_segment_size_view[ i ] = res;
    };
-   ellpack.allReduction( fetch, reduce, keep, std::numeric_limits< IndexType >::min() );
-   inplaceExclusiveScan( this->sliceOffsets );
+   ellpack.reduceAllSegments( fetch, reduce, keep, std::numeric_limits< IndexType >::min() );
+   this->sliceOffsets.template scan< Algorithms::ScanType::Exclusive >();
    this->size = sum( sizes );
    this->alignedSize = this->sliceOffsets.getElement( slicesCount );
 }
@@ -354,7 +354,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 SlicedEllpack< Device, Index, IndexAllocator, Organization, SliceSize >::
-allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.h b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
index dccb55fae..5b052848c 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
@@ -113,7 +113,7 @@ class SlicedEllpackView
       void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       SlicedEllpackView& operator=( const SlicedEllpackView& view );
 
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
index 8282ea46c..13ef6b038 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
@@ -378,7 +378,7 @@ template< typename Device,
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
 SlicedEllpackView< Device, Index, Organization, SliceSize >::
-allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
 }
diff --git a/src/TNL/Algorithms/Segments/detail/CSR.h b/src/TNL/Algorithms/Segments/detail/CSR.h
index 5b17e5373..b6ce3b6e4 100644
--- a/src/TNL/Algorithms/Segments/detail/CSR.h
+++ b/src/TNL/Algorithms/Segments/detail/CSR.h
@@ -108,7 +108,7 @@ class CSR
       void reduceSegments( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      void reduceAllSegments( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 };
          } // namespace detail
       } // namespace Segments
diff --git a/src/UnitTests/Algorithms/Segments/SegmentsTest.hpp b/src/UnitTests/Algorithms/Segments/SegmentsTest.hpp
index 7073bdb8a..5b92a02c8 100644
--- a/src/UnitTests/Algorithms/Segments/SegmentsTest.hpp
+++ b/src/UnitTests/Algorithms/Segments/SegmentsTest.hpp
@@ -112,7 +112,7 @@ void test_SetSegmentsSizes_EqualSizes_EllpackOnly()
 }
 
 template< typename Segments >
-void test_AllReduction_MaximumInSegments()
+void test_reduceAllSegments_MaximumInSegments()
 {
    using DeviceType = typename Segments::DeviceType;
    using IndexType = typename Segments::IndexType;
@@ -147,13 +147,13 @@ void test_AllReduction_MaximumInSegments()
    auto keep = [=] __cuda_callable__ ( const IndexType i, const IndexType a ) mutable {
       result_view[ i ] = a;
    };
-   segments.allReduction( fetch, reduce, keep, std::numeric_limits< IndexType >::min() );
+   segments.reduceAllSegments( fetch, reduce, keep, std::numeric_limits< IndexType >::min() );
 
    for( IndexType i = 0; i < segmentsCount; i++ )
       EXPECT_EQ( result.getElement( i ), ( i + 1 ) * segmentSize );
 
    result_view = 0;
-   segments.getView().allReduction( fetch, reduce, keep, std::numeric_limits< IndexType >::min() );
+   segments.getView().reduceAllSegments( fetch, reduce, keep, std::numeric_limits< IndexType >::min() );
    for( IndexType i = 0; i < segmentsCount; i++ )
       EXPECT_EQ( result.getElement( i ), ( i + 1 ) * segmentSize );
 }
diff --git a/src/UnitTests/Algorithms/Segments/SegmentsTest_CSR.h b/src/UnitTests/Algorithms/Segments/SegmentsTest_CSR.h
index 74219f7db..b1b587712 100644
--- a/src/UnitTests/Algorithms/Segments/SegmentsTest_CSR.h
+++ b/src/UnitTests/Algorithms/Segments/SegmentsTest_CSR.h
@@ -44,11 +44,11 @@ TYPED_TEST( CSRSegmentsTest, setSegmentsSizes_EqualSizes )
     test_SetSegmentsSizes_EqualSizes< CSRSegmentsType >();
 }
 
-TYPED_TEST( CSRSegmentsTest, allReduction_MaximumInSegments )
+TYPED_TEST( CSRSegmentsTest, reduceAllSegments_MaximumInSegments )
 {
     using CSRSegmentsType = typename TestFixture::CSRSegmentsType;
 
-    test_AllReduction_MaximumInSegments< CSRSegmentsType >();
+    test_reduceAllSegments_MaximumInSegments< CSRSegmentsType >();
 }
 
 #endif
diff --git a/src/UnitTests/Algorithms/Segments/SegmentsTest_Ellpack.h b/src/UnitTests/Algorithms/Segments/SegmentsTest_Ellpack.h
index 262ddce6d..af9816ee4 100644
--- a/src/UnitTests/Algorithms/Segments/SegmentsTest_Ellpack.h
+++ b/src/UnitTests/Algorithms/Segments/SegmentsTest_Ellpack.h
@@ -51,11 +51,11 @@ TYPED_TEST( EllpackSegmentsTest, setSegmentsSizes_EqualSizes_EllpackOnly )
     test_SetSegmentsSizes_EqualSizes_EllpackOnly< EllpackSegmentsType >();
 }
 
-TYPED_TEST( EllpackSegmentsTest, allReduction_MaximumInSegments )
+TYPED_TEST( EllpackSegmentsTest, reduceAllSegments_MaximumInSegments )
 {
     using EllpackSegmentsType = typename TestFixture::EllpackSegmentsType;
 
-    test_AllReduction_MaximumInSegments< EllpackSegmentsType >();
+    test_reduceAllSegments_MaximumInSegments< EllpackSegmentsType >();
 }
 
 #endif
diff --git a/src/UnitTests/Algorithms/Segments/SegmentsTest_SlicedEllpack.h b/src/UnitTests/Algorithms/Segments/SegmentsTest_SlicedEllpack.h
index 42a9e7652..2cd9fcd1c 100644
--- a/src/UnitTests/Algorithms/Segments/SegmentsTest_SlicedEllpack.h
+++ b/src/UnitTests/Algorithms/Segments/SegmentsTest_SlicedEllpack.h
@@ -44,11 +44,11 @@ TYPED_TEST( SlicedEllpackSegmentsTest, setSegmentsSizes_EqualSizes )
     test_SetSegmentsSizes_EqualSizes< SlicedEllpackSegmentsType >();
 }
 
-TYPED_TEST( SlicedEllpackSegmentsTest, allReduction_MaximumInSegments )
+TYPED_TEST( SlicedEllpackSegmentsTest, reduceAllSegments_MaximumInSegments )
 {
     using SlicedEllpackSegmentsType = typename TestFixture::SlicedEllpackSegmentsType;
 
-    test_AllReduction_MaximumInSegments< SlicedEllpackSegmentsType >();
+    test_reduceAllSegments_MaximumInSegments< SlicedEllpackSegmentsType >();
 }
 
 #endif
-- 
GitLab


From 90a43d2ca78ea08c0037d7e0c768e7de418d1339 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 31 Mar 2021 21:29:42 +0200
Subject: [PATCH 009/117] Removing Args... from segments reductions.

---
 src/TNL/Algorithms/Segments/BiEllpack.h       | 206 +++++++++---------
 src/TNL/Algorithms/Segments/BiEllpack.hpp     |  12 +-
 src/TNL/Algorithms/Segments/BiEllpackView.h   |   8 +-
 src/TNL/Algorithms/Segments/BiEllpackView.hpp |  30 ++-
 src/TNL/Algorithms/Segments/CSR.h             |   8 +-
 src/TNL/Algorithms/Segments/CSR.hpp           |  12 +-
 src/TNL/Algorithms/Segments/CSRView.h         |   8 +-
 src/TNL/Algorithms/Segments/CSRView.hpp       |  14 +-
 src/TNL/Algorithms/Segments/ChunkedEllpack.h  |   8 +-
 .../Algorithms/Segments/ChunkedEllpack.hpp    |  12 +-
 .../Algorithms/Segments/ChunkedEllpackView.h  |   8 +-
 .../Segments/ChunkedEllpackView.hpp           |  32 ++-
 src/TNL/Algorithms/Segments/Ellpack.h         |   8 +-
 src/TNL/Algorithms/Segments/Ellpack.hpp       |  12 +-
 src/TNL/Algorithms/Segments/EllpackView.h     |   8 +-
 src/TNL/Algorithms/Segments/EllpackView.hpp   |  22 +-
 src/TNL/Algorithms/Segments/SlicedEllpack.h   |   8 +-
 src/TNL/Algorithms/Segments/SlicedEllpack.hpp |  12 +-
 .../Algorithms/Segments/SlicedEllpackView.h   |   8 +-
 .../Algorithms/Segments/SlicedEllpackView.hpp |  22 +-
 20 files changed, 225 insertions(+), 233 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/BiEllpack.h b/src/TNL/Algorithms/Segments/BiEllpack.h
index 347bfe628..9f0aee613 100644
--- a/src/TNL/Algorithms/Segments/BiEllpack.h
+++ b/src/TNL/Algorithms/Segments/BiEllpack.h
@@ -22,163 +22,163 @@ namespace TNL
       namespace Segments
       {
 
-         template <typename Device,
-                   typename Index,
-                   typename IndexAllocator = typename Allocators::Default<Device>::template Allocator<Index>,
-                   ElementsOrganization Organization = Algorithms::Segments::DefaultElementsOrganization<Device>::getOrganization(),
-                   int WarpSize = 32>
-         class BiEllpack
-         {
-         public:
-            using DeviceType = Device;
-            using IndexType = std::remove_const_t<Index>;
-            using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType, IndexAllocator>;
-            static constexpr ElementsOrganization getOrganization() { return Organization; }
-            using ViewType = BiEllpackView< Device, Index, Organization, WarpSize >;
-            template <typename Device_, typename Index_>
-            using ViewTemplate = BiEllpackView<Device_, Index_, Organization, WarpSize >;
-            using ConstViewType = typename ViewType::ConstViewType;
-            using SegmentViewType = typename ViewType::SegmentViewType;
+template <typename Device,
+            typename Index,
+            typename IndexAllocator = typename Allocators::Default<Device>::template Allocator<Index>,
+            ElementsOrganization Organization = Algorithms::Segments::DefaultElementsOrganization<Device>::getOrganization(),
+            int WarpSize = 32>
+class BiEllpack
+{
+   public:
+      using DeviceType = Device;
+      using IndexType = std::remove_const_t<Index>;
+      using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType, IndexAllocator>;
+      static constexpr ElementsOrganization getOrganization() { return Organization; }
+      using ViewType = BiEllpackView< Device, Index, Organization, WarpSize >;
+      template <typename Device_, typename Index_>
+      using ViewTemplate = BiEllpackView<Device_, Index_, Organization, WarpSize >;
+      using ConstViewType = typename ViewType::ConstViewType;
+      using SegmentViewType = typename ViewType::SegmentViewType;
 
-            static constexpr bool havePadding() { return true; };
+      static constexpr bool havePadding() { return true; };
 
-            BiEllpack() = default;
+      BiEllpack() = default;
 
-            BiEllpack(const Containers::Vector<IndexType, DeviceType, IndexType> &sizes);
+      BiEllpack(const Containers::Vector<IndexType, DeviceType, IndexType> &sizes);
 
-            BiEllpack(const BiEllpack &segments);
+      BiEllpack(const BiEllpack &segments);
 
-            BiEllpack(const BiEllpack &&segments);
+      BiEllpack(const BiEllpack &&segments);
 
-            static String getSerializationType();
+      static String getSerializationType();
 
-            static String getSegmentsType();
+      static String getSegmentsType();
 
-            ViewType getView();
+      ViewType getView();
 
-            const ConstViewType getConstView() const;
+      const ConstViewType getConstView() const;
 
-            /**
+      /**
        * \brief Number of segments.
        */
-            __cuda_callable__
-                IndexType
-                getSegmentsCount() const;
+      __cuda_callable__
+            IndexType
+            getSegmentsCount() const;
 
-            /**
+      /**
        * \brief Set sizes of particular segments.
        */
-            template <typename SizesHolder = OffsetsHolder>
-            void setSegmentsSizes(const SizesHolder &sizes);
+      template <typename SizesHolder = OffsetsHolder>
+      void setSegmentsSizes(const SizesHolder &sizes);
 
-            void reset();
+      void reset();
 
-            IndexType getSegmentSize(const IndexType segmentIdx) const;
+      IndexType getSegmentSize(const IndexType segmentIdx) const;
 
-            /**
+      /**
        * \brief Number segments.
        */
-            __cuda_callable__
-                IndexType
-                getSize() const;
+      __cuda_callable__
+            IndexType
+            getSize() const;
 
-            __cuda_callable__
-                IndexType
-                getStorageSize() const;
+      __cuda_callable__
+            IndexType
+            getStorageSize() const;
 
-            __cuda_callable__
-                IndexType
-                getGlobalIndex(const IndexType segmentIdx, const IndexType localIdx) const;
+      __cuda_callable__
+            IndexType
+            getGlobalIndex(const IndexType segmentIdx, const IndexType localIdx) const;
 
-            __cuda_callable__
-                SegmentViewType
-                getSegmentView(const IndexType segmentIdx) const;
+      __cuda_callable__
+            SegmentViewType
+            getSegmentView(const IndexType segmentIdx) const;
 
-            /***
+      /***
        * \brief Go over all segments and for each segment element call
        * function 'f' with arguments 'args'. The return type of 'f' is bool.
        * When its true, the for-loop continues. Once 'f' returns false, the for-loop
        * is terminated.
        */
-            template <typename Function>
-            void forElements(IndexType first, IndexType last, Function &&f) const;
+      template< typename Function >
+      void forElements( IndexType first, IndexType last, Function&& f ) const;
 
-            template <typename Function>
-            void forAllElements(Function &&f) const;
+      template <typename Function>
+      void forAllElements(Function&& f ) const;
 
-            template <typename Function>
-            void forSegments(IndexType begin, IndexType end, Function &&f) const;
+      template <typename Function>
+      void forSegments(IndexType begin, IndexType end, Function&& f ) const;
 
-            template <typename Function>
-            void forAllSegments(Function &&f) const;
+      template <typename Function>
+      void forAllSegments( Function&& f ) const;
 
-            /***
+      /***
        * \brief Go over all segments and perform a reduction in each of them.
        */
-            template <typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args>
-            void reduceSegments(IndexType first, IndexType last, Fetch &fetch, const Reduction &reduction, ResultKeeper &keeper, const Real &zero, Args... args) const;
-
-            template <typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args>
-            void reduceAllSegments(Fetch &fetch, const Reduction &reduction, ResultKeeper &keeper, const Real &zero, Args... args) const;
+      template <typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceSegments(IndexType first, IndexType last, Fetch &fetch, const Reduction &reduction, ResultKeeper &keeper, const Real &zero ) const;
 
-            BiEllpack &operator=(const BiEllpack &source) = default;
+      template <typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceAllSegments(Fetch &fetch, const Reduction &reduction, ResultKeeper &keeper, const Real &zero ) const;
 
-            template <typename Device_, typename Index_, typename IndexAllocator_, ElementsOrganization Organization_>
-            BiEllpack &operator=(const BiEllpack<Device_, Index_, IndexAllocator_, Organization_, WarpSize> &source);
+      BiEllpack &operator=(const BiEllpack &source) = default;
 
-            void save(File &file) const;
+      template <typename Device_, typename Index_, typename IndexAllocator_, ElementsOrganization Organization_>
+      BiEllpack &operator=(const BiEllpack<Device_, Index_, IndexAllocator_, Organization_, WarpSize> &source);
 
-            void load(File &file);
+      void save(File &file) const;
 
-            void printStructure(std::ostream &str) const;
+      void load(File &file);
 
-            // TODO: nvcc needs this public because of lambda function used inside
-            template <typename SizesHolder = OffsetsHolder>
-            void performRowBubbleSort(const SizesHolder &segmentsSize);
+      void printStructure(std::ostream &str) const;
 
-            // TODO: the same as  above
-            template <typename SizesHolder = OffsetsHolder>
-            void computeColumnSizes(const SizesHolder &segmentsSizes);
+      // TODO: nvcc needs this public because of lambda function used inside
+      template <typename SizesHolder = OffsetsHolder>
+      void performRowBubbleSort(const SizesHolder &segmentsSize);
 
-         protected:
-            static constexpr int getWarpSize() { return WarpSize; };
+      // TODO: the same as  above
+      template <typename SizesHolder = OffsetsHolder>
+      void computeColumnSizes(const SizesHolder &segmentsSizes);
 
-            static constexpr int getLogWarpSize() { return std::log2(WarpSize); };
+   protected:
+      static constexpr int getWarpSize() { return WarpSize; };
 
-            template <typename SizesHolder = OffsetsHolder>
-            void verifyRowPerm(const SizesHolder &segmentsSizes);
+      static constexpr int getLogWarpSize() { return std::log2(WarpSize); };
 
-            template <typename SizesHolder = OffsetsHolder>
-            void verifyRowLengths(const SizesHolder &segmentsSizes);
+      template <typename SizesHolder = OffsetsHolder>
+      void verifyRowPerm(const SizesHolder &segmentsSizes);
 
-            IndexType getStripLength(const IndexType stripIdx) const;
+      template <typename SizesHolder = OffsetsHolder>
+      void verifyRowLengths(const SizesHolder &segmentsSizes);
 
-            IndexType getGroupLength(const IndexType strip, const IndexType group) const;
+      IndexType getStripLength(const IndexType stripIdx) const;
 
-            IndexType size = 0, storageSize = 0;
+      IndexType getGroupLength(const IndexType strip, const IndexType group) const;
 
-            IndexType virtualRows = 0;
+      IndexType size = 0, storageSize = 0;
 
-            OffsetsHolder rowPermArray;
+      IndexType virtualRows = 0;
 
-            OffsetsHolder groupPointers;
+      OffsetsHolder rowPermArray;
 
-            // TODO: Replace later
-            __cuda_callable__ Index power(const IndexType number, const IndexType exponent) const
-            {
-               if (exponent >= 0)
-               {
-                  IndexType result = 1;
-                  for (IndexType i = 0; i < exponent; i++)
-                     result *= number;
-                  return result;
-               }
-               return 0;
-            };
+      OffsetsHolder groupPointers;
 
-            template <typename Device_, typename Index_, typename IndexAllocator_, ElementsOrganization Organization_, int WarpSize_>
-            friend class BiEllpack;
-         };
+      // TODO: Replace later
+      __cuda_callable__ Index power(const IndexType number, const IndexType exponent) const
+      {
+         if (exponent >= 0)
+         {
+            IndexType result = 1;
+            for (IndexType i = 0; i < exponent; i++)
+               result *= number;
+            return result;
+         }
+         return 0;
+      };
+
+      template <typename Device_, typename Index_, typename IndexAllocator_, ElementsOrganization Organization_, int WarpSize_>
+      friend class BiEllpack;
+};
 
       } // namespace Segments
    }    // namespace Algorithms
diff --git a/src/TNL/Algorithms/Segments/BiEllpack.hpp b/src/TNL/Algorithms/Segments/BiEllpack.hpp
index 6dfaa0ab1..16de5a45c 100644
--- a/src/TNL/Algorithms/Segments/BiEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpack.hpp
@@ -497,12 +497,12 @@ template< typename Device,
           typename IndexAllocator,
           ElementsOrganization Organization,
           int WarpSize >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
-reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->getConstView().reduceSegments( first, last, fetch, reduction, keeper, zero, args... );
+   this->getConstView().reduceSegments( first, last, fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
@@ -510,12 +510,12 @@ template< typename Device,
           typename IndexAllocator,
           ElementsOrganization Organization,
           int WarpSize >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
-reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.h b/src/TNL/Algorithms/Segments/BiEllpackView.h
index 344f96c00..f728eb1ac 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.h
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.h
@@ -126,11 +126,11 @@ class BiEllpackView
       /***
        * \brief Go over all segments and perform a reduction in each of them.
        */
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
       BiEllpackView& operator=( const BiEllpackView& view );
 
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.hpp b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
index 29d554e0a..98c5c05c8 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
@@ -352,10 +352,10 @@ template< typename Device,
           typename Index,
           ElementsOrganization Organization,
           int WarpSize >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 BiEllpackView< Device, Index, Organization, WarpSize >::
-reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
    using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( this->getStorageSize() == 0 )
@@ -425,9 +425,9 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
          dim3 cudaGridSize = Cuda::getMaxGridSize();
          if( gridIdx == cudaGrids - 1 )
             cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-         details::BiEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim, Args...  >
+         details::BiEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim  >
             <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-            ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+            ( *this, gridIdx, first, last, fetch, reduction, keeper, zero );
          cudaThreadSynchronize();
          TNL_CHECK_CUDA_DEVICE;
       }
@@ -439,12 +439,12 @@ template< typename Device,
           typename Index,
           ElementsOrganization Organization,
           int WarpSize >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 BiEllpackView< Device, Index, Organization, WarpSize >::
-reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
@@ -513,8 +513,7 @@ template< typename Device,
              typename Reduction,
              typename ResultKeeper,
              typename Real,
-             int BlockDim,
-             typename... Args >
+             int BlockDim >
 __device__
 void
 BiEllpackView< Device, Index, Organization, WarpSize >::
@@ -524,10 +523,9 @@ reduceSegmentsKernelWithAllParameters( IndexType gridIdx,
                                           Fetch fetch,
                                           Reduction reduction,
                                           ResultKeeper keeper,
-                                          Real zero,
-                                          Args... args ) const
+                                          Real zero ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >() ) );
    const IndexType segmentIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x + first;
    if( segmentIdx >= last )
       return;
@@ -569,8 +567,7 @@ template< typename Device,
              typename Reduction,
              typename ResultKeeper,
              typename Real,
-             int BlockDim,
-             typename... Args >
+             int BlockDim >
 __device__
 void
 BiEllpackView< Device, Index, Organization, WarpSize >::
@@ -580,10 +577,9 @@ reduceSegmentsKernel( IndexType gridIdx,
                          Fetch fetch,
                          Reduction reduction,
                          ResultKeeper keeper,
-                         Real zero,
-                         Args... args ) const
+                         Real zero ) const
 {
-   using RealType = decltype( fetch( IndexType(), std::declval< bool& >(), args... ) );
+   using RealType = decltype( fetch( IndexType(), std::declval< bool& >() ) );
    Index segmentIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x + first;
 
    const IndexType strip = segmentIdx >> getLogWarpSize();
diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index aee35bbde..f3761ace5 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -123,11 +123,11 @@ class CSR
       /***
        * \brief Go over all segments and perform a reduction in each of them.
        */
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
       CSR& operator=( const CSR& rhsSegments ) = default;
 
diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp
index f0e1d7881..de3b77693 100644
--- a/src/TNL/Algorithms/Segments/CSR.hpp
+++ b/src/TNL/Algorithms/Segments/CSR.hpp
@@ -275,24 +275,24 @@ template< typename Device,
           typename Index,
           typename Kernel,
           typename IndexAllocator >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 CSR< Device, Index, Kernel, IndexAllocator >::
-reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->getConstView().reduceSegments( first, last, fetch, reduction, keeper, zero, args... );
+   this->getConstView().reduceSegments( first, last, fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
           typename Index,
           typename Kernel,
           typename IndexAllocator >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 CSR< Device, Index, Kernel, IndexAllocator >::
-reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index 6b47a4a3a..39a97b8c5 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -120,11 +120,11 @@ class CSRView
       /***
        * \brief Go over all segments and perform a reduction in each of them.
        */
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
       CSRView& operator=( const CSRView& view );
 
diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp
index e6cbcfff4..f4cfc2c78 100644
--- a/src/TNL/Algorithms/Segments/CSRView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRView.hpp
@@ -241,26 +241,26 @@ forAllSegments( Function&& f ) const
 template< typename Device,
           typename Index,
           typename Kernel >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 CSRView< Device, Index, Kernel >::
-reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
    if( std::is_same< DeviceType, TNL::Devices::Host >::value )
-      TNL::Algorithms::Segments::CSRScalarKernel< IndexType, DeviceType >::reduceSegments( offsets, first, last, fetch, reduction, keeper, zero, args... );
+      TNL::Algorithms::Segments::CSRScalarKernel< IndexType, DeviceType >::reduceSegments( offsets, first, last, fetch, reduction, keeper, zero );
    else
-      kernel.reduceSegments( offsets, first, last, fetch, reduction, keeper, zero, args... );
+      kernel.reduceSegments( offsets, first, last, fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
           typename Index,
           typename Kernel >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 CSRView< Device, Index, Kernel >::
-reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
index 52ab9cd21..5ed48a7f7 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
@@ -111,11 +111,11 @@ class ChunkedEllpack
       /***
        * \brief Go over all segments and perform a reduction in each of them.
        */
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
       ChunkedEllpack& operator=( const ChunkedEllpack& source ) = default;
 
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
index 3031c7baa..2228b7e62 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
@@ -440,24 +440,24 @@ template< typename Device,
           typename Index,
           typename IndexAllocator,
           ElementsOrganization Organization >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
-reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->getConstView().reduceSegments( first, last, fetch, reduction, keeper, zero, args... );
+   this->getConstView().reduceSegments( first, last, fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
           typename Index,
           typename IndexAllocator,
           ElementsOrganization Organization >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
-reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
index 77352f081..8f8177dbb 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
@@ -140,11 +140,11 @@ class ChunkedEllpackView
       /***
        * \brief Go over all segments and perform a reduction in each of them.
        */
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
       ChunkedEllpackView& operator=( const ChunkedEllpackView& view );
 
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
index 92bc43b77..c43b57321 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
@@ -393,15 +393,15 @@ forAllSegments( Function&& f ) const
 template< typename Device,
           typename Index,
           ElementsOrganization Organization >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 ChunkedEllpackView< Device, Index, Organization >::
-reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
    using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( std::is_same< DeviceType, Devices::Host >::value )
    {
-      //reduceSegmentsKernel( 0, first, last, fetch, reduction, keeper, zero, args... );
+      //reduceSegmentsKernel( 0, first, last, fetch, reduction, keeper, zero );
       //return;
 
       for( IndexType segmentIdx = first; segmentIdx < last; segmentIdx++ )
@@ -456,9 +456,9 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
       {
          if( gridIdx == cudaGrids - 1 )
             cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-         details::ChunkedEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args...  >
+         details::ChunkedEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real  >
             <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
-            ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
+            ( *this, gridIdx, first, last, fetch, reduction, keeper, zero );
       }
 #endif
    }
@@ -467,12 +467,12 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
 template< typename Device,
           typename Index,
           ElementsOrganization Organization >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 ChunkedEllpackView< Device, Index, Organization >::
-reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
@@ -543,8 +543,7 @@ template< typename Device,
    template< typename Fetch,
              typename Reduction,
              typename ResultKeeper,
-             typename Real,
-             typename... Args >
+             typename Real >
 __device__
 void
 ChunkedEllpackView< Device, Index, Organization >::
@@ -554,10 +553,9 @@ reduceSegmentsKernelWithAllParameters( IndexType gridIdx,
                                           Fetch fetch,
                                           Reduction reduction,
                                           ResultKeeper keeper,
-                                          Real zero,
-                                          Args... args ) const
+                                          Real zero ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >() ) );
 
    const IndexType firstSlice = rowToSliceMapping[ first ];
    const IndexType lastSlice = rowToSliceMapping[ last - 1 ];
@@ -621,8 +619,7 @@ template< typename Device,
    template< typename Fetch,
              typename Reduction,
              typename ResultKeeper,
-             typename Real,
-             typename... Args >
+             typename Real >
 __device__
 void
 ChunkedEllpackView< Device, Index, Organization >::
@@ -632,10 +629,9 @@ reduceSegmentsKernel( IndexType gridIdx,
                          Fetch fetch,
                          Reduction reduction,
                          ResultKeeper keeper,
-                         Real zero,
-                         Args... args ) const
+                         Real zero ) const
 {
-   using RealType = decltype( fetch( IndexType(), std::declval< bool& >(), args... ) );
+   using RealType = decltype( fetch( IndexType(), std::declval< bool& >() ) );
 
    const IndexType firstSlice = rowToSliceMapping[ first ];
    const IndexType lastSlice = rowToSliceMapping[ last - 1 ];
diff --git a/src/TNL/Algorithms/Segments/Ellpack.h b/src/TNL/Algorithms/Segments/Ellpack.h
index 368d435eb..52372e64b 100644
--- a/src/TNL/Algorithms/Segments/Ellpack.h
+++ b/src/TNL/Algorithms/Segments/Ellpack.h
@@ -111,11 +111,11 @@ class Ellpack
       /***
        * \brief Go over all segments and perform a reduction in each of them.
        */
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
       Ellpack& operator=( const Ellpack& source ) = default;
 
diff --git a/src/TNL/Algorithms/Segments/Ellpack.hpp b/src/TNL/Algorithms/Segments/Ellpack.hpp
index 0355ee62b..9de1f5b97 100644
--- a/src/TNL/Algorithms/Segments/Ellpack.hpp
+++ b/src/TNL/Algorithms/Segments/Ellpack.hpp
@@ -304,12 +304,12 @@ template< typename Device,
           typename IndexAllocator,
           ElementsOrganization Organization,
           int Alignment >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 Ellpack< Device, Index, IndexAllocator, Organization, Alignment >::
-reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->getConstView().reduceSegments( first, last, fetch, reduction, keeper, zero, args... );
+   this->getConstView().reduceSegments( first, last, fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
@@ -317,12 +317,12 @@ template< typename Device,
           typename IndexAllocator,
           ElementsOrganization Organization,
           int Alignment >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 Ellpack< Device, Index, IndexAllocator, Organization, Alignment >::
-reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/EllpackView.h b/src/TNL/Algorithms/Segments/EllpackView.h
index 37e6290db..23df8813b 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.h
+++ b/src/TNL/Algorithms/Segments/EllpackView.h
@@ -110,11 +110,11 @@ class EllpackView
       /***
        * \brief Go over all segments and perform a reduction in each of them.
        */
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
       EllpackView& operator=( const EllpackView& view );
 
diff --git a/src/TNL/Algorithms/Segments/EllpackView.hpp b/src/TNL/Algorithms/Segments/EllpackView.hpp
index b39915185..120f8f436 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/EllpackView.hpp
@@ -271,16 +271,16 @@ template< typename Device,
           typename Index,
           ElementsOrganization Organization,
           int Alignment >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void EllpackView< Device, Index, Organization, Alignment >::
-reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
-   using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
+   //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >() ) );
+   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( Organization == RowMajorOrder )
    {
       const IndexType segmentSize = this->segmentSize;
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
+      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx ) mutable {
          const IndexType begin = segmentIdx * segmentSize;
          const IndexType end = begin + segmentSize;
          RealType aux( zero );
@@ -290,13 +290,13 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
             aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
          keeper( segmentIdx, aux );
       };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+      Algorithms::ParallelFor< Device >::exec( first, last, l );
    }
    else
    {
       const IndexType storageSize = this->getStorageSize();
       const IndexType alignedSize = this->alignedSize;
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
+      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx ) mutable {
          const IndexType begin = segmentIdx;
          const IndexType end = storageSize;
          RealType aux( zero );
@@ -306,7 +306,7 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
             aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
          keeper( segmentIdx, aux );
       };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+      Algorithms::ParallelFor< Device >::exec( first, last, l );
    }
 }
 
@@ -314,11 +314,11 @@ template< typename Device,
           typename Index,
           ElementsOrganization Organization,
           int Alignment >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void EllpackView< Device, Index, Organization, Alignment >::
-reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.h b/src/TNL/Algorithms/Segments/SlicedEllpack.h
index edb35fac3..c69a27142 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpack.h
+++ b/src/TNL/Algorithms/Segments/SlicedEllpack.h
@@ -108,11 +108,11 @@ class SlicedEllpack
       /***
        * \brief Go over all segments and perform a reduction in each of them.
        */
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
       SlicedEllpack& operator=( const SlicedEllpack& source ) = default;
 
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
index 8f90b3aa5..af4184747 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
@@ -338,12 +338,12 @@ template< typename Device,
           typename IndexAllocator,
           ElementsOrganization Organization,
           int SliceSize >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 SlicedEllpack< Device, Index, IndexAllocator, Organization, SliceSize >::
-reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->getConstView().reduceSegments( first, last, fetch, reduction, keeper, zero, args... );
+   this->getConstView().reduceSegments( first, last, fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
@@ -351,12 +351,12 @@ template< typename Device,
           typename IndexAllocator,
           ElementsOrganization Organization,
           int SliceSize >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 SlicedEllpack< Device, Index, IndexAllocator, Organization, SliceSize >::
-reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.h b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
index 5b052848c..634d193c0 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
@@ -109,11 +109,11 @@ class SlicedEllpackView
       /***
        * \brief Go over all segments and perform a reduction in each of them.
        */
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
-      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
+      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
+      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
 
       SlicedEllpackView& operator=( const SlicedEllpackView& view );
 
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
index 13ef6b038..703da8d3c 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
@@ -326,18 +326,18 @@ template< typename Device,
           typename Index,
           ElementsOrganization Organization,
           int SliceSize >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 SlicedEllpackView< Device, Index, Organization, SliceSize >::
-reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
-   //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
+   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
+   //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >() ) );
    const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView();
    const auto sliceOffsets_view = this->sliceOffsets.getConstView();
    if( Organization == RowMajorOrder )
    {
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
+      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx ) mutable {
          const IndexType sliceIdx = segmentIdx / SliceSize;
          const IndexType segmentInSliceIdx = segmentIdx % SliceSize;
          const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ];
@@ -350,11 +350,11 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
             aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
          keeper( segmentIdx, aux );
       };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+      Algorithms::ParallelFor< Device >::exec( first, last, l );
    }
    else
    {
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
+      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx ) mutable {
          const IndexType sliceIdx = segmentIdx / SliceSize;
          const IndexType segmentInSliceIdx = segmentIdx % SliceSize;
          //const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ];
@@ -367,7 +367,7 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
             aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
          keeper( segmentIdx, aux );
       };
-      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+      Algorithms::ParallelFor< Device >::exec( first, last, l );
    }
 }
 
@@ -375,12 +375,12 @@ template< typename Device,
           typename Index,
           ElementsOrganization Organization,
           int SliceSize >
-   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
+   template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
 void
 SlicedEllpackView< Device, Index, Organization, SliceSize >::
-reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
+reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... );
+   this->reduceSegments( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero );
 }
 
 template< typename Device,
-- 
GitLab


From fddebacf7dc76d3dfb089a44cf5f6b819811adcc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 2 Apr 2021 15:13:45 +0200
Subject: [PATCH 010/117] Added segments printing.

---
 .../Algorithms/Segments/SegmentsPrinting.h    | 71 +++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 src/TNL/Algorithms/Segments/SegmentsPrinting.h

diff --git a/src/TNL/Algorithms/Segments/SegmentsPrinting.h b/src/TNL/Algorithms/Segments/SegmentsPrinting.h
new file mode 100644
index 000000000..30c027b74
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/SegmentsPrinting.h
@@ -0,0 +1,71 @@
+/***************************************************************************
+                          SegmentsPrinting.h -  description
+                             -------------------
+    begin                : Apr 1, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <iostream>
+#include <TNL/Containers/Array.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+template< typename Segments >
+std::ostream& printSegments( const Segments& segments, std::ostream& str )
+{
+   using IndexType = typename Segments::IndexType;
+   using DeviceType = typename Segments::DeviceType;
+
+   auto segmentsCount = segments.getSegmentsCount();
+   str << " [";
+   for( IndexType segmentIdx = 0; segmentIdx < segmentsCount; segmentIdx++ )
+   {
+      auto segmentSize = segments.getSegmentSize( segmentIdx );
+      str << " " << segmentSize;
+      if( segmentIdx < segmentsCount )
+         str << ",";
+   }
+   str << " ] " << std::endl;
+   return str;
+}
+
+
+template< typename Segments,
+          typename Fetch >
+std::ostream& printSegments( const Segments& segments, Fetch&& fetch, std::ostream& str )
+{
+   using IndexType = typename Segments::IndexType;
+   using DeviceType = typename Segments::DeviceType;
+   using ValueType = decltype( fetch( IndexType() ) );
+
+   TNL::Containers::Array< ValueType, DeviceType, IndexType > aux( 1 );
+   auto view = segments.getConstView();
+   for( IndexType segmentIdx = 0; segmentIdx < segments.getSegmentsCount(); segmentIdx++ )
+   {
+      str << "Seg. " << segmentIdx << ": [ ";
+      auto segmentSize = segments.getSegmentSize( segmentIdx );
+      for( IndexType localIdx = 0; localIdx < segmentSize; localIdx++ )
+      {
+         aux.forAllElements( [=] __cuda_callable__ ( IndexType elementIdx, double& v ) mutable {
+            v = fetch( view.getGlobalIndex( segmentIdx, localIdx ) );
+         } );
+         auto value = aux.getElement( 0 );
+         str << value;
+         if( localIdx < segmentSize - 1 )
+            str << ", ";
+      }
+      str << " ] " << std::endl;
+   }
+   return str;
+}
+
+      } // namespace Segments
+   } // namespace Algorithms
+} // namespace TNL
-- 
GitLab


From c2ee4999a01e8dbed9a2a977b02e8027c66b3dbb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 2 Apr 2021 15:16:55 +0200
Subject: [PATCH 011/117] Added constructors with segments sizes to segments
 and added insertion operators of segments to out streams.

---
 src/TNL/Algorithms/Segments/BiEllpack.h       | 15 ++++++++++++-
 src/TNL/Algorithms/Segments/BiEllpack.hpp     | 17 ++++++++++++--
 src/TNL/Algorithms/Segments/BiEllpackView.h   |  8 +++++++
 src/TNL/Algorithms/Segments/CSR.h             | 22 +++++++++++++++++--
 src/TNL/Algorithms/Segments/CSR.hpp           | 14 +++++++++++-
 src/TNL/Algorithms/Segments/CSRView.h         |  7 ++++++
 src/TNL/Algorithms/Segments/ChunkedEllpack.h  | 14 +++++++++++-
 .../Algorithms/Segments/ChunkedEllpack.hpp    | 16 ++++++++++++--
 src/TNL/Algorithms/Segments/Ellpack.h         | 13 ++++++++++-
 src/TNL/Algorithms/Segments/Ellpack.hpp       | 16 +++++++++++++-
 src/TNL/Algorithms/Segments/EllpackView.h     |  8 +++++++
 src/TNL/Algorithms/Segments/SlicedEllpack.h   | 13 ++++++++++-
 src/TNL/Algorithms/Segments/SlicedEllpack.hpp | 18 +++++++++++++--
 .../Algorithms/Segments/SlicedEllpackView.h   |  6 +++++
 14 files changed, 173 insertions(+), 14 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/BiEllpack.h b/src/TNL/Algorithms/Segments/BiEllpack.h
index 9f0aee613..3cf46e0ef 100644
--- a/src/TNL/Algorithms/Segments/BiEllpack.h
+++ b/src/TNL/Algorithms/Segments/BiEllpack.h
@@ -44,7 +44,12 @@ class BiEllpack
 
       BiEllpack() = default;
 
-      BiEllpack(const Containers::Vector<IndexType, DeviceType, IndexType> &sizes);
+      template< typename SizesContainer >
+      BiEllpack( const SizesContainer& sizes );
+
+      template< typename ListIndex >
+      BiEllpack( const std::initializer_list< ListIndex >& segmentsSizes );
+
 
       BiEllpack(const BiEllpack &segments);
 
@@ -180,6 +185,14 @@ class BiEllpack
       friend class BiEllpack;
 };
 
+template <typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization,
+          int WarpSize >
+std::ostream& operator<<( std::ostream& str, const BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >& segments ) { return printSegments( str, segments ); }
+
+
       } // namespace Segments
    }    // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/BiEllpack.hpp b/src/TNL/Algorithms/Segments/BiEllpack.hpp
index 16de5a45c..1412d1be5 100644
--- a/src/TNL/Algorithms/Segments/BiEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpack.hpp
@@ -26,10 +26,23 @@ template< typename Device,
           typename IndexAllocator,
           ElementsOrganization Organization,
           int WarpSize >
+   template< typename SizesContainer >
 BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
-BiEllpack( const Containers::Vector< IndexType, DeviceType, IndexType >& sizes )
+BiEllpack( const SizesContainer& segmentsSizes )
 {
-   this->setSegmentsSizes( sizes );
+   this->setSegmentsSizes( segmentsSizes );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization,
+          int WarpSize >
+   template< typename ListIndex >
+BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
+BiEllpack( const std::initializer_list< ListIndex >& segmentsSizes )
+{
+   this->setSegmentsSizes( Containers::Vector< IndexType, DeviceType, IndexType >( segmentsSizes ) );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.h b/src/TNL/Algorithms/Segments/BiEllpackView.h
index f728eb1ac..0bb603a3a 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.h
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.h
@@ -210,6 +210,14 @@ class BiEllpackView
       friend struct details::BiEllpackreduceSegmentsDispatcher;
 #endif
 };
+
+template <typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          int WarpSize >
+std::ostream& operator<<( std::ostream& str, const BiEllpackView< Device, Index, Organization, WarpSize >& segments ) { return printSegments( str, segments ); }
+
+
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index f3761ace5..e6eb5ad1d 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -21,6 +21,14 @@ namespace TNL {
    namespace Algorithms {
       namespace Segments {
 
+/**
+ * \brief Segments data structure based on CSR format.
+ *
+ * \tparam Device 
+ * \tparam Index 
+ * \tparam CSRScalarKernel< Index, Device > 
+ * \tparam Allocator< Index > 
+ */
 template< typename Device,
           typename Index,
           typename Kernel = CSRScalarKernel< Index, Device >,
@@ -46,7 +54,11 @@ class CSR
 
       CSR();
 
-      CSR( const SegmentsSizes& sizes );
+      template< typename SizesContainer >
+      CSR( const SizesContainer& sizes );
+
+      template< typename ListIndex >
+      CSR( const std::initializer_list< ListIndex >& segmentsSizes );
 
       CSR( const CSR& segments );
 
@@ -59,7 +71,7 @@ class CSR
       /**
        * \brief Set sizes of particular segments.
        */
-      template< typename SizesHolder = OffsetsHolder >
+      template< typename SizesHolder >
       void setSegmentsSizes( const SizesHolder& sizes );
 
       void reset();
@@ -145,6 +157,12 @@ class CSR
       KernelType kernel;
 };
 
+template< typename Device,
+          typename Index,
+          typename Kernel,
+          typename IndexAllocator >
+std::ostream& operator<<( std::ostream& str, const CSR< Device, Index, Kernel, IndexAllocator >& segments ) { return printSegments( segments, str ); }
+
 template< typename Device,
           typename Index,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp
index de3b77693..a3b4055c3 100644
--- a/src/TNL/Algorithms/Segments/CSR.hpp
+++ b/src/TNL/Algorithms/Segments/CSR.hpp
@@ -33,12 +33,24 @@ template< typename Device,
           typename Index,
           typename Kernel,
           typename IndexAllocator >
+   template< typename SizesContainer >
 CSR< Device, Index, Kernel, IndexAllocator >::
-CSR( const SegmentsSizes& segmentsSizes )
+CSR( const SizesContainer& segmentsSizes )
 {
    this->setSegmentsSizes( segmentsSizes );
 }
 
+template< typename Device,
+          typename Index,
+          typename Kernel,
+          typename IndexAllocator >
+   template< typename ListIndex >
+CSR< Device, Index, Kernel, IndexAllocator >::
+CSR( const std::initializer_list< ListIndex >& segmentsSizes )
+{
+   this->setSegmentsSizes( Containers::Vector< IndexType, DeviceType, IndexType >( segmentsSizes ) );
+}
+
 template< typename Device,
           typename Index,
           typename Kernel,
diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index 39a97b8c5..59300eaa3 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -18,6 +18,7 @@
 #include <TNL/Algorithms/Segments/CSRVectorKernel.h>
 #include <TNL/Algorithms/Segments/CSRHybridKernel.h>
 #include <TNL/Algorithms/Segments/CSRAdaptiveKernel.h>
+#include <TNL/Algorithms/Segments/SegmentsPrinting.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -139,6 +140,12 @@ class CSRView
       KernelView kernel;
 };
 
+
+template< typename Device,
+          typename Index,
+          typename Kernel >
+std::ostream& operator<<( std::ostream& str, const CSRView< Device, Index, Kernel >& segments ) { return printSegments( str, segments ); }
+
 template< typename Device,
           typename Index >
 using CSRViewScalar = CSRView< Device, Index, CSRScalarKernel< Index, Device > >;
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
index 5ed48a7f7..6c0bf2a22 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
@@ -45,7 +45,11 @@ class ChunkedEllpack
 
       ChunkedEllpack() = default;
 
-      ChunkedEllpack( const Containers::Vector< IndexType, DeviceType, IndexType >& sizes );
+      template< typename SizesContainer >
+      ChunkedEllpack( const SizesContainer& sizes );
+
+      template< typename ListIndex >
+      ChunkedEllpack( const std::initializer_list< ListIndex >& segmentsSizes );
 
       ChunkedEllpack( const ChunkedEllpack& segments );
 
@@ -168,6 +172,14 @@ class ChunkedEllpack
       friend class ChunkedEllpack;
 };
 
+template <typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization,
+          int Alignment >
+std::ostream& operator<<( std::ostream& str, const Ellpack< Device, Index, IndexAllocator, Organization, Alignment >& segments ) { return printSegments( str, segments ); }
+
+
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
index 2228b7e62..8992d0951 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
@@ -24,10 +24,22 @@ template< typename Device,
           typename Index,
           typename IndexAllocator,
           ElementsOrganization Organization >
+   template< typename SizesContainer >
 ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
-ChunkedEllpack( const Containers::Vector< IndexType, DeviceType, IndexType >& sizes )
+ChunkedEllpack( const SizesContainer& segmentsSizes )
 {
-   this->setSegmentsSizes( sizes );
+   this->setSegmentsSizes( segmentsSizes );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization >
+   template< typename ListIndex >
+ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
+ChunkedEllpack( const std::initializer_list< ListIndex >& segmentsSizes )
+{
+   this->setSegmentsSizes( Containers::Vector< IndexType, DeviceType, IndexType >( segmentsSizes ) );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/Ellpack.h b/src/TNL/Algorithms/Segments/Ellpack.h
index 52372e64b..442a1b7c6 100644
--- a/src/TNL/Algorithms/Segments/Ellpack.h
+++ b/src/TNL/Algorithms/Segments/Ellpack.h
@@ -43,7 +43,11 @@ class Ellpack
 
       Ellpack();
 
-      Ellpack( const SegmentsSizes& sizes );
+      template< typename SizesContainer >
+      Ellpack( const SizesContainer& sizes );
+
+      template< typename ListIndex >
+      Ellpack( const std::initializer_list< ListIndex >& segmentsSizes );
 
       Ellpack( const IndexType segmentsCount, const IndexType segmentSize );
 
@@ -131,6 +135,13 @@ class Ellpack
       IndexType segmentSize, size, alignedSize;
 };
 
+template <typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization,
+          int Alignment >
+std::ostream& operator<<( std::ostream& str, const Ellpack< Device, Index, IndexAllocator, Organization, Alignment >& segments ) { return printSegments( segments, str ); }
+
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/Ellpack.hpp b/src/TNL/Algorithms/Segments/Ellpack.hpp
index 9de1f5b97..589d9f944 100644
--- a/src/TNL/Algorithms/Segments/Ellpack.hpp
+++ b/src/TNL/Algorithms/Segments/Ellpack.hpp
@@ -35,13 +35,27 @@ template< typename Device,
           typename IndexAllocator,
           ElementsOrganization Organization,
           int Alignment >
+   template< typename SizesContainer >
 Ellpack< Device, Index, IndexAllocator, Organization, Alignment >::
-Ellpack( const SegmentsSizes& segmentsSizes )
+Ellpack( const SizesContainer& segmentsSizes )
    : segmentSize( 0 ), size( 0 ), alignedSize( 0 )
 {
    this->setSegmentsSizes( segmentsSizes );
 }
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization,
+          int Alignment >
+   template< typename ListIndex >
+Ellpack< Device, Index, IndexAllocator, Organization, Alignment >::
+Ellpack( const std::initializer_list< ListIndex >& segmentsSizes )
+   : segmentSize( 0 ), size( 0 ), alignedSize( 0 )
+{
+   this->setSegmentsSizes( Containers::Vector< IndexType, DeviceType, IndexType >( segmentsSizes ) );
+}
+
 template< typename Device,
           typename Index,
           typename IndexAllocator,
diff --git a/src/TNL/Algorithms/Segments/EllpackView.h b/src/TNL/Algorithms/Segments/EllpackView.h
index 23df8813b..aebe7b591 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.h
+++ b/src/TNL/Algorithms/Segments/EllpackView.h
@@ -15,6 +15,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/SegmentView.h>
 #include <TNL/Algorithms/Segments/ElementsOrganization.h>
+#include <TNL/Algorithms/Segments/SegmentsPrinting.h>
 
 
 namespace TNL {
@@ -127,6 +128,13 @@ class EllpackView
       IndexType segmentSize, segmentsCount, alignedSize;
 };
 
+template< typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          int Alignment >
+std::ostream& operator<<( std::ostream& str, const EllpackView< Device, Index, Organization, Alignment >& ellpack ) { return printSegments( str, ellpack ); }
+
+
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.h b/src/TNL/Algorithms/Segments/SlicedEllpack.h
index c69a27142..69b86c100 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpack.h
+++ b/src/TNL/Algorithms/Segments/SlicedEllpack.h
@@ -43,7 +43,11 @@ class SlicedEllpack
 
       SlicedEllpack();
 
-      SlicedEllpack( const Containers::Vector< IndexType, DeviceType, IndexType >& sizes );
+      template< typename SizesContainer >
+      SlicedEllpack( const SizesContainer& sizes );
+
+      template< typename ListIndex >
+      SlicedEllpack( const std::initializer_list< ListIndex >& segmentsSizes );
 
       SlicedEllpack( const SlicedEllpack& segments );
 
@@ -130,6 +134,13 @@ class SlicedEllpack
       OffsetsHolder sliceOffsets, sliceSegmentSizes;
 };
 
+template <typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization,
+          int SliceSize >
+std::ostream& operator<<( std::ostream& str, const SlicedEllpack< Device, Index, IndexAllocator, Organization, SliceSize >& segments ) { return printSegments( str, segments ); }
+
       } // namespace Segements
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
index af4184747..6c58c3ed1 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
@@ -37,11 +37,25 @@ template< typename Device,
           typename IndexAllocator,
           ElementsOrganization Organization,
           int SliceSize >
+   template< typename SizesContainer >
 SlicedEllpack< Device, Index, IndexAllocator, Organization, SliceSize >::
-SlicedEllpack( const Containers::Vector< IndexType, DeviceType, IndexType >& sizes )
+SlicedEllpack( const SizesContainer& segmentsSizes )
    : size( 0 ), alignedSize( 0 ), segmentsCount( 0 )
 {
-   this->setSegmentsSizes( sizes );
+   this->setSegmentsSizes( segmentsSizes );
+}
+
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization,
+          int SliceSize >
+   template< typename ListIndex >
+SlicedEllpack< Device, Index, IndexAllocator, Organization, SliceSize >::
+SlicedEllpack( const std::initializer_list< ListIndex >& segmentsSizes )
+   : size( 0 ), alignedSize( 0 ), segmentsCount( 0 )
+{
+   this->setSegmentsSizes( Containers::Vector< IndexType, DeviceType, IndexType >( segmentsSizes ) );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.h b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
index 634d193c0..aa20e758d 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
@@ -128,6 +128,12 @@ class SlicedEllpackView
       OffsetsView sliceOffsets, sliceSegmentSizes;
 };
 
+template <typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          int SliceSize >
+std::ostream& operator<<( std::ostream& str, const SlicedEllpack< Device, Index, Organization, SliceSize >& segments ) { return printSegments( str, segments ); }
+
       } // namespace Segements
    }  // namespace Algorithms
 } // namespace TNL
-- 
GitLab


From 3bbf860dfc81425138ef058bae674236bc48d7aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 2 Apr 2021 15:17:28 +0200
Subject: [PATCH 012/117] Added documentation for Segments namespace.

---
 .../Examples/Algorithms/CMakeLists.txt        |   2 +
 .../Segments/.SegmentsExample_General.cpp.swp | Bin 0 -> 12288 bytes
 .../Algorithms/Segments/CMakeLists.txt        |  23 ++++
 .../Segments/SegmentsExample_General.cpp      |  79 ++++++++++++
 .../Segments/SegmentsExample_General.cu       |   1 +
 src/TNL/Algorithms/Segments/_NamespaceDoxy.h  | 112 ++++++++++++++++++
 6 files changed, 217 insertions(+)
 create mode 100644 Documentation/Examples/Algorithms/Segments/.SegmentsExample_General.cpp.swp
 create mode 100644 Documentation/Examples/Algorithms/Segments/CMakeLists.txt
 create mode 100644 Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
 create mode 120000 Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cu
 create mode 100644 src/TNL/Algorithms/Segments/_NamespaceDoxy.h

diff --git a/Documentation/Examples/Algorithms/CMakeLists.txt b/Documentation/Examples/Algorithms/CMakeLists.txt
index 982b9c06f..8afdc50cc 100644
--- a/Documentation/Examples/Algorithms/CMakeLists.txt
+++ b/Documentation/Examples/Algorithms/CMakeLists.txt
@@ -1,3 +1,5 @@
+ADD_SUBDIRECTORY( Segments )
+
 IF( BUILD_CUDA )
    CUDA_ADD_EXECUTABLE( SortingExampleCuda SortingExample.cu)
    ADD_CUSTOM_COMMAND( COMMAND SortingExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out )
diff --git a/Documentation/Examples/Algorithms/Segments/.SegmentsExample_General.cpp.swp b/Documentation/Examples/Algorithms/Segments/.SegmentsExample_General.cpp.swp
new file mode 100644
index 0000000000000000000000000000000000000000..f50e0038f2a01cc4ed7c7742797ac5bb2531788d
GIT binary patch
literal 12288
zcmYc?2=nw+u+TGPU|?VnU|=W_Pfw5w@MHK8B*2iLlv<Qgnv_}ulEZ_O6N`#6Q*{&b
z@>21rVqmC)8>nBNUzA;3keHmRpPZkPs$Y_qqnnairtgxUT$-DjSCUwgnV+ZcT9KGr
zkds=h@0gRGUzAyrkz1@EoSF_&fM7rs#k;5Gr4}XT=p`2v;B~~P<Y)+t#1JSgNz=99
zWiU1}Gyr*DSxHerSSS?49L1v{Fd71*Aut*OqaiRF0;3@?8UmvsFd70QBm_zd7#Zpr
z7#NtK{`H5_jA%5JJ4%g)z-S1JhQMeDjE2By2#kinXb6mkz-S1JhQMeDjE2By2n@jx
zNK9d1_`uA-;K2%+|A+Pe-|#aqT;pe8*v!ws(8bTd5X;ZNV9(FMpvcd_puo?-z{JnM
z@S2Z-;Uymf!xlaUhDJUHh6X+ch6+9g26sLN1_wR{21z~!h8w&L4A*%X7|!!DFs$HZ
zU|7h@z~INrz+lhIz+lJAz+lG9z`(`Jz`)GQz;K_3fgy;8fkB^#fq{{Sf#D}N1H%t)
z28IvZ3=D_385nkRGcfpbGcd?=Gcd?;Gcbs8GcereVqmzz#lTR`#lR5E#lUcelYwD5
zCj-MmP6h^BP6h@sP6h@6P6mck91IMTIT#owaxgGV;9y|L<zQgQ=3rou<zQeq&d$J4
z$<Dx_%+A1|#LmE=$j-oUpN)ZG9~%S1UN#1XayABrAT|buKsE*j2Q~(VU#tuaH(41N
zZm=>i^szE9WU?|aB(X9uc(Fpl(F_!pLoh9jI(ak%Mniy@5U6EPP*5mJEh#O^Q*e#&
z42ciM6bW^9b`1`;W>8MeOUX<FX@IR=f-YIIQE)CzNp#K0DM(DtR<P61v<4|EE=jSn
zO3p7WQLwR5P=YE|$WK#%DJ+JrfKtfMQ*aJ-aa3>*2(?mB0x1XUP0dTmv1R}nh1EXi
z;2^?wffNyEk8);ON@|*dhhvy)JjgZJE%wMSE`hn9ERSa77ncy}NswJ2_Y<@SqzKuQ
zIP3t0WNC3`UOGy2*(!wi`B+&YR;F25!Ghb$3hGrGh#HsFvdrYvVk;|9bn7T&=9MVe
zfukB>D#$5#O>_<pBF;3B@1fQ!;58Ad2;EeWeoQxlM9Fa@NFUs^Y6g(ka}zW3G{Aw7
zSd^ZuqmZ1DSfr(pSd?BC9jl<pzyQuj;Dij%1R)uz3dN<l#R`c<sjx%<;@a3KfE7Yg
zyn=!PEH?FuQd3HkQyp`1U~!@WG9W%JwIn%12coGUr?l9{uDVu7Av-m-Ku5tq&p-i^
z18c#N12V2Ow>Z8mGqpS#X0m5Wg+i=?twLF1PH8GwSz>8PKG*~WTZL%bScUlb<kFPH
z_~gW#oW!J@)cAM>4Ta?Vyy6lC&%Bh>ijc~JR5c{ibfA(c`K3uYscH~|6*LucOG`l7
z6{<n05AGmniIb98k_dHIdQN^)Vh-3@2<H@+=0d!R$4zk8pg0C0nv<UlHdhDXZUwky
zItoeo`8jF|$@#ejr6s8fnhLsh3J@<qTnr8sP?Uk4sbC8V{bIfJ)RM5w)N*J+2hKmv
z`FSOYnR%&2#a32fsmUezMK(}_bQB=@1{7k*j<5rpr2%s`*i=|7JLi|?m1t-}<559N
z9~62DS_;lkcY<65DoRsxQo)Nz6*BV_3KEM-GLuVl5{nQ<g0$&tX=#BAvx1_`yb_oV
z2zzxDkW&*ltwNFy*zaIDqJkeDsHmX`t6gx!wE{Fvz`Ym%a-~8^MydixheB>*USfJ`
zib7H)SPY&qP<&Pk3n&E;0O<oqS2QA`!HFHJ*)ydAoWv9q6f)B^6krKN!NwLAaG+$U
z39<pIqgXF3zsNBs#}yjf8aQ(ZBHcl?dZtt$M>@<hWHETaVWviu+yjmexaSmX!2yJp
z77<CsGp{(cs06GAR7m9$P9Gqn6>LG}2RL{zlC@(|QDP;wRIOkKwg!=;K#mG7$uCMw
zPYuqjO2v|p9dke-QIe{V2r^V5Ex!mQdch8WCtPsZ2u_%wmIN#XD->r|rJ@w2u=WH-
zaf;9aRZ^{BsH0${qhPF~V4|a7ijfGNi&7IyQenxVSfM<#Bts!Rvn(|aAqKWy50pya
z4u#Z%h%{iUP*PctnwOZHssM9>l@(kX>`90gL<ymQRS!bmnxUGZEI%^^ZXmQ_rJ>1C
zlA2qPlUR~!gK4XR9Rq`MW?phmX-cYsO^Bb5KE%XgeNa`dmtluP7F6wF$|Bkdh*j*+
rT9`P+&cQ*L3K1o!KBNFeRgzYkmke4=pO}N}*v$OmlA_eaTssB;8bsWH

literal 0
HcmV?d00001

diff --git a/Documentation/Examples/Algorithms/Segments/CMakeLists.txt b/Documentation/Examples/Algorithms/Segments/CMakeLists.txt
new file mode 100644
index 000000000..c7f05c37f
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/CMakeLists.txt
@@ -0,0 +1,23 @@
+set( COMMON_EXAMPLES
+   SegmentsExample_General
+)
+
+if( BUILD_CUDA )
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+      cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
+      add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
+   endforeach()
+else()
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+      add_executable( ${target} ${target}.cpp )
+      add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out )
+   endforeach()
+endif()
+
+IF( BUILD_CUDA )
+   ADD_CUSTOM_TARGET( RunSegmentsExamples-cuda ALL DEPENDS ${CUDA_OUTPUTS} )
+ELSE()
+   ADD_CUSTOM_TARGET( RunSegmentsExamples ALL DEPENDS ${HOST_OUTPUTS} )
+ENDIF()
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
new file mode 100644
index 000000000..8e148af90
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
@@ -0,0 +1,79 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Algorithms/Segments/Ellpack.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Segments >
+void SegmentsExample()
+{
+   using DeviceType = typename Segments::DeviceType;
+   using IndexType = typename Segments::IndexType;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   Segments segments{ 1, 2, 3, 4, 5 };
+   std::cout << "Segments sizes are: " << segments << std::endl;
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, DeviceType > data( segments.getStorageSize() );
+   data = 0.0;
+
+   /***
+    * Insert data into particular segments.
+    */
+   auto data_view = data.getView();
+   segments.forAllElements( [=] __cuda_callable__ ( IndexType segmentIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable {
+      if( localIdx <= segmentIdx )
+         data_view[ globalIdx ] = segmentIdx;
+   } );
+
+   /***
+    * Print the data managed by the segments.
+    */
+   auto fetch = [=] __cuda_callable__ ( IndexType globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout );
+
+   /***
+    * Compute sums of elements in particular segments.
+    */
+   TNL::Containers::Vector< double, DeviceType, IndexType > sums( segments.getSegmentsCount() );
+   auto sums_view = sums.getView();
+   auto sum_fetch = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) -> double {
+      return data_view[ globalIdx ];
+   };
+   auto keep = [=] __cuda_callable__ ( const IndexType& segmentIdx, const double& value ) mutable {
+      sums_view[ segmentIdx ] = value;
+   };
+   segments.reduceAllSegments( sum_fetch, std::plus<>{}, keep, 0.0 );
+   std::cout << "The sums are: " << sums << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   using HostCSR = TNL::Algorithms::Segments::CSR< TNL::Devices::Host, int >;
+   using HostEllpack = TNL::Algorithms::Segments::Ellpack< TNL::Devices::Host, int >;
+   using CudaCSR = TNL::Algorithms::Segments::CSR< TNL::Devices::Cuda, int >;
+   using CudaEllpack = TNL::Algorithms::Segments::Ellpack< TNL::Devices::Cuda, int >;
+
+
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< HostCSR >();
+
+   std::cout << "Example of Ellpack segments on host: " << std::endl;
+   SegmentsExample< HostEllpack >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< CudaCSR >();
+
+   std::cout << "Example of Ellpack segments on CUDA GPU: " << std::endl;
+   SegmentsExample< CudaEllpack >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cu
new file mode 120000
index 000000000..64abaf44d
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cu
@@ -0,0 +1 @@
+SegmentsExample_General.cpp
\ No newline at end of file
diff --git a/src/TNL/Algorithms/Segments/_NamespaceDoxy.h b/src/TNL/Algorithms/Segments/_NamespaceDoxy.h
new file mode 100644
index 000000000..d9fdeb64d
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/_NamespaceDoxy.h
@@ -0,0 +1,112 @@
+/***************************************************************************
+                          _NamespaceDoxy.h -  description
+                             -------------------
+    begin                : Apr 1, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+namespace TNL {
+   namespace Algorithms {
+/**
+ * \brief Namespace holding segments data structures.
+
+ *Segments* represent data structure for manipulation with several local arrays (denoted also as segments)
+ having different size in general. All the local arrays are supposed to be allocated in one continuos global array.
+ The data structure segments offers mapping between indexes of particular local arrays and indexes
+ of the global array. In addition,one can perform parallel operations like for or flexible reduction on partical
+ local arrays.
+
+ A typical example for use of *segments* is implementation of sparse matrices. Sparse matrix like the following
+ \f[
+  \left(
+  \begin{array}{ccccc}
+   1  &  0  &  2  &  0  &  0 \\
+    0  &  0  &  5  &  0  &  0 \\
+    3  &  4  &  7  &  9  &  0 \\
+    0  &  0  &  0  &  0  & 12 \\
+   0  &  0  & 15  & 17  & 20
+  \end{array}
+  \right)
+ \f]
+ is usually first compressed which means that the zero elements are omitted to get the following "matrix":
+
+ \f[
+ \begin{array}{ccccc}
+    1  &   2  \\
+    5   \\
+    3  &   4  &  7 &  9   \\
+    12 \\
+    15 & 17  & 20
+ \end{array}
+ \f]
+ We have to store column index of each matrix elements as well in a "matrix" like this:
+ \f[
+ \begin{array}{ccccc}
+    0  &   2  \\
+    2   \\
+    0  &   1  &  2 &  3   \\
+    4 \\
+    2 & 3  & 4
+ \end{array}
+ \f]
+
+ Such "matrices" can be stored in memory in a row-wise manner in one contiguous array because of the performance reasons. The first "matrix" (i.e. values of the matrix elements)
+ would be stored as follows
+
+ \f[
+    \begin{array}{|cc|c|cccc|c|cc|} 1 & 2 &  5 & 3 & 4 & 7 & 9 & 12 & 15 & 17 & 20 \end{array}
+ \f]
+
+and the second one (i.e. column indexes of the matrix values) as follows
+
+\f[
+    \begin{array}{|cc|c|cccc|c|cc|} 0 & 2 & 2 & 0 & 1 & 2 & 3 & 4 & 2 & 3 & 4 \end{array}
+ \f]
+
+What we see above is so called [CSR sparse matrix format](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)).
+It is the most popular format for storage of sparse matrices designed for high performance. However, it may not be the most efficient format for storage
+of sparse matrices on GPUs. Therefore many other formats have been developed to get better performance. These formats often have different layout
+of the matrix elements in the memory. They have to deal especially with two difficulties:
+
+1. Efficient storage of matrix elements in the memory to fulfill the requirements of coalesced memory accesses on GPUs or good spatial locality
+ for efficient use of caches on CPUs.
+2. Efficient mapping of GPU threads to different matrix rows.
+
+Necessity of working with this kind of data structure is not limited only to sparse matrices. We could name at least few others:
+
+1. Efficient storage of [graphs](https://en.wikipedia.org/wiki/Graph_(discrete_mathematics)) - one segment represents one graph node,
+   the elements in one segments are indexes of its neighbors.
+2. [Unstructured numerical meshes](https://en.wikipedia.org/wiki/Types_of_mesh) - unstructured numerical mesh is a graph in fact.
+3. [Particle in cell method](https://en.wikipedia.org/wiki/Particle-in-cell) - one segment represents one cell, the elements in one segment
+   are indexes of the particles.
+4. [K-means clustering](https://en.wikipedia.org/wiki/K-means_clustering) - segments represent one cluster, the elements represent vectors
+   belonging to given cluster.
+5. [Hashing](https://arxiv.org/abs/1907.02900) - segments are particular rows of the hash table, elements in segments corresponds with coliding
+   hashed elements.
+
+In general, segments can be used for problems that somehow corresponds wit 2D data structure where each row can have different size and we need
+to perform miscellaneous operations within the rows. The name *segments* comes from segmented parallel reduction or
+[segmented scan (prefix-sum)](https://en.wikipedia.org/wiki/Segmented_scan).
+
+The following example demonstrates the essence of *segments* in TNL:
+
+\includelineno Algorithms/Segments/SegmentsExample_General.cpp
+
+The result looks as follows:
+
+\include SegmentsExample_General.out
+
+*/
+
+
+
+      namespace Segments {
+
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
-- 
GitLab


From 5864a51cf3b49f07536a594d21d599353eaaec11 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 3 Apr 2021 15:03:01 +0200
Subject: [PATCH 013/117] Writing documentation on segmenets.

---
 .../Segments/SegmentsExample_General.cpp      |  3 +-
 src/TNL/Algorithms/Segments/CSR.h             |  4 ++-
 src/TNL/Algorithms/Segments/_NamespaceDoxy.h  | 30 +++++++++++++++++++
 3 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
index 8e148af90..e50c6d1ed 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
@@ -21,8 +21,7 @@ void SegmentsExample()
    /***
     * Allocate array for the segments;
     */
-   TNL::Containers::Array< double, DeviceType > data( segments.getStorageSize() );
-   data = 0.0;
+   TNL::Containers::Array< double, DeviceType > data( segments.getStorageSize(), 0.0 );
 
    /***
     * Insert data into particular segments.
diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index e6eb5ad1d..5622a139b 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -22,7 +22,9 @@ namespace TNL {
       namespace Segments {
 
 /**
- * \brief Segments data structure based on CSR format.
+ * \brief Data structure for CSR segments format.
+ *
+ * See \ref TNL::Algorithms::Segments for more details about segments.
  *
  * \tparam Device 
  * \tparam Index 
diff --git a/src/TNL/Algorithms/Segments/_NamespaceDoxy.h b/src/TNL/Algorithms/Segments/_NamespaceDoxy.h
index d9fdeb64d..064e6ab45 100644
--- a/src/TNL/Algorithms/Segments/_NamespaceDoxy.h
+++ b/src/TNL/Algorithms/Segments/_NamespaceDoxy.h
@@ -97,10 +97,40 @@ The following example demonstrates the essence of *segments* in TNL:
 
 \includelineno Algorithms/Segments/SegmentsExample_General.cpp
 
+We demonstrate two formats of segments - \ref TNL::Algorithms::Segments::CSR and \ref TNL::Algorithms::Segments::Ellpack running on both CPU and GPU
+(lines 58-76). For each of them, we call function `SegmentsExample` which first creates given segments (line 18). The segments are defined by the sizes of
+particular segments.
+
+Next we allocate array with data related to the segments (line 24). The number of elemets managed by the segments is given by
+\ref TNL::Algorithms::Segments::CSR::getStorageSize and \ref TNL::Algorithms::Segments::Ellpack::getStorageSize respectively.
+
+Next we setup the segments elements (lines 29-33) by calling \ref TNL::Algorithms::Segments::CSR::forAllElements
+(and \ref TNL::Algorithms::Segments::CSR::forAllElements respectively) which iterates over all elements of the segments
+in parallel and perform given lambda function. The lambda function receives index of the segment (`segmentIdx`),
+index of the element within the segment (`localIdx`), index of the element within the array `data` and a reference to boolean (`compute`) which serves as a
+hint for interrupting the iteration over the elements of given segment when it is set to `false`. The value of the elements having the local index smaller or equal
+to the segments index is set to the value of the segment index. It creates, in fact, lower triangular matrix elements of which have values equal to row index.
+
+Next we use a function \ref TNL::Algorithms::Segments::printSegments to print the content of the segments (lines 38-39). To do this we have to provide a lambda function
+`fetch` (line 38) which returns value of elements with given global index.
+
+Finally we show how to compute sum of all elemnts in each segment. Firstly, we create vector into which we will store the sums (line 44) and get its view (line 45).
+The size of the vector is given by the number of the segments which can be obtained by the means of the method \ref TNL::Algorithms::Segments::CSR::getSegmentsCount
+(and \ref TNL::Algorithms::Segments::Ellpack::getSegmentsCount respectively). The sums are computed using the method \ref TNL::Algorithms::Segments::CSR::reduceAllSegments
+(and \ref TNL::Algorithms::Segments::Ellpack::reduceAllSegments respectively) which works the same way as the flexible parallel reduction (\ref TNL::Algorithms::Reduction).
+It requires lambda functions `fetch` for reading the data related to particular elements of the segments, function `reduce` which is \ref std::plus in this case and a
+function `keep` to store the result of sums in particular segments.
+
 The result looks as follows:
 
 \include SegmentsExample_General.out
 
+Note that the Ellpack format manages more elements than we asked for. It is because some formats use padding elements for more efficient memory accesses. The padding
+elements are available to the user as well and so we must ensure that work only with those elements we want to. This is the reason why we use the if statement on the
+line 31 when setting up the values of the elements in segments. The padding elements can be used in case when we later need more elements than we requested. However,
+the segments data structure does not allow any resizing of the segments. One can change the sizes of the segments, however, the access to the originally managed data
+is becoming invalid at that moment.
+
 */
 
 
-- 
GitLab


From 31bfdd99c66924643b481dd79a536d21cf39ff55 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 4 Apr 2021 13:41:07 +0200
Subject: [PATCH 014/117] Moving CSR kernels to Algorithms::Segments::Kernels.

---
 src/TNL/Algorithms/Segments/CSRView.h         |   8 +-
 src/TNL/Algorithms/Segments/ChunkedEllpack.h  |   5 +-
 .../{ => Kernels}/CSRAdaptiveKernel.h         |   8 +-
 .../{ => Kernels}/CSRAdaptiveKernel.hpp       |   4 +-
 .../{ => Kernels}/CSRAdaptiveKernelView.h     |   6 +-
 .../{ => Kernels}/CSRAdaptiveKernelView.hpp   |   8 +-
 .../Segments/{ => Kernels}/CSRHybridKernel.h  |   2 +-
 .../{ => Kernels}/CSRHybridKernel.hpp         |   2 +-
 .../Segments/{ => Kernels}/CSRScalarKernel.h  |   2 +-
 .../{ => Kernels}/CSRScalarKernel.hpp         |   2 +-
 .../Segments/{ => Kernels}/CSRVectorKernel.h  |   2 +-
 .../{ => Kernels}/CSRVectorKernel.hpp         |   2 +-
 .../CSRAdaptiveKernelBlockDescriptor.h        | 251 ++++++++++++++++++
 .../details/CSRAdaptiveKernelParameters.h     | 112 ++++++++
 .../Algorithms/Segments/SlicedEllpackView.h   |   2 +-
 15 files changed, 389 insertions(+), 27 deletions(-)
 rename src/TNL/Algorithms/Segments/{ => Kernels}/CSRAdaptiveKernel.h (92%)
 rename src/TNL/Algorithms/Segments/{ => Kernels}/CSRAdaptiveKernel.hpp (97%)
 rename src/TNL/Algorithms/Segments/{ => Kernels}/CSRAdaptiveKernelView.h (90%)
 rename src/TNL/Algorithms/Segments/{ => Kernels}/CSRAdaptiveKernelView.hpp (97%)
 rename src/TNL/Algorithms/Segments/{ => Kernels}/CSRHybridKernel.h (96%)
 rename src/TNL/Algorithms/Segments/{ => Kernels}/CSRHybridKernel.hpp (99%)
 rename src/TNL/Algorithms/Segments/{ => Kernels}/CSRScalarKernel.h (96%)
 rename src/TNL/Algorithms/Segments/{ => Kernels}/CSRScalarKernel.hpp (98%)
 rename src/TNL/Algorithms/Segments/{ => Kernels}/CSRVectorKernel.h (96%)
 rename src/TNL/Algorithms/Segments/{ => Kernels}/CSRVectorKernel.hpp (98%)
 create mode 100644 src/TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelBlockDescriptor.h
 create mode 100644 src/TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelParameters.h

diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index 59300eaa3..2d550aada 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -14,10 +14,10 @@
 
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/SegmentView.h>
-#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
-#include <TNL/Algorithms/Segments/CSRVectorKernel.h>
-#include <TNL/Algorithms/Segments/CSRHybridKernel.h>
-#include <TNL/Algorithms/Segments/CSRAdaptiveKernel.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRVectorKernel.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRHybridKernel.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.h>
 #include <TNL/Algorithms/Segments/SegmentsPrinting.h>
 
 namespace TNL {
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
index 6c0bf2a22..97abb3864 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
@@ -175,9 +175,8 @@ class ChunkedEllpack
 template <typename Device,
           typename Index,
           typename IndexAllocator,
-          ElementsOrganization Organization,
-          int Alignment >
-std::ostream& operator<<( std::ostream& str, const Ellpack< Device, Index, IndexAllocator, Organization, Alignment >& segments ) { return printSegments( str, segments ); }
+          ElementsOrganization Organization >
+std::ostream& operator<<( std::ostream& str, const ChunkedEllpack< Device, Index, IndexAllocator, Organization >& segments ) { return printSegments( str, segments ); }
 
 
       } // namespace Segments
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.h
similarity index 92%
rename from src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
rename to src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.h
index 40f06e1f9..53a59d229 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.h
@@ -15,9 +15,9 @@
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
-#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
-#include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.h>
-#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.h>
+#include <TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelBlockDescriptor.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -117,4 +117,4 @@ struct CSRAdaptiveKernel
    }  // namespace Algorithms
 } // namespace TNL
 
-#include <TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp>
+#include <TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.hpp>
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.hpp
similarity index 97%
rename from src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp
rename to src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.hpp
index c5d809920..de72daf77 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.hpp
@@ -15,8 +15,8 @@
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
-#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
-#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h>
+#include <TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelBlockDescriptor.h>
 
 namespace TNL {
    namespace Algorithms {
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.h
similarity index 90%
rename from src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h
rename to src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.h
index a48d2aa34..f7521d558 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.h
@@ -11,8 +11,8 @@
 #pragma once
 
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h>
-#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h>
+#include <TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelBlockDescriptor.h>
+#include <TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelParameters.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -70,4 +70,4 @@ struct CSRAdaptiveKernelView
    }  // namespace Algorithms
 } // namespace TNL
 
-#include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp>
+#include <TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp>
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp
similarity index 97%
rename from src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
rename to src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp
index fb1ab36c1..f213e9523 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp
@@ -15,10 +15,10 @@
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
-#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
-#include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.h>
-#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h>
-#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.h>
+#include <TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelBlockDescriptor.h>
+#include <TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelParameters.h>
 
 namespace TNL {
    namespace Algorithms {
diff --git a/src/TNL/Algorithms/Segments/CSRHybridKernel.h b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.h
similarity index 96%
rename from src/TNL/Algorithms/Segments/CSRHybridKernel.h
rename to src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.h
index 479b2b287..138819352 100644
--- a/src/TNL/Algorithms/Segments/CSRHybridKernel.h
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.h
@@ -62,4 +62,4 @@ struct CSRHybridKernel
    }  // namespace Algorithms
 } // namespace TNL
 
-#include <TNL/Algorithms/Segments/CSRHybridKernel.hpp>
+#include <TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp>
diff --git a/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
similarity index 99%
rename from src/TNL/Algorithms/Segments/CSRHybridKernel.hpp
rename to src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
index ad431050f..64f414cf8 100644
--- a/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
@@ -15,7 +15,7 @@
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
-#include <TNL/Algorithms/Segments/CSRHybridKernel.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRHybridKernel.h>
 
 namespace TNL {
    namespace Algorithms {
diff --git a/src/TNL/Algorithms/Segments/CSRScalarKernel.h b/src/TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h
similarity index 96%
rename from src/TNL/Algorithms/Segments/CSRScalarKernel.h
rename to src/TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h
index bd04670d7..f0c8accd3 100644
--- a/src/TNL/Algorithms/Segments/CSRScalarKernel.h
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h
@@ -60,4 +60,4 @@ struct CSRScalarKernel
    }  // namespace Algorithms
 } // namespace TNL
 
-#include <TNL/Algorithms/Segments/CSRScalarKernel.hpp>
\ No newline at end of file
+#include <TNL/Algorithms/Segments/Kernels/CSRScalarKernel.hpp>
diff --git a/src/TNL/Algorithms/Segments/CSRScalarKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRScalarKernel.hpp
similarity index 98%
rename from src/TNL/Algorithms/Segments/CSRScalarKernel.hpp
rename to src/TNL/Algorithms/Segments/Kernels/CSRScalarKernel.hpp
index 4bed6934c..d98f88661 100644
--- a/src/TNL/Algorithms/Segments/CSRScalarKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRScalarKernel.hpp
@@ -14,7 +14,7 @@
 #include <TNL/Cuda/LaunchHelpers.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h>
 #include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 
 namespace TNL {
diff --git a/src/TNL/Algorithms/Segments/CSRVectorKernel.h b/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.h
similarity index 96%
rename from src/TNL/Algorithms/Segments/CSRVectorKernel.h
rename to src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.h
index fccc1022a..0654b5ef6 100644
--- a/src/TNL/Algorithms/Segments/CSRVectorKernel.h
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.h
@@ -60,4 +60,4 @@ struct CSRVectorKernel
    }  // namespace Algorithms
 } // namespace TNL
 
-#include <TNL/Algorithms/Segments/CSRVectorKernel.hpp>
+#include <TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp>
diff --git a/src/TNL/Algorithms/Segments/CSRVectorKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp
similarity index 98%
rename from src/TNL/Algorithms/Segments/CSRVectorKernel.hpp
rename to src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp
index 3bf799288..cf7d80af6 100644
--- a/src/TNL/Algorithms/Segments/CSRVectorKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp
@@ -15,7 +15,7 @@
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
-#include <TNL/Algorithms/Segments/CSRVectorKernel.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRVectorKernel.h>
 
 namespace TNL {
    namespace Algorithms {
diff --git a/src/TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelBlockDescriptor.h b/src/TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelBlockDescriptor.h
new file mode 100644
index 000000000..83faa105d
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelBlockDescriptor.h
@@ -0,0 +1,251 @@
+/***************************************************************************
+                          CSRAdaptiveKernelBlockDescriptor.h -  description
+                             -------------------
+    begin                : Jan 25, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+         namespace detail {
+
+enum class Type {
+   /* LONG = 0!!! Non zero value rewrites index[1] */
+   LONG = 0,
+   STREAM = 1,
+   VECTOR = 2
+};
+
+//#define CSR_ADAPTIVE_UNION
+
+#ifdef CSR_ADAPTIVE_UNION
+template< typename Index >
+union CSRAdaptiveKernelBlockDescriptor
+{
+   CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0, uint8_t warpsCount = 0) noexcept
+   {
+      this->index[0] = row;
+      this->index[1] = index;
+      this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;
+   }
+
+   CSRAdaptiveKernelBlockDescriptor(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept
+   {
+      this->index[0] = row;
+      this->index[1] = 0;
+      this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID;
+
+      if (type == Type::STREAM)
+         this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row;
+
+      if (type == Type::STREAM)
+         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000;
+      else if (type == Type::VECTOR)
+         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000;
+   }
+
+   CSRAdaptiveKernelBlockDescriptor() = default;
+
+   __cuda_callable__ Type getType() const
+   {
+      if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 )
+         return Type::STREAM;
+      if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 )
+         return Type::VECTOR;
+      return Type::LONG;
+   }
+
+   __cuda_callable__ const Index& getFirstSegment() const
+   {
+      return index[ 0 ];
+   }
+
+   /***
+    * \brief Returns number of elements covered by the block.
+    */
+   __cuda_callable__ const Index getSize() const
+   {
+      return twobytes[ sizeof(Index) == 4 ? 2 : 4 ];
+   }
+
+   /***
+    * \brief Returns number of segments covered by the block.
+    */
+   __cuda_callable__ const Index getSegmentsInBlock() const
+   {
+      return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF );
+   }
+
+   __cuda_callable__ uint8_t getWarpIdx() const
+   {
+      return index[ 1 ];
+   }
+
+   __cuda_callable__ uint8_t getWarpsCount() const
+   {
+      return 1;
+   }
+
+   void print( std::ostream& str ) const
+   {
+      Type type = this->getType();
+      str << "Type: ";
+      switch( type )
+      {
+         case Type::STREAM:
+            str << " Stream ";
+            break;
+         case Type::VECTOR:
+            str << " Vector ";
+            break;
+         case Type::LONG:
+            str << " Long ";
+            break;
+      }
+      str << " first segment: " << getFirstSegment();
+      str << " block end: " << getSize();
+      str << " index in warp: " << index[ 1 ];
+   }
+   Index index[2]; // index[0] is row pointer, index[1] is index in warp
+   uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
+   uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID
+                                                //twobytes[3/5] is nextRow - row
+};
+#else
+
+template< typename Index >
+struct CSRAdaptiveKernelBlockDescriptor
+{
+   CSRAdaptiveKernelBlockDescriptor( Index firstSegmentIdx,
+                                     Type type = Type::VECTOR,
+                                     uint8_t warpIdx = 0,
+                                     uint8_t warpsCount = 0 ) noexcept
+   {
+      this->firstSegmentIdx = firstSegmentIdx;
+      this->type = ( uint8_t ) type;
+      this->warpIdx = warpIdx;
+      this->warpsCount = warpsCount;
+      /*this->index[0] = row;
+      this->index[1] = index;
+      this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;*/
+   }
+
+   CSRAdaptiveKernelBlockDescriptor( Index firstSegmentIdx,
+                                     Type type,
+                                     Index lastSegmentIdx,
+                                     Index end,
+                                     Index begin ) noexcept
+   {
+      this->firstSegmentIdx = firstSegmentIdx;
+      this->warpIdx = 0;
+      this->blockSize = end - begin;
+      this->segmentsInBlock = lastSegmentIdx - firstSegmentIdx;
+      this->type = ( uint8_t ) type;
+
+      /*this->index[0] = row;
+      this->index[1] = 0;
+      this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID;
+
+      if (type == Type::STREAM)
+         this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row;
+
+      if (type == Type::STREAM)
+         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000;
+      else if (type == Type::VECTOR)
+         this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000;*/
+   }
+
+   CSRAdaptiveKernelBlockDescriptor() = default;
+
+   __cuda_callable__ Type getType() const
+   {
+      return ( Type ) this->type;
+      /*if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 )
+         return Type::STREAM;
+      if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 )
+         return Type::VECTOR;
+      return Type::LONG;*/
+   }
+
+   __cuda_callable__ const Index& getFirstSegment() const
+   {
+      return this->firstSegmentIdx;
+      //return index[ 0 ];
+   }
+
+   /***
+    * \brief Returns number of elements covered by the block.
+    */
+   __cuda_callable__ const Index getSize() const
+   {
+      return this->blockSize;
+      //return twobytes[ sizeof(Index) == 4 ? 2 : 4 ];
+   }
+
+   /***
+    * \brief Returns number of segments covered by the block.
+    */
+   __cuda_callable__ const Index getSegmentsInBlock() const
+   {
+      return this->segmentsInBlock;
+      //return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF );
+   }
+
+   __cuda_callable__ uint8_t getWarpIdx() const
+   {
+      return this->warpIdx;
+   }
+
+   __cuda_callable__ uint8_t getWarpsCount() const
+   {
+      return this->warpsCount;
+   }
+
+   void print( std::ostream& str ) const
+   {
+      str << "Type: ";
+      switch( this->getType() )
+      {
+         case Type::STREAM:
+            str << " Stream ";
+            break;
+         case Type::VECTOR:
+            str << " Vector ";
+            break;
+         case Type::LONG:
+            str << " Long ";
+            break;
+      }
+      str << " first segment: " << this->getFirstSegment();
+      str << " block end: " << this->getSize();
+      str << " index in warp: " << this->getWarpIdx();
+   }
+
+   uint8_t type;
+   Index firstSegmentIdx, blockSize, segmentsInBlock;
+   uint8_t warpIdx, warpsCount;
+
+   //Index index[2]; // index[0] is row pointer, index[1] is index in warp
+   //uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
+   //uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID
+                                                //twobytes[3/5] is nextRow - row
+};
+
+#endif
+
+template< typename Index >
+std::ostream& operator<< ( std::ostream& str, const CSRAdaptiveKernelBlockDescriptor< Index >& block )
+{
+   block.print( str );
+   return str;
+}
+         } // namespace detail
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelParameters.h b/src/TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelParameters.h
new file mode 100644
index 000000000..4af0197d2
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/Kernels/details/CSRAdaptiveKernelParameters.h
@@ -0,0 +1,112 @@
+/***************************************************************************
+                          CSRAdaptiveKernelBlockDescriptor.h -  description
+                             -------------------
+    begin                : Jan 25, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+         namespace detail {
+
+// This can be used for tunning the number of CUDA threads per block depending on the size of Value
+// TODO: Perform some tests
+static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 256, 256, 256 };
+
+template< int SizeOfValue = 1,
+          int StreamedSharedMemory_ = 24576 >
+struct CSRAdaptiveKernelParameters
+{
+   static constexpr int MaxValueSizeLog = 6;
+
+   static constexpr int getSizeValueLogConstexpr( const int i );
+
+   static constexpr int getSizeOfValue() { return SizeOfValue; };
+
+   static constexpr int SizeOfValueLog = getSizeValueLogConstexpr( SizeOfValue );
+
+   static_assert( SizeOfValueLog < MaxValueSizeLog, "Parameter SizeOfValue is too large." );
+
+   /**
+    * \brief Computes number of CUDA threads per block depending on Value type.
+    *
+    * \return CUDA block size.
+    */
+   static constexpr int CudaBlockSize() { return CSRAdaptiveKernelParametersCudaBlockSizes[ SizeOfValueLog ]; };
+   //{ return SizeOfValue == 8 ? 128 : 256; };
+
+   /**
+    * \brief Returns amount of shared memory dedicated for stream CSR kernel.
+    *
+    * \return Stream shared memory.
+    */
+   static constexpr size_t StreamedSharedMemory() { return StreamedSharedMemory_; };
+
+   /**
+    * \brief Number of elements fitting into streamed shared memory.
+    */
+   static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / SizeOfValue; };
+
+   /**
+    * \brief Computes number of warps in one CUDA block.
+    */
+   static constexpr size_t WarpsCount() { return CudaBlockSize() / Cuda::getWarpSize(); };
+
+   /**
+    * \brief Computes number of elements to be streamed into the shared memory.
+    *
+    * \return Number of elements to be streamed into the shared memory.
+    */
+   static constexpr size_t StreamedSharedElementsPerWarp() { return StreamedSharedElementsCount() / WarpsCount(); };
+
+   /**
+    * \brief Returns maximum number of elements per warp for vector and hybrid kernel.
+    *
+    * \return Maximum number of elements per warp for vector and hybrid kernel.
+    */
+   static constexpr int MaxVectorElementsPerWarp() { return 384; };
+
+   /**
+    * \brief Returns maximum number of elements per warp for adaptive kernel.
+    *
+    * \return Maximum number of elements per warp for adaptive kernel.
+    */
+   static constexpr int MaxAdaptiveElementsPerWarp() { return 512; };
+
+   static int getSizeValueLog( const int i )
+   {
+      if( i ==  1 ) return 0;
+      if( i ==  2 ) return 1;
+      if( i <=  4 ) return 2;
+      if( i <=  8 ) return 3;
+      if( i <= 16 ) return 4;
+      return 5;
+   }
+};
+
+
+template< int SizeOfValue,
+          int StreamedSharedMemory_ >
+constexpr int
+CSRAdaptiveKernelParameters< SizeOfValue, StreamedSharedMemory_ >::
+getSizeValueLogConstexpr( const int i )
+{
+   if( i ==  1 ) return 0;
+   if( i ==  2 ) return 1;
+   if( i <=  4 ) return 2;
+   if( i <=  8 ) return 3;
+   if( i <= 16 ) return 4;
+   if( i <= 32 ) return 5;
+   return 6;
+};
+
+         } // namespace detail
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.h b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
index aa20e758d..cd3b1fbe4 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
@@ -132,7 +132,7 @@ template <typename Device,
           typename Index,
           ElementsOrganization Organization,
           int SliceSize >
-std::ostream& operator<<( std::ostream& str, const SlicedEllpack< Device, Index, Organization, SliceSize >& segments ) { return printSegments( str, segments ); }
+std::ostream& operator<<( std::ostream& str, const SlicedEllpackView< Device, Index, Organization, SliceSize >& segments ) { return printSegments( str, segments ); }
 
       } // namespace Segements
    }  // namespace Algorithms
-- 
GitLab


From c4969701619c8903bba4503310b965dac7412ffb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 5 Apr 2021 11:58:10 +0200
Subject: [PATCH 015/117] Renaming OffsetsHolder to OffsetsContainer.

---
 src/TNL/Algorithms/Segments/BiEllpack.h       |  16 +--
 src/TNL/Algorithms/Segments/CSR.h             | 114 ++++++++++++++++--
 src/TNL/Algorithms/Segments/CSR.hpp           |   4 +-
 src/TNL/Algorithms/Segments/ChunkedEllpack.h  |  12 +-
 src/TNL/Algorithms/Segments/Ellpack.h         |   6 +-
 src/TNL/Algorithms/Segments/EllpackView.h     |   4 +-
 src/TNL/Algorithms/Segments/SlicedEllpack.h   |   6 +-
 .../Algorithms/Segments/detail/BiEllpack.h    |   6 +-
 .../Segments/detail/ChunkedEllpack.h          |   8 +-
 9 files changed, 134 insertions(+), 42 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/BiEllpack.h b/src/TNL/Algorithms/Segments/BiEllpack.h
index 3cf46e0ef..48aa4e6be 100644
--- a/src/TNL/Algorithms/Segments/BiEllpack.h
+++ b/src/TNL/Algorithms/Segments/BiEllpack.h
@@ -32,7 +32,7 @@ class BiEllpack
    public:
       using DeviceType = Device;
       using IndexType = std::remove_const_t<Index>;
-      using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType, IndexAllocator>;
+      using OffsetsContainer = Containers::Vector< IndexType, DeviceType, IndexType, IndexAllocator>;
       static constexpr ElementsOrganization getOrganization() { return Organization; }
       using ViewType = BiEllpackView< Device, Index, Organization, WarpSize >;
       template <typename Device_, typename Index_>
@@ -73,7 +73,7 @@ class BiEllpack
       /**
        * \brief Set sizes of particular segments.
        */
-      template <typename SizesHolder = OffsetsHolder>
+      template <typename SizesHolder = OffsetsContainer>
       void setSegmentsSizes(const SizesHolder &sizes);
 
       void reset();
@@ -138,11 +138,11 @@ class BiEllpack
       void printStructure(std::ostream &str) const;
 
       // TODO: nvcc needs this public because of lambda function used inside
-      template <typename SizesHolder = OffsetsHolder>
+      template <typename SizesHolder = OffsetsContainer>
       void performRowBubbleSort(const SizesHolder &segmentsSize);
 
       // TODO: the same as  above
-      template <typename SizesHolder = OffsetsHolder>
+      template <typename SizesHolder = OffsetsContainer>
       void computeColumnSizes(const SizesHolder &segmentsSizes);
 
    protected:
@@ -150,10 +150,10 @@ class BiEllpack
 
       static constexpr int getLogWarpSize() { return std::log2(WarpSize); };
 
-      template <typename SizesHolder = OffsetsHolder>
+      template <typename SizesHolder = OffsetsContainer>
       void verifyRowPerm(const SizesHolder &segmentsSizes);
 
-      template <typename SizesHolder = OffsetsHolder>
+      template <typename SizesHolder = OffsetsContainer>
       void verifyRowLengths(const SizesHolder &segmentsSizes);
 
       IndexType getStripLength(const IndexType stripIdx) const;
@@ -164,9 +164,9 @@ class BiEllpack
 
       IndexType virtualRows = 0;
 
-      OffsetsHolder rowPermArray;
+      OffsetsContainer rowPermArray;
 
-      OffsetsHolder groupPointers;
+      OffsetsContainer groupPointers;
 
       // TODO: Replace later
       __cuda_callable__ Index power(const IndexType number, const IndexType exponent) const
diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index 5622a139b..221c02b8a 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -26,10 +26,16 @@ namespace TNL {
  *
  * See \ref TNL::Algorithms::Segments for more details about segments.
  *
- * \tparam Device 
- * \tparam Index 
- * \tparam CSRScalarKernel< Index, Device > 
- * \tparam Allocator< Index > 
+ * \tparam Device is type of device where the segments will be operating.
+ * \tparam Index is type for indexing of the elements managed by the segments.
+ * \tparam Kernel is type of kernel used for parallel operations with segments.
+ *    It can be any of the following:
+ *    \ref TNL::Containers::Segments::Kernels::CSRAdaptiveKernel,
+ *    \ref TNL::Containers::Segments::Kernels::CSRHybridKernel,
+ *    \ref TNL::Containers::Segments::Kernels::CSRScalarKernel,
+ *    \ref TNL::Containers::Segments::Kernels::CSRVectorKernel
+ *
+ * \tparam IndexAllocator is allocator for supporting index containers.
  */
 template< typename Device,
           typename Index,
@@ -39,31 +45,117 @@ class CSR
 {
    public:
 
+      /**
+       * \brief The device where the segments are operating.
+       */
       using DeviceType = Device;
+
+      /**
+       * \brief The type used for indexing of segments elements.
+       */
       using IndexType = std::remove_const_t< Index >;
+
+      /**
+       * \brief Type of kernel used for reduction operations.
+       */
       using KernelType = Kernel;
-      using OffsetsHolder = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
-      using SegmentsSizes = OffsetsHolder;
+
+      /**
+       * \brief Type of container storing offsets of particular rows.
+       */
+      using OffsetsContainer = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
+
+      /**
+       * \brief Templated view type.
+       *
+       * \tparam Device_ is alternative device type for the view.
+       * \tparam Index_ is alternative index type for the view.
+       */
       template< typename Device_, typename Index_ >
       using ViewTemplate = CSRView< Device_, Index_, KernelType >;
+
+      /**
+       * \brief Type of segments view.1
+       */
       using ViewType = CSRView< Device, Index, KernelType >;
+
+      /**
+       * \brief Type of constant segments view.
+       */
       using ConstViewType = CSRView< Device, std::add_const_t< IndexType >, KernelType >;
+
+      /**
+       * \brief Accessor type fro one particular segment.
+       */
       using SegmentViewType = SegmentView< IndexType, RowMajorOrder >;
 
-      static constexpr ElementsOrganization getOrganization() { return ColumnMajorOrder; }
+      /**
+       * \brief This functions says that CSR format is always organised in row-major order.
+       */
+      static constexpr ElementsOrganization getOrganization() { return RowMajorOrder; }
 
+      /**
+       * \brief This function says that CSR format does not use padding elements.
+       */
       static constexpr bool havePadding() { return false; };
 
+      /**
+       * \brief Construct with no parameters to create empty segments.
+       */
       CSR();
 
+      /**
+       * \brief Construct with segments sizes.
+       *
+       * The number of segments is given by the size of \e segmentsSizes. Particular elements
+       * of this container define sizes of particular segments.
+       *
+       * \tparam SizesContainer is a type of container for segments sizes.
+       * \param sizes is an instance of the container with the segments sizes.
+       *
+       * See the following example:
+       *
+       * \includelineno Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp
+       *
+       * The result looks as follows:
+       *
+       * \include SegmentsExample_CSR_constructor_1.out
+       */
       template< typename SizesContainer >
-      CSR( const SizesContainer& sizes );
+      CSR( const SizesContainer& segmentsSizes );
 
+      /**
+       * \brief Construct with segments sizes in initializer list..
+       *
+       * The number of segments is given by the size of \e segmentsSizes. Particular elements
+       * of this initializer list define sizes of particular segments.
+       *
+       * \tparam ListIndex is a type of indexes of the initializer list.
+       * \param sizes is an instance of the container with the segments sizes.
+       *
+       * See the following example:
+       *
+       * \includelineno Algorithms/Segments/SegmentsExample_constructor_2.cpp
+       *
+       * The result looks as follows:
+       *
+       * \include SegmentsExample_constructor_1.out
+       */
       template< typename ListIndex >
       CSR( const std::initializer_list< ListIndex >& segmentsSizes );
 
+      /**
+       * \brief Copy constructor.
+       *
+       * \param segments are the source segments.
+       */
       CSR( const CSR& segments );
 
+      /**
+       * \brief Move constructor.
+       *
+       * \param segments  are the source segments.
+       */
       CSR( const CSR&& segments );
 
       static String getSerializationType();
@@ -112,9 +204,9 @@ class CSR
       __cuda_callable__
       SegmentViewType getSegmentView( const IndexType segmentIdx ) const;
 
-      const OffsetsHolder& getOffsets() const;
+      const OffsetsContainer& getOffsets() const;
 
-      OffsetsHolder& getOffsets();
+      OffsetsContainer& getOffsets();
 
       /***
        * \brief Go over all segments and for each segment element call
@@ -154,7 +246,7 @@ class CSR
 
    protected:
 
-      OffsetsHolder offsets;
+      OffsetsContainer offsets;
 
       KernelType kernel;
 };
diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp
index a3b4055c3..10bbc5847 100644
--- a/src/TNL/Algorithms/Segments/CSR.hpp
+++ b/src/TNL/Algorithms/Segments/CSR.hpp
@@ -219,7 +219,7 @@ template< typename Device,
           typename IndexAllocator >
 auto
 CSR< Device, Index, Kernel, IndexAllocator >::
-getOffsets() const -> const OffsetsHolder&
+getOffsets() const -> const OffsetsContainer&
 {
    return this->offsets;
 }
@@ -230,7 +230,7 @@ template< typename Device,
           typename IndexAllocator >
 auto
 CSR< Device, Index, Kernel, IndexAllocator >::
-getOffsets() -> OffsetsHolder&
+getOffsets() -> OffsetsContainer&
 {
    return this->offsets;
 }
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
index 97abb3864..d5d459a00 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
@@ -29,7 +29,7 @@ class ChunkedEllpack
 
       using DeviceType = Device;
       using IndexType = std::remove_const_t< Index >;
-      using OffsetsHolder = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
+      using OffsetsContainer = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
       static constexpr ElementsOrganization getOrganization() { return Organization; }
       using ViewType = ChunkedEllpackView< Device, Index, Organization >;
       template< typename Device_, typename Index_ >
@@ -72,7 +72,7 @@ class ChunkedEllpack
       /**
        * \brief Set sizes of particular segments.
        */
-      template< typename SizesHolder = OffsetsHolder >
+      template< typename SizesHolder = OffsetsContainer >
       void setSegmentsSizes( const SizesHolder& sizes );
 
       void reset();
@@ -150,19 +150,19 @@ class ChunkedEllpack
        * For each segment, this keeps index of the slice which contains the
        * segment.
        */
-      OffsetsHolder rowToSliceMapping;
+      OffsetsContainer rowToSliceMapping;
 
       /**
        * For each row, this keeps index of the first chunk within a slice.
        */
-      OffsetsHolder rowToChunkMapping;
+      OffsetsContainer rowToChunkMapping;
 
-      OffsetsHolder chunksToSegmentsMapping;
+      OffsetsContainer chunksToSegmentsMapping;
 
       /**
        * Keeps index of the first segment index.
        */
-      OffsetsHolder rowPointers;
+      OffsetsContainer rowPointers;
 
       ChunkedEllpackSliceInfoContainer slices;
 
diff --git a/src/TNL/Algorithms/Segments/Ellpack.h b/src/TNL/Algorithms/Segments/Ellpack.h
index 442a1b7c6..e68ebdf62 100644
--- a/src/TNL/Algorithms/Segments/Ellpack.h
+++ b/src/TNL/Algorithms/Segments/Ellpack.h
@@ -31,8 +31,8 @@ class Ellpack
       using IndexType = std::remove_const_t< Index >;
       static constexpr int getAlignment() { return Alignment; }
       static constexpr ElementsOrganization getOrganization() { return Organization; }
-      using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType >;
-      using SegmentsSizes = OffsetsHolder;
+      using OffsetsContainer = Containers::Vector< IndexType, DeviceType, IndexType >;
+      using SegmentsSizes = OffsetsContainer;
       template< typename Device_, typename Index_ >
       using ViewTemplate = EllpackView< Device_, Index_, Organization, Alignment >;
       using ViewType = EllpackView< Device, Index, Organization, Alignment >;
@@ -66,7 +66,7 @@ class Ellpack
       /**
        * \brief Set sizes of particular segments.
        */
-      template< typename SizesHolder = OffsetsHolder >
+      template< typename SizesHolder = OffsetsContainer >
       void setSegmentsSizes( const SizesHolder& sizes );
 
       void setSegmentsSizes( const IndexType segmentsCount, const IndexType segmentSize );
diff --git a/src/TNL/Algorithms/Segments/EllpackView.h b/src/TNL/Algorithms/Segments/EllpackView.h
index aebe7b591..865c0d848 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.h
+++ b/src/TNL/Algorithms/Segments/EllpackView.h
@@ -34,8 +34,8 @@ class EllpackView
       using IndexType = std::remove_const_t< Index >;
       static constexpr int getAlignment() { return Alignment; }
       static constexpr bool getOrganization() { return Organization; }
-      using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType >;
-      using SegmentsSizes = OffsetsHolder;
+      using OffsetsContainer = Containers::Vector< IndexType, DeviceType, IndexType >;
+      using SegmentsSizes = OffsetsContainer;
       template< typename Device_, typename Index_ >
       using ViewTemplate = EllpackView< Device_, Index_, Organization, Alignment >;
       using ViewType = EllpackView;
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.h b/src/TNL/Algorithms/Segments/SlicedEllpack.h
index 69b86c100..092af6a1f 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpack.h
+++ b/src/TNL/Algorithms/Segments/SlicedEllpack.h
@@ -30,7 +30,7 @@ class SlicedEllpack
 
       using DeviceType = Device;
       using IndexType = std::remove_const_t< Index >;
-      using OffsetsHolder = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
+      using OffsetsContainer = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >;
       static constexpr int getSliceSize() { return SliceSize; }
       static constexpr ElementsOrganization getOrganization() { return Organization; }
       using ViewType = SlicedEllpackView< Device, Index, Organization, SliceSize >;
@@ -64,7 +64,7 @@ class SlicedEllpack
       /**
        * \brief Set sizes of particular segments.
        */
-      template< typename SizesHolder = OffsetsHolder >
+      template< typename SizesHolder = OffsetsContainer >
       void setSegmentsSizes( const SizesHolder& sizes );
 
       void reset();
@@ -131,7 +131,7 @@ class SlicedEllpack
 
       IndexType size, alignedSize, segmentsCount;
 
-      OffsetsHolder sliceOffsets, sliceSegmentSizes;
+      OffsetsContainer sliceOffsets, sliceSegmentSizes;
 };
 
 template <typename Device,
diff --git a/src/TNL/Algorithms/Segments/detail/BiEllpack.h b/src/TNL/Algorithms/Segments/detail/BiEllpack.h
index 5605c8fb6..43c42e43c 100644
--- a/src/TNL/Algorithms/Segments/detail/BiEllpack.h
+++ b/src/TNL/Algorithms/Segments/detail/BiEllpack.h
@@ -31,10 +31,10 @@ class BiEllpack
       using DeviceType = Device;
       using IndexType = Index;
       static constexpr bool getOrganization() { return Organization; }
-      using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType >;
-      using OffsetsHolderView = typename OffsetsHolder::ViewType;
+      using OffsetsContainer = Containers::Vector< IndexType, DeviceType, IndexType >;
+      using OffsetsHolderView = typename OffsetsContainer::ViewType;
       using ConstOffsetsHolderView = typename OffsetsHolderView::ConstViewType;
-      using SegmentsSizes = OffsetsHolder;
+      using SegmentsSizes = OffsetsContainer;
       using SegmentViewType = BiEllpackSegmentView< IndexType, Organization >;
 
       static constexpr int getWarpSize() { return WarpSize; };
diff --git a/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h
index 19169b558..3e279b02b 100644
--- a/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h
+++ b/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h
@@ -62,10 +62,10 @@ class ChunkedEllpack
       using DeviceType = Device;
       using IndexType = Index;
       static constexpr ElementsOrganization getOrganization() { return Organization; }
-      using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType >;
-      using OffsetsHolderView = typename OffsetsHolder::ViewType;
-      using SegmentsSizes = OffsetsHolder;
-      using ChunkedEllpackSliceInfoType = detail::ChunkedEllpackSliceInfo< IndexType >;
+      using OffsetsContainer = Containers::Vector< IndexType, DeviceType, IndexType >;
+      using OffsetsHolderView = typename OffsetsContainer::ViewType;
+      using SegmentsSizes = OffsetsContainer;
+      using ChunkedEllpackSliceInfoType = details::ChunkedEllpackSliceInfo< IndexType >;
       using ChunkedEllpackSliceInfoAllocator = typename Allocators::Default< Device >::template Allocator< ChunkedEllpackSliceInfoType >;
       using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >;
       using ChunkedEllpackSliceInfoContainerView = typename ChunkedEllpackSliceInfoContainer::ViewType;
-- 
GitLab


From 7b225e4629a9f29e11bcde25299b1cf6588ac7ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 5 Apr 2021 17:13:52 +0200
Subject: [PATCH 016/117] Writting documentation on CSR format.

---
 .../Segments/.SegmentsExample_General.cpp.swp | Bin 12288 -> 0 bytes
 .../Algorithms/Segments/CMakeLists.txt        |   5 +
 .../SegmentsExample_CSR_constructor_1.cpp     |  51 ++++++++++
 .../SegmentsExample_CSR_constructor_1.cu      |   1 +
 .../SegmentsExample_CSR_constructor_2.cpp     |  50 ++++++++++
 .../SegmentsExample_CSR_constructor_2.cu      |   1 +
 .../SegmentsExample_CSR_getSegmentsType.cpp   |  29 ++++++
 ...gmentsExample_CSR_getSerializationType.cpp |  29 ++++++
 ...egmentsExample_CSR_getSerializationType.cu |   1 +
 .../SegmentsExample_CSR_setSegmentsSizes.cpp  |  32 +++++++
 .../SegmentsExample_CSR_setSegmentsSizes.cu   |   1 +
 src/TNL/Algorithms/Segments/CSR.h             |  90 ++++++++++++++++--
 src/TNL/Algorithms/Segments/CSR.hpp           |   2 +-
 13 files changed, 281 insertions(+), 11 deletions(-)
 delete mode 100644 Documentation/Examples/Algorithms/Segments/.SegmentsExample_General.cpp.swp
 create mode 100644 Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp
 create mode 120000 Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cu
 create mode 100644 Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp
 create mode 120000 Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cu
 create mode 100644 Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentsType.cpp
 create mode 100644 Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSerializationType.cpp
 create mode 120000 Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSerializationType.cu
 create mode 100644 Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_setSegmentsSizes.cpp
 create mode 120000 Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_setSegmentsSizes.cu

diff --git a/Documentation/Examples/Algorithms/Segments/.SegmentsExample_General.cpp.swp b/Documentation/Examples/Algorithms/Segments/.SegmentsExample_General.cpp.swp
deleted file mode 100644
index f50e0038f2a01cc4ed7c7742797ac5bb2531788d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 12288
zcmYc?2=nw+u+TGPU|?VnU|=W_Pfw5w@MHK8B*2iLlv<Qgnv_}ulEZ_O6N`#6Q*{&b
z@>21rVqmC)8>nBNUzA;3keHmRpPZkPs$Y_qqnnairtgxUT$-DjSCUwgnV+ZcT9KGr
zkds=h@0gRGUzAyrkz1@EoSF_&fM7rs#k;5Gr4}XT=p`2v;B~~P<Y)+t#1JSgNz=99
zWiU1}Gyr*DSxHerSSS?49L1v{Fd71*Aut*OqaiRF0;3@?8UmvsFd70QBm_zd7#Zpr
z7#NtK{`H5_jA%5JJ4%g)z-S1JhQMeDjE2By2#kinXb6mkz-S1JhQMeDjE2By2n@jx
zNK9d1_`uA-;K2%+|A+Pe-|#aqT;pe8*v!ws(8bTd5X;ZNV9(FMpvcd_puo?-z{JnM
z@S2Z-;Uymf!xlaUhDJUHh6X+ch6+9g26sLN1_wR{21z~!h8w&L4A*%X7|!!DFs$HZ
zU|7h@z~INrz+lhIz+lJAz+lG9z`(`Jz`)GQz;K_3fgy;8fkB^#fq{{Sf#D}N1H%t)
z28IvZ3=D_385nkRGcfpbGcd?=Gcd?;Gcbs8GcereVqmzz#lTR`#lR5E#lUcelYwD5
zCj-MmP6h^BP6h@sP6h@6P6mck91IMTIT#owaxgGV;9y|L<zQgQ=3rou<zQeq&d$J4
z$<Dx_%+A1|#LmE=$j-oUpN)ZG9~%S1UN#1XayABrAT|buKsE*j2Q~(VU#tuaH(41N
zZm=>i^szE9WU?|aB(X9uc(Fpl(F_!pLoh9jI(ak%Mniy@5U6EPP*5mJEh#O^Q*e#&
z42ciM6bW^9b`1`;W>8MeOUX<FX@IR=f-YIIQE)CzNp#K0DM(DtR<P61v<4|EE=jSn
zO3p7WQLwR5P=YE|$WK#%DJ+JrfKtfMQ*aJ-aa3>*2(?mB0x1XUP0dTmv1R}nh1EXi
z;2^?wffNyEk8);ON@|*dhhvy)JjgZJE%wMSE`hn9ERSa77ncy}NswJ2_Y<@SqzKuQ
zIP3t0WNC3`UOGy2*(!wi`B+&YR;F25!Ghb$3hGrGh#HsFvdrYvVk;|9bn7T&=9MVe
zfukB>D#$5#O>_<pBF;3B@1fQ!;58Ad2;EeWeoQxlM9Fa@NFUs^Y6g(ka}zW3G{Aw7
zSd^ZuqmZ1DSfr(pSd?BC9jl<pzyQuj;Dij%1R)uz3dN<l#R`c<sjx%<;@a3KfE7Yg
zyn=!PEH?FuQd3HkQyp`1U~!@WG9W%JwIn%12coGUr?l9{uDVu7Av-m-Ku5tq&p-i^
z18c#N12V2Ow>Z8mGqpS#X0m5Wg+i=?twLF1PH8GwSz>8PKG*~WTZL%bScUlb<kFPH
z_~gW#oW!J@)cAM>4Ta?Vyy6lC&%Bh>ijc~JR5c{ibfA(c`K3uYscH~|6*LucOG`l7
z6{<n05AGmniIb98k_dHIdQN^)Vh-3@2<H@+=0d!R$4zk8pg0C0nv<UlHdhDXZUwky
zItoeo`8jF|$@#ejr6s8fnhLsh3J@<qTnr8sP?Uk4sbC8V{bIfJ)RM5w)N*J+2hKmv
z`FSOYnR%&2#a32fsmUezMK(}_bQB=@1{7k*j<5rpr2%s`*i=|7JLi|?m1t-}<559N
z9~62DS_;lkcY<65DoRsxQo)Nz6*BV_3KEM-GLuVl5{nQ<g0$&tX=#BAvx1_`yb_oV
z2zzxDkW&*ltwNFy*zaIDqJkeDsHmX`t6gx!wE{Fvz`Ym%a-~8^MydixheB>*USfJ`
zib7H)SPY&qP<&Pk3n&E;0O<oqS2QA`!HFHJ*)ydAoWv9q6f)B^6krKN!NwLAaG+$U
z39<pIqgXF3zsNBs#}yjf8aQ(ZBHcl?dZtt$M>@<hWHETaVWviu+yjmexaSmX!2yJp
z77<CsGp{(cs06GAR7m9$P9Gqn6>LG}2RL{zlC@(|QDP;wRIOkKwg!=;K#mG7$uCMw
zPYuqjO2v|p9dke-QIe{V2r^V5Ex!mQdch8WCtPsZ2u_%wmIN#XD->r|rJ@w2u=WH-
zaf;9aRZ^{BsH0${qhPF~V4|a7ijfGNi&7IyQenxVSfM<#Bts!Rvn(|aAqKWy50pya
z4u#Z%h%{iUP*PctnwOZHssM9>l@(kX>`90gL<ymQRS!bmnxUGZEI%^^ZXmQ_rJ>1C
zlA2qPlUR~!gK4XR9Rq`MW?phmX-cYsO^Bb5KE%XgeNa`dmtluP7F6wF$|Bkdh*j*+
rT9`P+&cQ*L3K1o!KBNFeRgzYkmke4=pO}N}*v$OmlA_eaTssB;8bsWH

diff --git a/Documentation/Examples/Algorithms/Segments/CMakeLists.txt b/Documentation/Examples/Algorithms/Segments/CMakeLists.txt
index c7f05c37f..7479762c3 100644
--- a/Documentation/Examples/Algorithms/Segments/CMakeLists.txt
+++ b/Documentation/Examples/Algorithms/Segments/CMakeLists.txt
@@ -1,5 +1,10 @@
 set( COMMON_EXAMPLES
    SegmentsExample_General
+   SegmentsExample_CSR_constructor_1
+   SegmentsExample_CSR_constructor_2
+   SegmentsExample_CSR_getSerializationType
+   SegmentsExample_CSR_getSegmentsType
+   SegmentsExample_CSR_setSegmentsSizes
 )
 
 if( BUILD_CUDA )
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp
new file mode 100644
index 000000000..4c6e28575
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp
@@ -0,0 +1,51 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   TNL::Containers::Vector< int, Device > segmentsSizes{ 1, 2, 3, 4, 5 };
+   SegmentsType segments( segmentsSizes );
+   std::cout << "Segments sizes are: " << segments << std::endl;
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, Device > data( segments.getStorageSize(), 0.0 );
+
+   /***
+    * Insert data into particular segments.
+    */
+   auto data_view = data.getView();
+   segments.forAllElements( [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx, bool& compute ) mutable {
+      if( localIdx <= segmentIdx )
+         data_view[ globalIdx ] = segmentIdx;
+   } );
+
+   /***
+    * Print the data managed by the segments.
+    */
+   auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout );
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cu
new file mode 120000
index 000000000..9daf42ace
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cu
@@ -0,0 +1 @@
+SegmentsExample_CSR_constructor_1.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp
new file mode 100644
index 000000000..c15f5791e
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp
@@ -0,0 +1,50 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   SegmentsType segments{ 1, 2, 3, 4, 5 };
+   std::cout << "Segments sizes are: " << segments << std::endl;
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, Device > data( segments.getStorageSize(), 0.0 );
+
+   /***
+    * Insert data into particular segments.
+    */
+   auto data_view = data.getView();
+   segments.forAllElements( [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx, bool& compute ) mutable {
+      if( localIdx <= segmentIdx )
+         data_view[ globalIdx ] = segmentIdx;
+   } );
+
+   /***
+    * Print the data managed by the segments.
+    */
+   auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout );
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cu
new file mode 120000
index 000000000..9286174a1
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cu
@@ -0,0 +1 @@
+SegmentsExample_CSR_constructor_2.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentsType.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentsType.cpp
new file mode 100644
index 000000000..fea901173
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentsType.cpp
@@ -0,0 +1,29 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+
+   /***
+    * Create segments and print the segments type.
+    */
+   SegmentsType segments;
+   std::cout << "The segments type is: " << segments.getSegmentsType() << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSerializationType.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSerializationType.cpp
new file mode 100644
index 000000000..a52a18e50
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSerializationType.cpp
@@ -0,0 +1,29 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+
+   /***
+    * Create segments and print the serialization type.
+    */
+   SegmentsType segments;
+   std::cout << "The serialization type is: " << segments.getSerializationType() << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSerializationType.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSerializationType.cu
new file mode 120000
index 000000000..31c65453c
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSerializationType.cu
@@ -0,0 +1 @@
+SegmentsExample_CSR_getSerializationType.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_setSegmentsSizes.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_setSegmentsSizes.cpp
new file mode 100644
index 000000000..59a9e1bfa
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_setSegmentsSizes.cpp
@@ -0,0 +1,32 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   TNL::Containers::Vector< int, Device > segmentsSizes{ 1, 2, 3, 4, 5 };
+   SegmentsType segments;
+   segments.setSegmentsSizes( segmentsSizes );
+   std::cout << "Segments sizes are: " << segments << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_setSegmentsSizes.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_setSegmentsSizes.cu
new file mode 120000
index 000000000..f56df02ad
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_setSegmentsSizes.cu
@@ -0,0 +1 @@
+SegmentsExample_CSR_setSegmentsSizes.cpp
\ No newline at end of file
diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index 221c02b8a..5290ac9f4 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -110,7 +110,8 @@ class CSR
        * The number of segments is given by the size of \e segmentsSizes. Particular elements
        * of this container define sizes of particular segments.
        *
-       * \tparam SizesContainer is a type of container for segments sizes.
+       * \tparam SizesContainer is a type of container for segments sizes.  It can be \ref TNL::Containers::Array or
+       *  \ref TNL::Containers::Vector for example.
        * \param sizes is an instance of the container with the segments sizes.
        *
        * See the following example:
@@ -135,11 +136,11 @@ class CSR
        *
        * See the following example:
        *
-       * \includelineno Algorithms/Segments/SegmentsExample_constructor_2.cpp
+       * \includelineno Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp
        *
        * The result looks as follows:
        *
-       * \include SegmentsExample_constructor_1.out
+       * \include SegmentsExample_CSR_constructor_2.out
        */
       template< typename ListIndex >
       CSR( const std::initializer_list< ListIndex >& segmentsSizes );
@@ -158,49 +159,118 @@ class CSR
        */
       CSR( const CSR&& segments );
 
+      /**
+       * \brief Returns string with serialization type.
+       *
+       * The string has a form `Algorithms::Segments::CSR< IndexType,  [any_device], [any_kernel], [any_allocator] >`.
+       *
+       * \return \ref String with the serialization type.
+       *
+       * \par Example
+       * \include Algorithms/Segments/SegmentsExample_CSR_getSerializationType.cpp
+       * \par Output
+       * \include SegmentsExample_CSR_getSerializationType.out
+       */
       static String getSerializationType();
 
+      /**
+       * \brief Returns string with segments type.
+       *
+       * The string has a form `CSR< KernelType >`.
+       *
+       * \return \ref String with the segments type.
+       *
+       * \par Example
+       * \include Algorithms/Segments/SegmentsExample_CSR_getSegmentsType.cpp
+       * \par Output
+       * \include SegmentsExample_CSR_getSegmentsType.out
+       */
       static String getSegmentsType();
 
       /**
        * \brief Set sizes of particular segments.
+       *
+       * \tparam SizesContainer is a container with segments sizes. It can be \ref TNL::Containers::Array or
+       *  \ref TNL::Containers::Vector for example.
+       *
+       * \param segmentsSizes is an instance of the container with segments sizes.
        */
-      template< typename SizesHolder >
-      void setSegmentsSizes( const SizesHolder& sizes );
+      template< typename SizesContainer >
+      void setSegmentsSizes( const SizesContainer& segmentsSizes );
 
+      /**
+       * \brief Reset the segments to empty states.
+       *
+       * It means that there is no segment in the CSR segments.
+       */
       void reset();
 
+      /**
+       * \brief Getter of a view object.
+       *
+       * \return View for this instance of CSR segments which can by used for example in
+       *  lambda functions running in GPU kernels.
+       */
       ViewType getView();
 
+      /**
+       * \brief Getter of a view object for constants instances.
+       *
+       * \return View for this instance of CSR segments which can by used for example in
+       *  lambda functions running in GPU kernels.
+       */
       const ConstViewType getConstView() const;
 
       /**
-       * \brief Number of segments.
+       * \brief Getter of number of segments.
+       *
+       * \return number of segments within this object.
        */
       __cuda_callable__
       IndexType getSegmentsCount() const;
 
-      /***
-       * \brief Returns size of the segment number \r segmentIdx
+      /**
+       * \brief Returns size of particular segment.
+       *
+       * \return size of the segment number \e segmentIdx.
        */
       __cuda_callable__
       IndexType getSegmentSize( const IndexType segmentIdx ) const;
 
       /***
        * \brief Returns number of elements managed by all segments.
+       *
+       * \return number of elements managed by all segments.
        */
       __cuda_callable__
       IndexType getSize() const;
 
-      /***
-       * \brief Returns number of elements that needs to be allocated.
+      /**
+       * \brief Returns number of elements that needs to be allocated by a container connected to this segments.
+       *
+       * \return size of container connected to this segments.
        */
       __cuda_callable__
       IndexType getStorageSize() const;
 
+      /**
+       * \brief Computes the global index of an element managed by the segments.
+       *
+       * The global index serves as a refernce on the element in its container.
+       *
+       * \param segmentIdx is index of a segment with the element.
+       * \param localIdx is tha local index of the element within the segment.
+       * \return global index of the element.
+       */
       __cuda_callable__
       IndexType getGlobalIndex( const Index segmentIdx, const Index localIdx ) const;
 
+      /**
+       * \brief Returns segment view (i.e. segment accessor) of segment with given index.
+       *
+       * \param segmentIdx is index of the request segment.
+       * \return segment view of given segment.
+       */
       __cuda_callable__
       SegmentViewType getSegmentView( const IndexType segmentIdx ) const;
 
diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp
index 10bbc5847..ce91e7dc5 100644
--- a/src/TNL/Algorithms/Segments/CSR.hpp
+++ b/src/TNL/Algorithms/Segments/CSR.hpp
@@ -79,7 +79,7 @@ CSR< Device, Index, Kernel, IndexAllocator >::
 getSerializationType()
 {
    return "CSR< [any_device], " +
-      TNL::getSerializationType< IndexType >() +
+      TNL::getSerializationType< IndexType >() + ", " +
       TNL::getSerializationType< KernelType >() + " >";
 }
 
-- 
GitLab


From 3916fab849addd312d2346fd3c4ecbe87de72416 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 6 Apr 2021 13:24:40 +0200
Subject: [PATCH 017/117] Added SequentialFor.

---
 .../Examples/Algorithms/CMakeLists.txt        | 81 +++++++------------
 .../Algorithms/SequentialForExample.cpp       | 37 +++++++++
 .../Algorithms/SequentialForExample.cu        |  1 +
 src/TNL/Algorithms/SequentialFor.h            | 54 +++++++++++++
 4 files changed, 122 insertions(+), 51 deletions(-)
 create mode 100644 Documentation/Examples/Algorithms/SequentialForExample.cpp
 create mode 120000 Documentation/Examples/Algorithms/SequentialForExample.cu
 create mode 100644 src/TNL/Algorithms/SequentialFor.h

diff --git a/Documentation/Examples/Algorithms/CMakeLists.txt b/Documentation/Examples/Algorithms/CMakeLists.txt
index 8afdc50cc..c200642f5 100644
--- a/Documentation/Examples/Algorithms/CMakeLists.txt
+++ b/Documentation/Examples/Algorithms/CMakeLists.txt
@@ -1,56 +1,35 @@
 ADD_SUBDIRECTORY( Segments )
 
-IF( BUILD_CUDA )
-   CUDA_ADD_EXECUTABLE( SortingExampleCuda SortingExample.cu)
-   ADD_CUSTOM_COMMAND( COMMAND SortingExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out )
-
-   CUDA_ADD_EXECUTABLE( SortingExample2Cuda SortingExample2.cu)
-   ADD_CUSTOM_COMMAND( COMMAND SortingExample2Cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample2.out OUTPUT SortingExample2.out )
-
-   CUDA_ADD_EXECUTABLE( SortingExample3Cuda SortingExample3.cu)
-   ADD_CUSTOM_COMMAND( COMMAND SortingExample3Cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample3.out OUTPUT SortingExample3.out )
-
-   CUDA_ADD_EXECUTABLE(ParallelForExampleCuda ParallelForExample.cu)
-   ADD_CUSTOM_COMMAND( COMMAND ParallelForExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
-
-   CUDA_ADD_EXECUTABLE(reduceArrayExampleCuda reduceArrayExample.cu)
-   ADD_CUSTOM_COMMAND( COMMAND reduceArrayExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceArrayExample.out OUTPUT reduceArrayExample.out )
-
-   CUDA_ADD_EXECUTABLE(reduceWithArgumentArrayExampleCuda reduceWithArgumentArrayExample.cu)
-   ADD_CUSTOM_COMMAND( COMMAND reduceWithArgumentArrayExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceWithArgumentArrayExample.out OUTPUT reduceWithArgumentArrayExample.out )
-ELSE()
-   ADD_EXECUTABLE( SortingExample SortingExample.cpp)
-   ADD_CUSTOM_COMMAND( COMMAND SortingExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out )
-
-   ADD_EXECUTABLE( SortingExample2 SortingExample2.cpp)
-   ADD_CUSTOM_COMMAND( COMMAND SortingExample2 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample2.out OUTPUT SortingExample2.out )
-
-   ADD_EXECUTABLE( SortingExample3 SortingExample3.cpp)
-   ADD_CUSTOM_COMMAND( COMMAND SortingExample3 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample3.out OUTPUT SortingExample3.out )
-
-   ADD_EXECUTABLE(ParallelForExample ParallelForExample.cpp)
-   ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
-
-   ADD_EXECUTABLE(reduceArrayExample reduceArrayExample.cpp)
-   ADD_CUSTOM_COMMAND( COMMAND reduceArrayExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceArrayExample.out OUTPUT reduceArrayExample.out )
-
-   ADD_EXECUTABLE(reduceWithArgumentArrayExample reduceWithArgumentArrayExample.cpp)
-   ADD_CUSTOM_COMMAND( COMMAND reduceWithArgumentArrayExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceWithArgumentArrayExample.out OUTPUT reduceWithArgumentArrayExample.out )
-ENDIF()
-
-ADD_EXECUTABLE(staticForExample staticForExample.cpp)
-ADD_CUSTOM_COMMAND( COMMAND staticForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/staticForExample.out OUTPUT staticForExample.out )
-
-ADD_EXECUTABLE(unrolledForExample unrolledForExample.cpp)
-ADD_CUSTOM_COMMAND( COMMAND unrolledForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/unrolledForExample.out OUTPUT unrolledForExample.out )
-
-ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS
-   SortingExample.out
-   SortingExample2.out
-   SortingExample3.out
-   ParallelForExample.out
-   reduceArrayExample.out
-   reduceWithArgumentArrayExample.out
+set( COMMON_EXAMPLES
+   SortingExample
+   SortingExample2
+   SortingExample3
+   ParallelForExample
+   SequentialForExample
    unrolledForExample.out
    staticForExample.out
 )
+
+set( HOST_EXAMPLES
+   staticForExample
+   unrolledForExample
+)
+if( BUILD_CUDA )
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+      cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
+      add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
+   endforeach()
+else()
+   foreach( target IN ITEMS "${COMMON_EXAMPLES} ${HOST_EXAMPLES}")
+      add_executable( ${target} ${target}.cpp )
+      add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out )
+   endforeach()
+endif()
+
+IF( BUILD_CUDA )
+   ADD_CUSTOM_TARGET( RunAlgorithmsExamples-cuda ALL DEPENDS ${CUDA_OUTPUTS} )
+ELSE()
+   ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS ${HOST_OUTPUTS} )
+ENDIF()
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/SequentialForExample.cpp b/Documentation/Examples/Algorithms/SequentialForExample.cpp
new file mode 100644
index 000000000..d127a33a9
--- /dev/null
+++ b/Documentation/Examples/Algorithms/SequentialForExample.cpp
@@ -0,0 +1,37 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/SequentialFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+template< typename Device >
+void printVector()
+{
+   const int size( 36 );
+   TNL::Containers::Vector< float, Device > v( size, 1.0 );
+   auto view = v.getView();
+   auto print = [=] __cuda_callable__  ( int i ) mutable {
+      printf( "v[ %d ] = %f \n", i, view[ i ] );  // we use printf because of compatibility with GPU kernels
+   };
+   std::cout << "Printing vector using parallel for: " << std::endl;
+   Algorithms::ParallelFor< Device >::exec( 0, v.getSize(), print );
+
+   std::cout << "Printing vector using sequential for: " << std::endl;
+   Algorithms::SequentialFor< Device >::exec( 0, v.getSize(), print );
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example on the host:" << std::endl;
+   printVector< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example on CUDA GPU:" << std::endl;
+   printVector< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Examples/Algorithms/SequentialForExample.cu b/Documentation/Examples/Algorithms/SequentialForExample.cu
new file mode 120000
index 000000000..ac78b379b
--- /dev/null
+++ b/Documentation/Examples/Algorithms/SequentialForExample.cu
@@ -0,0 +1 @@
+SequentialForExample.cpp
\ No newline at end of file
diff --git a/src/TNL/Algorithms/SequentialFor.h b/src/TNL/Algorithms/SequentialFor.h
new file mode 100644
index 000000000..ea783ca33
--- /dev/null
+++ b/src/TNL/Algorithms/SequentialFor.h
@@ -0,0 +1,54 @@
+/***************************************************************************
+                          SequentialFor.h  -  description
+                             -------------------
+    begin                : Apr 5, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Algorithms/ParallelFor.h>
+
+
+namespace TNL {
+   namespace Algorithms {
+
+/**
+ * \brief Wrapper to ParallelFor which makes it run sequentially.
+ *
+ *  It is helpfull for debuging or just sequential for loops on GPUs.
+ */
+template< typename Device = Devices::Sequential >
+struct SequentialFor
+{
+   /**
+    * \brief Static method for execution of the loop.
+    *
+    * \tparam Index defines the type of indexes over which the loop iterates.
+    * \tparam Function is the type of function to be called in each iteration.
+    *
+    * \param start the for-loop iterates over index interval [start, end).
+    * \param end the for-loop iterates over index interval [start, end).
+    * \param f is the function to be called in each iteration
+    *
+    * \par Example
+    * \include Algorithms/SequentialForExample.cpp
+    * \par Output
+    * \include SequentialForExample.out
+    *
+    */
+   template< typename Index,
+             typename Function >
+   static void exec( Index start, Index end, Function f )
+   {
+      for( Index i = start; i < end; i++ )
+         ParallelFor< Device >::exec( i, i + 1, f );
+   }
+};
+
+
+   } // namespace Algorithms
+} // namespace TNL
\ No newline at end of file
-- 
GitLab


From afa089e1afce3338fd1fa60fe9da0b326c66f1d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 6 Apr 2021 14:11:39 +0200
Subject: [PATCH 018/117] Improving SequentialFor example.

---
 Documentation/Examples/Algorithms/SequentialForExample.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Documentation/Examples/Algorithms/SequentialForExample.cpp b/Documentation/Examples/Algorithms/SequentialForExample.cpp
index d127a33a9..4bf83a64a 100644
--- a/Documentation/Examples/Algorithms/SequentialForExample.cpp
+++ b/Documentation/Examples/Algorithms/SequentialForExample.cpp
@@ -10,11 +10,12 @@ using namespace TNL::Containers;
 template< typename Device >
 void printVector()
 {
-   const int size( 36 );
+   const int size( 60 );
    TNL::Containers::Vector< float, Device > v( size, 1.0 );
    auto view = v.getView();
    auto print = [=] __cuda_callable__  ( int i ) mutable {
-      printf( "v[ %d ] = %f \n", i, view[ i ] );  // we use printf because of compatibility with GPU kernels
+      if( i % 5 == 0 )
+         printf( "v[ %d ] = %f \n", i, view[ i ] );  // we use printf because of compatibility with GPU kernels
    };
    std::cout << "Printing vector using parallel for: " << std::endl;
    Algorithms::ParallelFor< Device >::exec( 0, v.getSize(), print );
-- 
GitLab


From 1cd288897218a7f72d3a9ab07ca3f2173d7619fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 6 Apr 2021 15:14:25 +0200
Subject: [PATCH 019/117] Added segment element and segment view iterator.

---
 src/TNL/Algorithms/Segments/SegmentElement.h  | 57 +++++++++++++
 src/TNL/Algorithms/Segments/SegmentView.h     | 69 +++++++++++++++
 .../Algorithms/Segments/SegmentViewIterator.h | 84 +++++++++++++++++++
 .../Segments/SegmentViewIterator.hpp          | 83 ++++++++++++++++++
 src/TNL/Matrices/MatrixRowViewIterator.hpp    |  2 +-
 5 files changed, 294 insertions(+), 1 deletion(-)
 create mode 100644 src/TNL/Algorithms/Segments/SegmentElement.h
 create mode 100644 src/TNL/Algorithms/Segments/SegmentViewIterator.h
 create mode 100644 src/TNL/Algorithms/Segments/SegmentViewIterator.hpp

diff --git a/src/TNL/Algorithms/Segments/SegmentElement.h b/src/TNL/Algorithms/Segments/SegmentElement.h
new file mode 100644
index 000000000..68088ba22
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/SegmentElement.h
@@ -0,0 +1,57 @@
+/***************************************************************************
+                          SegmentElement.h -  description
+                             -------------------
+    begin                : Apr 5, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <ostream>
+
+#include <TNL/Cuda/CudaCallable.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+
+template< typename Index >
+class SegmentElement
+{
+   public:
+
+      using IndexType = Index;
+
+      __cuda_callable__
+      SegmentElement( const IndexType& segmentIdx,
+                      const IndexType& localIdx,
+                      const IndexType globalIdx )
+      : segmentIdx( segmentIdx ), localIdx( localIdx ), globalIdx( globalIdx ) {};
+
+      __cuda_callable__
+      const IndexType& segmentIndex() const { return segmentIdx; };
+
+      __cuda_callable__
+      const IndexType& localIndex() const { return localIdx; };
+
+      __cuda_callable__
+      const IndexType& globalIndex() const { return globalIdx; };
+
+   protected:
+
+      const IndexType& segmentIdx;
+
+      const IndexType& localIdx;
+
+      const IndexType globalIdx;
+
+
+};
+
+      } // namespace Segments
+   } // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/SegmentView.h b/src/TNL/Algorithms/Segments/SegmentView.h
index ecf1c95f6..399e3ddd1 100644
--- a/src/TNL/Algorithms/Segments/SegmentView.h
+++ b/src/TNL/Algorithms/Segments/SegmentView.h
@@ -11,6 +11,7 @@
 #pragma once
 
 #include <TNL/Algorithms/Segments/ElementsOrganization.h>
+#include <TNL/Algorithms/Segments/SegmentViewIterator.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -27,6 +28,8 @@ class SegmentView< Index, ColumnMajorOrder >
 
       using IndexType = Index;
 
+      using IteratorType = SegmentViewIterator< SegmentView >;
+
       __cuda_callable__
       SegmentView( const IndexType segmentIdx,
                    const IndexType offset,
@@ -57,6 +60,38 @@ class SegmentView< Index, ColumnMajorOrder >
          return this->segmentIdx;
       };
 
+      /**
+       * \brief Returns iterator pointing at the beginning of the segment.
+       *
+       * \return iterator pointing at the beginning.
+       */
+      __cuda_callable__
+      IteratorType begin() const { return IteratorType( *this, 0 ); };
+
+      /**
+       * \brief Returns iterator pointing at the end of the segment.
+       *
+       * \return iterator pointing at the end.
+       */
+      __cuda_callable__
+      IteratorType end() const { return IteratorType( *this, this->getSize() ); };
+
+      /**
+       * \brief Returns constant iterator pointing at the beginning of the segment.
+       *
+       * \return iterator pointing at the beginning.
+       */
+      __cuda_callable__
+      const IteratorType cbegin() const { return IteratorType( *this, 0 ); };
+
+      /**
+       * \brief Returns constant iterator pointing at the end of the segment.
+       *
+       * \return iterator pointing at the end.
+       */
+      __cuda_callable__
+      const IteratorType cend() const { return IteratorType( *this, this->getSize() ); };
+
       protected:
 
          IndexType segmentIdx, segmentOffset, segmentSize, step;
@@ -69,6 +104,8 @@ class SegmentView< Index, RowMajorOrder >
 
       using IndexType = Index;
 
+      using IteratorType = SegmentViewIterator< SegmentView >;
+
       __cuda_callable__
       SegmentView( const IndexType segmentIdx,
                    const IndexType offset,
@@ -95,6 +132,38 @@ class SegmentView< Index, RowMajorOrder >
          return this->segmentIdx;
       };
 
+      /**
+       * \brief Returns iterator pointing at the beginning of the segment.
+       *
+       * \return iterator pointing at the beginning.
+       */
+      __cuda_callable__
+      IteratorType begin() const { return IteratorType( *this, 0 ); };
+
+      /**
+       * \brief Returns iterator pointing at the end of the segment.
+       *
+       * \return iterator pointing at the end.
+       */
+      __cuda_callable__
+      IteratorType end() const { return IteratorType( *this, this->getSize() ); };
+
+      /**
+       * \brief Returns constant iterator pointing at the beginning of the segment.
+       *
+       * \return iterator pointing at the beginning.
+       */
+      __cuda_callable__
+      const IteratorType cbegin() const { return IteratorType( *this, 0 ); };
+
+      /**
+       * \brief Returns constant iterator pointing at the end of the segment.
+       *
+       * \return iterator pointing at the end.
+       */
+      __cuda_callable__
+      const IteratorType cend() const { return IteratorType( *this, this->getSize() ); };
+
       protected:
 
          IndexType segmentIdx, segmentOffset, segmentSize;
diff --git a/src/TNL/Algorithms/Segments/SegmentViewIterator.h b/src/TNL/Algorithms/Segments/SegmentViewIterator.h
new file mode 100644
index 000000000..335ce91aa
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/SegmentViewIterator.h
@@ -0,0 +1,84 @@
+ /***************************************************************************
+                          SegmentViewIterator.h -  description
+                             -------------------
+    begin                : Apr 5, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <ostream>
+
+#include <TNL/Cuda/CudaCallable.h>
+#include <TNL/Algorithms/Segments/SegmentElement.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+template< typename SegmentView >
+class SegmentViewIterator
+{
+   public:
+
+      /**
+       * \brief Type of SegmentView
+       */
+      using SegmentViewType = SegmentView;
+
+      /**
+       * \brief The type used for matrix elements indexing.
+       */
+      using IndexType = typename SegmentViewType::IndexType;
+
+      /**
+       * \brief The type of related matrix element.
+       */
+      using SegmentElementType = SegmentElement< IndexType >;
+
+      __cuda_callable__
+      SegmentViewIterator( const SegmentViewType& segmentView,
+                           const IndexType& localIdx );
+
+      /**
+       * \brief Comparison of two matrix Segment iterators.
+       *
+       * \param other is another matrix Segment iterator.
+       * \return \e true if both iterators points at the same point of the same matrix, \e false otherwise.
+       */
+      __cuda_callable__
+      bool operator==( const SegmentViewIterator& other ) const;
+
+      /**
+       * \brief Comparison of two matrix Segment iterators.
+       *
+       * \param other is another matrix Segment iterator.
+       * \return \e false if both iterators points at the same point of the same matrix, \e true otherwise.
+       */
+      __cuda_callable__
+      bool operator!=( const SegmentViewIterator& other ) const;
+
+      __cuda_callable__
+      SegmentViewIterator& operator++();
+
+      __cuda_callable__
+      SegmentViewIterator& operator--();
+
+      __cuda_callable__
+      const SegmentElementType operator*() const;
+
+   protected:
+
+      const SegmentViewType& segmentView;
+
+      IndexType localIdx = 0;
+};
+
+      } // namespace Segments
+   } // namespace Algorithms
+} // namespace TNL
+
+#include <TNL/Algorithms/Segments/SegmentViewIterator.hpp>
diff --git a/src/TNL/Algorithms/Segments/SegmentViewIterator.hpp b/src/TNL/Algorithms/Segments/SegmentViewIterator.hpp
new file mode 100644
index 000000000..47154da99
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/SegmentViewIterator.hpp
@@ -0,0 +1,83 @@
+/***************************************************************************
+                          SegmentViewIterator.hpp -  description
+                             -------------------
+    begin                : Apr 5, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Algorithms/Segments/SegmentView.h>
+#include <TNL/Assert.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+template< typename SegmentView >
+__cuda_callable__
+SegmentViewIterator< SegmentView >::
+SegmentViewIterator( const SegmentViewType& segmentView,
+                     const IndexType& localIdx )
+: segmentView( segmentView ), localIdx( localIdx )
+{
+}
+
+template< typename SegmentView >
+__cuda_callable__ bool
+SegmentViewIterator< SegmentView >::
+operator==( const SegmentViewIterator& other ) const
+{
+   if( &this->segmentView == &other.segmentView &&
+       localIdx == other.localIdx )
+      return true;
+   return false;
+}
+
+template< typename SegmentView >
+__cuda_callable__ bool
+SegmentViewIterator< SegmentView >::
+operator!=( const SegmentViewIterator& other ) const
+{
+   return ! ( other == *this );
+}
+
+template< typename SegmentView >
+__cuda_callable__
+SegmentViewIterator< SegmentView >&
+SegmentViewIterator< SegmentView >::
+operator++()
+{
+   if( localIdx < segmentView.getSize() )
+      localIdx ++;
+   return *this;
+}
+
+template< typename SegmentView >
+__cuda_callable__
+SegmentViewIterator< SegmentView >&
+SegmentViewIterator< SegmentView >::
+operator--()
+{
+   if( localIdx > 0 )
+      localIdx --;
+   return *this;
+}
+
+template< typename SegmentView >
+__cuda_callable__ auto
+SegmentViewIterator< SegmentView >::
+operator*() const -> const SegmentElementType
+{
+   return SegmentElementType(
+      this->segmentView.getSegmentIndex(),
+      this->localIdx,
+      this->segmentView.getGlobalIndex( this->localIdx ) );
+}
+
+      } // namespace Segments
+   } // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Matrices/MatrixRowViewIterator.hpp b/src/TNL/Matrices/MatrixRowViewIterator.hpp
index 7b233e47b..7d217bc7a 100644
--- a/src/TNL/Matrices/MatrixRowViewIterator.hpp
+++ b/src/TNL/Matrices/MatrixRowViewIterator.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          SparseMatrixRowView.hpp -  description
+                          MatrixRowViewIterator.hpp -  description
                              -------------------
     begin                : Mar 20, 2021
     copyright            : (C) 2021 by Tomas Oberhuber
-- 
GitLab


From 6ca4d1254bb26496c5e6493e4d04780f30674b81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 6 Apr 2021 15:15:34 +0200
Subject: [PATCH 020/117] Writting documentation on CSR format.

---
 .../Algorithms/Segments/CMakeLists.txt        |  1 +
 .../SegmentsExample_CSR_getSegmentView.cpp    | 47 +++++++++++++++++++
 .../SegmentsExample_CSR_getSegmentView.cu     |  1 +
 .../SegmentsExample_CSR_getSegmentsType.cu    |  1 +
 ...tsExample_CSR_sequentialForAllSegments.cpp | 42 +++++++++++++++++
 src/TNL/Algorithms/Segments/CSR.h             | 22 +++++++++
 src/TNL/Algorithms/Segments/CSR.hpp           | 24 ++++++++++
 src/TNL/Algorithms/Segments/CSRView.h         |  6 +++
 src/TNL/Algorithms/Segments/CSRView.hpp       | 23 +++++++++
 9 files changed, 167 insertions(+)
 create mode 100644 Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentView.cpp
 create mode 120000 Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentView.cu
 create mode 120000 Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentsType.cu
 create mode 100644 Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_sequentialForAllSegments.cpp

diff --git a/Documentation/Examples/Algorithms/Segments/CMakeLists.txt b/Documentation/Examples/Algorithms/Segments/CMakeLists.txt
index 7479762c3..dcc32305e 100644
--- a/Documentation/Examples/Algorithms/Segments/CMakeLists.txt
+++ b/Documentation/Examples/Algorithms/Segments/CMakeLists.txt
@@ -5,6 +5,7 @@ set( COMMON_EXAMPLES
    SegmentsExample_CSR_getSerializationType
    SegmentsExample_CSR_getSegmentsType
    SegmentsExample_CSR_setSegmentsSizes
+   SegmentsExample_CSR_getSegmentView
 )
 
 if( BUILD_CUDA )
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentView.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentView.cpp
new file mode 100644
index 000000000..e9ef92da5
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentView.cpp
@@ -0,0 +1,47 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Algorithms/SequentialFor.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+   using SegmentView = typename SegmentsType::SegmentViewType;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   const int size( 5 );
+   SegmentsType segments{ 1, 2, 3, 4, 5 };
+   auto view = segments.getView();
+
+   /***
+    * Print the elemets mapping using segment view.
+    */
+   std::cout << "Mapping of local indexes to global indexes:" << std::endl;
+
+   auto f = [=] __cuda_callable__ ( int segmentIdx ) {
+      printf( "Segment idx. %d: ", segmentIdx );                 // printf works even in GPU kernels
+      auto segment = view.getSegmentView( segmentIdx );
+      for( auto element : segment )
+         printf( "%d -> %d \t", element.localIndex(), element.globalIndex() );
+      printf( "\n" );
+   };
+   TNL::Algorithms::SequentialFor< Device >::exec( 0, size, f );
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentView.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentView.cu
new file mode 120000
index 000000000..fd9d23822
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentView.cu
@@ -0,0 +1 @@
+SegmentsExample_CSR_getSegmentView.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentsType.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentsType.cu
new file mode 120000
index 000000000..fcb8d7eb7
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_getSegmentsType.cu
@@ -0,0 +1 @@
+SegmentsExample_CSR_getSegmentsType.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_sequentialForAllSegments.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_sequentialForAllSegments.cpp
new file mode 100644
index 000000000..433ae6a61
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_sequentialForAllSegments.cpp
@@ -0,0 +1,42 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+   using SegmentViewType = typename SegmentsType::SegmentView;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   SegmentsType segments{ 1, 2, 3, 4, 5 };
+   std::cout << "Segments sizes are: " << segments << std::endl;
+
+   /***
+    * Print the elemets mapping using segment view.
+    */
+   std::cout << "Elements mapping:" << std::endl;
+   segments.sequentialForAllSegments( [] __cuda_callable__ ( const SegmentView segment ) {
+      printf( "Segment idx. %d: \n", segments.getSegmentIndex() );                 // printf works even in GPU kernels
+      for( auto element : segment )
+         printf( "%d -> %d  ", element.localIndex(), element.globalIndex() );
+   } );
+
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index 5290ac9f4..4fe24934f 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -270,12 +270,27 @@ class CSR
        *
        * \param segmentIdx is index of the request segment.
        * \return segment view of given segment.
+       *
+       * \par Example
+       * \include Algorithms/Segments/SegmentsExample_CSR_getSegmentView.cpp
+       * \par Output
+       * \include SegmentsExample_CSR_getSegmentView.out
        */
       __cuda_callable__
       SegmentViewType getSegmentView( const IndexType segmentIdx ) const;
 
+      /**
+       * \brief Returns reference on constant vector with row offsets used in the CSR format.
+       *
+       * \return reference on constant vector with row offsets used in the CSR format.
+       */
       const OffsetsContainer& getOffsets() const;
 
+      /**
+       * \brief Returns reference on vector with row offsets used in the CSR format.
+       *
+       * \return reference on vector with row offsets used in the CSR format.
+       */
       OffsetsContainer& getOffsets();
 
       /***
@@ -296,6 +311,13 @@ class CSR
       template< typename Function >
       void forAllSegments( Function&& f ) const;
 
+      template< typename Function >
+      void sequentialForSegments( IndexType begin, IndexType end, Function&& f ) const;
+
+      template< typename Function >
+      void sequentialForAllSegments( Function&& f ) const;
+
+
       /***
        * \brief Go over all segments and perform a reduction in each of them.
        */
diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp
index ce91e7dc5..b427f4acd 100644
--- a/src/TNL/Algorithms/Segments/CSR.hpp
+++ b/src/TNL/Algorithms/Segments/CSR.hpp
@@ -283,6 +283,30 @@ forAllSegments( Function&& f ) const
    this->getConstView().forAllSegments( f );
 }
 
+template< typename Device,
+          typename Index,
+          typename Kernel,
+          typename IndexAllocator >
+   template< typename Function >
+void
+CSR< Device, Index, Kernel, IndexAllocator >::
+sequentialForSegments( IndexType begin, IndexType end, Function&& f ) const
+{
+   this->getConstView().sequentialForSegments( begin, end, f );
+}
+
+template< typename Device,
+          typename Index,
+          typename Kernel,
+          typename IndexAllocator >
+   template< typename Function >
+void
+CSR< Device, Index, Kernel, IndexAllocator >::
+sequentialForAllSegments( Function&& f ) const
+{
+   this->getConstView().sequentialForAllSegments( f );
+}
+
 template< typename Device,
           typename Index,
           typename Kernel,
diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index 2d550aada..5daa3e7c2 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -118,6 +118,12 @@ class CSRView
       template< typename Function >
       void forAllSegments( Function&& f ) const;
 
+      template< typename Function >
+      void sequentialForSegments( IndexType begin, IndexType end, Function&& f ) const;
+
+      template< typename Function >
+      void sequentialForAllSegments( Function&& f ) const;
+
       /***
        * \brief Go over all segments and perform a reduction in each of them.
        */
diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp
index f4cfc2c78..7de193837 100644
--- a/src/TNL/Algorithms/Segments/CSRView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRView.hpp
@@ -238,6 +238,29 @@ forAllSegments( Function&& f ) const
    this->forSegments( 0, this->getSegmentsCount(), f );
 }
 
+template< typename Device,
+          typename Index,
+          typename Kernel >
+   template< typename Function >
+void
+CSRView< Device, Index, Kernel >::
+sequentialForSegments( IndexType begin, IndexType end, Function&& function ) const
+{
+   for( IndexType i = begin; i < end; i++ )
+      forSegments( i, i + 1, function );
+}
+
+template< typename Device,
+          typename Index,
+          typename Kernel >
+   template< typename Function >
+void
+CSRView< Device, Index, Kernel >::
+sequentialForAllSegments( Function&& f ) const
+{
+   this->sequentialForSegments( 0, this->getSegmentsCount(), f );
+}
+
 template< typename Device,
           typename Index,
           typename Kernel >
-- 
GitLab


From 219abe6c3cc9bce48bdb2391f062f38461bc60cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 6 Apr 2021 21:44:29 +0200
Subject: [PATCH 021/117] Removing boolean variable compute from forElements.

---
 .../SegmentsExample_CSR_constructor_1.cpp     |  2 +-
 .../SegmentsExample_CSR_constructor_2.cpp     |  2 +-
 .../Segments/SegmentsExample_General.cpp      |  2 +-
 .../DenseMatrixExample_forAllElements.cpp     |  6 +--
 .../DenseMatrixExample_forElements.cpp        |  6 +--
 .../DenseMatrixViewExample_forAllElements.cpp |  6 +--
 .../DenseMatrixViewExample_forElements.cpp    |  6 +--
 .../LambdaMatrixExample_forAllElements.cpp    |  2 +-
 .../LambdaMatrixExample_forElements.cpp       |  2 +-
 ...tidiagonalMatrixExample_forAllElements.cpp |  2 +-
 ...MultidiagonalMatrixExample_forElements.cpp |  2 +-
 ...agonalMatrixViewExample_forAllElements.cpp |  2 +-
 ...idiagonalMatrixViewExample_forElements.cpp |  2 +-
 .../SparseMatrixExample_forAllElements.cpp    |  6 +--
 .../SparseMatrixExample_forElements.cpp       |  6 +--
 ...SparseMatrixViewExample_forAllElements.cpp |  6 +--
 .../SparseMatrixViewExample_forElements.cpp   |  6 +--
 ...ridiagonalMatrixExample_forAllElements.cpp |  2 +-
 .../TridiagonalMatrixExample_forElements.cpp  |  2 +-
 ...agonalMatrixViewExample_forAllElements.cpp |  2 +-
 ...idiagonalMatrixViewExample_forElements.cpp |  2 +-
 .../Matrices/DenseMatrixSetup_Benchmark.cpp   |  2 +-
 .../MultidiagonalMatrixSetup_Benchmark.cpp    |  2 +-
 .../Matrices/SparseMatrixSetup_Benchmark.cpp  |  2 +-
 src/TNL/Algorithms/Segments/BiEllpackView.h   | 18 +++------
 src/TNL/Algorithms/Segments/BiEllpackView.hpp |  7 ++--
 src/TNL/Algorithms/Segments/CSR.h             |  2 +-
 src/TNL/Algorithms/Segments/CSRView.hpp       |  5 +--
 .../Algorithms/Segments/ChunkedEllpackView.h  | 18 +++------
 .../Segments/ChunkedEllpackView.hpp           |  9 ++---
 src/TNL/Algorithms/Segments/EllpackView.hpp   | 10 ++---
 .../Algorithms/Segments/SlicedEllpackView.hpp | 14 +++----
 src/TNL/Matrices/DenseMatrix.h                | 16 ++------
 src/TNL/Matrices/DenseMatrix.hpp              |  8 ++--
 src/TNL/Matrices/DenseMatrixView.h            |  8 ++--
 src/TNL/Matrices/DenseMatrixView.hpp          |  8 ++--
 src/TNL/Matrices/LambdaMatrix.h               |  6 +--
 src/TNL/Matrices/LambdaMatrix.hpp             |  5 +--
 src/TNL/Matrices/MultidiagonalMatrix.h        | 18 ++-------
 src/TNL/Matrices/MultidiagonalMatrix.hpp      |  6 +--
 src/TNL/Matrices/MultidiagonalMatrixView.h    | 18 ++-------
 src/TNL/Matrices/MultidiagonalMatrixView.hpp  | 10 ++---
 src/TNL/Matrices/SparseMatrix.h               | 16 ++------
 src/TNL/Matrices/SparseMatrix.hpp             | 12 +++---
 src/TNL/Matrices/SparseMatrixView.h           | 16 ++------
 src/TNL/Matrices/SparseMatrixView.hpp         | 12 +++---
 src/TNL/Matrices/TridiagonalMatrix.h          | 24 +++--------
 src/TNL/Matrices/TridiagonalMatrix.hpp        |  4 +-
 src/TNL/Matrices/TridiagonalMatrixView.h      | 12 ++----
 src/TNL/Matrices/TridiagonalMatrixView.hpp    | 40 +++++++++----------
 src/UnitTests/Matrices/DenseMatrixTest.h      |  2 +-
 src/UnitTests/Matrices/SparseMatrixTest.hpp   |  2 +-
 .../SparseMatrixVectorProductTest.hpp         |  6 +--
 53 files changed, 155 insertions(+), 257 deletions(-)

diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp
index 4c6e28575..0ceb7a6bd 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp
@@ -26,7 +26,7 @@ void SegmentsExample()
     * Insert data into particular segments.
     */
    auto data_view = data.getView();
-   segments.forAllElements( [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx, bool& compute ) mutable {
+   segments.forAllElements( [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx ) mutable {
       if( localIdx <= segmentIdx )
          data_view[ globalIdx ] = segmentIdx;
    } );
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp
index c15f5791e..9493758b4 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp
@@ -25,7 +25,7 @@ void SegmentsExample()
     * Insert data into particular segments.
     */
    auto data_view = data.getView();
-   segments.forAllElements( [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx, bool& compute ) mutable {
+   segments.forAllElements( [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx ) mutable {
       if( localIdx <= segmentIdx )
          data_view[ globalIdx ] = segmentIdx;
    } );
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
index e50c6d1ed..ade0263fb 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
@@ -27,7 +27,7 @@ void SegmentsExample()
     * Insert data into particular segments.
     */
    auto data_view = data.getView();
-   segments.forAllElements( [=] __cuda_callable__ ( IndexType segmentIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable {
+   segments.forAllElements( [=] __cuda_callable__ ( IndexType segmentIdx, IndexType localIdx, IndexType globalIdx ) mutable {
       if( localIdx <= segmentIdx )
          data_view[ globalIdx ] = segmentIdx;
    } );
diff --git a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_forAllElements.cpp b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_forAllElements.cpp
index 4fd7d3b47..f143164a2 100644
--- a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_forAllElements.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_forAllElements.cpp
@@ -8,10 +8,8 @@ void forAllElementsExample()
 {
    TNL::Matrices::DenseMatrix< double, Device > matrix( 5, 5 );
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int columnIdx, int columnIdx_, double& value, bool& compute ) {
-      if( rowIdx < columnIdx )
-         compute = false;
-      else
+   auto f = [=] __cuda_callable__ ( int rowIdx, int columnIdx, int columnIdx_, double& value ) {
+      if( rowIdx >= columnIdx )
          value = rowIdx + columnIdx;
    };
 
diff --git a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_forElements.cpp b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_forElements.cpp
index 0764eecdf..b37470c43 100644
--- a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_forElements.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_forElements.cpp
@@ -8,10 +8,8 @@ void forElementsExample()
 {
    TNL::Matrices::DenseMatrix< double, Device > matrix( 5, 5 );
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int columnIdx, int columnIdx_, double& value, bool& compute ) {
-      if( rowIdx < columnIdx )
-         compute = false;
-      else
+   auto f = [=] __cuda_callable__ ( int rowIdx, int columnIdx, int columnIdx_, double& value ) {
+      if( rowIdx >= columnIdx )
          value = rowIdx + columnIdx;
    };
 
diff --git a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_forAllElements.cpp b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_forAllElements.cpp
index 66b394130..c5802a0e1 100644
--- a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_forAllElements.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_forAllElements.cpp
@@ -9,10 +9,8 @@ void forAllElementsExample()
    TNL::Matrices::DenseMatrix< double, Device > matrix( 5, 5 );
    auto matrixView = matrix.getView();
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int columnIdx, int globalIdx, double& value, bool& compute ) {
-      if( rowIdx < columnIdx )
-         compute = false;
-      else
+   auto f = [=] __cuda_callable__ ( int rowIdx, int columnIdx, int globalIdx, double& value ) {
+      if( rowIdx >= columnIdx )
          value = rowIdx + columnIdx;
    };
 
diff --git a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_forElements.cpp b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_forElements.cpp
index 6a980d23c..572c526f0 100644
--- a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_forElements.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_forElements.cpp
@@ -9,10 +9,8 @@ void forElementsExample()
    TNL::Matrices::DenseMatrix< double, Device > matrix( 5, 5 );
    auto matrixView = matrix.getView();
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int columnIdx, int globalIdx, double& value, bool& compute ) {
-      if( columnIdx > rowIdx )
-         compute = false;
-      else
+   auto f = [=] __cuda_callable__ ( int rowIdx, int columnIdx, int globalIdx, double& value ) {
+      if( columnIdx <= rowIdx )
          value = rowIdx + columnIdx;
    };
 
diff --git a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forAllElements.cpp b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forAllElements.cpp
index 293f173d2..6b335f5f2 100644
--- a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forAllElements.cpp
+++ b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forAllElements.cpp
@@ -22,7 +22,7 @@ void forAllElementsExample()
    TNL::Matrices::DenseMatrix< double, Device > denseMatrix( 5, 5 );
    auto denseView = denseMatrix.getView();
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double value, bool& compute ) mutable {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double value ) mutable {
       denseView.setElement( rowIdx, columnIdx, value );
    };
 
diff --git a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forElements.cpp b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forElements.cpp
index f23f031b1..8472ef28d 100644
--- a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forElements.cpp
+++ b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forElements.cpp
@@ -22,7 +22,7 @@ void forElementsExample()
    TNL::Matrices::DenseMatrix< double, Device > denseMatrix( 5, 5 );
    auto denseView = denseMatrix.getView();
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double value, bool& compute ) mutable {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double value ) mutable {
       denseView.setElement( rowIdx, columnIdx, value );
    };
 
diff --git a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_forAllElements.cpp b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_forAllElements.cpp
index b29543d9e..96a4668b9 100644
--- a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_forAllElements.cpp
+++ b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_forAllElements.cpp
@@ -23,7 +23,7 @@ void forAllElementsExample()
       5,               // number of matrix columns
       { -2, -1, 0 } ); // matrix diagonals offsets
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value, bool& compute ) {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value ) {
       /***
        * 'forElements' method iterates only over matrix elements lying on given subdiagonals
        * and so we do not need to check anything. The element value can be expressed
diff --git a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_forElements.cpp b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_forElements.cpp
index dd30694e6..1dc957af2 100644
--- a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_forElements.cpp
+++ b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_forElements.cpp
@@ -23,7 +23,7 @@ void forElementsExample()
       5,               // number of matrix columns
       { -2, -1, 0 } ); // matrix diagonals offsets
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value, bool& compute ) {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value ) {
       /***
        * 'forElements' method iterates only over matrix elements lying on given subdiagonals
        * and so we do not need to check anything. The element value can be expressed
diff --git a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_forAllElements.cpp b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_forAllElements.cpp
index b05da1d82..4ca0940cb 100644
--- a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_forAllElements.cpp
+++ b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_forAllElements.cpp
@@ -24,7 +24,7 @@ void forAllElementsExample()
       { -2, -1, 0 } ); // matrix diagonals offsets
    auto view = matrix.getView();
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value, bool& compute ) {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value ) {
       /***
        * 'forElements' method iterates only over matrix elements lying on given subdiagonals
        * and so we do not need to check anything. The element value can be expressed
diff --git a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_forElements.cpp b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_forElements.cpp
index 9663a2c0d..d941fc4a2 100644
--- a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_forElements.cpp
+++ b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_forElements.cpp
@@ -24,7 +24,7 @@ void forElementsExample()
       { -2, -1, 0 } ); // matrix diagonals offsets
    auto view = matrix.getView();
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value, bool& compute ) {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value ) {
       /***
        * 'forElements' method iterates only over matrix elements lying on given subdiagonals
        * and so we do not need to check anything. The element value can be expressed
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forAllElements.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forAllElements.cpp
index c603fe32f..8c9fb368c 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forAllElements.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forAllElements.cpp
@@ -8,11 +8,9 @@ void forAllElementsExample()
 {
    TNL::Matrices::SparseMatrix< double, Device > matrix( { 1, 2, 3, 4, 5 }, 5 );
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value, bool& compute ) {
-      if( rowIdx < localIdx )  // This is important, some matrix formats may allocate more matrix elements
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value ) {
+      if( rowIdx >= localIdx )  // This is important, some matrix formats may allocate more matrix elements
                                // than we requested. These padding elements are processed here as well.
-         compute = false;
-      else
       {
          columnIdx = localIdx;
          value = rowIdx + localIdx + 1;
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forElements.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forElements.cpp
index 216433b63..2d7bbeba5 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forElements.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forElements.cpp
@@ -8,11 +8,9 @@ void forElementsExample()
 {
    TNL::Matrices::SparseMatrix< double, Device > matrix( { 1, 2, 3, 4, 5 }, 5 );
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value, bool& compute ) {
-      if( rowIdx < localIdx )  // This is important, some matrix formats may allocate more matrix elements
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value ) {
+      if( rowIdx >= localIdx )  // This is important, some matrix formats may allocate more matrix elements
                                // than we requested. These padding elements are processed here as well.
-         compute = false;
-      else
       {
          columnIdx = localIdx;
          value = rowIdx + localIdx + 1;
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forAllElements.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forAllElements.cpp
index 4000107eb..79fb7890d 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forAllElements.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forAllElements.cpp
@@ -9,11 +9,9 @@ void forAllElementsExample()
    TNL::Matrices::SparseMatrix< double, Device > matrix( { 1, 2, 3, 4, 5 }, 5 );
    auto view = matrix.getView();
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value, bool& compute ) {
-      if( rowIdx < localIdx )  // This is important, some matrix formats may allocate more matrix elements
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value ) {
+      if( rowIdx >= localIdx )  // This is important, some matrix formats may allocate more matrix elements
                                // than we requested. These padding elements are processed here as well.
-         compute = false;
-      else
       {
          columnIdx = localIdx;
          value = rowIdx + localIdx + 1;
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forElements.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forElements.cpp
index 4ffb2ee83..6e296d3de 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forElements.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forElements.cpp
@@ -9,11 +9,9 @@ void forElementsExample()
    TNL::Matrices::SparseMatrix< double, Device > matrix( { 1, 2, 3, 4, 5 }, 5 );
    auto view = matrix.getView();
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value, bool& compute ) {
-      if( rowIdx < localIdx )  // This is important, some matrix formats may allocate more matrix elements
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value ) {
+      if( rowIdx >= localIdx )  // This is important, some matrix formats may allocate more matrix elements
                                // than we requested. These padding elements are processed here as well.
-         compute = false;
-      else
       {
          columnIdx = localIdx;
          value = rowIdx + localIdx + 1;
diff --git a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_forAllElements.cpp b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_forAllElements.cpp
index c29b439a6..314cd6e4b 100644
--- a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_forAllElements.cpp
+++ b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_forAllElements.cpp
@@ -20,7 +20,7 @@ void forAllElementsExample()
       5,      // number of matrix rows
       5 );    // number of matrix columns
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value, bool& compute ) {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value ) {
       /***
        * 'forElements' method iterates only over matrix elements lying on given subdiagonals
        * and so we do not need to check anything. The element value can be expressed
diff --git a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_forElements.cpp b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_forElements.cpp
index 243e9468e..b15a9f581 100644
--- a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_forElements.cpp
+++ b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_forElements.cpp
@@ -20,7 +20,7 @@ void forElementsExample()
       5,      // number of matrix rows
       5 );    // number of matrix columns
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value, bool& compute ) {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value ) {
       /***
        * 'forElements' method iterates only over matrix elements lying on given subdiagonals
        * and so we do not need to check anything. The element value can be expressed
diff --git a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forAllElements.cpp b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forAllElements.cpp
index 0ef430462..8d90c989e 100644
--- a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forAllElements.cpp
+++ b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forAllElements.cpp
@@ -21,7 +21,7 @@ void forAllElementsExample()
       5 );    // number of matrix columns
    auto view = matrix.getView();
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value, bool& compute ) {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value ) {
       /***
        * 'forElements' method iterates only over matrix elements lying on given subdiagonals
        * and so we do not need to check anything. The element value can be expressed
diff --git a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forElements.cpp b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forElements.cpp
index 3045bc655..b077c008c 100644
--- a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forElements.cpp
+++ b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forElements.cpp
@@ -21,7 +21,7 @@ void forElementsExample()
       5 );    // number of matrix columns
    auto view = matrix.getView();
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value, bool& compute ) {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, double& value ) {
       /***
        * 'forElements' method iterates only over matrix elements lying on given subdiagonals
        * and so we do not need to check anything. The element value can be expressed
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cpp b/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cpp
index 7696e9d0d..64979b0d4 100644
--- a/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cpp
+++ b/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cpp
@@ -62,7 +62,7 @@ void forElements( const int matrixSize, Matrix& matrix )
 {
    matrix.setDimensions( matrixSize, matrixSize );
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, float& value, bool& compute ) mutable {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, float& value ) mutable {
       value = rowIdx + columnIdx;
    };
    matrix.forElements( 0, matrixSize, f );
diff --git a/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cpp b/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cpp
index d323105cd..75186957e 100644
--- a/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cpp
+++ b/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cpp
@@ -143,7 +143,7 @@ void forElements( const int gridSize, Matrix& matrix )
    const int matrixSize = gridSize * gridSize;
    matrix.setDimensions( matrixSize, matrixSize, getOffsets< typename Matrix::DeviceType >( gridSize ) );
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, float& value, bool& compute ) mutable {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, float& value ) mutable {
       const int i = rowIdx % gridSize;
       const int j = rowIdx / gridSize;
       if( ( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 ) && localIdx == 0 )
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp b/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp
index 7af7de1e1..31a2a039c 100644
--- a/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp
+++ b/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp
@@ -168,7 +168,7 @@ void forElements( const int gridSize, Matrix& matrix )
    matrix.setDimensions( matrixSize, matrixSize );
    matrix.setRowCapacities( rowCapacities );
 
-   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, float& value, bool& compute ) mutable {
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, float& value ) mutable {
       const int i = rowIdx % gridSize;
       const int j = rowIdx / gridSize;
       if( ( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 ) && localIdx == 0 )
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.h b/src/TNL/Algorithms/Segments/BiEllpackView.h
index 0bb603a3a..c37ed6d73 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.h
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.h
@@ -159,8 +159,7 @@ class BiEllpackView
                 typename Reduction,
                 typename ResultKeeper,
                 typename Real,
-                int BlockDim,
-                typename... Args >
+                int BlockDim >
       __device__
       void reduceSegmentsKernelWithAllParameters( IndexType gridIdx,
                                                      IndexType first,
@@ -168,15 +167,13 @@ class BiEllpackView
                                                      Fetch fetch,
                                                      Reduction reduction,
                                                      ResultKeeper keeper,
-                                                     Real zero,
-                                                     Args... args ) const;
+                                                     Real zero ) const;
 
       template< typename Fetch,
                 typename Reduction,
                 typename ResultKeeper,
                 typename Real_,
-                int BlockDim,
-                typename... Args >
+                int BlockDim >
       __device__
       void reduceSegmentsKernel( IndexType gridIdx,
                                     IndexType first,
@@ -184,8 +181,7 @@ class BiEllpackView
                                     Fetch fetch,
                                     Reduction reduction,
                                     ResultKeeper keeper,
-                                    Real_ zero,
-                                    Args... args ) const;
+                                    Real_ zero ) const;
 
       template< typename View_,
                 typename Index_,
@@ -193,8 +189,7 @@ class BiEllpackView
                 typename Reduction_,
                 typename ResultKeeper_,
                 typename Real_,
-                int BlockDim,
-                typename... Args_ >
+                int BlockDim >
       friend __global__
       void BiEllpackreduceSegmentsKernel( View_ chunkedEllpack,
                                              Index_ gridIdx,
@@ -203,8 +198,7 @@ class BiEllpackView
                                              Fetch_ fetch,
                                              Reduction_ reduction,
                                              ResultKeeper_ keeper,
-                                             Real_ zero,
-                                             Args_... args );
+                                             Real_ zero );
 
       template< typename Index_, typename Fetch_, int BlockDim_, int WarpSize_, bool B_ >
       friend struct details::BiEllpackreduceSegmentsDispatcher;
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.hpp b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
index 98c5c05c8..b480deac0 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
@@ -275,9 +275,8 @@ forElements( IndexType first, IndexType last, Function&& f ) const
       const IndexType groupsCount = detail::BiEllpack< IndexType, DeviceType, Organization, getWarpSize() >::getActiveGroupsCountDirect( segmentsPermutationView, segmentIdx );
       IndexType groupHeight = getWarpSize();
       //printf( "segmentIdx = %d strip = %d firstGroupInStrip = %d rowStripPerm = %d groupsCount = %d \n", segmentIdx, strip, firstGroupInStrip, rowStripPerm, groupsCount );
-      bool compute( true );
       IndexType localIdx( 0 );
-      for( IndexType groupIdx = firstGroupInStrip; groupIdx < firstGroupInStrip + groupsCount && compute; groupIdx++ )
+      for( IndexType groupIdx = firstGroupInStrip; groupIdx < firstGroupInStrip + groupsCount; groupIdx++ )
       {
          IndexType groupOffset = groupPointersView[ groupIdx ];
          const IndexType groupSize = groupPointersView[ groupIdx + 1 ] - groupOffset;
@@ -289,14 +288,14 @@ forElements( IndexType first, IndexType last, Function&& f ) const
             {
                if( Organization == RowMajorOrder )
                {
-                  f( segmentIdx, localIdx, groupOffset + rowStripPerm * groupWidth + i, compute );
+                  f( segmentIdx, localIdx, groupOffset + rowStripPerm * groupWidth + i );
                }
                else
                {
                   /*printf( "segmentIdx = %d localIdx = %d globalIdx = %d groupIdx = %d groupSize = %d groupWidth = %d\n",
                      segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight,
                      groupIdx, groupSize, groupWidth );*/
-                  f( segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight, compute );
+                  f( segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight );
                }
                localIdx++;
             }
diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index 4fe24934f..af05a9f61 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -293,7 +293,7 @@ class CSR
        */
       OffsetsContainer& getOffsets();
 
-      /***
+      /**
        * \brief Go over all segments and for each segment element call
        * function 'f'. The return type of 'f' is bool.
        * When its true, the for-loop continues. Once 'f' returns false, the for-loop
diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp
index 7de193837..08822ca94 100644
--- a/src/TNL/Algorithms/Segments/CSRView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRView.hpp
@@ -193,9 +193,8 @@ forElements( IndexType begin, IndexType end, Function&& f ) const
       const IndexType begin = offsetsView[ segmentIdx ];
       const IndexType end = offsetsView[ segmentIdx + 1 ];
       IndexType localIdx( 0 );
-      bool compute( true );
-      for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
-         f( segmentIdx, localIdx++, globalIdx, compute );
+      for( IndexType globalIdx = begin; globalIdx < end; globalIdx++  )
+         f( segmentIdx, localIdx++, globalIdx );
    };
    Algorithms::ParallelFor< Device >::exec( begin, end, l );
 }
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
index 8f8177dbb..123bc1cb9 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
@@ -158,8 +158,7 @@ class ChunkedEllpackView
       template< typename Fetch,
                 typename Reduction,
                 typename ResultKeeper,
-                typename Real,
-                typename... Args >
+                typename Real >
       __device__
       void reduceSegmentsKernelWithAllParameters( IndexType gridIdx,
                                                      IndexType first,
@@ -167,14 +166,12 @@ class ChunkedEllpackView
                                                      Fetch fetch,
                                                      Reduction reduction,
                                                      ResultKeeper keeper,
-                                                     Real zero,
-                                                     Args... args ) const;
+                                                     Real zero ) const;
 
       template< typename Fetch,
                 typename Reduction,
                 typename ResultKeeper,
-                typename Real,
-                typename... Args >
+                typename Real >
       __device__
       void reduceSegmentsKernel( IndexType gridIdx,
                                     IndexType first,
@@ -182,8 +179,7 @@ class ChunkedEllpackView
                                     Fetch fetch,
                                     Reduction reduction,
                                     ResultKeeper keeper,
-                                    Real zero,
-                                    Args... args ) const;
+                                    Real zero ) const;
 #endif
 
       IndexType size = 0, storageSize = 0, numberOfSlices = 0;
@@ -216,8 +212,7 @@ class ChunkedEllpackView
                 typename Fetch_,
                 typename Reduction_,
                 typename ResultKeeper_,
-                typename Real_,
-                typename... Args_ >
+                typename Real_ >
       friend __global__
       void ChunkedEllpackreduceSegmentsKernel( View_ chunkedEllpack,
                                                   Index_ gridIdx,
@@ -226,8 +221,7 @@ class ChunkedEllpackView
                                                   Fetch_ fetch,
                                                   Reduction_ reduction,
                                                   ResultKeeper_ keeper,
-                                                  Real_ zero,
-                                                  Args_... args );
+                                                  Real_ zero );
 
       template< typename Index_, typename Fetch_, bool B_ >
       friend struct details::ChunkedEllpackreduceSegmentsDispatcher;
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
index c43b57321..5f73fd8ab 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
@@ -323,14 +323,13 @@ forElements( IndexType first, IndexType last, Function&& f ) const
       const IndexType chunkSize = slices[ sliceIdx ].chunkSize;
 
       const IndexType segmentSize = segmentChunksCount * chunkSize;
-      bool compute( true );
       if( Organization == RowMajorOrder )
       {
          IndexType begin = sliceOffset + firstChunkOfSegment * chunkSize;
          IndexType end = begin + segmentSize;
          IndexType localIdx( 0 );
-         for( IndexType j = begin; j < end && compute; j++ )
-            f( segmentIdx, localIdx++, j, compute );
+         for( IndexType j = begin; j < end; j++ )
+            f( segmentIdx, localIdx++, j );
       }
       else
       {
@@ -339,9 +338,9 @@ forElements( IndexType first, IndexType last, Function&& f ) const
          {
             IndexType begin = sliceOffset + firstChunkOfSegment + chunkIdx;
             IndexType end = begin + chunksInSlice * chunkSize;
-            for( IndexType j = begin; j < end && compute; j += chunksInSlice )
+            for( IndexType j = begin; j < end; j += chunksInSlice )
             {
-               f( segmentIdx, localIdx++, j, compute );
+               f( segmentIdx, localIdx++, j );
             }
          }
       }
diff --git a/src/TNL/Algorithms/Segments/EllpackView.hpp b/src/TNL/Algorithms/Segments/EllpackView.hpp
index 120f8f436..1ec928336 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/EllpackView.hpp
@@ -207,9 +207,8 @@ forElements( IndexType first, IndexType last, Function&& f ) const
          const IndexType begin = segmentIdx * segmentSize;
          const IndexType end = begin + segmentSize;
          IndexType localIdx( 0 );
-         bool compute( true );
-         for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
-            f( segmentIdx, localIdx++, globalIdx, compute );
+         for( IndexType globalIdx = begin; globalIdx < end; globalIdx++  )
+            f( segmentIdx, localIdx++, globalIdx );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l );
    }
@@ -221,9 +220,8 @@ forElements( IndexType first, IndexType last, Function&& f ) const
          const IndexType begin = segmentIdx;
          const IndexType end = storageSize;
          IndexType localIdx( 0 );
-         bool compute( true );
-         for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx += alignedSize )
-            f( segmentIdx, localIdx++, globalIdx, compute );
+         for( IndexType globalIdx = begin; globalIdx < end; globalIdx += alignedSize )
+            f( segmentIdx, localIdx++, globalIdx );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l );
    }
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
index 703da8d3c..871aa2da0 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
@@ -242,15 +242,14 @@ forElements( IndexType first, IndexType last, Function&& f ) const
          const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx * segmentSize;
          const IndexType end = begin + segmentSize;
          IndexType localIdx( 0 );
-         bool compute( true );
-         for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
+         for( IndexType globalIdx = begin; globalIdx < end; globalIdx++  )
          {
             // The following is a workaround of a bug in nvcc 11.2
 #if CUDART_VERSION == 11020
-             f( segmentIdx, localIdx, globalIdx, compute );
+             f( segmentIdx, localIdx, globalIdx );
              localIdx++;
 #else
-             f( segmentIdx, localIdx++, globalIdx, compute );
+             f( segmentIdx, localIdx++, globalIdx );
 #endif
          }
       };
@@ -265,15 +264,14 @@ forElements( IndexType first, IndexType last, Function&& f ) const
          const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx;
          const IndexType end = sliceOffsets_view[ sliceIdx + 1 ];
          IndexType localIdx( 0 );
-         bool compute( true );
-         for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx += SliceSize )
+         for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize )
          {
             // The following is a workaround of a bug in nvcc 11.2
 #if CUDART_VERSION == 11020
-            f( segmentIdx, localIdx, globalIdx, compute );
+            f( segmentIdx, localIdx, globalIdx );
             localIdx++;
 #else
-            f( segmentIdx, localIdx++, globalIdx, compute );
+            f( segmentIdx, localIdx++, globalIdx );
 #endif
          }
       };
diff --git a/src/TNL/Matrices/DenseMatrix.h b/src/TNL/Matrices/DenseMatrix.h
index a65c12d80..0036ba240 100644
--- a/src/TNL/Matrices/DenseMatrix.h
+++ b/src/TNL/Matrices/DenseMatrix.h
@@ -457,10 +457,8 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -479,10 +477,8 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -633,10 +629,8 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -650,10 +644,8 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
diff --git a/src/TNL/Matrices/DenseMatrix.hpp b/src/TNL/Matrices/DenseMatrix.hpp
index 16b844cda..46c85a373 100644
--- a/src/TNL/Matrices/DenseMatrix.hpp
+++ b/src/TNL/Matrices/DenseMatrix.hpp
@@ -1139,7 +1139,7 @@ operator=( const DenseMatrixView< RHSReal, RHSDevice, RHSIndex, RHSOrganization
    auto this_view = this->view;
    if( std::is_same< DeviceType, RHSDeviceType >::value )
    {
-      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIdx, const RHSRealType& value, bool& compute ) mutable {
+      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIdx, const RHSRealType& value ) mutable {
          this_view( rowIdx, columnIdx ) = value;
       };
       matrix.forAllElements( f );
@@ -1162,7 +1162,7 @@ operator=( const DenseMatrixView< RHSReal, RHSDevice, RHSIndex, RHSOrganization
 
          ////
          // Copy matrix elements into buffer
-         auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIdx, const RHSRealType& value, bool& compute ) mutable {
+         auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIdx, const RHSRealType& value ) mutable {
             const IndexType bufferIdx = ( rowIdx - baseRow ) * maxRowLength + columnIdx;
             matrixValuesBuffer_view[ bufferIdx ] = value;
          };
@@ -1214,7 +1214,7 @@ operator=( const RHSMatrix& matrix )
    if( std::is_same< DeviceType, RHSDeviceType >::value )
    {
       const auto segments_view = this->segments.getView();
-      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx_, RHSIndexType columnIdx, const RHSRealType& value, bool& compute ) mutable {
+      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx_, RHSIndexType columnIdx, const RHSRealType& value ) mutable {
          if( value != 0.0 && columnIdx != padding_index )
             values_view[ segments_view.getGlobalIndex( rowIdx, columnIdx ) ] = value;
       };
@@ -1244,7 +1244,7 @@ operator=( const RHSMatrix& matrix )
 
          ////
          // Copy matrix elements into buffer
-         auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value, bool& compute ) mutable {
+         auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value ) mutable {
             if( columnIndex != padding_index )
             {
                const IndexType bufferIdx = ( rowIdx - baseRow ) * maxRowLength + localIdx;
diff --git a/src/TNL/Matrices/DenseMatrixView.h b/src/TNL/Matrices/DenseMatrixView.h
index ea7f6dbe7..5fd6cda8e 100644
--- a/src/TNL/Matrices/DenseMatrixView.h
+++ b/src/TNL/Matrices/DenseMatrixView.h
@@ -518,7 +518,7 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx, const RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
        *  If the 'compute' variable is set to false the iteration over the row can
        *  be interrupted.
@@ -540,7 +540,7 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx, RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx, RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
        *  If the 'compute' variable is set to false the iteration over the row can
        *  be interrupted.
@@ -694,7 +694,7 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
        *  If the 'compute' variable is set to false the iteration over the row can
        *  be interrupted.
@@ -711,7 +711,7 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
        *  If the 'compute' variable is set to false the iteration over the row can
        *  be interrupted.
diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index 68b2de7ee..1ace77056 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -355,8 +355,8 @@ DenseMatrixView< Real, Device, Index, Organization >::
 forElements( IndexType begin, IndexType end, Function&& function ) const
 {
    const auto values_view = this->values.getConstView();
-   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType globalIdx, bool& compute ) mutable {
-      function( rowIdx, columnIdx, columnIdx, values_view[ globalIdx ], compute );
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType globalIdx ) mutable {
+      function( rowIdx, columnIdx, columnIdx, values_view[ globalIdx ] );
    };
    this->segments.forElements( begin, end, f );
 }
@@ -371,8 +371,8 @@ DenseMatrixView< Real, Device, Index, Organization >::
 forElements( IndexType begin, IndexType end, Function&& function )
 {
    auto values_view = this->values.getView();
-   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType globalIdx, bool& compute ) mutable {
-      function( rowIdx, columnIdx, globalIdx, values_view[ globalIdx ], compute );
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType globalIdx ) mutable {
+      function( rowIdx, columnIdx, globalIdx, values_view[ globalIdx ] );
    };
    this->segments.forElements( begin, end, f );
 }
diff --git a/src/TNL/Matrices/LambdaMatrix.h b/src/TNL/Matrices/LambdaMatrix.h
index 01d3a0b91..cfdd77d2e 100644
--- a/src/TNL/Matrices/LambdaMatrix.h
+++ b/src/TNL/Matrices/LambdaMatrix.h
@@ -260,10 +260,8 @@ class LambdaMatrix
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -348,7 +346,7 @@ class LambdaMatrix
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
        *  If the 'compute' variable is set to false the iteration over the row can
        *  be interrupted.
diff --git a/src/TNL/Matrices/LambdaMatrix.hpp b/src/TNL/Matrices/LambdaMatrix.hpp
index 867016d47..77de0872c 100644
--- a/src/TNL/Matrices/LambdaMatrix.hpp
+++ b/src/TNL/Matrices/LambdaMatrix.hpp
@@ -317,14 +317,13 @@ forElements( IndexType first, IndexType last, Function& function ) const
    auto matrixElements = this->matrixElementsLambda;
    auto processRow = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
       const IndexType rowLength = rowLengths( rows, columns, rowIdx );
-      bool compute( true );
-      for( IndexType localIdx = 0; localIdx < rowLength && compute; localIdx++ )
+      for( IndexType localIdx = 0; localIdx < rowLength; localIdx++ )
       {
         IndexType elementColumn( 0 );
         RealType elementValue( 0.0 );
         matrixElements( rows, columns, rowIdx, localIdx, elementColumn, elementValue );
         if( elementValue != 0.0 )
-            function( rowIdx, localIdx, elementColumn, elementValue, compute );
+            function( rowIdx, localIdx, elementColumn, elementValue );
       }
    };
    Algorithms::ParallelFor< DeviceType >::exec( first, last, processRow );
diff --git a/src/TNL/Matrices/MultidiagonalMatrix.h b/src/TNL/Matrices/MultidiagonalMatrix.h
index d938a1062..622b79da4 100644
--- a/src/TNL/Matrices/MultidiagonalMatrix.h
+++ b/src/TNL/Matrices/MultidiagonalMatrix.h
@@ -715,7 +715,7 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
        *
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`,
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`,
        *
        * where
        *
@@ -728,9 +728,6 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \e value is the matrix element value.
        *
-       * \e compute is a reference to a boolen variable. If it is set to false the iteration over the row can
-       *  be interrupted.
-       *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
@@ -749,7 +746,7 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
        *
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`,
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`,
        *
        * where
        *
@@ -762,9 +759,6 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \e value is a reference to the matrix element value. It can be used even for changing the matrix element value.
        *
-       * \e compute is a reference to a boolen variable. If it is set to false the iteration over the row can
-       *  be interrupted.
-       *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
@@ -914,10 +908,8 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -931,10 +923,8 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
diff --git a/src/TNL/Matrices/MultidiagonalMatrix.hpp b/src/TNL/Matrices/MultidiagonalMatrix.hpp
index b432814c6..244188831 100644
--- a/src/TNL/Matrices/MultidiagonalMatrix.hpp
+++ b/src/TNL/Matrices/MultidiagonalMatrix.hpp
@@ -873,7 +873,7 @@ operator=( const MultidiagonalMatrix< Real_, Device_, Index_, Organization_, Rea
       if( std::is_same< Device, Device_ >::value )
       {
          const auto matrix_view = matrix.getView();
-         auto f = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value, bool& compute ) mutable {
+         auto f = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value ) mutable {
             value = matrix_view.getValues()[ matrix_view.getIndexer().getGlobalIndex( rowIdx, localIdx ) ];
          };
          this->forAllElements( f );
@@ -898,7 +898,7 @@ operator=( const MultidiagonalMatrix< Real_, Device_, Index_, Organization_, Rea
 
             ////
             // Copy matrix elements into buffer
-            auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value, bool& compute ) mutable {
+            auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value ) mutable {
                   const IndexType bufferIdx = ( rowIdx - baseRow ) * maxRowLength + localIdx;
                   matrixValuesBuffer_view[ bufferIdx ] = value;
             };
@@ -910,7 +910,7 @@ operator=( const MultidiagonalMatrix< Real_, Device_, Index_, Organization_, Rea
 
             ////
             // Copy matrix elements from the buffer to the matrix
-            auto f2 = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType localIdx, const IndexType columnIndex, RealType& value, bool& compute  ) mutable {
+            auto f2 = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType localIdx, const IndexType columnIndex, RealType& value ) mutable {
                const IndexType bufferIdx = ( rowIdx - baseRow ) * maxRowLength + localIdx;
                   value = thisValuesBuffer_view[ bufferIdx ];
             };
diff --git a/src/TNL/Matrices/MultidiagonalMatrixView.h b/src/TNL/Matrices/MultidiagonalMatrixView.h
index 357560213..fc5799fa6 100644
--- a/src/TNL/Matrices/MultidiagonalMatrixView.h
+++ b/src/TNL/Matrices/MultidiagonalMatrixView.h
@@ -477,7 +477,7 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
        *
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`,
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`,
        *
        * where
        *
@@ -490,9 +490,6 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        *
        * \e value is the matrix element value.
        *
-       * \e compute is a reference to a boolen variable. If it is set to false the iteration over the row can
-       *  be interrupted.
-       *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
@@ -511,7 +508,7 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
        *
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`,
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`,
        *
        * where
        *
@@ -524,9 +521,6 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        *
        * \e value is a reference to the matrix element value. It can be used even for changing the matrix element value.
        *
-       * \e compute is a reference to a boolen variable. If it is set to false the iteration over the row can
-       *  be interrupted.
-       *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
@@ -676,10 +670,8 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -693,10 +685,8 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
diff --git a/src/TNL/Matrices/MultidiagonalMatrixView.hpp b/src/TNL/Matrices/MultidiagonalMatrixView.hpp
index 7621f1678..550158e4d 100644
--- a/src/TNL/Matrices/MultidiagonalMatrixView.hpp
+++ b/src/TNL/Matrices/MultidiagonalMatrixView.hpp
@@ -216,7 +216,7 @@ setValue( const RealType& v )
    // we dont do this->values = v here because it would set even elements 'outside' the matrix
    // method getNumberOfNonzeroElements would not work well then
    const RealType newValue = v;
-   auto f = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType columnIdx, RealType& value, bool& compute ) mutable {
+   auto f = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType columnIdx, RealType& value ) mutable {
       value = newValue;
    };
    this->forAllElements( f );
@@ -443,13 +443,12 @@ forElements( IndexType first, IndexType last, Function& function ) const
    const IndexType diagonalsCount = this->diagonalsOffsets.getSize();
    const IndexType columns = this->getColumns();
    const auto indexer = this->indexer;
-   bool compute( true );
    auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
       for( IndexType localIdx = 0; localIdx < diagonalsCount; localIdx++ )
       {
          const IndexType columnIdx = rowIdx + diagonalsOffsets_view[ localIdx ];
          if( columnIdx >= 0 && columnIdx < columns )
-            function( rowIdx, localIdx, columnIdx, values_view[ indexer.getGlobalIndex( rowIdx, localIdx ) ], compute );
+            function( rowIdx, localIdx, columnIdx, values_view[ indexer.getGlobalIndex( rowIdx, localIdx ) ] );
       }
    };
    Algorithms::ParallelFor< DeviceType >::exec( first, last, f );
@@ -469,13 +468,12 @@ forElements( IndexType first, IndexType last, Function& function )
    const IndexType diagonalsCount = this->diagonalsOffsets.getSize();
    const IndexType columns = this->getColumns();
    const auto indexer = this->indexer;
-   bool compute( true );
    auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
-      for( IndexType localIdx = 0; localIdx < diagonalsCount && compute; localIdx++ )
+      for( IndexType localIdx = 0; localIdx < diagonalsCount; localIdx++ )
       {
          const IndexType columnIdx = rowIdx + diagonalsOffsets_view[ localIdx ];
          if( columnIdx >= 0 && columnIdx < columns )
-            function( rowIdx, localIdx, columnIdx, values_view[ indexer.getGlobalIndex( rowIdx, localIdx ) ], compute );
+            function( rowIdx, localIdx, columnIdx, values_view[ indexer.getGlobalIndex( rowIdx, localIdx ) ] );
       }
    };
    Algorithms::ParallelFor< DeviceType >::exec( first, last, f );
diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index 237417d66..ef7fe0580 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -724,12 +724,10 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * The lambda function `function` should be declared like follows:
        *
        * ```
-       * auto function = [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute ) { ... };
+       * auto function = [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
        * ```
        *
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixExample_forElements.cpp
@@ -751,12 +749,10 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * The lambda function `function` should be declared like follows:
        *
        * ```
-       * auto function = [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute ) mutable { ... }
+       * auto function = [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) mutable { ... }
        * ```
        *
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixExample_forElements.cpp
@@ -903,10 +899,8 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -920,10 +914,8 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index 08e55190f..40c568b4d 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -890,7 +890,7 @@ operator=( const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocato
    if( std::is_same< DeviceType, RHSDeviceType >::value )
    {
       const auto segments_view = this->segments.getView();
-      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIdx, const RHSRealType& value, bool& compute ) mutable {
+      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIdx, const RHSRealType& value ) mutable {
          if( value != 0.0 )
          {
             IndexType thisGlobalIdx = segments_view.getGlobalIndex( rowIdx, rowLocalIndexes_view[ rowIdx ]++ );
@@ -921,7 +921,7 @@ operator=( const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocato
 
          ////
          // Copy matrix elements into buffer
-         auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value, bool& compute ) mutable {
+         auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value ) mutable {
             const IndexType bufferIdx = ( rowIdx - baseRow ) * maxRowLength + localIdx;
             matrixValuesBuffer_view[ bufferIdx ] = value;
          };
@@ -935,7 +935,7 @@ operator=( const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocato
          // Copy matrix elements from the buffer to the matrix and ignoring
          // zero matrix elements.
          const IndexType matrix_columns = this->getColumns();
-         auto f2 = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType& columnIndex, RealType& value, bool& compute  ) mutable {
+         auto f2 = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType& columnIndex, RealType& value ) mutable {
             RealType inValue( 0.0 );
             IndexType bufferIdx, column( rowLocalIndexes_view[ rowIdx ] );
             while( inValue == 0.0 && column < matrix_columns )
@@ -1001,7 +1001,7 @@ operator=( const RHSMatrix& matrix )
    if( std::is_same< DeviceType, RHSDeviceType >::value )
    {
       const auto segments_view = this->segments.getView();
-      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx_, RHSIndexType columnIndex, const RHSRealType& value, bool& compute ) mutable {
+      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx_, RHSIndexType columnIndex, const RHSRealType& value ) mutable {
          IndexType localIdx( rowLocalIndexes_view[ rowIdx ] );
          if( value != 0.0 && columnIndex != paddingIndex )
          {
@@ -1043,7 +1043,7 @@ operator=( const RHSMatrix& matrix )
 
          ////
          // Copy matrix elements into buffer
-         auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value, bool& compute ) mutable {
+         auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value ) mutable {
             if( columnIndex != paddingIndex )
             {
                TNL_ASSERT_LT( rowIdx - baseRow, bufferRowsCount, "" );
@@ -1066,7 +1066,7 @@ operator=( const RHSMatrix& matrix )
          // zero matrix elements
          //const IndexType matrix_columns = this->getColumns();
          const auto thisRowLengths_view = thisRowLengths.getConstView();
-         auto f2 = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType& columnIndex, RealType& value, bool& compute ) mutable {
+         auto f2 = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType& columnIndex, RealType& value ) mutable {
             RealType inValue( 0.0 );
             size_t bufferIdx;
             IndexType bufferLocalIdx( rowLocalIndexes_view[ rowIdx ] );
diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index 40a89b628..10310d802 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -508,10 +508,8 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`.
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -530,10 +528,8 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`.
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -685,10 +681,8 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -702,10 +696,8 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 5df969867..72377f847 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -594,11 +594,11 @@ forElements( IndexType begin, IndexType end, Function& function ) const
    const auto columns_view = this->columnIndexes.getConstView();
    const auto values_view = this->values.getConstView();
    //const IndexType paddingIndex_ = this->getPaddingIndex();
-   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable -> bool {
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx ) mutable -> bool {
       if( isBinary() )
-         function( rowIdx, localIdx, columns_view[ globalIdx ], 1, compute );
+         function( rowIdx, localIdx, columns_view[ globalIdx ], 1 );
       else
-         function( rowIdx, localIdx, columns_view[ globalIdx ], values_view[ globalIdx ], compute );
+         function( rowIdx, localIdx, columns_view[ globalIdx ], values_view[ globalIdx ] );
       return true;
    };
    this->segments.forElements( begin, end, f );
@@ -618,14 +618,14 @@ forElements( IndexType begin, IndexType end, Function& function )
    auto columns_view = this->columnIndexes.getView();
    auto values_view = this->values.getView();
    const IndexType paddingIndex_ = this->getPaddingIndex();
-   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable {
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx ) mutable {
       if( isBinary() )
       {
          RealType one( columns_view[ globalIdx ] != paddingIndex_ );
-         function( rowIdx, localIdx, columns_view[ globalIdx ], one, compute );
+         function( rowIdx, localIdx, columns_view[ globalIdx ], one );
       }
       else
-         function( rowIdx, localIdx, columns_view[ globalIdx ], values_view[ globalIdx ], compute );
+         function( rowIdx, localIdx, columns_view[ globalIdx ], values_view[ globalIdx ] );
    };
    this->segments.forElements( begin, end, f );
 }
diff --git a/src/TNL/Matrices/TridiagonalMatrix.h b/src/TNL/Matrices/TridiagonalMatrix.h
index b74e0dcb9..4789a079f 100644
--- a/src/TNL/Matrices/TridiagonalMatrix.h
+++ b/src/TNL/Matrices/TridiagonalMatrix.h
@@ -610,10 +610,8 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`.
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -632,10 +630,8 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`.
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -654,10 +650,8 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`.
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -676,10 +670,8 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`.
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -798,10 +790,8 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -815,10 +805,8 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
diff --git a/src/TNL/Matrices/TridiagonalMatrix.hpp b/src/TNL/Matrices/TridiagonalMatrix.hpp
index a76ae3ef9..93ebcd8b3 100644
--- a/src/TNL/Matrices/TridiagonalMatrix.hpp
+++ b/src/TNL/Matrices/TridiagonalMatrix.hpp
@@ -717,7 +717,7 @@ operator=( const TridiagonalMatrix< Real_, Device_, Index_, Organization_, RealA
       if( std::is_same< Device, Device_ >::value )
       {
          const auto matrix_view = matrix.getView();
-         auto f = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value, bool& compute ) mutable {
+         auto f = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value ) mutable {
             value = matrix_view.getValues()[ matrix_view.getIndexer().getGlobalIndex( rowIdx, localIdx ) ];
          };
          this->forAllElements( f );
@@ -727,7 +727,7 @@ operator=( const TridiagonalMatrix< Real_, Device_, Index_, Organization_, RealA
          TridiagonalMatrix< Real, Device, Index, Organization_ > auxMatrix;
          auxMatrix = matrix;
          const auto matrix_view = auxMatrix.getView();
-         auto f = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value, bool& compute ) mutable {
+         auto f = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value ) mutable {
             value = matrix_view.getValues()[ matrix_view.getIndexer().getGlobalIndex( rowIdx, localIdx ) ];
          };
          this->forAllElements( f );
diff --git a/src/TNL/Matrices/TridiagonalMatrixView.h b/src/TNL/Matrices/TridiagonalMatrixView.h
index e05a8b059..6b68f4197 100644
--- a/src/TNL/Matrices/TridiagonalMatrixView.h
+++ b/src/TNL/Matrices/TridiagonalMatrixView.h
@@ -463,10 +463,8 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`.
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -485,10 +483,8 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`.
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -639,7 +635,7 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
        *  If the 'compute' variable is set to false the iteration over the row can
        *  be interrupted.
@@ -656,7 +652,7 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value, bool& compute )`.
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
        *  The column index repeats twice only for compatibility with sparse matrices.
        *  If the 'compute' variable is set to false the iteration over the row can
        *  be interrupted.
diff --git a/src/TNL/Matrices/TridiagonalMatrixView.hpp b/src/TNL/Matrices/TridiagonalMatrixView.hpp
index 0ebad8cb6..c67073381 100644
--- a/src/TNL/Matrices/TridiagonalMatrixView.hpp
+++ b/src/TNL/Matrices/TridiagonalMatrixView.hpp
@@ -393,26 +393,25 @@ forElements( IndexType first, IndexType last, Function& function ) const
 {
    const auto values_view = this->values.getConstView();
    const auto indexer = this->indexer;
-   bool compute( true );
    auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
       if( rowIdx == 0 )
       {
-         function( 0, 1, 0, values_view[ indexer.getGlobalIndex( 0, 1 ) ], compute );
-         function( 0, 2, 1, values_view[ indexer.getGlobalIndex( 0, 2 ) ], compute );
+         function( 0, 1, 0, values_view[ indexer.getGlobalIndex( 0, 1 ) ] );
+         function( 0, 2, 1, values_view[ indexer.getGlobalIndex( 0, 2 ) ] );
       }
       else if( rowIdx + 1 < indexer.getColumns() )
       {
-         function( rowIdx, 0, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ], compute );
-         function( rowIdx, 1, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ], compute );
-         function( rowIdx, 2, rowIdx + 1, values_view[ indexer.getGlobalIndex( rowIdx, 2 ) ], compute );
+         function( rowIdx, 0, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] );
+         function( rowIdx, 1, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] );
+         function( rowIdx, 2, rowIdx + 1, values_view[ indexer.getGlobalIndex( rowIdx, 2 ) ] );
       }
       else if( rowIdx < indexer.getColumns() )
       {
-         function( rowIdx, 0, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ], compute );
-         function( rowIdx, 1, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ], compute );
+         function( rowIdx, 0, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] );
+         function( rowIdx, 1, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] );
       }
       else
-         function( rowIdx, 0, rowIdx, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ], compute );
+         function( rowIdx, 0, rowIdx, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] );
    };
    Algorithms::ParallelFor< DeviceType >::exec( first, last, f );
 }
@@ -428,26 +427,25 @@ forElements( IndexType first, IndexType last, Function& function )
 {
    auto values_view = this->values.getView();
    const auto indexer = this->indexer;
-   bool compute( true );
    auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
       if( rowIdx == 0 )
       {
-         function( 0, 1, 0, values_view[ indexer.getGlobalIndex( 0, 1 ) ], compute );
-         function( 0, 2, 1, values_view[ indexer.getGlobalIndex( 0, 2 ) ], compute );
+         function( 0, 1, 0, values_view[ indexer.getGlobalIndex( 0, 1 ) ] );
+         function( 0, 2, 1, values_view[ indexer.getGlobalIndex( 0, 2 ) ] );
       }
       else if( rowIdx + 1 < indexer.getColumns() )
       {
-         function( rowIdx, 0, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ], compute );
-         function( rowIdx, 1, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ], compute );
-         function( rowIdx, 2, rowIdx + 1, values_view[ indexer.getGlobalIndex( rowIdx, 2 ) ], compute );
+         function( rowIdx, 0, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] );
+         function( rowIdx, 1, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] );
+         function( rowIdx, 2, rowIdx + 1, values_view[ indexer.getGlobalIndex( rowIdx, 2 ) ] );
       }
       else if( rowIdx < indexer.getColumns() )
       {
-         function( rowIdx, 0, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ], compute );
-         function( rowIdx, 1, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ], compute );
+         function( rowIdx, 0, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] );
+         function( rowIdx, 1, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] );
       }
       else
-         function( rowIdx, 0, rowIdx, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ], compute );
+         function( rowIdx, 0, rowIdx, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] );
    };
    Algorithms::ParallelFor< DeviceType >::exec( first, last, f );
 }
@@ -663,13 +661,13 @@ addMatrix( const TridiagonalMatrixView< Real_, Device_, Index_, Organization_ >&
       const auto matrix_view = matrix;
       const auto matrixMult = matrixMultiplicator;
       const auto thisMult = thisMatrixMultiplicator;
-      auto add0 = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value, bool& compute ) mutable {
+      auto add0 = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value ) mutable {
          value = matrixMult * matrix.getValues()[ matrix.getIndexer().getGlobalIndex( rowIdx, localIdx ) ];
       };
-      auto add1 = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value, bool& compute ) mutable {
+      auto add1 = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value ) mutable {
          value += matrixMult * matrix.getValues()[ matrix.getIndexer().getGlobalIndex( rowIdx, localIdx ) ];
       };
-      auto addGen = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value, bool& compute ) mutable {
+      auto addGen = [=] __cuda_callable__ ( const IndexType& rowIdx, const IndexType& localIdx, const IndexType& column, Real& value ) mutable {
          value = thisMult * value + matrixMult * matrix.getValues()[ matrix.getIndexer().getGlobalIndex( rowIdx, localIdx ) ];
       };
       if( thisMult == 0.0 )
diff --git a/src/UnitTests/Matrices/DenseMatrixTest.h b/src/UnitTests/Matrices/DenseMatrixTest.h
index ceb7ae358..9a5fee6e9 100644
--- a/src/UnitTests/Matrices/DenseMatrixTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixTest.h
@@ -791,7 +791,7 @@ void test_ForElements()
    const IndexType rows = 8;
 
    Matrix m( rows, cols  );
-   m.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, const IndexType& columnIdx, RealType& value, bool compute ) mutable {
+   m.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, const IndexType& columnIdx, RealType& value ) mutable {
       value = rowIdx + 1.0;
    } );
 
diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index cca22d857..1716b0ab8 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -1050,7 +1050,7 @@ void test_ForElements()
    const IndexType rows = 8;
 
    Matrix m( { 3, 3, 3, 3, 3, 3, 3, 3, 3 }, cols  );
-   m.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType& columnIdx, RealType& value, bool compute ) mutable {
+   m.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType& columnIdx, RealType& value ) mutable {
       value = rowIdx + 1.0;
       columnIdx = localIdx;
    } );
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest.hpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest.hpp
index c3f8a39db..d39593e23 100644
--- a/src/UnitTests/Matrices/SparseMatrixVectorProductTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest.hpp
@@ -365,7 +365,7 @@ void test_VectorProduct_largeMatrix()
       TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowCapacities( size );
       rowCapacities.forAllElements( [] __cuda_callable__ ( IndexType i, IndexType& value ) { value = 1; } );
       m1.setRowCapacities( rowCapacities );
-      auto f1 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
+      auto f1 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value ) {
          if( localIdx == 0  )
          {
             value = row + 1;
@@ -389,7 +389,7 @@ void test_VectorProduct_largeMatrix()
       rowCapacities.setSize( rows );
       rowCapacities.forAllElements( [=] __cuda_callable__ ( IndexType i, IndexType& value ) { value = i + 1; } );
       m2.setRowCapacities( rowCapacities );
-      auto f2 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
+      auto f2 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value ) {
          if( localIdx <= row )
          {
             value = localIdx + 1;
@@ -436,7 +436,7 @@ void test_VectorProduct_longRowsMatrix()
          TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowsCapacities( rows );
          rowsCapacities = columns;
          m3.setRowCapacities( rowsCapacities );
-         auto f = [] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
+         auto f = [] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value ) {
             column = localIdx;
             value = localIdx + row;
          };
-- 
GitLab


From 9eb2d3180e217147984e9f32a70f65fa4521e700 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 8 Apr 2021 21:08:24 +0200
Subject: [PATCH 022/117] Writting documentation on CSR segments.

---
 .../Algorithms/Segments/CMakeLists.txt        |   4 +
 .../SegmentsExample_CSR_forElements.cpp       |  49 +++++
 .../SegmentsExample_CSR_forElements.cu        |   1 +
 .../SegmentsExample_CSR_forSegments.cpp       |  52 +++++
 .../SegmentsExample_CSR_forSegments.cu        |   1 +
 .../SegmentsExample_CSR_reduceSegments.cpp    |  69 +++++++
 .../SegmentsExample_CSR_reduceSegments.cu     |   1 +
 ...mentsExample_CSR_sequentialForSegments.cpp |  45 ++++
 ...gmentsExample_CSR_sequentialForSegments.cu |   1 +
 src/TNL/Algorithms/Segments/CSR.h             | 195 ++++++++++++++++--
 src/TNL/Algorithms/Segments/SegmentElement.h  |  33 ++-
 src/TNL/Algorithms/Segments/SegmentView.h     |  89 ++++++++
 .../Algorithms/Segments/SegmentViewIterator.h |  23 +++
 13 files changed, 543 insertions(+), 20 deletions(-)
 create mode 100644 Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cpp
 create mode 120000 Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cu
 create mode 100644 Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cpp
 create mode 120000 Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cu
 create mode 100644 Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_reduceSegments.cpp
 create mode 120000 Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_reduceSegments.cu
 create mode 100644 Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_sequentialForSegments.cpp
 create mode 120000 Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_sequentialForSegments.cu

diff --git a/Documentation/Examples/Algorithms/Segments/CMakeLists.txt b/Documentation/Examples/Algorithms/Segments/CMakeLists.txt
index dcc32305e..8df20f637 100644
--- a/Documentation/Examples/Algorithms/Segments/CMakeLists.txt
+++ b/Documentation/Examples/Algorithms/Segments/CMakeLists.txt
@@ -6,6 +6,10 @@ set( COMMON_EXAMPLES
    SegmentsExample_CSR_getSegmentsType
    SegmentsExample_CSR_setSegmentsSizes
    SegmentsExample_CSR_getSegmentView
+   SegmentsExample_CSR_forElements
+   SegmentsExample_CSR_forSegments
+   SegmentsExample_CSR_sequentialForSegments
+   SegmentsExample_CSR_reduceSegments
 )
 
 if( BUILD_CUDA )
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cpp
new file mode 100644
index 000000000..37267a889
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cpp
@@ -0,0 +1,49 @@
+#include <iostream>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   const int size( 5 );
+   SegmentsType segments{ 1, 2, 3, 4, 5 };
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, Device > data( segments.getStorageSize(), 0.0 );
+
+   /***
+    * Insert data into particular segments.
+    */
+   auto data_view = data.getView();
+   segments.forElements( 0, size, [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx ) mutable {
+      if( localIdx <= segmentIdx )
+         data_view[ globalIdx ] = segmentIdx;
+   } );
+
+   /***
+    * Print the data managed by the segments.
+    */
+   auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout );
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cu
new file mode 120000
index 000000000..59a419856
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cu
@@ -0,0 +1 @@
+SegmentsExample_CSR_forElements.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cpp
new file mode 100644
index 000000000..3bf7cc50b
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cpp
@@ -0,0 +1,52 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+   using SegmentViewType = typename SegmentsType::SegmentViewType;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   const int size( 5 );
+   SegmentsType segments{ 1, 2, 3, 4, 5 };
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, Device > data( segments.getStorageSize(), 0.0 );
+
+   /***
+    * Insert data into particular segments.
+    */
+   auto data_view = data.getView();
+   segments.forSegments( 0, size, [=] __cuda_callable__ ( const SegmentViewType& segment ) mutable {
+      for( auto element : segment )
+         if( element.localIndex() <= element.segmentIndex() )
+            data_view[ element.globalIndex() ] = element.segmentIndex() + element.localIndex();
+   } );
+
+   /***
+    * Print the data managed by the segments.
+    */
+   auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout );
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cu
new file mode 120000
index 000000000..07825a022
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cu
@@ -0,0 +1 @@
+SegmentsExample_CSR_forSegments.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_reduceSegments.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_reduceSegments.cpp
new file mode 100644
index 000000000..f784177af
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_reduceSegments.cpp
@@ -0,0 +1,69 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   const int size( 5 );
+   SegmentsType segments{ 1, 2, 3, 4, 5 };
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, Device > data( segments.getStorageSize(), 0.0 );
+
+   /***
+    * Insert data into particular segments.
+    */
+   auto data_view = data.getView();
+   segments.forElements( 0, size, [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx ) mutable {
+      if( localIdx <= segmentIdx )
+         data_view[ globalIdx ] = segmentIdx;
+   } );
+
+   /***
+    * Compute sums of elements in each segment.
+    */
+   TNL::Containers::Vector< double, Device > sums( size );
+   auto sums_view = sums.getView();
+   auto fetch_full = [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx, bool& compute ) -> double {
+      if( localIdx <= segmentIdx )
+         return data_view[ globalIdx ];
+      else
+      {
+         compute = false;
+         return 0.0;
+      }
+   };
+   auto fetch_brief = [=] __cuda_callable__ ( int globalIdx, bool& compute ) -> double {
+      return data_view[ globalIdx ];
+   };
+
+   auto keep = [=] __cuda_callable__ ( int globalIdx, const double& value  ) mutable {
+      sums_view[ globalIdx ] = value; };
+   segments.reduceAllSegments( fetch_full, std::plus<>{}, keep, 0.0 );
+   std::cout << "The sums with full fetch form are: " << sums << std::endl;
+   segments.reduceAllSegments( fetch_brief, std::plus<>{}, keep, 0.0 );
+   std::cout << "The sums with brief fetch form are: " << sums << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_reduceSegments.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_reduceSegments.cu
new file mode 120000
index 000000000..c133b0c2d
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_reduceSegments.cu
@@ -0,0 +1 @@
+SegmentsExample_CSR_reduceSegments.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_sequentialForSegments.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_sequentialForSegments.cpp
new file mode 100644
index 000000000..76affa43b
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_sequentialForSegments.cpp
@@ -0,0 +1,45 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Algorithms/SequentialFor.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+   using SegmentView = typename SegmentsType::SegmentViewType;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   const int size( 5 );
+   SegmentsType segments{ 1, 2, 3, 4, 5 };
+
+   /***
+    * Print the elemets mapping using segment view.
+    */
+   std::cout << "Mapping of local indexes to global indexes:" << std::endl;
+
+   auto f = [=] __cuda_callable__ ( const SegmentView& segment ) {
+      printf( "Segment idx. %d: ", segment.getSegmentIndex() );                 // printf works even in GPU kernels
+      for( auto element : segment )
+         printf( "%d -> %d \t", element.localIndex(), element.globalIndex() );
+      printf( "\n" );
+   };
+   segments.sequentialForSegments( 0, size, f );
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_sequentialForSegments.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_sequentialForSegments.cu
new file mode 120000
index 000000000..06e162fd7
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_sequentialForSegments.cu
@@ -0,0 +1 @@
+SegmentsExample_CSR_sequentialForSegments.cpp
\ No newline at end of file
diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index af05a9f61..f3f1aa881 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -294,46 +294,196 @@ class CSR
       OffsetsContainer& getOffsets();
 
       /**
-       * \brief Go over all segments and for each segment element call
-       * function 'f'. The return type of 'f' is bool.
-       * When its true, the for-loop continues. Once 'f' returns false, the for-loop
-       * is terminated.
+       * \brief Iterate over all elements of given segments in parallel and call given lambda function.
+       *
+       * \tparam Function is a type of the lambda function to be performed on each element.
+       * \param begin defines begining of an interval [ \e begin, \e end ) of segments on
+       *    elements of which we want to apply the lambda function.
+       * \param end defines end of an interval [ \e begin, \e end ) of segments on
+       *    elements of which we want to apply the lambda function.
+       * \param function is the lambda function to be applied on the elements of the segments.
+       *
+       * Declaration of the lambda function \e function is supposed to be
+       *
+       * ```
+       * auto f = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType localIdx, IndexType globalIdx ) {...} 
+       * ```
+       * where \e segmentIdx is index of segment where given element belong to, \e localIdx is rank of the element
+       * within the segment and \e globalIdx is index of the element within the related container.
+       *
+       * \par Example
+       * \include Algorithms/Segments/SegmentsExample_CSR_forElements.cpp
+       * \par Output
+       * \include SegmentsExample_CSR_forElements.out
        */
       template< typename Function >
-      void forElements( IndexType begin, IndexType end, Function&& f ) const;
+      void forElements( IndexType begin, IndexType end, Function&& function ) const;
 
+      /**
+       * \brief Call \ref TNL::Algorithms::Segments::CSR::forElements for all elements of the segments.
+       *
+       * See \ref TNL::Algorithms::Segments::CSR::forElements for more details.
+       */
       template< typename Function >
-      void forAllElements( Function&& f ) const;
+      void forAllElements( Function&& function ) const;
 
+      /**
+       * \brief Iterate over all segments in parallel and call given lambda function.
+       *
+       * \tparam Function is a type of the lambda function to be performed on each segment.
+       * \param begin defines begining of an interval [ \e begin, \e end ) of segments on
+       *    elements of which we want to apply the lambda function.
+       * \param end defines end of an interval [ \e begin, \e end ) of segments on
+       *    elements of which we want to apply the lambda function.
+       * \param function is the lambda function to be applied on the elements of the segments.
+       *
+       *  Declaration of the lambda function \e function is supposed to be
+       *
+       * ```
+       * auto f = [=] __cuda_callable__ ( const SegmentView& segment ) {...}
+       * ```
+       * where \e segment represents given segment (see \ref TNL::Algorithms::Segments::SegmentView).
+       * Its type is given by \ref SegmentViewType.
+       *
+       * \par Example
+       * \include Algorithms/Segments/SegmentsExample_CSR_forSegments.cpp
+       * \par Output
+       * \include SegmentsExample_CSR_forSegments.out
+       */
       template< typename Function >
-      void forSegments( IndexType begin, IndexType end, Function&& f ) const;
+      void forSegments( IndexType begin, IndexType end, Function&& function ) const;
 
+      /**
+       * \brief Call \ref TNL::Algorithms::Segments::CSR::forSegments for all segments.
+       *
+       * See \ref TNL::Algorithms::Segments::CSR::forSegments for more details.
+       */
       template< typename Function >
-      void forAllSegments( Function&& f ) const;
+      void forAllSegments( Function&& function ) const;
 
+      /**
+       * \brief Call \ref TNL::Algorithms::Segments::CSR::forSegments sequentially for particular segments.
+       *
+       * With this method, the given segments are processed sequentially one-by-one. This is usefull for example
+       * for printing of segments based data structures or for debugging reasons.
+       *
+       * \param begin defines begining of an interval [ \e begin, \e end ) of segments on
+       *    elements of which we want to apply the lambda function.
+       * \param end defines end of an interval [ \e begin, \e end ) of segments on
+       *    elements of which we want to apply the lambda function.
+       * \param function is the lambda function to be applied on the elements of the segments.
+       *
+       * See \ref TNL::Algorithms::Segments::CSR::forSegments for more details.
+       *
+       * \par Example
+       * \include Algorithms/Segments/SegmentsExample_CSR_sequentialForSegments.cpp
+       * \par Output
+       * \include SegmentsExample_CSR_sequentialForSegments.out
+       */
       template< typename Function >
-      void sequentialForSegments( IndexType begin, IndexType end, Function&& f ) const;
+      void sequentialForSegments( IndexType begin, IndexType end, Function&& function ) const;
 
+      /**
+       * \brief Call \ref TNL::Algorithms::Segments::CSR::sequentialForSegments for all segments.
+       *
+       * See \ref TNL::Algorithms::Segments::CSR::sequentialForSegments for more details.
+       */
       template< typename Function >
       void sequentialForAllSegments( Function&& f ) const;
 
-
-      /***
-       * \brief Go over all segments and perform a reduction in each of them.
+      /**
+       * \brief Compute reduction in each segment.
+       *
+       * \tparam Fetch is type of lambda function for data fetching.
+       * \tparam Reduce is a reduction operation.
+       * \tparam Keep is lambda function for storing results from particular segments.
+       *
+       * \param begin defines begining of an interval [ \e begin, \e end ) of segments in
+       *    which we want to perform the reduction.
+       * \param end defines and of an interval [ \e begin, \e end ) of segments in
+       *    which we want to perform the reduction.
+       * \param fetch is a lambda function for fetching of data. It is suppos have one of the
+       *  following forms:
+       * 1. Full form
+       *  ```
+       *  auto fetch = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) { ... }
+       *  ```
+       * 2. Brief form
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType globalIdx, bool& compute ) { ... }
+       * ```
+       * where for both variants \e segmentIdx is segment index, \e localIdx is a rank of element in the segment, \e globalIdx is index of the element
+       * in related container and \e compute is a boolean variable which serves for stopping the reduction if it is set to \e false. It is however,
+       * only a hint and the real behaviour depends on type of kernel used ofr the redcution.
+       * Some kernels are optimized so that they can be significantly faster with the brief variant of the \e fetch lambda function.
+       * \param reduce is a lambda function representing the reduction opeartion. It is supposed to be defined as:
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const Value& a, const Value& b ) -> Value { ... }
+       * ```
+       *
+       * where \e a and \e b are values to be reduced and the lambda function returns result of the reduction.
+       * \param keep is a lambda function for saving results from particular segments. It is supposed to be defined as:
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( IndexType segmentIdx, const Value& value ) { ... }
+       * ```
+       *
+       * where \e segmentIdx is an index of the segment and \e value is the result of the reduction in given segment to be stored.
+       *
+       * \par Example
+       * \include Algorithms/Segments/SegmentsExample_CSR_reduceSegments.cpp
+       * \par Output
+       * \include SegmentsExample_CSR_reduceSegments.out
        */
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
-      void reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
+      template< typename Fetch, typename Reduce, typename Keep, typename Value >
+      void reduceSegments( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const Value& zero ) const;
 
-      template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real >
-      void reduceAllSegments( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const;
+      /**
+       * \brief Call \ref TNL::Algorithms::Segments::CSR::reduceSegments for all segments.
+       *
+       * See \ref TNL::Algorithms::Segments::CSR::reduceSegments for more details.
+       */
+      template< typename Fetch, typename Reduce, typename Keep, typename Value >
+      void reduceAllSegments( Fetch& fetch, const Reduce& reduce, Keep& keep, const Value& zero ) const;
 
-      CSR& operator=( const CSR& rhsSegments ) = default;
+      /**
+       * \brief Assignment operator.
+       *
+       * It makes a deep copy of the source segments.
+       *
+       * \param source are the CSR segments to be assigned.
+       * \return reference to this instance.
+       */
+      CSR& operator=( const CSR& source ) = default;
 
+      /**
+       * \brief Assignment operator with CSR segments with different template parameters.
+       *
+       * It makes a deep copy of the source segments.
+       *
+       * \tparam Device_ is device type of the source segments.
+       * \tparam Index_ is the index type of the source segments.
+       * \tparam Kernel_ is the kernel type of the source segments.
+       * \tparam IndexAllocator_ is the index allocator of the source segments.
+       * \param source is the source segments object.
+       * \return reference to this instance.
+       */
       template< typename Device_, typename Index_, typename Kernel_, typename IndexAllocator_ >
       CSR& operator=( const CSR< Device_, Index_, Kernel_, IndexAllocator_ >& source );
 
+      /**
+       * \brief Method for saving the segments to a file in a binary form.
+       *
+       * \param file is the target file.
+       */
       void save( File& file ) const;
 
+      /**
+       * \brief Method for loading the segments from a file in a binary form.
+       *
+       * \param file is the source file.
+       */
       void load( File& file );
 
    protected:
@@ -343,6 +493,17 @@ class CSR
       KernelType kernel;
 };
 
+/**
+ * \brief Insertion operator of CSR segments to output stream.
+ *
+ * \tparam Device is the device type of the source segments.
+ * \tparam Index is the index type of the source segments.
+ * \tparam Kernel is kernel type of the source segments.
+ * \tparam IndexAllocator is the index allocator of the source segments.
+ * \param str is the output stream.
+ * \param segments are the source segments.
+ * \return reference to the output stream.
+ */
 template< typename Device,
           typename Index,
           typename Kernel,
diff --git a/src/TNL/Algorithms/Segments/SegmentElement.h b/src/TNL/Algorithms/Segments/SegmentElement.h
index 68088ba22..71f78cdd3 100644
--- a/src/TNL/Algorithms/Segments/SegmentElement.h
+++ b/src/TNL/Algorithms/Segments/SegmentElement.h
@@ -18,26 +18,55 @@ namespace TNL {
    namespace Algorithms {
       namespace Segments {
 
-
+/**
+ * \brief Simple structure representing one element of a segment.
+ *
+ * \tparam Index is type used for indexing of the elements.
+ */
 template< typename Index >
 class SegmentElement
 {
    public:
 
+      /**
+       * \brief Type used for indexing of the elements.
+       */
       using IndexType = Index;
 
+      /**
+       * \brief Constructor of the segment element with all parameters.
+       *
+       * \param segmentIdx is in index of the parent segment.
+       * \param localIdx is a rank of the element in the segment.
+       * \param globalIdx is an index of the element in the related container.
+       */
       __cuda_callable__
       SegmentElement( const IndexType& segmentIdx,
                       const IndexType& localIdx,
                       const IndexType globalIdx )
       : segmentIdx( segmentIdx ), localIdx( localIdx ), globalIdx( globalIdx ) {};
 
+      /**
+       * \brief Returns index of the parent segment.
+       *
+       * \return index of the parent segment.
+       */
       __cuda_callable__
       const IndexType& segmentIndex() const { return segmentIdx; };
 
+      /**
+       * \brief Returns rank of the element in the segment.
+       *
+       * \return rank of the element in the segment.
+       */
       __cuda_callable__
       const IndexType& localIndex() const { return localIdx; };
 
+      /**
+       * \brief Returns index of the element in the related container.
+       *
+       * \return index of the element in the related container.
+       */
       __cuda_callable__
       const IndexType& globalIndex() const { return globalIdx; };
 
@@ -48,8 +77,6 @@ class SegmentElement
       const IndexType& localIdx;
 
       const IndexType globalIdx;
-
-
 };
 
       } // namespace Segments
diff --git a/src/TNL/Algorithms/Segments/SegmentView.h b/src/TNL/Algorithms/Segments/SegmentView.h
index 399e3ddd1..aac6e0a94 100644
--- a/src/TNL/Algorithms/Segments/SegmentView.h
+++ b/src/TNL/Algorithms/Segments/SegmentView.h
@@ -17,19 +17,48 @@ namespace TNL {
    namespace Algorithms {
       namespace Segments {
 
+/**
+ * \brief Data structure for accessing particular segment.
+ *
+ * \tparam Index is type for indexing elements in related segments.
+ *
+ * See the template specializations \ref TNL::Algorithms::Segments::SegmentView< Index, ColumnMajorOrder >
+ *  and \ref TNL::Algorithms::Segments::SegmentView< Index, RowMajorOrder > for column-major
+ * and row-major elements organization respectively. They have equivalent interface.
+ */
 template< typename Index,
           ElementsOrganization Organization >
 class SegmentView;
 
+
+/**
+ * \brief Data structure for accessing particular segment.
+ *
+ * \tparam Index is type for indexing elements in related segments.
+ */
 template< typename Index >
 class SegmentView< Index, ColumnMajorOrder >
 {
    public:
 
+      /**
+       * \brief Type for indexing elements in related segments.
+       */
       using IndexType = Index;
 
+      /**
+       * \brief Type of iterator for iterating over elements of the segment.
+       */
       using IteratorType = SegmentViewIterator< SegmentView >;
 
+      /**
+       * \brief Conctructor with all parameters.
+       *
+       * \param segmentIdx is an index of segment the segment view will point to.
+       * \param offset is an offset of the segment in the parent segments.
+       * \param size is a size of the segment.
+       * \param step is stepping between neighbouring elements in the segment.
+       */
       __cuda_callable__
       SegmentView( const IndexType segmentIdx,
                    const IndexType offset,
@@ -37,16 +66,32 @@ class SegmentView< Index, ColumnMajorOrder >
                    const IndexType step )
       : segmentIdx( segmentIdx ), segmentOffset( offset ), segmentSize( size ), step( step ){};
 
+      /**
+       * \brief Copy constructor.
+       *
+       * \param view is the source view.
+       */
       __cuda_callable__
       SegmentView( const SegmentView& view )
       : segmentIdx( view.segmentIdx ), segmentOffset( view.segmentOffset ), segmentSize( view.segmentSize ), step( view.step ){};
 
+      /**
+       * \brief Get the size of the segment, i.e. number of elements in the segment.
+       *
+       * \return number of elements in the segment.
+       */
       __cuda_callable__
       const IndexType& getSize() const
       {
          return this->segmentSize;
       };
 
+      /**
+       * \brief Get global index of an element with rank \e localIndex in the segment.
+       *
+       * \param localIndex is the rank of the element in the segment.
+       * \return global index of the element.
+       */
       __cuda_callable__
       IndexType getGlobalIndex( const IndexType localIndex ) const
       {
@@ -54,6 +99,11 @@ class SegmentView< Index, ColumnMajorOrder >
          return segmentOffset + localIndex * step;
       };
 
+      /**
+       * \brief Get index of the segment.
+       *
+       * \return index of the segment.
+       */
       __cuda_callable__
       const IndexType& getSegmentIndex() const
       {
@@ -102,10 +152,24 @@ class SegmentView< Index, RowMajorOrder >
 {
    public:
 
+      /**
+       * \brief Type for indexing elements in related segments.
+       */
       using IndexType = Index;
 
+      /**
+       * \brief Type of iterator for iterating over elements of the segment.
+       */
       using IteratorType = SegmentViewIterator< SegmentView >;
 
+      /**
+       * \brief Conctructor with all parameters.
+       *
+       * \param segmentIdx is an index of segment the segment view will point to.
+       * \param offset is an offset of the segment in the parent segments.
+       * \param size is a size of the segment.
+       * \param step is stepping between neighbouring elements in the segment.
+       */
       __cuda_callable__
       SegmentView( const IndexType segmentIdx,
                    const IndexType offset,
@@ -113,12 +177,32 @@ class SegmentView< Index, RowMajorOrder >
                    const IndexType step = 1 ) // For compatibility with previous specialization
       : segmentIdx( segmentIdx ), segmentOffset( offset ), segmentSize( size ){};
 
+      /**
+       * \brief Copy constructor.
+       *
+       * \param view is the source view.
+       */
+      __cuda_callable__
+      SegmentView( const SegmentView& view )
+      : segmentIdx( view.segmentIdx ), segmentOffset( view.segmentOffset ), segmentSize( view.segmentSize ) {};
+
+      /**
+       * \brief Get the size of the segment, i.e. number of elements in the segment.
+       *
+       * \return number of elements in the segment.
+       */
       __cuda_callable__
       const IndexType& getSize() const
       {
          return this->segmentSize;
       };
 
+      /**
+       * \brief Get global index of an element with rank \e localIndex in the segment.
+       *
+       * \param localIndex is the rank of the element in the segment.
+       * \return global index of the element.
+       */
       __cuda_callable__
       IndexType getGlobalIndex( const IndexType localIndex ) const
       {
@@ -126,6 +210,11 @@ class SegmentView< Index, RowMajorOrder >
          return segmentOffset + localIndex;
       };
 
+      /**
+       * \brief Get index of the segment.
+       *
+       * \return index of the segment.
+       */
       __cuda_callable__
       const IndexType& getSegmentIndex() const
       {
diff --git a/src/TNL/Algorithms/Segments/SegmentViewIterator.h b/src/TNL/Algorithms/Segments/SegmentViewIterator.h
index 335ce91aa..a0e788832 100644
--- a/src/TNL/Algorithms/Segments/SegmentViewIterator.h
+++ b/src/TNL/Algorithms/Segments/SegmentViewIterator.h
@@ -19,6 +19,13 @@ namespace TNL {
    namespace Algorithms {
       namespace Segments {
 
+/**
+ * \brief Iterator for iterating over elements of a segment.
+ *
+ * The iterator can be used even in GPU kernels.
+ *
+ * \tparam SegmentView is a type of related segment view.
+ */
 template< typename SegmentView >
 class SegmentViewIterator
 {
@@ -61,12 +68,28 @@ class SegmentViewIterator
       __cuda_callable__
       bool operator!=( const SegmentViewIterator& other ) const;
 
+      /**
+       * \brief Operator for incrementing the iterator, i.e. moving to the next element.
+       *
+       * \return reference to this iterator.
+       */
       __cuda_callable__
       SegmentViewIterator& operator++();
 
+      /**
+       * \brief Operator for decrementing the iterator, i.e. moving to the previous element.
+       *
+       * \return reference to this iterator.
+       */
       __cuda_callable__
       SegmentViewIterator& operator--();
 
+      /**
+       * \brief Operator for derefrencing the iterator.
+       *
+       * It returns structure \ref SegmentElementType which represent one element of a segment.
+       * \return segment element the iterator points to.
+       */
       __cuda_callable__
       const SegmentElementType operator*() const;
 
-- 
GitLab


From 1597e56a6a1e888f89635bb1ae37153ac918683b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 9 Apr 2021 18:33:35 +0200
Subject: [PATCH 023/117] Fixing unit test on segments.

---
 src/UnitTests/Algorithms/Segments/SegmentsTest.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/UnitTests/Algorithms/Segments/SegmentsTest.hpp b/src/UnitTests/Algorithms/Segments/SegmentsTest.hpp
index 5b92a02c8..f6d5a9c8c 100644
--- a/src/UnitTests/Algorithms/Segments/SegmentsTest.hpp
+++ b/src/UnitTests/Algorithms/Segments/SegmentsTest.hpp
@@ -128,7 +128,7 @@ void test_reduceAllSegments_MaximumInSegments()
    TNL::Containers::Vector< IndexType, DeviceType, IndexType > v( segments.getStorageSize() );
 
    auto view = v.getView();
-   auto init = [=] __cuda_callable__ ( const IndexType segmentIdx, const IndexType localIdx, const IndexType globalIdx, bool& compute ) mutable -> bool {
+   auto init = [=] __cuda_callable__ ( const IndexType segmentIdx, const IndexType localIdx, const IndexType globalIdx ) mutable -> bool {
       view[ globalIdx ] =  segmentIdx * 5 + localIdx + 1;
       return true;
    };
-- 
GitLab


From 9ffb70a5f9ffe55f45ca72403ab6648b2e635453 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 10 Apr 2021 11:30:03 +0200
Subject: [PATCH 024/117] Fixing formating in segments printing.

---
 .../Algorithms/Segments/SegmentsPrinting.h    | 35 ++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/src/TNL/Algorithms/Segments/SegmentsPrinting.h b/src/TNL/Algorithms/Segments/SegmentsPrinting.h
index 30c027b74..5018471b3 100644
--- a/src/TNL/Algorithms/Segments/SegmentsPrinting.h
+++ b/src/TNL/Algorithms/Segments/SegmentsPrinting.h
@@ -17,6 +17,19 @@ namespace TNL {
    namespace Algorithms {
       namespace Segments {
 
+/**
+ * \brief Print segments sizes, i.e. the segments setup.
+ *
+ * \tparam Segments is type of segments.
+ * \param segments is an instance of segments.
+ * \param str is output stream.
+ * \return reference to the output stream.
+ *
+ * \par Example
+ * \include Algorithms/Segments/SegmentsPrintingExample-1.cpp
+ * \par Output
+ * \include SegmentsPrintingExample-1.out
+ */
 template< typename Segments >
 std::ostream& printSegments( const Segments& segments, std::ostream& str )
 {
@@ -32,11 +45,31 @@ std::ostream& printSegments( const Segments& segments, std::ostream& str )
       if( segmentIdx < segmentsCount )
          str << ",";
    }
-   str << " ] " << std::endl;
+   str << " ] ";
    return str;
 }
 
 
+/**
+ * \brief Print segments with related content.
+ *
+ * \tparam Segments is type of segments.
+ * \tparam Fetch is a lambda function for reading related data.
+ * \param segments is an instance of segments.
+ * \param fetch is an instance of lambda function reading related data. It is supposed to defined as
+ *
+ * ```
+ * auto fetch = [=] __cuda_callable__ ( IndexType globalIdx ) -> ValueType { return data_view[ globalIdx ]; };
+ * ```
+ *
+ * \param str is output stream.
+ * \return reference to the output stream.
+ *
+ * \par Example
+ * \include Algorithms/Segments/SegmentsPrintingExample-2.cpp
+ * \par Output
+ * \include SegmentsPrintingExample-2.out
+ */
 template< typename Segments,
           typename Fetch >
 std::ostream& printSegments( const Segments& segments, Fetch&& fetch, std::ostream& str )
-- 
GitLab


From 8c2f2c26fdb76be3f95e18277f5e1ea2d2f10c47 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 10 Apr 2021 11:30:44 +0200
Subject: [PATCH 025/117] Fixing const views for segments.

---
 src/TNL/Algorithms/Segments/BiEllpack.h             | 2 +-
 src/TNL/Algorithms/Segments/BiEllpackView.h         | 2 +-
 src/TNL/Algorithms/Segments/ChunkedEllpack.h        | 5 +++--
 src/TNL/Algorithms/Segments/ChunkedEllpack.hpp      | 8 ++++----
 src/TNL/Algorithms/Segments/ChunkedEllpackView.h    | 4 ++--
 src/TNL/Algorithms/Segments/detail/BiEllpack.h      | 2 +-
 src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h | 4 ++--
 7 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/BiEllpack.h b/src/TNL/Algorithms/Segments/BiEllpack.h
index 48aa4e6be..ee202d25c 100644
--- a/src/TNL/Algorithms/Segments/BiEllpack.h
+++ b/src/TNL/Algorithms/Segments/BiEllpack.h
@@ -190,7 +190,7 @@ template <typename Device,
           typename IndexAllocator,
           ElementsOrganization Organization,
           int WarpSize >
-std::ostream& operator<<( std::ostream& str, const BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >& segments ) { return printSegments( str, segments ); }
+std::ostream& operator<<( std::ostream& str, const BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >& segments ) { return printSegments( segments, str ); }
 
 
       } // namespace Segments
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.h b/src/TNL/Algorithms/Segments/BiEllpackView.h
index c37ed6d73..c0ae1559e 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.h
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.h
@@ -32,7 +32,7 @@ class BiEllpackView
 
       using DeviceType = Device;
       using IndexType = std::remove_const_t< Index >;
-      using OffsetsView = typename Containers::VectorView< IndexType, DeviceType, IndexType >;
+      using OffsetsView = typename Containers::VectorView< Index, DeviceType, IndexType >;
       using ConstOffsetsView = typename OffsetsView::ConstViewType;
       using ViewType = BiEllpackView;
       template< typename Device_, typename Index_ >
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
index d5d459a00..5b0916d20 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
@@ -14,6 +14,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/ChunkedEllpackView.h>
 #include <TNL/Algorithms/Segments/SegmentView.h>
+#include <TNL/Algorithms/Segments/SegmentsPrinting.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -34,7 +35,7 @@ class ChunkedEllpack
       using ViewType = ChunkedEllpackView< Device, Index, Organization >;
       template< typename Device_, typename Index_ >
       using ViewTemplate = ChunkedEllpackView< Device_, Index_, Organization >;
-      using ConstViewType = ChunkedEllpackView< Device, std::add_const_t< IndexType >, Organization >;
+      using ConstViewType = typename ViewType::ConstViewType;
       using SegmentViewType = typename ViewType::SegmentViewType;
       using ChunkedEllpackSliceInfoType = typename ViewType::ChunkedEllpackSliceInfoType; // detail::ChunkedEllpackSliceInfo< IndexType >;
       //TODO: using ChunkedEllpackSliceInfoAllocator = typename IndexAllocatorType::retype< ChunkedEllpackSliceInfoType >;
@@ -176,7 +177,7 @@ template <typename Device,
           typename Index,
           typename IndexAllocator,
           ElementsOrganization Organization >
-std::ostream& operator<<( std::ostream& str, const ChunkedEllpack< Device, Index, IndexAllocator, Organization >& segments ) { return printSegments( str, segments ); }
+std::ostream& operator<<( std::ostream& str, const ChunkedEllpack< Device, Index, IndexAllocator, Organization >& segments ) { return printSegments( segments, str ); }
 
 
       } // namespace Segments
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
index 8992d0951..82ddd7d8e 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
@@ -348,10 +348,10 @@ template< typename Device,
 auto ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
 getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
-   return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentSize(
-      rowToSliceMapping.getView(),
-      slices.getView(),
-      rowToChunkMapping.getView(),
+   return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentSize(
+      rowToSliceMapping.getConstView(),
+      slices.getConstView(),
+      rowToChunkMapping.getConstView(),
       segmentIdx );
 }
 
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
index 123bc1cb9..a20d5a41a 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
@@ -32,7 +32,7 @@ class ChunkedEllpackView
 
       using DeviceType = Device;
       using IndexType = std::remove_const_t< Index >;
-      using OffsetsView = typename Containers::VectorView< IndexType, DeviceType, IndexType >;
+      using OffsetsView = typename Containers::VectorView< Index, DeviceType, IndexType >;
       using ConstOffsetsView = typename OffsetsView::ConstViewType;
       using ViewType = ChunkedEllpackView;
       template< typename Device_, typename Index_ >
@@ -41,7 +41,7 @@ class ChunkedEllpackView
       using SegmentViewType = ChunkedEllpackSegmentView< IndexType, Organization >;
       using ChunkedEllpackSliceInfoType = detail::ChunkedEllpackSliceInfo< IndexType >;
       using ChunkedEllpackSliceInfoAllocator = typename Allocators::Default< Device >::template Allocator< ChunkedEllpackSliceInfoType >;
-      using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >;
+      using ChunkedEllpackSliceInfoContainer = Containers::Array< typename TNL::copy_const< ChunkedEllpackSliceInfoType >::template from< Index >::type, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >;
       using ChunkedEllpackSliceInfoContainerView = typename ChunkedEllpackSliceInfoContainer::ViewType;
 
       static constexpr bool havePadding() { return true; };
diff --git a/src/TNL/Algorithms/Segments/detail/BiEllpack.h b/src/TNL/Algorithms/Segments/detail/BiEllpack.h
index 43c42e43c..db64d392d 100644
--- a/src/TNL/Algorithms/Segments/detail/BiEllpack.h
+++ b/src/TNL/Algorithms/Segments/detail/BiEllpack.h
@@ -32,7 +32,7 @@ class BiEllpack
       using IndexType = Index;
       static constexpr bool getOrganization() { return Organization; }
       using OffsetsContainer = Containers::Vector< IndexType, DeviceType, IndexType >;
-      using OffsetsHolderView = typename OffsetsContainer::ViewType;
+      using OffsetsHolderView = typename OffsetsContainer::ConstViewType;
       using ConstOffsetsHolderView = typename OffsetsHolderView::ConstViewType;
       using SegmentsSizes = OffsetsContainer;
       using SegmentViewType = BiEllpackSegmentView< IndexType, Organization >;
diff --git a/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h
index 3e279b02b..d9a6c30f2 100644
--- a/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h
+++ b/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h
@@ -63,12 +63,12 @@ class ChunkedEllpack
       using IndexType = Index;
       static constexpr ElementsOrganization getOrganization() { return Organization; }
       using OffsetsContainer = Containers::Vector< IndexType, DeviceType, IndexType >;
-      using OffsetsHolderView = typename OffsetsContainer::ViewType;
+      using OffsetsHolderView = typename OffsetsContainer::ConstViewType;
       using SegmentsSizes = OffsetsContainer;
       using ChunkedEllpackSliceInfoType = details::ChunkedEllpackSliceInfo< IndexType >;
       using ChunkedEllpackSliceInfoAllocator = typename Allocators::Default< Device >::template Allocator< ChunkedEllpackSliceInfoType >;
       using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >;
-      using ChunkedEllpackSliceInfoContainerView = typename ChunkedEllpackSliceInfoContainer::ViewType;
+      using ChunkedEllpackSliceInfoContainerView = typename ChunkedEllpackSliceInfoContainer::ConstViewType;
       using SegmentViewType = ChunkedEllpackSegmentView< IndexType, Organization >;
 
       __cuda_callable__ static
-- 
GitLab


From e7636ac3932e94a8976ed2bade18928365855d9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 10 Apr 2021 11:31:40 +0200
Subject: [PATCH 026/117] Added link to paper by Guy Bleloch to tutorial on
 scan.

---
 .../Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md b/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
index 59ab08d3f..0c55abd2e 100644
--- a/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
+++ b/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
@@ -216,7 +216,7 @@ and exclusive prefix sum of the same sequence is
 [0,1,4,9,16,25,36]
 ```
 
-Both kinds of [scan](https://en.wikipedia.org/wiki/Prefix_sum) are usually applied only on summation, however product or logical operations could be handy as well. In TNL, scan is implemented in similar way as reduction and uses the same functors as the reduction operation. The following example shows how it works:
+Both kinds of [scan](https://en.wikipedia.org/wiki/Prefix_sum) have many different [applications](https://www.cs.cmu.edu/~guyb/papers/Ble93.pdf) but they are usually applied only on summation, however product or logical operations could be handy as well. In TNL, prefix sum is implemented in similar way as reduction and so it can be easily modified by lambda functions. The following example shows how it works:
 
 ```
 inplaceInclusiveScan( array, 0, array.getSize(), TNL::Plus{} );
-- 
GitLab


From 8c2f7758309223c84d75cf2b72fd4e6be6325104 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 10 Apr 2021 12:39:32 +0200
Subject: [PATCH 027/117] Added examples for segments printing.

---
 .../Segments/SegmentsPrintingExample-1.cpp    | 49 +++++++++++++
 .../Segments/SegmentsPrintingExample-1.cu     |  1 +
 .../Segments/SegmentsPrintingExample-2.cpp    | 68 +++++++++++++++++++
 .../Segments/SegmentsPrintingExample-2.cu     |  1 +
 4 files changed, 119 insertions(+)
 create mode 100644 Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-1.cpp
 create mode 120000 Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-1.cu
 create mode 100644 Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cpp
 create mode 120000 Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cu

diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-1.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-1.cpp
new file mode 100644
index 000000000..f5fc25337
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-1.cpp
@@ -0,0 +1,49 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Algorithms/Segments/Ellpack.h>
+#include <TNL/Algorithms/Segments/ChunkedEllpack.h>
+#include <TNL/Algorithms/Segments/BiEllpack.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Segments >
+void SegmentsExample()
+{
+   /***
+    * Create segments with given segments sizes and print their setup.
+    */
+   Segments segments{ 1, 2, 3, 4, 5 };
+   std::cout << "Segments sizes are: " << segments << std::endl << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Host, int > >();
+
+   std::cout << "Example of Ellpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Host, int > >();
+
+   std::cout << "Example of ChunkedEllpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::ChunkedEllpack< TNL::Devices::Host, int > >();
+
+   std::cout << "Example of BiEllpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::BiEllpack< TNL::Devices::Host, int > >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Cuda, int > >();
+
+   std::cout << "Example of Ellpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Cuda, int > >();
+
+   std::cout << "Example of ChunkedEllpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::ChunkedEllpack< TNL::Devices::Cuda, int > >();
+
+   std::cout << "Example of BiEllpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::BiEllpack< TNL::Devices::Cuda, int > >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-1.cu b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-1.cu
new file mode 120000
index 000000000..42cd3852f
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-1.cu
@@ -0,0 +1 @@
+SegmentsPrintingExample-1.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cpp
new file mode 100644
index 000000000..ceaff0ecd
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cpp
@@ -0,0 +1,68 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Algorithms/Segments/Ellpack.h>
+#include <TNL/Algorithms/Segments/ChunkedEllpack.h>
+#include <TNL/Algorithms/Segments/BiEllpack.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Segments >
+void SegmentsExample()
+{
+   using Device = typename Segments::DeviceType;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   TNL::Containers::Vector< int, Device > sizes{ 1, 2, 3, 4, 5 };
+   Segments segments( sizes );
+   std::cout << "Segments sizes are: " << segments << std::endl;
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, Device > data( segments.getStorageSize(), 0.0 );
+   data.forAllElements( [=] __cuda_callable__ ( int idx, double& value ) {
+      value = idx + 1.0;
+   } );
+
+   /***
+    * Print the data managed by the segments.
+    */
+   auto data_view = data.getView();
+   auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout );
+   std::cout << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Host, int > >();
+
+   std::cout << "Example of Ellpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Host, int > >();
+
+   std::cout << "Example of ChunkedEllpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::ChunkedEllpack< TNL::Devices::Host, int > >();
+
+   std::cout << "Example of BiEllpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::BiEllpack< TNL::Devices::Host, int > >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Cuda, int > >();
+
+   std::cout << "Example of Ellpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Cuda, int > >();
+
+   std::cout << "Example of ChunkedEllpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::ChunkedEllpack< TNL::Devices::Cuda, int > >();
+
+   std::cout << "Example of BiEllpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::BiEllpack< TNL::Devices::Cuda, int > >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cu b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cu
new file mode 120000
index 000000000..2f3149802
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cu
@@ -0,0 +1 @@
+SegmentsPrintingExample-2.cpp
\ No newline at end of file
-- 
GitLab


From 1ac8fe84f957b1dac10771e74f4a14dbb2359971 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 17 Apr 2021 16:58:02 +0200
Subject: [PATCH 028/117] Writting tutorial for segments.

---
 .../Algorithms/Segments/CMakeLists.txt        |   6 +
 .../Segments/SegmentsExample_forElements.cpp  |  73 ++++++
 .../Segments/SegmentsExample_forElements.cu   |   1 +
 .../SegmentsExample_forSegments-1.cpp         |  62 +++++
 .../Segments/SegmentsExample_forSegments-1.cu |   1 +
 .../SegmentsExample_forSegments-2.cpp         |  71 ++++++
 .../Segments/SegmentsExample_forSegments-2.cu |   1 +
 .../SegmentsExample_reduceSegments.cpp        |  83 +++++++
 .../SegmentsExample_reduceSegments.cu         |   1 +
 .../Segments/SegmentsPrintingExample-1.cpp    |   1 -
 .../Segments/SegmentsPrintingExample-2.cpp    |   3 +-
 .../Tutorials/Segments/CMakeLists.txt         |  35 +++
 .../Tutorials/Segments/tutorial_Segments.md   | 212 ++++++++++++++++++
 Documentation/Tutorials/index.md              |   3 +-
 src/TNL/Algorithms/Segments/_NamespaceDoxy.h  |   4 +-
 15 files changed, 551 insertions(+), 6 deletions(-)
 create mode 100644 Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cpp
 create mode 120000 Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cu
 create mode 100644 Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cpp
 create mode 120000 Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cu
 create mode 100644 Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cpp
 create mode 120000 Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cu
 create mode 100644 Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cpp
 create mode 120000 Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cu
 create mode 100644 Documentation/Tutorials/Segments/CMakeLists.txt
 create mode 100644 Documentation/Tutorials/Segments/tutorial_Segments.md

diff --git a/Documentation/Examples/Algorithms/Segments/CMakeLists.txt b/Documentation/Examples/Algorithms/Segments/CMakeLists.txt
index 8df20f637..ccf157446 100644
--- a/Documentation/Examples/Algorithms/Segments/CMakeLists.txt
+++ b/Documentation/Examples/Algorithms/Segments/CMakeLists.txt
@@ -1,5 +1,7 @@
 set( COMMON_EXAMPLES
    SegmentsExample_General
+   SegmentsPrintingExample-1
+   SegmentsPrintingExample-2
    SegmentsExample_CSR_constructor_1
    SegmentsExample_CSR_constructor_2
    SegmentsExample_CSR_getSerializationType
@@ -10,6 +12,10 @@ set( COMMON_EXAMPLES
    SegmentsExample_CSR_forSegments
    SegmentsExample_CSR_sequentialForSegments
    SegmentsExample_CSR_reduceSegments
+   SegmentsExample_forElements
+   SegmentsExample_forSegments-1
+   SegmentsExample_forSegments-2
+   SegmentsExample_reduceSegments
 )
 
 if( BUILD_CUDA )
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cpp
new file mode 100644
index 000000000..8b3450167
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cpp
@@ -0,0 +1,73 @@
+#include <iostream>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Algorithms/Segments/Ellpack.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Segments >
+void SegmentsExample()
+{
+   using Device = typename Segments::DeviceType;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   Segments segments{ 1, 2, 3, 4, 5 };
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, Device > data( segments.getStorageSize(), 0.0 );
+
+   /***
+    * Insert data into particular segments with no check.
+    */
+   auto data_view = data.getView();
+   segments.forAllElements( [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx ) mutable {
+      data_view[ globalIdx ] = segmentIdx;
+   } );
+
+   /***
+    * Print the data managed by the segments.
+    */
+   std::cout << "Data setup with no check ... " << std::endl;
+   std::cout << "Array: " << data << std::endl;
+   auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout ) << std::endl;
+
+   /***
+    * Insert data into particular segments.
+    */
+   data = 0.0;
+   segments.forAllElements( [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx ) mutable {
+      if( localIdx <= segmentIdx )
+         data_view[ globalIdx ] = segmentIdx;
+   } );
+
+   /***
+    * Print the data managed by the segments.
+    */
+   std::cout << "Data setup with check for padding elements..." << std::endl;
+   std::cout << "Array: " << data << std::endl;
+   printSegments( segments, fetch, std::cout ) << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Host, int > >();
+
+   std::cout << "Example of Ellpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Host, int > >();
+
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Cuda, int > >();
+
+   std::cout << "Example of Ellpack segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Cuda, int > >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cu
new file mode 120000
index 000000000..5f881d30f
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cu
@@ -0,0 +1 @@
+SegmentsExample_forElements.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cpp
new file mode 100644
index 000000000..d8be1f04c
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cpp
@@ -0,0 +1,62 @@
+#include <iostream>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Algorithms/Segments/Ellpack.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Segments >
+void SegmentsExample()
+{
+   using Device = typename Segments::DeviceType;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   Segments segments{ 1, 2, 3, 4, 5 };
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, Device > data( segments.getStorageSize(), 0.0 );
+
+   /***
+    * Insert data into particular segments.
+    */
+   auto data_view = data.getView();
+   using SegmentViewType = typename Segments::SegmentViewType;
+   segments.forAllSegments( [=] __cuda_callable__ ( const SegmentViewType& segment ) mutable {
+      double sum( 0.0 );
+      for( auto element : segment )
+         if( element.localIndex() <= element.segmentIndex() )
+         {
+             sum += element.localIndex() + 1;
+             data_view[ element.globalIndex() ] = sum;
+         }
+   } );
+
+   /***
+    * Print the data managed by the segments.
+    */
+   auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout );
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Host, int > >();
+
+   std::cout << "Example of Ellpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Host, int > >();
+
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Cuda, int > >();
+
+   std::cout << "Example of Ellpack segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Cuda, int > >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cu
new file mode 120000
index 000000000..3df81cf9d
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cu
@@ -0,0 +1 @@
+SegmentsExample_forSegments-1.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cpp
new file mode 100644
index 000000000..a5d7d0caa
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cpp
@@ -0,0 +1,71 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void SegmentsExample()
+{
+   using SegmentsType = typename TNL::Algorithms::Segments::CSR< Device, int >;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   SegmentsType segments{ 1, 2, 3, 4, 5 };
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, Device > data( segments.getStorageSize(), 0.0 );
+
+   /***
+    * Insert data into particular segments.
+    */
+   auto data_view = data.getView();
+   segments.forAllElements( [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx ) mutable {
+      data_view[ globalIdx ] = localIdx + 1;
+   } );
+
+   /***
+    * Print the data by the segments.
+    */
+   std::cout << "Values of elements after intial setup: " << std::endl;
+   auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout );
+
+   /***
+    * Divide elements in each segment by a sum of all elements in the segment
+    */
+   using SegmentViewType = typename SegmentsType::SegmentViewType;
+   segments.forAllSegments( [=] __cuda_callable__ ( const SegmentViewType& segment ) mutable {
+      // Compute the sum first ...
+      double sum = 0.0;
+      for( auto element : segment )
+         if( element.localIndex() <= element.segmentIndex() )
+            sum += data_view[ element.globalIndex() ];
+      // ... divide all elements.
+      for( auto element : segment )
+         if( element.localIndex() <= element.segmentIndex() )
+            data_view[ element.globalIndex() ] /= sum;
+   } );
+
+   /***
+    * Print the data managed by the segments.
+    */
+   std::cout << "Value of elements after dividing by sum in each segment:" << std::endl;
+   printSegments( segments, fetch, std::cout );
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
+   SegmentsExample< TNL::Devices::Cuda >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cu
new file mode 120000
index 000000000..6dde7c891
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cu
@@ -0,0 +1 @@
+SegmentsExample_forSegments-2.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cpp
new file mode 100644
index 000000000..c9a7476c7
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cpp
@@ -0,0 +1,83 @@
+#include <iostream>
+#include <functional>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Algorithms/Segments/Ellpack.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Segments >
+void SegmentsExample()
+{
+   using Device = typename Segments::DeviceType;
+
+   /***
+    * Create segments with given segments sizes.
+    */
+   const int size = 5;
+   Segments segments{ 1, 2, 3, 4, 5 };
+
+   /***
+    * Allocate array for the segments;
+    */
+   TNL::Containers::Array< double, Device > data( segments.getStorageSize(), 0.0 );
+
+   /***
+    * Insert data into particular segments.
+    */
+   auto data_view = data.getView();
+   segments.forAllElements( [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx ) mutable {
+      if( localIdx <= segmentIdx )
+         data_view[ globalIdx ] = segmentIdx;
+   } );
+
+   /***
+    * Print the data by the segments.
+    */
+   std::cout << "Values of elements after intial setup: " << std::endl;
+   auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout );
+
+   /***
+    * Compute sums of elements in each segment.
+    */
+   TNL::Containers::Vector< double, Device > sums( size );
+   auto sums_view = sums.getView();
+   auto fetch_full = [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx, bool& compute ) -> double {
+      if( localIdx <= segmentIdx )
+         return data_view[ globalIdx ];
+      else
+      {
+         compute = false;
+         return 0.0;
+      }
+   };
+   auto fetch_brief = [=] __cuda_callable__ ( int globalIdx, bool& compute ) -> double {
+      return data_view[ globalIdx ];
+   };
+
+   auto keep = [=] __cuda_callable__ ( int globalIdx, const double& value  ) mutable {
+      sums_view[ globalIdx ] = value; };
+   segments.reduceAllSegments( fetch_full, std::plus<>{}, keep, 0.0 );
+   std::cout << "The sums with full fetch form are: " << sums << std::endl;
+   segments.reduceAllSegments( fetch_brief, std::plus<>{}, keep, 0.0 );
+   std::cout << "The sums with brief fetch form are: " << sums << std::endl << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Host, int > >();
+
+   std::cout << "Example of Ellpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Host, int > >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Example of CSR segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Cuda, int > >();
+
+   std::cout << "Example of Ellpack segments on host: " << std::endl;
+   SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Cuda, int > >();
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cu b/Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cu
new file mode 120000
index 000000000..ce5db1005
--- /dev/null
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cu
@@ -0,0 +1 @@
+SegmentsExample_reduceSegments.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-1.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-1.cpp
index f5fc25337..62b30d7ef 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-1.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-1.cpp
@@ -1,5 +1,4 @@
 #include <iostream>
-#include <functional>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/CSR.h>
 #include <TNL/Algorithms/Segments/Ellpack.h>
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cpp
index ceaff0ecd..8d98455a5 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cpp
@@ -1,5 +1,4 @@
 #include <iostream>
-#include <functional>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/CSR.h>
 #include <TNL/Algorithms/Segments/Ellpack.h>
@@ -25,7 +24,7 @@ void SegmentsExample()
     */
    TNL::Containers::Array< double, Device > data( segments.getStorageSize(), 0.0 );
    data.forAllElements( [=] __cuda_callable__ ( int idx, double& value ) {
-      value = idx + 1.0;
+      value = idx;
    } );
 
    /***
diff --git a/Documentation/Tutorials/Segments/CMakeLists.txt b/Documentation/Tutorials/Segments/CMakeLists.txt
new file mode 100644
index 000000000..049908708
--- /dev/null
+++ b/Documentation/Tutorials/Segments/CMakeLists.txt
@@ -0,0 +1,35 @@
+set( COMMON_EXAMPLES
+
+)
+
+
+if( BUILD_CUDA )
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+      cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
+      add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
+   endforeach()
+   foreach( target IN ITEMS ${LONG_EXAMPLES} )
+      cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
+      #add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      #set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
+   endforeach()
+else()
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+      add_executable( ${target} ${target}.cpp )
+      add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out )
+   endforeach()
+   foreach( target IN ITEMS ${LONG_EXAMPLES} )
+      add_executable( ${target} ${target}.cpp )
+      #add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      #set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out )
+   endforeach()
+endif()
+
+
+IF( BUILD_CUDA )
+   ADD_CUSTOM_TARGET( RunTutorialsSegmentsExamples-cuda ALL DEPENDS ${CUDA_OUTPUTS} )
+ELSE()
+   ADD_CUSTOM_TARGET( RunTutorialsSegmentsExamples ALL DEPENDS ${HOST_OUTPUTS} )
+ENDIF()
\ No newline at end of file
diff --git a/Documentation/Tutorials/Segments/tutorial_Segments.md b/Documentation/Tutorials/Segments/tutorial_Segments.md
new file mode 100644
index 000000000..619f57682
--- /dev/null
+++ b/Documentation/Tutorials/Segments/tutorial_Segments.md
@@ -0,0 +1,212 @@
+\page tutorial_Segments  Segments tutorial
+
+[TOC]
+
+
+## Introduction
+
+*Segments* represent data structure for manipulation with several local arrays (denoted also as segments) having different size in general. All the local arrays are supposed to be allocated in one continuos global array. The data structure segments offers mapping between indexes of particular local arrays and indexes of the global array. Segments do not store any data, segments just represent a layer for efficient access and operations with group of segments of linear containers (i.e. local arrays) with different size in general. One can perform parallel operations like *for* or *flexible reduction* on particular segments (local arrays).
+
+A typical example of *segments* are different formats for sparse matrices. Sparse matrix like the following
+ \f[
+  \left(
+  \begin{array}{ccccc}
+   1  &  0  &  2  &  0  &  0 \\
+    0  &  0  &  5  &  0  &  0 \\
+    3  &  4  &  7  &  9  &  0 \\
+    0  &  0  &  0  &  0  & 12 \\
+   0  &  0  & 15  & 17  & 20
+  \end{array}
+  \right)
+ \f]
+ is usually first compressed which means that the zero elements are omitted to get the following "matrix":
+
+ \f[
+ \begin{array}{ccccc}
+    1  &   2  \\
+    5   \\
+    3  &   4  &  7 &  9   \\
+    12 \\
+    15 & 17  & 20
+ \end{array}
+ \f]
+ We have to store column index of each matrix elements as well in a "matrix" like this:
+ \f[
+ \begin{array}{ccccc}
+    0  &   2  \\
+    2   \\
+    0  &   1  &  2 &  3   \\
+    4 \\
+    2 & 3  & 4
+ \end{array}
+ \f]
+
+ Such "matrices" can be stored in memory in a row-wise manner in one contiguous array because of the performance reasons. The first "matrix" (i.e. values of the matrix elements)  would be stored as follows
+
+ \f[
+    \begin{array}{|cc|c|cccc|c|cc|} 1 & 2 &  5 & 3 & 4 & 7 & 9 & 12 & 15 & 17 & 20 \end{array}
+ \f]
+
+and the second one (i.e. column indexes of the matrix values) as follows
+
+\f[
+    \begin{array}{|cc|c|cccc|c|cc|} 0 & 2 & 2 & 0 & 1 & 2 & 3 & 4 & 2 & 3 & 4 \end{array}
+ \f]
+
+What we see above is so called [CSR sparse matrix format](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)). It is the most popular format for storage of sparse matrices designed for high performance. However, it may not be the most efficient format for storage of sparse matrices on GPUs. Therefore many other formats have been developed to get better performance. These formats often have different layout of the matrix elements in the memory. They have to deal especially with two difficulties:
+
+1. Efficient storage of matrix elements in the memory to fulfill the requirements of coalesced memory accesses on GPUs or good spatial locality for efficient use of caches on CPUs.
+2. Efficient mapping of GPU threads to different matrix rows.
+
+TNL offers the following sparse matrix formats in a form of segments (Ellpack formats often use so called *padding elements* like padding zeros in terms of sparse matrices):
+
+1. [CSR format](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)) (\ref TNL::Algorithms::Segments::CSR) is the most popular format for sparse matrices. It is simple ane very efficient especially on CPUs and today there are efficient kernels even for GPUs. The following GPU kernels are implemented in TNL:
+   1. [Scalar](http://mgarland.org/files/papers/nvr-2008-004.pdf) which maps one GPU thread for each segment (matrix row).
+   2. [Vector](http://mgarland.org/files/papers/nvr-2008-004.pdf) which maps one warp of GPU threads for each segment (matrix row).
+   3. [Adaptive](https://ieeexplore.ieee.org/document/7397620) ...
+2. [Ellpack format](http://mgarland.org/files/papers/nvr-2008-004.pdf) (\ref TNL::Algorithms::Segments::Ellpack) uses padding elements to have the same number of element in each segment. It can be highly inefficient in cases when one works with few very long segments.
+3. [SlicedEllpack format](https://link.springer.com/chapter/10.1007/978-3-642-11515-8_10) (\ref TNL::Algorithms::Segments::SlicedEllpack) which was also presented as [Row-grouped CSR format](https://arxiv.org/abs/1012.2270) is similar to common Ellpack. However, SlicedEllpack first merges the segments into groups of 32. It also uses padding elements but only segments within the same group are aligned to have the same size. Therefore there is not such a high performance drop because of few long segments.
+4. [ChunkedEllpack format](http://geraldine.fjfi.cvut.cz/~oberhuber/data/vyzkum/publikace/12-heller-oberhuber-improved-rgcsr-format.pdf) (\ref TNL::Algorithms::Segments::ChunkedEllpack) is simillar to SlicedEllpack but it splits segments into chunks which allows to map more GPU threads to one segment.
+5. [BiEllpack format](https://www.sciencedirect.com/science/article/pii/S0743731514000458?casa_token=2phrEj0Ef1gAAAAA:Lgf6rMBUN6T7TJne6mAgI_CSUJ-jR8jz7Eghdv6L0SJeGm4jfso-x6Wh8zgERk3Si7nFtTAJngg) (\ref TNL::Algorithms::Segments::BiEllpack) is simillar to ChunkedEllpack. In addition it sorts segments within the same slice w.r.t. their length to achieve higher performance and better memory accesses.
+
+Especially in case of GPUs, the performance of each format strongly depends on distribution of the segment sizes. Therefore we cannot say that one of the previous formats would outperform the others in general. To get the best performance, one should try more of the formats and choose the best one. It is the reason why TNL offers more of them and additional formats will acrue.
+
+Necessity of working with this kind of data structures is not limited only to sparse matrices. We could name at least few other applications for segments:
+
+1. [Graphs](https://en.wikipedia.org/wiki/Graph_(discrete_mathematics)) - one segment represents one graph node, the elements in one segments are indexes of its neighbors.
+2. [Unstructured numerical meshes](https://en.wikipedia.org/wiki/Types_of_mesh) - unstructured numerical mesh is a graph in fact.
+3. [Particle in cell method](https://en.wikipedia.org/wiki/Particle-in-cell) - one segment represents one cell, the elements in one segment are indexes of the particles.
+4. [K-means clustering](https://en.wikipedia.org/wiki/K-means_clustering) - segments represent one cluster, the elements represent vectors belonging to given cluster.
+5. [Hashing](https://arxiv.org/abs/1907.02900) - segments are particular rows of the hash table, elements in segments corresponds with colliding hashed elements.
+
+In general, segments can be used for problems that somehow corresponds wit 2D data structure where each row can have different size and we need to perform miscellaneous operations within the rows. The name *segments* comes from segmented parallel reduction or [segmented scan (prefix-sum)](https://en.wikipedia.org/wiki/Segmented_scan).
+
+## Segments setup
+
+Segments are defined just by sizes of particular segments. The following example shows how to create them:
+
+\includelineno Algorithms/Segments/SegmentsPrintingExample-1.cpp
+
+We use constructor with initializer list (line 16) where each element of the list defines size of one segment. Next we print sizes of particular segments (line 17). We call this function for different segments types (excluding \ref TNL::Algorithms::Segments::SlicedEllpack since it would behave the same way as \ref TNL::Algorithms::Segments::Ellpack on this small example). The result looks as follows:
+
+\include SegmentsPrintingExample-1.out
+
+We can see, that real sizes of the segments are different for all Ellpack-based formats. As we said already, these formats often use padding elements to get more efficient access to the memory. For example \ref TNL::Algorithms::Segments::ChunkedEllpack format involves multiple of elements. It is, however, only because of very small example we present now, on large examples the overhead is not so significant.
+
+We remind that segments represent rather sparse format then data structure because they do not store any data. The following example shows how to connect segments with array:
+
+\includelineno Algorithms/Segments/SegmentsPrintingExample-2.cpp
+
+On the line 19, we show how to create segments with vector (\ref TNL::Containers::Vector) carrying the segments sizes. Of course, the same constructor works even for arrays and views (i.e. \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView and \ref TNL::Containers::VectorView). Next we print the real segment sizes depending on the format in the background (line 20) the same way as we did in the previous example. On the line 25, we allocate array having the size requested by the `segments` by means of method `getStorageSize` (\ref TNL::Algortihms::Segments::CSR::getStorageSize for example). This method says how many elements the segments need to be able to address all elements by their global index. On the lines 26-28, we mark each element of the array by its rank in the array. On the line 35, we use function \ref TNL::Algorithms::Segments::printSegments which accepts lambda function `fetch` as one its parameters. The lambda function reads data from our array `data` (with the help of array view `data_view`) according to given global index `globalIdx` (line 34). The result looks as follows:
+
+\include SegmentsPrintingExample-2.out
+
+Frankly, what we see is not so important. It only shows that different segments formats can use very different mapping of elements identified by its *segment index* and *local index* (rank of the element in given segment) to a *global index* which serves as an address in the related container.
+
+## Iteration over elements of segments
+
+In this section, we show how to iterate over the elements of segments and how to manipulate with them. There are three possible ways:
+
+1. Method `forElements` (\ref TNL::Algorihms::Segments::CSR::forElements for example), which iterates in parallel over all elements of segments and perform given lambda function on each of them.
+2. Method `forSegments` (\ref TNL::Algorihms::Segments::CSR::forSegments for example), which iterates in parallel over all segments. It is better choice when we need to process each segment sequentially are we have significant amount of computations common for all elements in each segment.
+3. Method `sequentailForSegments` (\ref TNL::Algorihms::Segments::CSR::sequentailForSegments for example), which iterates over all segments sequentially i.e. using only one thread even on GPUs. It is useful for debugging or for printing for example.
+
+Methods iterating over particular segments use a segment view (\ref TNL::Algorithms::Segments::SegmentView) to access the elements of given segment. The segment view offers iterator for better convenience.
+
+### Method forElements
+
+The following example shows use of the method `forElements`:
+
+\includelineno Algorithms/Segments/SegmentsExample_forElements.cpp
+
+On the line 7, we first create segments with linearly increasing size (so it is like lower triangular matrix). Next, we allocate array `data` (line 21) having the same size as the number of elements managed by the segments. It can be obtained by the method `getStorageSize` (\ref TNL::Algorithms::Segments::CSR::getStorageSize for example). We prepare array view `data_view` for the purpose of use in lambda functions (line 26). Finally, we call the method `forAllElements` (lines 27-29) which iterates in parallel over all elements in the segments and for each element it calls given lambda function. The lambda function receives three arguments - `segmentIdx` is an index of the segment the element belongs to, `localIdx` is the rank of the element within the segment and `globalIdx` is an index of the element in the array `data`. We use the global index to set proper element of the array `data` to the index of the segment. On the line 35, we print the array `data`. We can see elements belonging to particular segments by their indexes. The layout of the elements depends on the type of segments (which means sparse format in use). Next we print the elements of array `data` by segments (lines 36 and 37). The function `printSegments` iterates over all elements and it reads the elements of the array `data` with the help of the lambda function defined on the line 36.
+
+Note, that for the Ellpack format, the output looks as follows:
+
+```
+Seg. 0: [ 0, 0, 0, 0, 0 ]
+Seg. 1: [ 1, 1, 1, 1, 1 ]
+Seg. 2: [ 2, 2, 2, 2, 2 ]
+Seg. 3: [ 3, 3, 3, 3, 3 ]
+Seg. 4: [ 4, 4, 4, 4, 4 ]
+```
+
+We see more elements that we have requested. The reason is that the Ellpack format uses padding elements for optimizing access to the memory. Segments give access even to the padding elements, they can be used in case when we get to situation of need of additional elements. Therefore we need to check for relevant and padding elements each time we work with elements of segments. It is demonstrated on the lines 43-46 where we set the array `data` again but we check for the padding elements (line 44). After printing the segments the same way as before (line 53) we get correct result:
+
+```
+Seg. 0: [ 0, 0, 0, 0, 0 ]
+Seg. 1: [ 1, 1, 0, 0, 0 ]
+Seg. 2: [ 2, 2, 2, 0, 0 ]
+Seg. 3: [ 3, 3, 3, 3, 0 ]
+Seg. 4: [ 4, 4, 4, 4, 4 ]
+```
+
+The result of the whole example looks as follows:
+
+\include SegmentsExample_forElements.out
+
+### Method forSegments
+
+Method `forSegments` iterates in parallel over particular segments. Iteration over elements within the segment is sequential. There are two reasons for such proceeding:
+
+1. The iteration over the elements within the same segments must be sequential, i.e. the computation with one element depends on a result of the computation with the previous one.
+2. Some part of computations on all elements in one segment is common. In this case, we can first perform the common part and then iterate over the elements. If we would use the method `forElements`, the common part would have to be performed for each element.
+
+#### Sequential dependency
+
+The first situation is demonstrated in the following example:
+
+\includelineno Algorithms/Segments/SegmentsExample_forSegments-1.cpp
+
+The result looks as follows:
+
+The code is the same as in the previous example up to line 26. Instead of calling the method `forElements` we call the method `forSegments` (line 28) for which we need to define type  `SegmentViewType` (\ref TNL::Algorithms::Segments::CSR::SegmentViewType for example). The lambda function on the line 28 gets the segment view and it iterates over all elements of the segment by means of a for loop. We use auxiliary variable `sum` to compute cumulative sum of elements in each segment which is just the sequential dependency. The result looks as follows:
+
+\include SegmentsExample_forSegments-1.out
+
+#### Common computations
+
+Now let's take a look at the second situation, i.e. there are common computations for all elements of one segment. In the following example, we first set values of each element using the method `forElements` which we are already familiar with (lines 26-29). Next we print values of all elements (lines 34-36) and then we use the method `forAllSegments` (lines 41-52) to divide each element by a sum of values of all elements in a segment. So we first sum up all elements in the segment (lines 43-47). This is the common part of the computation for all elements in the segment. Next we perform the division of all elements by the value of the variable `sum` (lines 48-51).
+
+\includelineno Algorithms/Segments/SegmentsExample_forSegments-2.cpp
+
+The result looks as follows:
+
+\include SegmentsExample_forSegments-2.out
+
+## Flexible reduction within segments
+
+In this section we will explain extension of [flexible reduction]() to segments. It allows to reduce all elements within the same segment and store the result into an array. See the following example:
+
+\includelineno Algorithms/Segments/SegmentsExample_reduceSegments.cpp
+
+We first create the segments `segments` (line 18), related array `data` (line 23) and setup the elements (lines 28-32). After printing the segments (lines 37-39) we are ready for the parallel reduction. It requires three lambda fuctions:
+
+1. `fetch` which reads data belonging to particular elements of the segments. The fetch function can have two different forms - *brief* and *full*:
+   * *Brief form* - is this case the lambda function gets only global index and the `compute` flag:
+```
+      auto fetch = [=] __cuda_callable__ ( int globalIdx, bool& compute ) -> double { ... };
+```
+   * *Full form* - in this case the lambda function receives even the segment index and element index:
+```
+      auto fetch = [=] __cuda_callable__ ( int segmentIdx, int localIdx, int globalIdx, bool& compute ) -> double { ... };
+```
+   where `segmentIdx` is the index of the segment, `localIdx` is the rank of the element within the segment, `globalIdx` is index of the element in the related array and `compute` serves for the reduction interruption which means that the remaining elements in the segment can be omitted. Many formats used for segments are optimized for much higher performance if the brief variant is used. The form of the `fetch` lambda function is detected automatically using [SFINAE](https://en.cppreference.com/w/cpp/language/sfinae) and so the use of both is very ease for the user.
+2. `reduce` is a function representing the reduction operation, in our case it is defined as follows:
+```
+auto reduce = [=] __cuda_callable__ ( const double& a, const double& b ) -> double { return a + b; }
+```
+   or, in fact, we can use the function `std::plus`.
+3. `keep` is a lambda function responsible for storage of the results. It is supposed to be defined as:
+```
+auto keep = [=] __cuda_callable__ ( int segmentIdx, const double& value ) mutable { ... };
+```
+where `segmentIdx` is an index of the segment of which the reduction result we aim to store and `value` is the result of the reduction in the segment.
+
+We first create vector `sums` where we will store the results (line 44) and prepare a view to this vector for later use in the lambda functions. We demonstrate use of both variants - full by `fetch_full` (lines 46-54) and brief by `fetch_brief` (lines 55-57). The lambda function `keep` for storing the sums from particular segments into the vector `sums` is on the lines 59-60. Finally, we call the method `reduceAllSegments` (\ref TNL::Algorithms::Segments::CSR::reduceSegments for example) to compute the reductions in the segments - first with  `fetch_full` (line 61) and then with `fetch_brief` (line 63). In both cases, we use `std::plus` for the reduction and we pass zero (the last argument) as an idempotent element for sumation. In both cases we print the results which are supposed to be the same. The result looks as follows:
+
+\include SegmentsExample_reduceSegments.out
+
+
+
+
diff --git a/Documentation/Tutorials/index.md b/Documentation/Tutorials/index.md
index 739d609ac..031de3fae 100644
--- a/Documentation/Tutorials/index.md
+++ b/Documentation/Tutorials/index.md
@@ -10,4 +10,5 @@
 6. [Sorting](tutorial_Sorting.html)
 7. [Cross-device pointers](tutorial_Pointers.html)
 8. [Matrices](tutorial_Matrices.html)
-9. [Unstructured meshes](tutorial_Meshes.html)
+9. [Segments aka sparse formats](tutorial_Segments.html)
+10. [Unstructured meshes](tutorial_Meshes.html)
diff --git a/src/TNL/Algorithms/Segments/_NamespaceDoxy.h b/src/TNL/Algorithms/Segments/_NamespaceDoxy.h
index 064e6ab45..6a9bab68a 100644
--- a/src/TNL/Algorithms/Segments/_NamespaceDoxy.h
+++ b/src/TNL/Algorithms/Segments/_NamespaceDoxy.h
@@ -14,8 +14,8 @@ namespace TNL {
    namespace Algorithms {
 /**
  * \brief Namespace holding segments data structures.
-
- *Segments* represent data structure for manipulation with several local arrays (denoted also as segments)
+ *
+ * *Segments* represent data structure for manipulation with several local arrays (denoted also as segments)
  having different size in general. All the local arrays are supposed to be allocated in one continuos global array.
  The data structure segments offers mapping between indexes of particular local arrays and indexes
  of the global array. In addition,one can perform parallel operations like for or flexible reduction on partical
-- 
GitLab


From 0812137f11bce1e9bb00219ca71a2e03109e1df1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Tue, 13 Apr 2021 21:21:07 +0200
Subject: [PATCH 029/117] Fixed remaining return types of getOrganization
 methods

Fixes #83
---
 src/TNL/Algorithms/Segments/EllpackView.h       | 2 +-
 src/TNL/Algorithms/Segments/SlicedEllpackView.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/EllpackView.h b/src/TNL/Algorithms/Segments/EllpackView.h
index 865c0d848..0857886f2 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.h
+++ b/src/TNL/Algorithms/Segments/EllpackView.h
@@ -33,7 +33,7 @@ class EllpackView
       using DeviceType = Device;
       using IndexType = std::remove_const_t< Index >;
       static constexpr int getAlignment() { return Alignment; }
-      static constexpr bool getOrganization() { return Organization; }
+      static constexpr ElementsOrganization getOrganization() { return Organization; }
       using OffsetsContainer = Containers::Vector< IndexType, DeviceType, IndexType >;
       using SegmentsSizes = OffsetsContainer;
       template< typename Device_, typename Index_ >
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.h b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
index cd3b1fbe4..e614f9dc0 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
@@ -32,7 +32,7 @@ class SlicedEllpackView
       using IndexType = std::remove_const_t< Index >;
       using OffsetsView = typename Containers::VectorView< Index, DeviceType, IndexType >;
       static constexpr int getSliceSize() { return SliceSize; }
-      static constexpr bool getOrganization() { return Organization; }
+      static constexpr ElementsOrganization getOrganization() { return Organization; }
       template< typename Device_, typename Index_ >
       using ViewTemplate = SlicedEllpackView< Device_, Index_, Organization, SliceSize >;
       using ViewType = SlicedEllpackView;
-- 
GitLab


From d5984f141602ea10cf5e0908b66e39a65784961d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 18 Apr 2021 12:38:03 +0200
Subject: [PATCH 030/117] Optimizing dense-matrix vector multiplication
 inspired by sparse matrix implementation.

---
 src/TNL/Matrices/DenseMatrixView.hpp | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index 1ace77056..6371832ee 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -538,10 +538,33 @@ vectorProduct( const InVector& inVector,
    auto fetch = [=] __cuda_callable__ ( IndexType row, IndexType column, IndexType offset, bool& compute ) -> RealType {
       return valuesView[ offset ] * inVectorView[ column ];
    };
-   auto keeper = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
+   auto keeperGeneral = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
       outVectorView[ row ] = matrixMultiplicator * value + outVectorMultiplicator * outVectorView[ row ];
    };
-   this->segments.reduceSegments( begin, end, fetch, std::plus<>{}, keeper, ( RealType ) 0.0 );
+   auto keeperDirect = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
+      outVectorView[ row ] = value;
+   };
+   auto keeperMatrixMult = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
+      outVectorView[ row ] = matrixMultiplicator * value;
+   };
+   auto keeperVectorMult = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
+      outVectorView[ row ] = outVectorMultiplicator * outVectorView[ row ] + value;
+   };
+
+   if( outVectorMultiplicator == 0.0 )
+   {
+      if( matrixMultiplicator == 1.0 )
+         this->segments.reduceSegments( begin, end, fetch, std::plus<>{}, keeperDirect, ( RealType ) 0.0 );
+      else
+         this->segments.reduceSegments( begin, end, fetch, std::plus<>{}, keeperMatrixMult, ( RealType ) 0.0 );
+   }
+   else
+   {
+      if( matrixMultiplicator == 1.0 )
+         this->segments.reduceSegments( begin, end, fetch, std::plus<>{}, keeperVectorMult, ( RealType ) 0.0 );
+      else
+         this->segments.reduceSegments( begin, end, fetch, std::plus<>{}, keeperGeneral, ( RealType ) 0.0 );
+   }
 }
 
 template< typename Real,
-- 
GitLab


From 4df14dcb1b595a09a17a1e196f365142e01ec192 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 18 Apr 2021 12:38:34 +0200
Subject: [PATCH 031/117] Added method print to Segments for better use with
 output streams.

---
 .../SegmentsExample_CSR_constructor_1.cpp     |  2 +-
 .../SegmentsExample_CSR_constructor_2.cpp     |  2 +-
 .../SegmentsExample_CSR_forElements.cpp       |  2 +-
 .../SegmentsExample_CSR_forSegments.cpp       |  2 +-
 .../Segments/SegmentsExample_General.cpp      |  2 +-
 .../Segments/SegmentsExample_forElements.cpp  |  4 ++--
 .../SegmentsExample_forSegments-1.cpp         |  2 +-
 .../SegmentsExample_forSegments-2.cpp         |  4 ++--
 .../SegmentsExample_reduceSegments.cpp        |  2 +-
 .../Segments/SegmentsPrintingExample-2.cpp    |  2 +-
 src/TNL/Algorithms/Segments/BiEllpack.h       |  3 +++
 src/TNL/Algorithms/Segments/BiEllpack.hpp     | 13 ++++++++++
 src/TNL/Algorithms/Segments/BiEllpackView.h   |  3 +++
 src/TNL/Algorithms/Segments/BiEllpackView.hpp | 12 ++++++++++
 src/TNL/Algorithms/Segments/CSR.h             | 21 ++++++++++++++++
 src/TNL/Algorithms/Segments/CSR.hpp           | 12 ++++++++++
 src/TNL/Algorithms/Segments/CSRView.h         |  3 +++
 src/TNL/Algorithms/Segments/CSRView.hpp       | 12 ++++++++++
 src/TNL/Algorithms/Segments/ChunkedEllpack.h  |  3 +++
 .../Algorithms/Segments/ChunkedEllpack.hpp    | 12 ++++++++++
 .../Algorithms/Segments/ChunkedEllpackView.h  |  3 +++
 .../Segments/ChunkedEllpackView.hpp           | 12 ++++++++++
 src/TNL/Algorithms/Segments/Ellpack.h         |  3 +++
 src/TNL/Algorithms/Segments/Ellpack.hpp       | 13 ++++++++++
 src/TNL/Algorithms/Segments/EllpackView.h     |  3 +++
 src/TNL/Algorithms/Segments/EllpackView.hpp   | 12 ++++++++++
 .../Algorithms/Segments/SegmentsPrinting.h    | 24 +++++++++++++++++++
 src/TNL/Algorithms/Segments/SlicedEllpack.h   |  3 +++
 src/TNL/Algorithms/Segments/SlicedEllpack.hpp | 13 ++++++++++
 .../Algorithms/Segments/SlicedEllpackView.h   |  3 +++
 .../Algorithms/Segments/SlicedEllpackView.hpp | 12 ++++++++++
 31 files changed, 207 insertions(+), 12 deletions(-)

diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp
index 0ceb7a6bd..ed25d6df4 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp
@@ -35,7 +35,7 @@ void SegmentsExample()
     * Print the data managed by the segments.
     */
    auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
-   printSegments( segments, fetch, std::cout );
+   std::cout << segments.print( fetch ) << std::endl;
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp
index 9493758b4..a71c51519 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp
@@ -34,7 +34,7 @@ void SegmentsExample()
     * Print the data managed by the segments.
     */
    auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
-   printSegments( segments, fetch, std::cout );
+   std::cout << segments.print( fetch ) << std::endl;
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cpp
index 37267a889..264998046 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cpp
@@ -33,7 +33,7 @@ void SegmentsExample()
     * Print the data managed by the segments.
     */
    auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
-   printSegments( segments, fetch, std::cout );
+   std::cout << segments.print( fetch ) << std::endl;
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cpp
index 3bf7cc50b..f2eb0ae13 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cpp
@@ -36,7 +36,7 @@ void SegmentsExample()
     * Print the data managed by the segments.
     */
    auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
-   printSegments( segments, fetch, std::cout );
+   std::cout << segments.print( fetch ) << std::endl;
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
index ade0263fb..7e4aced7e 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
@@ -36,7 +36,7 @@ void SegmentsExample()
     * Print the data managed by the segments.
     */
    auto fetch = [=] __cuda_callable__ ( IndexType globalIdx ) -> double { return data_view[ globalIdx ]; };
-   printSegments( segments, fetch, std::cout );
+   std::cout << segments.print( fetch ) << std::endl;
 
    /***
     * Compute sums of elements in particular segments.
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cpp
index 8b3450167..621a2123a 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cpp
@@ -34,7 +34,7 @@ void SegmentsExample()
    std::cout << "Data setup with no check ... " << std::endl;
    std::cout << "Array: " << data << std::endl;
    auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
-   printSegments( segments, fetch, std::cout ) << std::endl;
+   std::cout << segments.print( fetch ) << std::endl;
 
    /***
     * Insert data into particular segments.
@@ -50,7 +50,7 @@ void SegmentsExample()
     */
    std::cout << "Data setup with check for padding elements..." << std::endl;
    std::cout << "Array: " << data << std::endl;
-   printSegments( segments, fetch, std::cout ) << std::endl;
+   std::cout << segments.print( fetch ) << std::endl;
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cpp
index d8be1f04c..fa81662e8 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cpp
@@ -39,7 +39,7 @@ void SegmentsExample()
     * Print the data managed by the segments.
     */
    auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
-   printSegments( segments, fetch, std::cout );
+   std::cout << segments.print( fetch ) << std::endl;
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cpp
index a5d7d0caa..0439b846a 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cpp
@@ -33,7 +33,7 @@ void SegmentsExample()
     */
    std::cout << "Values of elements after intial setup: " << std::endl;
    auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
-   printSegments( segments, fetch, std::cout );
+   std::cout << segments.print( fetch );
 
    /***
     * Divide elements in each segment by a sum of all elements in the segment
@@ -55,7 +55,7 @@ void SegmentsExample()
     * Print the data managed by the segments.
     */
    std::cout << "Value of elements after dividing by sum in each segment:" << std::endl;
-   printSegments( segments, fetch, std::cout );
+   std::cout << segments.print( fetch ) << std::endl;
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cpp
index c9a7476c7..e6701f36b 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cpp
@@ -36,7 +36,7 @@ void SegmentsExample()
     */
    std::cout << "Values of elements after intial setup: " << std::endl;
    auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
-   printSegments( segments, fetch, std::cout );
+   std::cout << segments.print( fetch ) << std::endl;
 
    /***
     * Compute sums of elements in each segment.
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cpp
index 8d98455a5..73d2e415d 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cpp
@@ -32,7 +32,7 @@ void SegmentsExample()
     */
    auto data_view = data.getView();
    auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
-   printSegments( segments, fetch, std::cout );
+   std::cout << segments.print( fetch ) << std::endl;
    std::cout << std::endl;
 }
 
diff --git a/src/TNL/Algorithms/Segments/BiEllpack.h b/src/TNL/Algorithms/Segments/BiEllpack.h
index ee202d25c..3830d8e14 100644
--- a/src/TNL/Algorithms/Segments/BiEllpack.h
+++ b/src/TNL/Algorithms/Segments/BiEllpack.h
@@ -135,6 +135,9 @@ class BiEllpack
 
       void load(File &file);
 
+      template< typename Fetch >
+      SegmentsPrinter< BiEllpack, Fetch > print( Fetch&& fetch ) const;
+
       void printStructure(std::ostream &str) const;
 
       // TODO: nvcc needs this public because of lambda function used inside
diff --git a/src/TNL/Algorithms/Segments/BiEllpack.hpp b/src/TNL/Algorithms/Segments/BiEllpack.hpp
index 1412d1be5..4bbccbb0e 100644
--- a/src/TNL/Algorithms/Segments/BiEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpack.hpp
@@ -581,6 +581,19 @@ load( File& file )
         >> this->groupPointers;
 }
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization,
+          int WarpSize >
+      template< typename Fetch >
+auto
+BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
+print( Fetch&& fetch ) const -> SegmentsPrinter< BiEllpack, Fetch >
+{
+   return SegmentsPrinter< BiEllpack, Fetch >( *this, fetch );
+}
+
 template< typename Device,
           typename Index,
           typename IndexAllocator,
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.h b/src/TNL/Algorithms/Segments/BiEllpackView.h
index c0ae1559e..62d60509c 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.h
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.h
@@ -138,6 +138,9 @@ class BiEllpackView
 
       void load( File& file );
 
+      template< typename Fetch >
+      SegmentsPrinter< BiEllpackView, Fetch > print( Fetch&& fetch ) const;
+
       void printStructure( std::ostream& str ) const;
 
    protected:
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.hpp b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
index b480deac0..ab79d5833 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
@@ -477,6 +477,18 @@ save( File& file ) const
         << this->groupPointers;
 }
 
+template< typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          int WarpSize >
+      template< typename Fetch >
+auto
+BiEllpackView< Device, Index, Organization, WarpSize >::
+print( Fetch&& fetch ) const -> SegmentsPrinter< BiEllpackView, Fetch >
+{
+   return SegmentsPrinter< BiEllpackView, Fetch >( *this, fetch );
+}
+
 template< typename Device,
           typename Index,
           ElementsOrganization Organization,
diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index f3f1aa881..eebd186a6 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -486,6 +486,27 @@ class CSR
        */
       void load( File& file );
 
+      /**
+       * \brief Return simple proxy object for insertion to output stream.
+       *
+       * The proxy object serves for wrapping segments with lambda function mediating access to data managed by the segments.
+       *
+       * \tparam Fetch is type of lambda function for data access.
+       * \param fetch is an instance of lambda function for data access. It is supposed to be defined as
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType globalIdx ) -> ValueType { return data_view[ globalIdx ]; };
+       * ```
+       * \return Proxy object for insertion to output stream.
+       *
+       * \par Example
+       * \include Algorithms/Segments/SegmentsPrintingExample-2.cpp
+       * \par Output
+       * \include SegmentsPrintingExample-2.out
+       */
+      template< typename Fetch >
+      SegmentsPrinter< CSR, Fetch > print( Fetch&& fetch ) const;
+
    protected:
 
       OffsetsContainer offsets;
diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp
index b427f4acd..0d15790bf 100644
--- a/src/TNL/Algorithms/Segments/CSR.hpp
+++ b/src/TNL/Algorithms/Segments/CSR.hpp
@@ -368,6 +368,18 @@ load( File& file )
    this->kernel.init( this->offsets );
 }
 
+template< typename Device,
+          typename Index,
+          typename Kernel,
+          typename IndexAllocator >
+      template< typename Fetch >
+auto
+CSR< Device, Index, Kernel, IndexAllocator >::
+print( Fetch&& fetch ) const -> SegmentsPrinter< CSR, Fetch >
+{
+   return SegmentsPrinter< CSR, Fetch >( *this, fetch );
+}
+
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index 5daa3e7c2..dee96ba5a 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -139,6 +139,9 @@ class CSRView
 
       void load( File& file );
 
+      template< typename Fetch >
+      SegmentsPrinter< CSRView, Fetch > print( Fetch&& fetch ) const;
+
    protected:
 
       OffsetsView offsets;
diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp
index 08822ca94..b69e61e5a 100644
--- a/src/TNL/Algorithms/Segments/CSRView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRView.hpp
@@ -318,6 +318,18 @@ load( File& file )
    this->kernel.init( this->offsets );
 }
 
+template< typename Device,
+          typename Index,
+          typename Kernel >
+      template< typename Fetch >
+auto
+CSRView< Device, Index, Kernel >::
+print( Fetch&& fetch ) const -> SegmentsPrinter< CSRView, Fetch >
+{
+   return SegmentsPrinter< CSRView, Fetch >( *this, fetch );
+}
+
+
       } // namespace Segments
    }  // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
index 5b0916d20..1d4f9fabc 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
@@ -131,6 +131,9 @@ class ChunkedEllpack
 
       void load( File& file );
 
+      template< typename Fetch >
+      SegmentsPrinter< ChunkedEllpack, Fetch > print( Fetch&& fetch ) const;
+
       void printStructure( std::ostream& str ); // TODO const;
 
    protected:
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
index 82ddd7d8e..e39a16670 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
@@ -534,6 +534,18 @@ load( File& file )
    file.load( &this->numberOfSlices );
 }
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization >
+      template< typename Fetch >
+auto
+ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
+print( Fetch&& fetch ) const -> SegmentsPrinter< ChunkedEllpack, Fetch >
+{
+   return SegmentsPrinter< ChunkedEllpack, Fetch >( *this, fetch );
+}
+
 template< typename Device,
           typename Index,
           typename IndexAllocator,
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
index a20d5a41a..a54f8e5ef 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
@@ -150,6 +150,9 @@ class ChunkedEllpackView
 
       void save( File& file ) const;
 
+      template< typename Fetch >
+      SegmentsPrinter< ChunkedEllpackView, Fetch > print( Fetch&& fetch ) const;
+
       void printStructure( std::ostream& str ) const;
 
    protected:
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
index 5f73fd8ab..6fc767107 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
@@ -513,6 +513,18 @@ save( File& file ) const
    file.save( &this->numberOfSlices );
 }
 
+template< typename Device,
+          typename Index,
+          ElementsOrganization Organization >
+      template< typename Fetch >
+auto
+ChunkedEllpackView< Device, Index, Organization >::
+print( Fetch&& fetch ) const -> SegmentsPrinter< ChunkedEllpackView, Fetch >
+{
+   return SegmentsPrinter< ChunkedEllpackView, Fetch >( *this, fetch );
+}
+
+
 template< typename Device,
           typename Index,
           ElementsOrganization Organization >
diff --git a/src/TNL/Algorithms/Segments/Ellpack.h b/src/TNL/Algorithms/Segments/Ellpack.h
index e68ebdf62..c363d3000 100644
--- a/src/TNL/Algorithms/Segments/Ellpack.h
+++ b/src/TNL/Algorithms/Segments/Ellpack.h
@@ -130,6 +130,9 @@ class Ellpack
 
       void load( File& file );
 
+      template< typename Fetch >
+      SegmentsPrinter< Ellpack, Fetch > print( Fetch&& fetch ) const;
+
    protected:
 
       IndexType segmentSize, size, alignedSize;
diff --git a/src/TNL/Algorithms/Segments/Ellpack.hpp b/src/TNL/Algorithms/Segments/Ellpack.hpp
index 589d9f944..27e7dcbe3 100644
--- a/src/TNL/Algorithms/Segments/Ellpack.hpp
+++ b/src/TNL/Algorithms/Segments/Ellpack.hpp
@@ -383,6 +383,19 @@ load( File& file )
    file.load( &alignedSize );
 }
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization,
+          int Alignment >
+      template< typename Fetch >
+auto
+Ellpack< Device, Index, IndexAllocator, Organization, Alignment >::
+print( Fetch&& fetch ) const -> SegmentsPrinter< Ellpack, Fetch >
+{
+   return SegmentsPrinter< Ellpack, Fetch >( *this, fetch );
+}
+
       } // namespace Segments
    }  // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/EllpackView.h b/src/TNL/Algorithms/Segments/EllpackView.h
index 0857886f2..6e4995e1d 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.h
+++ b/src/TNL/Algorithms/Segments/EllpackView.h
@@ -123,6 +123,9 @@ class EllpackView
 
       void load( File& file );
 
+      template< typename Fetch >
+      SegmentsPrinter< EllpackView, Fetch > print( Fetch&& fetch ) const;
+
    protected:
 
       IndexType segmentSize, segmentsCount, alignedSize;
diff --git a/src/TNL/Algorithms/Segments/EllpackView.hpp b/src/TNL/Algorithms/Segments/EllpackView.hpp
index 1ec928336..18f1cde7b 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/EllpackView.hpp
@@ -357,6 +357,18 @@ load( File& file )
    file.load( &alignedSize );
 }
 
+template< typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          int Alignment >
+      template< typename Fetch >
+auto
+EllpackView< Device, Index, Organization, Alignment >::
+print( Fetch&& fetch ) const -> SegmentsPrinter< EllpackView, Fetch >
+{
+   return SegmentsPrinter< EllpackView, Fetch >( *this, fetch );
+}
+
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/SegmentsPrinting.h b/src/TNL/Algorithms/Segments/SegmentsPrinting.h
index 5018471b3..260ad71e8 100644
--- a/src/TNL/Algorithms/Segments/SegmentsPrinting.h
+++ b/src/TNL/Algorithms/Segments/SegmentsPrinting.h
@@ -99,6 +99,30 @@ std::ostream& printSegments( const Segments& segments, Fetch&& fetch, std::ostre
    return str;
 }
 
+
+template< typename Segments,
+          typename Fetch >
+struct SegmentsPrinter
+{
+   SegmentsPrinter( const Segments& segments, Fetch& fetch )
+   : segments( segments ), fetch( fetch ) {}
+
+   std::ostream& print( std::ostream& str ) const { return printSegments( segments, fetch, str ); }
+
+   protected:
+
+   const Segments& segments;
+
+   Fetch& fetch;
+};
+
+template< typename Segments,
+          typename Fetch >
+std::ostream& operator<<( std::ostream& str, const SegmentsPrinter< Segments, Fetch >& printer )
+{
+   return printer.print( str );
+}
+
       } // namespace Segments
    } // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.h b/src/TNL/Algorithms/Segments/SlicedEllpack.h
index 092af6a1f..974087e4b 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpack.h
+++ b/src/TNL/Algorithms/Segments/SlicedEllpack.h
@@ -127,6 +127,9 @@ class SlicedEllpack
 
       void load( File& file );
 
+      template< typename Fetch >
+      SegmentsPrinter< SlicedEllpack, Fetch > print( Fetch&& fetch ) const;
+
    protected:
 
       IndexType size, alignedSize, segmentsCount;
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
index 6c58c3ed1..8a4903cbd 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
@@ -423,6 +423,19 @@ load( File& file )
    file >> this->sliceSegmentSizes;
 }
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator,
+          ElementsOrganization Organization,
+          int SliceSize >
+      template< typename Fetch >
+auto
+SlicedEllpack< Device, Index, IndexAllocator, Organization, SliceSize >::
+print( Fetch&& fetch ) const -> SegmentsPrinter< SlicedEllpack, Fetch >
+{
+   return SegmentsPrinter< SlicedEllpack, Fetch >( *this, fetch );
+}
+
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.h b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
index e614f9dc0..16e2c082f 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
@@ -121,6 +121,9 @@ class SlicedEllpackView
 
       void load( File& file );
 
+      template< typename Fetch >
+      SegmentsPrinter< SlicedEllpackView, Fetch > print( Fetch&& fetch ) const;
+
    protected:
 
       IndexType size, alignedSize, segmentsCount;
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
index 871aa2da0..5b97c72e2 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
@@ -427,6 +427,18 @@ load( File& file )
    file >> this->sliceSegmentSizes;
 }
 
+template< typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          int SliceSize >
+      template< typename Fetch >
+auto
+SlicedEllpackView< Device, Index, Organization, SliceSize >::
+print( Fetch&& fetch ) const -> SegmentsPrinter< SlicedEllpackView, Fetch >
+{
+   return SegmentsPrinter< SlicedEllpackView, Fetch >( *this, fetch );
+}
+
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
-- 
GitLab


From fd236cf9cbfddf47e8edbb17d56255de25763699 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 18 Apr 2021 13:03:20 +0200
Subject: [PATCH 032/117] Refactoring segments printing.

---
 .../Algorithms/Segments/SegmentsPrinting.h    | 76 ++++++-------------
 1 file changed, 25 insertions(+), 51 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/SegmentsPrinting.h b/src/TNL/Algorithms/Segments/SegmentsPrinting.h
index 260ad71e8..7695cc693 100644
--- a/src/TNL/Algorithms/Segments/SegmentsPrinting.h
+++ b/src/TNL/Algorithms/Segments/SegmentsPrinting.h
@@ -49,65 +49,39 @@ std::ostream& printSegments( const Segments& segments, std::ostream& str )
    return str;
 }
 
-
-/**
- * \brief Print segments with related content.
- *
- * \tparam Segments is type of segments.
- * \tparam Fetch is a lambda function for reading related data.
- * \param segments is an instance of segments.
- * \param fetch is an instance of lambda function reading related data. It is supposed to defined as
- *
- * ```
- * auto fetch = [=] __cuda_callable__ ( IndexType globalIdx ) -> ValueType { return data_view[ globalIdx ]; };
- * ```
- *
- * \param str is output stream.
- * \return reference to the output stream.
- *
- * \par Example
- * \include Algorithms/Segments/SegmentsPrintingExample-2.cpp
- * \par Output
- * \include SegmentsPrintingExample-2.out
- */
 template< typename Segments,
           typename Fetch >
-std::ostream& printSegments( const Segments& segments, Fetch&& fetch, std::ostream& str )
+struct SegmentsPrinter
 {
-   using IndexType = typename Segments::IndexType;
-   using DeviceType = typename Segments::DeviceType;
-   using ValueType = decltype( fetch( IndexType() ) );
+   SegmentsPrinter( const Segments& segments, Fetch& fetch )
+   : segments( segments ), fetch( fetch ) {}
 
-   TNL::Containers::Array< ValueType, DeviceType, IndexType > aux( 1 );
-   auto view = segments.getConstView();
-   for( IndexType segmentIdx = 0; segmentIdx < segments.getSegmentsCount(); segmentIdx++ )
+   std::ostream& print( std::ostream& str ) const
    {
-      str << "Seg. " << segmentIdx << ": [ ";
-      auto segmentSize = segments.getSegmentSize( segmentIdx );
-      for( IndexType localIdx = 0; localIdx < segmentSize; localIdx++ )
+      using IndexType = typename Segments::IndexType;
+      using DeviceType = typename Segments::DeviceType;
+      using ValueType = decltype( fetch( IndexType() ) );
+
+      TNL::Containers::Array< ValueType, DeviceType, IndexType > aux( 1 );
+      auto view = segments.getConstView();
+      for( IndexType segmentIdx = 0; segmentIdx < segments.getSegmentsCount(); segmentIdx++ )
       {
-         aux.forAllElements( [=] __cuda_callable__ ( IndexType elementIdx, double& v ) mutable {
-            v = fetch( view.getGlobalIndex( segmentIdx, localIdx ) );
-         } );
-         auto value = aux.getElement( 0 );
-         str << value;
-         if( localIdx < segmentSize - 1 )
-            str << ", ";
+         str << "Seg. " << segmentIdx << ": [ ";
+         auto segmentSize = segments.getSegmentSize( segmentIdx );
+         for( IndexType localIdx = 0; localIdx < segmentSize; localIdx++ )
+         {
+            aux.forAllElements( [=] __cuda_callable__ ( IndexType elementIdx, double& v ) mutable {
+               v = fetch( view.getGlobalIndex( segmentIdx, localIdx ) );
+            } );
+            auto value = aux.getElement( 0 );
+            str << value;
+            if( localIdx < segmentSize - 1 )
+               str << ", ";
+         }
+         str << " ] " << std::endl;
       }
-      str << " ] " << std::endl;
+      return str;
    }
-   return str;
-}
-
-
-template< typename Segments,
-          typename Fetch >
-struct SegmentsPrinter
-{
-   SegmentsPrinter( const Segments& segments, Fetch& fetch )
-   : segments( segments ), fetch( fetch ) {}
-
-   std::ostream& print( std::ostream& str ) const { return printSegments( segments, fetch, str ); }
 
    protected:
 
-- 
GitLab


From cbf1977e4b625afef8163e861a562da167d3d189 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 18 Apr 2021 14:13:47 +0200
Subject: [PATCH 033/117] Fixing segments printing headers including.

---
 src/TNL/Algorithms/Segments/BiEllpackView.h      | 3 ++-
 src/TNL/Algorithms/Segments/ChunkedEllpackView.h | 3 ++-
 src/TNL/Algorithms/Segments/SlicedEllpackView.h  | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.h b/src/TNL/Algorithms/Segments/BiEllpackView.h
index 62d60509c..91b055e26 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.h
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.h
@@ -15,7 +15,8 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/ElementsOrganization.h>
 #include <TNL/Algorithms/Segments/BiEllpackSegmentView.h>
-#include <TNL/Algorithms/Segments/detail/BiEllpack.h>
+#include <TNL/Algorithms/Segments/details/BiEllpack.h>
+#include <TNL/Algorithms/Segments/SegmentsPrinting.h>
 
 namespace TNL {
    namespace Algorithms {
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
index a54f8e5ef..ae400d2fe 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
@@ -16,7 +16,8 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/ElementsOrganization.h>
 #include <TNL/Algorithms/Segments/ChunkedEllpackSegmentView.h>
-#include <TNL/Algorithms/Segments/detail/ChunkedEllpack.h>
+#include <TNL/Algorithms/Segments/details/ChunkedEllpack.h>
+#include <TNL/Algorithms/Segments/SegmentsPrinting.h>
 
 namespace TNL {
    namespace Algorithms {
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.h b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
index 16e2c082f..5cb9ef8f0 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
@@ -15,6 +15,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/ElementsOrganization.h>
 #include <TNL/Algorithms/Segments/SegmentView.h>
+#include <TNL/Algorithms/Segments/SegmentPrinting.h>
 
 namespace TNL {
    namespace Algorithms {
-- 
GitLab


From 045485d48d140aea8d849237a0e1c1c6fdda2c2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 18 Apr 2021 15:11:12 +0200
Subject: [PATCH 034/117] Fixing segments printing headers including.

---
 src/TNL/Algorithms/Segments/SlicedEllpackView.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.h b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
index 5cb9ef8f0..0df58aec6 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.h
@@ -15,7 +15,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/ElementsOrganization.h>
 #include <TNL/Algorithms/Segments/SegmentView.h>
-#include <TNL/Algorithms/Segments/SegmentPrinting.h>
+#include <TNL/Algorithms/Segments/SegmentsPrinting.h>
 
 namespace TNL {
    namespace Algorithms {
-- 
GitLab


From afcbc3912dd8b2af5972be1c1f6d4c73f260bbe9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 18 Apr 2021 20:30:15 +0200
Subject: [PATCH 035/117] Trying to debug segments printing for GPUs.

---
 .../Segments/SegmentsExample_General.cpp      |  2 +
 .../Algorithms/Segments/SegmentsPrinting.h    | 38 ++++++++++++++++++-
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
index 7e4aced7e..d64fdbfde 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
@@ -35,7 +35,9 @@ void SegmentsExample()
    /***
     * Print the data managed by the segments.
     */
+   std::cerr << data << std::endl;
    auto fetch = [=] __cuda_callable__ ( IndexType globalIdx ) -> double { return data_view[ globalIdx ]; };
+   printSegments( segments, fetch, std::cout );
    std::cout << segments.print( fetch ) << std::endl;
 
    /***
diff --git a/src/TNL/Algorithms/Segments/SegmentsPrinting.h b/src/TNL/Algorithms/Segments/SegmentsPrinting.h
index 7695cc693..fa5d6c628 100644
--- a/src/TNL/Algorithms/Segments/SegmentsPrinting.h
+++ b/src/TNL/Algorithms/Segments/SegmentsPrinting.h
@@ -53,7 +53,7 @@ template< typename Segments,
           typename Fetch >
 struct SegmentsPrinter
 {
-   SegmentsPrinter( const Segments& segments, Fetch& fetch )
+   SegmentsPrinter( const Segments& segments, Fetch&& fetch )
    : segments( segments ), fetch( fetch ) {}
 
    std::ostream& print( std::ostream& str ) const
@@ -71,6 +71,8 @@ struct SegmentsPrinter
          for( IndexType localIdx = 0; localIdx < segmentSize; localIdx++ )
          {
             aux.forAllElements( [=] __cuda_callable__ ( IndexType elementIdx, double& v ) mutable {
+               //printf( "####### localIdx = %d, globalIdx = %d \n", localIdx, view.getGlobalIndex( segmentIdx, localIdx ) );
+               //v = view.getGlobalIndex( segmentIdx, localIdx );
                v = fetch( view.getGlobalIndex( segmentIdx, localIdx ) );
             } );
             auto value = aux.getElement( 0 );
@@ -87,7 +89,7 @@ struct SegmentsPrinter
 
    const Segments& segments;
 
-   Fetch& fetch;
+   Fetch fetch;
 };
 
 template< typename Segments,
@@ -97,6 +99,38 @@ std::ostream& operator<<( std::ostream& str, const SegmentsPrinter< Segments, Fe
    return printer.print( str );
 }
 
+template< typename Segments,
+          typename Fetch >
+std::ostream& printSegments( const Segments& segments, Fetch&& fetch, std::ostream& str )
+{
+   using IndexType = typename Segments::IndexType;
+   using DeviceType = typename Segments::DeviceType;
+   using ValueType = decltype( fetch( IndexType() ) );
+
+   TNL::Containers::Array< ValueType, DeviceType, IndexType > aux( 1 );
+   auto view = segments.getConstView();
+   for( IndexType segmentIdx = 0; segmentIdx < segments.getSegmentsCount(); segmentIdx++ )
+   {
+      str << "Seg. " << segmentIdx << ": [ ";
+      auto segmentSize = segments.getSegmentSize( segmentIdx );
+      //std::cerr << "Segment size = " << segmentSize << std::endl;
+      for( IndexType localIdx = 0; localIdx < segmentSize; localIdx++ )
+      {
+         aux.forAllElements( [=] __cuda_callable__ ( IndexType elementIdx, double& v ) mutable {
+            //printf( "####### localIdx = %d, globalIdx = %d \n", localIdx, view.getGlobalIndex( segmentIdx, localIdx ) );
+            v = fetch( view.getGlobalIndex( segmentIdx, localIdx ) );
+            //v = view.getGlobalIndex( segmentIdx, localIdx );
+         } );
+         auto value = aux.getElement( 0 );
+         str << value;
+         if( localIdx < segmentSize - 1 )
+            str << ", ";
+      }
+      str << " ] " << std::endl;
+   }
+   return str;
+}
+
       } // namespace Segments
    } // namespace Algorithms
 } // namespace TNL
-- 
GitLab


From b4e3741090e6f4816ac6a303b681ebb999a33504 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 19 Apr 2021 16:04:53 +0200
Subject: [PATCH 036/117] Trying to debug segments printing for GPUs.

---
 .../Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp | 2 +-
 .../Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp | 2 +-
 .../Algorithms/Segments/SegmentsExample_CSR_forElements.cpp   | 2 +-
 .../Algorithms/Segments/SegmentsExample_CSR_forSegments.cpp   | 2 +-
 .../Examples/Algorithms/Segments/SegmentsExample_General.cpp  | 2 --
 .../Algorithms/Segments/SegmentsExample_forElements.cpp       | 4 ++--
 .../Algorithms/Segments/SegmentsExample_forSegments-1.cpp     | 2 +-
 .../Algorithms/Segments/SegmentsExample_forSegments-2.cpp     | 4 ++--
 .../Algorithms/Segments/SegmentsExample_reduceSegments.cpp    | 2 +-
 .../Algorithms/Segments/SegmentsPrintingExample-2.cpp         | 3 +--
 src/TNL/Algorithms/Segments/SegmentsPrinting.h                | 4 ++--
 11 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp
index ed25d6df4..0ceb7a6bd 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_1.cpp
@@ -35,7 +35,7 @@ void SegmentsExample()
     * Print the data managed by the segments.
     */
    auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
-   std::cout << segments.print( fetch ) << std::endl;
+   printSegments( segments, fetch, std::cout );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp
index a71c51519..9493758b4 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_constructor_2.cpp
@@ -34,7 +34,7 @@ void SegmentsExample()
     * Print the data managed by the segments.
     */
    auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
-   std::cout << segments.print( fetch ) << std::endl;
+   printSegments( segments, fetch, std::cout );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cpp
index 264998046..37267a889 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forElements.cpp
@@ -33,7 +33,7 @@ void SegmentsExample()
     * Print the data managed by the segments.
     */
    auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
-   std::cout << segments.print( fetch ) << std::endl;
+   printSegments( segments, fetch, std::cout );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cpp
index f2eb0ae13..3bf7cc50b 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_CSR_forSegments.cpp
@@ -36,7 +36,7 @@ void SegmentsExample()
     * Print the data managed by the segments.
     */
    auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
-   std::cout << segments.print( fetch ) << std::endl;
+   printSegments( segments, fetch, std::cout );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
index d64fdbfde..ade0263fb 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_General.cpp
@@ -35,10 +35,8 @@ void SegmentsExample()
    /***
     * Print the data managed by the segments.
     */
-   std::cerr << data << std::endl;
    auto fetch = [=] __cuda_callable__ ( IndexType globalIdx ) -> double { return data_view[ globalIdx ]; };
    printSegments( segments, fetch, std::cout );
-   std::cout << segments.print( fetch ) << std::endl;
 
    /***
     * Compute sums of elements in particular segments.
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cpp
index 621a2123a..7d7eac76c 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forElements.cpp
@@ -34,7 +34,7 @@ void SegmentsExample()
    std::cout << "Data setup with no check ... " << std::endl;
    std::cout << "Array: " << data << std::endl;
    auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
-   std::cout << segments.print( fetch ) << std::endl;
+   printSegments( segments, fetch, std::cout );
 
    /***
     * Insert data into particular segments.
@@ -50,7 +50,7 @@ void SegmentsExample()
     */
    std::cout << "Data setup with check for padding elements..." << std::endl;
    std::cout << "Array: " << data << std::endl;
-   std::cout << segments.print( fetch ) << std::endl;
+   printSegments( segments, fetch, std::cout );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cpp
index fa81662e8..d8be1f04c 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-1.cpp
@@ -39,7 +39,7 @@ void SegmentsExample()
     * Print the data managed by the segments.
     */
    auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
-   std::cout << segments.print( fetch ) << std::endl;
+   printSegments( segments, fetch, std::cout );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cpp
index 0439b846a..a5d7d0caa 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_forSegments-2.cpp
@@ -33,7 +33,7 @@ void SegmentsExample()
     */
    std::cout << "Values of elements after intial setup: " << std::endl;
    auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
-   std::cout << segments.print( fetch );
+   printSegments( segments, fetch, std::cout );
 
    /***
     * Divide elements in each segment by a sum of all elements in the segment
@@ -55,7 +55,7 @@ void SegmentsExample()
     * Print the data managed by the segments.
     */
    std::cout << "Value of elements after dividing by sum in each segment:" << std::endl;
-   std::cout << segments.print( fetch ) << std::endl;
+   printSegments( segments, fetch, std::cout );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cpp
index e6701f36b..c9a7476c7 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsExample_reduceSegments.cpp
@@ -36,7 +36,7 @@ void SegmentsExample()
     */
    std::cout << "Values of elements after intial setup: " << std::endl;
    auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
-   std::cout << segments.print( fetch ) << std::endl;
+   printSegments( segments, fetch, std::cout );
 
    /***
     * Compute sums of elements in each segment.
diff --git a/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cpp b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cpp
index 73d2e415d..8f25b8bad 100644
--- a/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cpp
+++ b/Documentation/Examples/Algorithms/Segments/SegmentsPrintingExample-2.cpp
@@ -32,8 +32,7 @@ void SegmentsExample()
     */
    auto data_view = data.getView();
    auto fetch = [=] __cuda_callable__ ( int globalIdx ) -> double { return data_view[ globalIdx ]; };
-   std::cout << segments.print( fetch ) << std::endl;
-   std::cout << std::endl;
+   printSegments( segments, fetch, std::cout ) << std::endl;
 }
 
 int main( int argc, char* argv[] )
diff --git a/src/TNL/Algorithms/Segments/SegmentsPrinting.h b/src/TNL/Algorithms/Segments/SegmentsPrinting.h
index fa5d6c628..f8fd7412e 100644
--- a/src/TNL/Algorithms/Segments/SegmentsPrinting.h
+++ b/src/TNL/Algorithms/Segments/SegmentsPrinting.h
@@ -92,12 +92,12 @@ struct SegmentsPrinter
    Fetch fetch;
 };
 
-template< typename Segments,
+/*template< typename Segments,
           typename Fetch >
 std::ostream& operator<<( std::ostream& str, const SegmentsPrinter< Segments, Fetch >& printer )
 {
    return printer.print( str );
-}
+}*/
 
 template< typename Segments,
           typename Fetch >
-- 
GitLab


From 11c8a4b0079fe81b1cc003b55a56014f9d4891c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 22 Apr 2021 19:59:02 +0200
Subject: [PATCH 037/117] Added sanbox sparse matrix.

---
 .../Algorithms/Segments/SegmentsPrinting.h    |    3 +
 .../Matrices/Sandbox/SparseSandboxMatrix.h    | 1176 ++++++++++++++++
 .../Matrices/Sandbox/SparseSandboxMatrix.hpp  | 1197 +++++++++++++++++
 .../Sandbox/SparseSandboxMatrixRowView.h      |  282 ++++
 .../Sandbox/SparseSandboxMatrixRowView.hpp    |  240 ++++
 .../Sandbox/SparseSandboxMatrixView.h         |  871 ++++++++++++
 .../Sandbox/SparseSandboxMatrixView.hpp       | 1026 ++++++++++++++
 src/TNL/Matrices/SparseMatrix.hpp             |    1 +
 src/UnitTests/Matrices/CMakeLists.txt         |    2 +
 .../SparseMatrixTest_SandboxMatrix.cpp        |   11 +
 .../SparseMatrixTest_SandboxMatrix.cu         |    1 +
 .../Matrices/SparseMatrixTest_SandboxMatrix.h |   45 +
 ...eMatrixVectorProductTest_SandboxMatrix.cpp |   11 +
 ...seMatrixVectorProductTest_SandboxMatrix.cu |    1 +
 ...rseMatrixVectorProductTest_SandboxMatrix.h |   45 +
 15 files changed, 4912 insertions(+)
 create mode 100644 src/TNL/Matrices/Sandbox/SparseSandboxMatrix.h
 create mode 100644 src/TNL/Matrices/Sandbox/SparseSandboxMatrix.hpp
 create mode 100644 src/TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.h
 create mode 100644 src/TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.hpp
 create mode 100644 src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.h
 create mode 100644 src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp
 create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_SandboxMatrix.cpp
 create mode 120000 src/UnitTests/Matrices/SparseMatrixTest_SandboxMatrix.cu
 create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_SandboxMatrix.h
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_SandboxMatrix.cpp
 create mode 120000 src/UnitTests/Matrices/SparseMatrixVectorProductTest_SandboxMatrix.cu
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_SandboxMatrix.h

diff --git a/src/TNL/Algorithms/Segments/SegmentsPrinting.h b/src/TNL/Algorithms/Segments/SegmentsPrinting.h
index f8fd7412e..491f059fa 100644
--- a/src/TNL/Algorithms/Segments/SegmentsPrinting.h
+++ b/src/TNL/Algorithms/Segments/SegmentsPrinting.h
@@ -49,6 +49,8 @@ std::ostream& printSegments( const Segments& segments, std::ostream& str )
    return str;
 }
 
+/// This is to prevent from appearing in Doxygen documentation.
+/// \cond HIDDEN_CLASS
 template< typename Segments,
           typename Fetch >
 struct SegmentsPrinter
@@ -130,6 +132,7 @@ std::ostream& printSegments( const Segments& segments, Fetch&& fetch, std::ostre
    }
    return str;
 }
+/// \endcond
 
       } // namespace Segments
    } // namespace Algorithms
diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.h b/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.h
new file mode 100644
index 000000000..6a2a6a565
--- /dev/null
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.h
@@ -0,0 +1,1176 @@
+/***************************************************************************
+                          SparseSandboxMatrix.h -  description
+                             -------------------
+    begin                : Apr 19, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <map>
+#include <TNL/Matrices/Matrix.h>
+#include <TNL/Matrices/MatrixType.h>
+#include <TNL/Allocators/Default.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.h>
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrixView.h>
+#include <TNL/Matrices/DenseMatrix.h>
+
+namespace TNL {
+   namespace Matrices {
+      namespace Sandbox {
+
+/**
+ * \brief Template of a sparse matrix that can be used for testing of new sparse-matrix formats.
+ *
+ * \tparam Real is a type of matrix elements. If \e Real equals \e bool the matrix is treated
+ *    as binary and so the matrix elements values are not stored in the memory since we need
+ *    to remember only coordinates of non-zero elements( which equal one).
+ * \tparam Device is a device where the matrix is allocated.
+ * \tparam Index is a type for indexing of the matrix elements.
+ * \tparam MatrixType specifies a symmetry of matrix. See \ref MatrixType. Symmetric
+ *    matrices store only lower part of the matrix and its diagonal. The upper part is reconstructed on the fly.
+ *    GeneralMatrix with no symmetry is used by default.
+ * \tparam RealAllocator is allocator for the matrix elements values.
+ * \tparam IndexAllocator is allocator for the matrix elements column indexes.
+ *
+ * This class can be used for rapid testing and development of new formats for sparse matrices. One may profit from
+ * several TNL tools compatible with interface of this templated class like:
+ *
+ * 1. Large set of existing unit tests.
+ * 3. Matrix reading from MTX files - to use \ref TNL::Matrices::MatrixReader, the following methods must be functional
+ *    a. \ref TNL::Matrices::SandboxSparseMatrix::setRowCapacities
+ *    b. \ref TNL::Matrices::SandboxSparseMatrix::setElement
+ *    c. \ref TNL::Matrices::SandboxSparseMatrix::operator= between different devices
+ * 4. Matrix benchmarks - the following methods must be functional
+ *    a. \ref TNL::Matrices::SandboxSparseMatrix::vectorProduct - for SpMV benchmark
+ * 5. Linear solvers
+ * 6. Simple comparison of performance with other matrix formats
+ *
+ * In the core of this class there is:
+ *
+ * 1. Vector 'values` (\ref TNL::Matrices::Matrix::values) which is inheritted from \ref TNL::Matrices::Matrix. This vector is used for storing
+ *    of matrix elements values.
+ * 2. Vector `columnIndexes` (\ref TNL::Matrices::SendboxMatrix::columnIndexes). This vector is used for storing of matrix elements column indexes.
+ *
+ * This class contains fully functional implementation of CSR format and so the user have to replace just what he needs to. Once you have
+ * successfully implemented the sparse matrix format in this form, you may consider to extract it into a form of segments to make it accessible
+ * even for other algorithms then SpMV.
+ *
+ * Parts of the code, that need to be modified are marked by SANDBOX_TODO tag. The whole implementation consits of the following classes:
+ *
+ * 1. \ref TNL::Matrices::Sandbox::SparseSandboxMatrix - this class, it serves for matrix setup and performing of the main operations.
+ * 2. \ref TNL::Matrices::Sandbox::SparseSandboxMatrixView - view class which is necessary mainly for passing the matrix to GPU kernels. Most methods of `SparseSandboxMatrix` are common
+ *    with `SparseSandboxMatrixView` and in this case they are implemented in the view class (and there is just redirection from this class). For this reason, `SparseSandboxMatrix` contains instance of the view class
+ *    (\ref TNL::Matrices::Sandbox::SparseSandboxMatrix::view) which needs to be regularly updated each time when metadata are changed. This is usually done by the means of
+ *    method \ref TNL::Matrices::Sandbox::SparseSandboxMatrix::getView.
+ * 3. \ref TNL::Matrices::Sandbox::SparseSandboxMatrixRowView - is a class for accessing particular matrix rows. It will, likely, require some changes as well.
+ *
+ * We suggest the following way of implementation of the new sparse matrix format:
+ *
+ * 1. Add metadata required by your format next to \ref TNL::Matrices::Sandbox::SparseSandboxMatrix::rowPointers but do not replace the row pointers. It will allow you
+ *    to implement your new format next to the original CSR and to check/compare with the valid CSR implementation any time you get into troubles. The adventage is that all
+ *    unit tests are working properly and you may just focus on modifying one method after another. The unit tests are called from
+ *    `src/UnitTests/Matrices/SparseMatrixTests_SandboxMatrix.h` and `src/UnitTests/Matrices/SparseMatrixVectorProductTests_SandboxMatrix.h`
+ * 2. Modify first the method \ref TNL::Matrices::Sandbox::SparseSandboxMatrix::setRowCapacities which is responsible for the setup of the format metadata.
+ * 3. Continue with modification of constructors, view class, \ref TNL::Matrices::Sandbox::SparseSandoxMatrix::getView and \ref TNL::Matrices::Sandbox::SparseSandoxMatrix::getConstView.
+ * 4. Next you need to modify \ref TNL::Matrices::Sandbox::SparseSandboxMatrix::setElement and \ref TNL::Matrices::Sandbox::SparseSandboxMatrix::getElement methods and assignment operator
+ *    at least for copying the matrix across different devices (i.e. from CPU to GPU). It will allow you to use \ref TNL::Matrices::MatrixReader. We recommend to have the same data layout
+ *    on both CPU and GPU so that the transfer of the matrix from CPU to GPU is trivial.
+ * 5. Finally proceed to \ref TNL::Matrices::Sandbox::SparseSandboxMatrix::vectorProduct to implement SpMV operation. We recommend to implement first the CPU version which is easier to
+ *     debug. Next proceed to GPU version.
+ * 6. When SpMV works it is time to delete the original CSR implementation, i.e. everything around `rowPointers`.
+ * 7. Optimize your implementation to the best performance and test with `tnl-benchmark-spmv` - you need to include your new matrix to `src/Benchmarks/SpMV/spmv.h` and modify this file
+ *    accordingly.
+ * 8. If you want, you may now generalize SpMV to \ref TNL::Matrices::Sandbox::SparseSandboxMatrix::reduceRows method.
+ * 9. If you have `reduceRows` implemented, you may use the original implementation of SpMV based just on the `reduceRows` method.
+ * 10. You may implement \ref TNL::Matrices::Sandbox::SparseSandboxMatrix::forRows and \ref TNL::Matrices::Sandbox::SparseSandboxMatrix::forElements.
+ * 11. Now you have complete implementation of new sparse matrix format. You may turn it into new type of segments (\ref TNL::Algorithms::Segments).
+ *
+ * During the implementation some unit tests may crash. If you do not need them at the moment, you may comment them in files
+ * `src/UnitTests/Matrices/SparseMatrixTests.h` and `src/UnitTests/Matrices/SparseMatrixVectorProductTests.h`
+ */
+template< typename Real =  double,
+          typename Device = Devices::Host,
+          typename Index = int,
+          typename MatrixType = GeneralMatrix,
+          typename RealAllocator = typename Allocators::Default< Device >::template Allocator< Real >,
+          typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
+class SparseSandboxMatrix : public Matrix< Real, Device, Index, RealAllocator >
+{
+   static_assert(
+         ! MatrixType::isSymmetric() ||
+         ! std::is_same< Device, Devices::Cuda >::value ||
+         ( std::is_same< Real, float >::value || std::is_same< Real, double >::value || std::is_same< Real, int >::value || std::is_same< Real, long long int >::value ),
+         "Given Real type is not supported by atomic operations on GPU which are necessary for symmetric operations." );
+
+   public:
+
+      // Supporting types - they are not important for the user
+      using BaseType = Matrix< Real, Device, Index, RealAllocator >;
+      using ValuesVectorType = typename Matrix< Real, Device, Index, RealAllocator >::ValuesType;
+      using ValuesViewType = typename ValuesVectorType::ViewType;
+      using ConstValuesViewType = typename ValuesViewType::ConstViewType;
+      using ColumnsIndexesVectorType = Containers::Vector< typename TNL::copy_const< Index >::template from< Real >::type, Device, Index, IndexAllocator >;
+      using ColumnsIndexesViewType = typename ColumnsIndexesVectorType::ViewType;
+      using ConstColumnsIndexesViewType = typename ColumnsIndexesViewType::ConstViewType;
+      using RowsCapacitiesType = Containers::Vector< std::remove_const_t< Index >, Device, Index, IndexAllocator >;
+      using RowsCapacitiesView = Containers::VectorView< std::remove_const_t< Index >, Device, Index >;
+      using ConstRowsCapacitiesView = typename RowsCapacitiesView::ConstViewType;
+
+      /**
+       * \brief Test of symmetric matrix type.
+       *
+       * \return \e true if the matrix is stored as symmetric and \e false otherwise.
+       */
+      static constexpr bool isSymmetric() { return MatrixType::isSymmetric(); };
+
+      /**
+       * \brief Test of binary matrix type.
+       *
+       * \return \e true if the matrix is stored as binary and \e false otherwise.
+       */
+      static constexpr bool isBinary() { return std::is_same< Real, bool >::value; };
+
+      /**
+       * \brief The type of matrix elements.
+       */
+      using RealType = std::remove_const_t< Real >;
+
+      /**
+       * \brief The device where the matrix is allocated.
+       */
+      using DeviceType = Device;
+
+      /**
+       * \brief The type used for matrix elements indexing.
+       */
+      using IndexType = Index;
+
+      /**
+       * \brief The allocator for matrix elements values.
+       */
+      using RealAllocatorType = RealAllocator;
+
+      /**
+       * \brief The allocator for matrix elements column indexes.
+       */
+      using IndexAllocatorType = IndexAllocator;
+
+      /**
+       * \brief Type of related matrix view.
+       *
+       * See \ref SparseSandboxMatrixView.
+       */
+      using ViewType = SparseSandboxMatrixView< Real, Device, Index, MatrixType >;
+
+      /**
+       * \brief Matrix view type for constant instances.
+       *
+       * See \ref SparseSandboxMatrixView.
+       */
+      using ConstViewType = SparseSandboxMatrixView< std::add_const_t< Real >, Device, Index, MatrixType >;
+
+      /**
+       * \brief Type for accessing matrix rows.
+       */
+      using RowView = SparseSandboxMatrixRowView< ValuesViewType, ColumnsIndexesViewType, isBinary() >;
+
+      /**
+       * \brief Type for accessing constant matrix rows.
+       */
+      using ConstRowView = SparseSandboxMatrixRowView< ConstValuesViewType, ConstColumnsIndexesViewType, isBinary() >;;
+
+      /**
+       * \brief Helper type for getting self type or its modifications.
+       */
+      template< typename _Real = Real,
+                typename _Device = Device,
+                typename _Index = Index,
+                typename _MatrixType = MatrixType,
+                typename _RealAllocator = typename Allocators::Default< _Device >::template Allocator< _Real >,
+                typename _IndexAllocator = typename Allocators::Default< _Device >::template Allocator< _Index > >
+      using Self = SparseSandboxMatrix< _Real, _Device, _Index, _MatrixType, _RealAllocator, _IndexAllocator >;
+
+      /**
+       * \brief Type of container for CSR row pointers.
+       *
+       * SANDBOX_TODO: You may replace it with containers for metadata of your format.
+       */
+      using RowPointers = TNL::Containers::Vector< IndexType, DeviceType, IndexType >;
+
+      /**
+       * \brief Constructor only with values and column indexes allocators.
+       *
+       * \param realAllocator is used for allocation of matrix elements values.
+       * \param indexAllocator is used for allocation of matrix elements column indexes.
+       */
+      SparseSandboxMatrix( const RealAllocatorType& realAllocator = RealAllocatorType(),
+                           const IndexAllocatorType& indexAllocator = IndexAllocatorType() );
+
+      /**
+       * \brief Copy constructor.
+       *
+       * \param matrix is the source matrix
+       */
+      SparseSandboxMatrix( const SparseSandboxMatrix& matrix1 ) = default;
+
+      /**
+       * \brief Move constructor.
+       *
+       * \param matrix is the source matrix
+       */
+      SparseSandboxMatrix( SparseSandboxMatrix&& matrix ) = default;
+
+      /**
+       * \brief Constructor with matrix dimensions.
+       *
+       * \param rows is number of matrix rows.
+       * \param columns is number of matrix columns.
+       * \param realAllocator is used for allocation of matrix elements values.
+       * \param indexAllocator is used for allocation of matrix elements column indexes.
+       */
+      template< typename Index_t, std::enable_if_t< std::is_integral< Index_t >::value, int > = 0 >
+      SparseSandboxMatrix( const Index_t rows,
+                           const Index_t columns,
+                           const RealAllocatorType& realAllocator = RealAllocatorType(),
+                           const IndexAllocatorType& indexAllocator = IndexAllocatorType() );
+
+      /**
+       * \brief Constructor with matrix rows capacities and number of columns.
+       *
+       * The number of matrix rows is given by the size of \e rowCapacities list.
+       *
+       * \tparam ListIndex is the initializer list values type.
+       * \param rowCapacities is a list telling how many matrix elements must be
+       *    allocated in each row.
+       * \param columns is the number of matrix columns.
+       * \param realAllocator is used for allocation of matrix elements values.
+       * \param indexAllocator is used for allocation of matrix elements column indexes.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_Constructor_init_list_1.cpp
+       * \par Output
+       * \include SparseMatrixExample_Constructor_init_list_1.out
+       */
+      template< typename ListIndex >
+      explicit SparseSandboxMatrix( const std::initializer_list< ListIndex >& rowCapacities,
+                                    const IndexType columns,
+                                    const RealAllocatorType& realAllocator = RealAllocatorType(),
+                                    const IndexAllocatorType& indexAllocator = IndexAllocatorType() );
+
+      /**
+       * \brief Constructor with matrix rows capacities given as a vector and number of columns.
+       *
+       * The number of matrix rows is given by the size of \e rowCapacities vector.
+       *
+       * \tparam RowCapacitiesVector is the row capacities vector type. Usually it is some of
+       *    \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or
+       *    \ref TNL::Containers::VectorView.
+       * \param rowCapacities is a vector telling how many matrix elements must be
+       *    allocated in each row.
+       * \param columns is the number of matrix columns.
+       * \param realAllocator is used for allocation of matrix elements values.
+       * \param indexAllocator is used for allocation of matrix elements column indexes.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cpp
+       * \par Output
+       * \include SparseMatrixExample_Constructor_rowCapacities_vector.out
+       */
+      template< typename RowCapacitiesVector, std::enable_if_t< TNL::IsArrayType< RowCapacitiesVector >::value, int > = 0 >
+      explicit SparseSandboxMatrix( const RowCapacitiesVector& rowCapacities,
+                                    const IndexType columns,
+                                    const RealAllocatorType& realAllocator = RealAllocatorType(),
+                                    const IndexAllocatorType& indexAllocator = IndexAllocatorType() );
+
+      /**
+       * \brief Constructor with matrix dimensions and data in initializer list.
+       *
+       * The matrix elements values are given as a list \e data of triples:
+       * { { row1, column1, value1 },
+       *   { row2, column2, value2 },
+       * ... }.
+       *
+       * \param rows is number of matrix rows.
+       * \param columns is number of matrix columns.
+       * \param data is a list of matrix elements values.
+       * \param realAllocator is used for allocation of matrix elements values.
+       * \param indexAllocator is used for allocation of matrix elements column indexes.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_Constructor_init_list_2.cpp
+       * \par Output
+       * \include SparseMatrixExample_Constructor_init_list_2.out
+       */
+      explicit SparseSandboxMatrix( const IndexType rows,
+                                    const IndexType columns,
+                                    const std::initializer_list< std::tuple< IndexType, IndexType, RealType > >& data,
+                                    const RealAllocatorType& realAllocator = RealAllocatorType(),
+                                    const IndexAllocatorType& indexAllocator = IndexAllocatorType() );
+
+      /**
+       * \brief Constructor with matrix dimensions and data in std::map.
+       *
+       * The matrix elements values are given as a map \e data where keys are
+       * std::pair of matrix coordinates ( {row, column} ) and value is the
+       * matrix element value.
+       *
+       * \tparam MapIndex is a type for indexing rows and columns.
+       * \tparam MapValue is a type for matrix elements values in the map.
+       *
+       * \param rows is number of matrix rows.
+       * \param columns is number of matrix columns.
+       * \param map is std::map containing matrix elements.
+       * \param realAllocator is used for allocation of matrix elements values.
+       * \param indexAllocator is used for allocation of matrix elements column indexes.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_Constructor_std_map.cpp
+       * \par Output
+       * \include SparseMatrixExample_Constructor_std_map.out
+       */
+      template< typename MapIndex,
+                typename MapValue >
+      explicit SparseSandboxMatrix( const IndexType rows,
+                                    const IndexType columns,
+                                    const std::map< std::pair< MapIndex, MapIndex >, MapValue >& map,
+                                    const RealAllocatorType& realAllocator = RealAllocatorType(),
+                                    const IndexAllocatorType& indexAllocator = IndexAllocatorType() );
+
+      /**
+       * \brief Returns a modifiable view of the sparse matrix.
+       *
+       * See \ref SparseSandboxMatrixView.
+       *
+       * \return sparse matrix view.
+       */
+      ViewType getView() const; // TODO: remove const
+
+      /**
+       * \brief Returns a non-modifiable view of the sparse matrix.
+       *
+       * See \ref SparseSandboxMatrixView.
+       *
+       * \return sparse matrix view.
+       */
+      ConstViewType getConstView() const;
+
+      /**
+       * \brief Returns string with serialization type.
+       *
+       * The string has a form `Matrices::SparseSandboxMatrix< RealType,  [any_device], IndexType, General/Symmetric, Format, [any_allocator] >`.
+       *
+       * \return \ref String with the serialization type.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_getSerializationType.cpp
+       * \par Output
+       * \include SparseMatrixExample_getSerializationType.out
+       */
+      static String getSerializationType();
+
+      /**
+       * \brief Returns string with serialization type.
+       *
+       * See \ref SparseSandboxMatrix::getSerializationType.
+       *
+       * \return \e String with the serialization type.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_getSerializationType.cpp
+       * \par Output
+       * \include SparseMatrixExample_getSerializationType.out
+       */
+      virtual String getSerializationTypeVirtual() const override;
+
+      /**
+       * \brief Set number of rows and columns of this matrix.
+       *
+       * \param rows is the number of matrix rows.
+       * \param columns is the number of matrix columns.
+       */
+      virtual void setDimensions( const IndexType rows,
+                                  const IndexType columns ) override;
+
+      /**
+       * \brief Set the number of matrix rows and columns by the given matrix.
+       *
+       * \tparam Matrix is matrix type. This can be any matrix having methods
+       *  \ref getRows and \ref getColumns.
+       *
+       * \param matrix in the input matrix dimensions of which are to be adopted.
+       */
+      template< typename Matrix >
+      void setLike( const Matrix& matrix );
+
+      /**
+       * \brief Allocates memory for non-zero matrix elements.
+       *
+       * The size of the input vector must be equal to the number of matrix rows.
+       * The number of allocated matrix elements for each matrix row depends on
+       * the sparse matrix format. Some formats may allocate more elements than
+       * required.
+       *
+       * \tparam RowsCapacitiesVector is a type of vector/array used for row
+       *    capacities setting.
+       *
+       * \param rowCapacities is a vector telling the number of required non-zero
+       *    matrix elements in each row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_setRowCapacities.cpp
+       * \par Output
+       * \include SparseMatrixExample_setRowCapacities.out
+       */
+      template< typename RowsCapacitiesVector >
+      void setRowCapacities( const RowsCapacitiesVector& rowCapacities );
+
+      /**
+       * \brief Compute capacities of all rows.
+       *
+       * The row capacities are not stored explicitly and must be computed.
+       *
+       * \param rowCapacities is a vector where the row capacities will be stored.
+       */
+      template< typename Vector >
+      void getRowCapacities( Vector& rowCapacities ) const;
+
+      /**
+       * \brief This method sets the sparse matrix elements from initializer list.
+       *
+       * The number of matrix rows and columns must be set already.
+       * The matrix elements values are given as a list \e data of triples:
+       * { { row1, column1, value1 },
+       *   { row2, column2, value2 },
+       * ... }.
+       *
+       * \param data is a initializer list of initializer lists representing
+       * list of matrix rows.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_setElements.cpp
+       * \par Output
+       * \include SparseMatrixExample_setElements.out
+       */
+      void setElements( const std::initializer_list< std::tuple< IndexType, IndexType, RealType > >& data );
+
+      /**
+       * \brief This method sets the sparse matrix elements from std::map.
+       *
+       * The matrix elements values are given as a map \e data where keys are
+       * std::pair of matrix coordinates ( {row, column} ) and value is the
+       * matrix element value.
+       *
+       * \tparam MapIndex is a type for indexing rows and columns.
+       * \tparam MapValue is a type for matrix elements values in the map.
+       *
+       * \param map is std::map containing matrix elements.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_setElements_map.cpp
+       * \par Output
+       * \include SparseMatrixExample_setElements_map.out
+       */
+      template< typename MapIndex,
+                typename MapValue >
+      void setElements( const std::map< std::pair< MapIndex, MapIndex > , MapValue >& map );
+
+      /**
+       * \brief Computes number of non-zeros in each row.
+       *
+       * \param rowLengths is a vector into which the number of non-zeros in each row
+       * will be stored.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_getCompressedRowLengths.cpp
+       * \par Output
+       * \include SparseMatrixExample_getCompressedRowLengths.out
+       */
+      template< typename Vector >
+      void getCompressedRowLengths( Vector& rowLengths ) const;
+
+
+      /**
+       * \brief Returns capacity of given matrix row.
+       *
+       * \param row index of matrix row.
+       * \return number of matrix elements allocated for the row.
+       */
+      __cuda_callable__
+      IndexType getRowCapacity( const IndexType row ) const;
+
+      /**
+       * \brief Returns number of non-zero matrix elements.
+       *
+       * This method really counts the non-zero matrix elements and so
+       * it returns zero for matrix having all allocated elements set to zero.
+       *
+       * \return number of non-zero matrix elements.
+       */
+      IndexType getNonzeroElementsCount() const;
+
+      /**
+       * \brief Resets the matrix to zero dimensions.
+       */
+      void reset();
+
+      /**
+       * \brief Constant getter of simple structure for accessing given matrix row.
+       *
+       * \param rowIdx is matrix row index.
+       *
+       * \return RowView for accessing given matrix row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_getConstRow.cpp
+       * \par Output
+       * \include SparseMatrixExample_getConstRow.out
+       *
+       * See \ref SparseMatrixRowView.
+       */
+      __cuda_callable__
+      const ConstRowView getRow( const IndexType& rowIdx ) const;
+
+      /**
+       * \brief Non-constant getter of simple structure for accessing given matrix row.
+       *
+       * \param rowIdx is matrix row index.
+       *
+       * \return RowView for accessing given matrix row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_getRow.cpp
+       * \par Output
+       * \include SparseMatrixExample_getRow.out
+       *
+       * See \ref SparseMatrixRowView.
+       */
+      __cuda_callable__
+      RowView getRow( const IndexType& rowIdx );
+
+      /**
+       * \brief Sets element at given \e row and \e column to given \e value.
+       *
+       * This method can be called from the host system (CPU) no matter
+       * where the matrix is allocated. If the matrix is allocated on GPU this method
+       * can be called even from device kernels. If the matrix is allocated in GPU device
+       * this method is called from CPU, it transfers values of each matrix element separately and so the
+       * performance is very low. For higher performance see. \ref SparseMatrix::getRow
+       * or \ref SparseMatrix::forElements and \ref SparseMatrix::forAllElements.
+       * The call may fail if the matrix row capacity is exhausted.
+       *
+       * \param row is row index of the element.
+       * \param column is columns index of the element.
+       * \param value is the value the element will be set to.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_setElement.cpp
+       * \par Output
+       * \include SparseMatrixExample_setElement.out
+       */
+      __cuda_callable__
+      void setElement( const IndexType row,
+                       const IndexType column,
+                       const RealType& value );
+
+      /**
+       * \brief Add element at given \e row and \e column to given \e value.
+       *
+       * This method can be called from the host system (CPU) no matter
+       * where the matrix is allocated. If the matrix is allocated on GPU this method
+       * can be called even from device kernels. If the matrix is allocated in GPU device
+       * this method is called from CPU, it transfers values of each matrix element separately and so the
+       * performance is very low. For higher performance see. \ref SparseMatrix::getRow
+       * or \ref SparseMatrix::forElements and \ref SparseMatrix::forAllElements.
+       * The call may fail if the matrix row capacity is exhausted.
+       *
+       * \param row is row index of the element.
+       * \param column is columns index of the element.
+       * \param value is the value the element will be set to.
+       * \param thisElementMultiplicator is multiplicator the original matrix element
+       *   value is multiplied by before addition of given \e value.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_addElement.cpp
+       * \par Output
+       * \include SparseMatrixExample_addElement.out
+       *
+       */
+      __cuda_callable__
+      void addElement( const IndexType row,
+                       const IndexType column,
+                       const RealType& value,
+                       const RealType& thisElementMultiplicator );
+
+      /**
+       * \brief Returns value of matrix element at position given by its row and column index.
+       *
+       * This method can be called from the host system (CPU) no matter
+       * where the matrix is allocated. If the matrix is allocated on GPU this method
+       * can be called even from device kernels. If the matrix is allocated in GPU device
+       * this method is called from CPU, it transfers values of each matrix element separately and so the
+       * performance is very low. For higher performance see. \ref SparseMatrix::getRow
+       * or \ref SparseMatrix::forElements and \ref SparseMatrix::forAllElements.
+       *
+       * \param row is a row index of the matrix element.
+       * \param column i a column index of the matrix element.
+       *
+       * \return value of given matrix element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_getElement.cpp
+       * \par Output
+       * \include SparseMatrixExample_getElement.out
+       *
+       */
+      __cuda_callable__
+      RealType getElement( const IndexType row,
+                           const IndexType column ) const;
+
+      /**
+       * \brief Method for performing general reduction on matrix rows.
+       *
+       * \tparam Fetch is a type of lambda function for data fetch declared as
+       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
+       *          The return type of this lambda can be any non void.
+       * \tparam Reduce is a type of lambda function for reduction declared as
+       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
+       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       * \tparam FetchValue is type returned by the Fetch lambda function.
+       *
+       * \param begin defines beginning of the range [begin,end) of rows to be processed.
+       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param fetch is an instance of lambda function for data fetch.
+       * \param reduce is an instance of lambda function for reduction.
+       * \param keep in an instance of lambda function for storing results.
+       * \param zero is zero of given reduction operation also known as idempotent element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_reduceRows.cpp
+       * \par Output
+       * \include SparseMatrixExample_reduceRows.out
+       */
+      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero );
+
+      /**
+       * \brief Method for performing general reduction on matrix rows for constant instances.
+       *
+       * \tparam Fetch is a type of lambda function for data fetch declared as
+       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
+       *          The return type of this lambda can be any non void.
+       * \tparam Reduce is a type of lambda function for reduction declared as
+       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
+       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       * \tparam FetchValue is type returned by the Fetch lambda function.
+       *
+       * \param begin defines beginning of the range [begin,end) of rows to be processed.
+       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param fetch is an instance of lambda function for data fetch.
+       * \param reduce is an instance of lambda function for reduction.
+       * \param keep in an instance of lambda function for storing results.
+       * \param zero is zero of given reduction operation also known as idempotent element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_reduceRows.cpp
+       * \par Output
+       * \include SparseMatrixExample_reduceRows.out
+       */
+      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+
+      /**
+       * \brief Method for performing general reduction on all matrix rows.
+       *
+       * \tparam Fetch is a type of lambda function for data fetch declared as
+       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
+       *          The return type of this lambda can be any non void.
+       * \tparam Reduce is a type of lambda function for reduction declared as
+       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
+       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       * \tparam FetchValue is type returned by the Fetch lambda function.
+       *
+       * \param fetch is an instance of lambda function for data fetch.
+       * \param reduce is an instance of lambda function for reduction.
+       * \param keep in an instance of lambda function for storing results.
+       * \param zero is zero of given reduction operation also known as idempotent element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_reduceAllRows.cpp
+       * \par Output
+       * \include SparseMatrixExample_reduceAllRows.out
+       */
+      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+      void reduceAllRows( Fetch&& fetch, const Reduce&& reduce, Keep&& keep, const FetchReal& zero );
+
+      /**
+       * \brief Method for performing general reduction on all matrix rows for constant instances.
+       *
+       * \tparam Fetch is a type of lambda function for data fetch declared as
+       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
+       *          The return type of this lambda can be any non void.
+       * \tparam Reduce is a type of lambda function for reduction declared as
+       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
+       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       * \tparam FetchValue is type returned by the Fetch lambda function.
+       *
+       * \param fetch is an instance of lambda function for data fetch.
+       * \param reduce is an instance of lambda function for reduction.
+       * \param keep in an instance of lambda function for storing results.
+       * \param zero is zero of given reduction operation also known as idempotent element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_reduceAllRows.cpp
+       * \par Output
+       * \include SparseMatrixExample_reduceAllRows.out
+       */
+      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+
+      /**
+       * \brief Method for parallel iteration over matrix elements of given rows for constant instances.
+       *
+       * \tparam Function is type of lambda function that will operate on matrix elements.
+       *
+       * \param begin defines beginning of the range [ \e begin,\e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
+       * \param function is an instance of the lambda function to be called for element of given rows.
+       *
+       * The lambda function `function` should be declared like follows:
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
+       *
+       *  The \e localIdx parameter is a rank of the non-zero element in given row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_forElements.cpp
+       * \par Output
+       * \include SparseMatrixExample_forElements.out
+       */
+      template< typename Function >
+      void forElements( IndexType begin, IndexType end, Function&& function ) const;
+
+      /**
+       * \brief Method for parallel iteration over all matrix elements of given rows for non-constant instances.
+       *
+       * \tparam Function is type of lambda function that will operate on matrix elements.
+       *
+       * \param begin defines beginning of the range [ \e begin,\e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
+       * \param function is an instance of the lambda function to be called for each element of given rows.
+       *
+       * The lambda function `function` should be declared like follows:
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) mutable { ... }
+       * ```
+       *
+       *  The \e localIdx parameter is a rank of the non-zero element in given row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_forElements.cpp
+       * \par Output
+       * \include SparseMatrixExample_forElements.out
+       */
+      template< typename Function >
+      void forElements( IndexType begin, IndexType end, Function&& function );
+
+      /**
+       * \brief Method for parallel iteration over all matrix elements for constant instances.
+       *
+       * See \ref SparseMatrix::forElements.
+       *
+       * \tparam Function is a type of lambda function that will operate on matrix elements.
+       * \param function  is an instance of the lambda function to be called for each matrix element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_forElements.cpp
+       * \par Output
+       * \include SparseMatrixExample_forElements.out
+       */
+      template< typename Function >
+      void forAllElements( Function&& function ) const;
+
+      /**
+       * \brief Method for parallel iteration over all matrix elements for non-constant instances.
+       *
+       * See \ref SparseMatrix::forElements.
+       *
+       * \tparam Function is a type of lambda function that will operate on matrix elements.
+       * \param function  is an instance of the lambda function to be called for each matrix element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_forElements.cpp
+       * \par Output
+       * \include SparseMatrixExample_forElements.out
+       */
+      template< typename Function >
+      void forAllElements( Function&& function );
+
+      /**
+       * \brief Method for parallel iteration over matrix rows from interval [ \e begin, \e end).
+       *
+       * In each row, given lambda function is performed. Each row is processed by at most one thread unlike the method
+       * \ref SparseMatrix::forElements where more than one thread can be mapped to each row.
+       *
+       * \tparam Function is type of the lambda function.
+       *
+       * \param begin defines beginning of the range [ \e begin,\e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
+       * \param function is an instance of the lambda function to be called for each row.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) mutable { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseMatrix::RowView.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_forRows.cpp
+       * \par Output
+       * \include SparseMatrixExample_forRows.out
+       */
+      template< typename Function >
+      void forRows( IndexType begin, IndexType end, Function&& function );
+
+      /**
+       * \brief Method for parallel iteration over matrix rows from interval [ \e begin, \e end) for constant instances.
+       *
+       * In each row, given lambda function is performed. Each row is processed by at most one thread unlike the method
+       * \ref SparseMatrix::forElements where more than one thread can be mapped to each row.
+       *
+       * \tparam Function is type of the lambda function.
+       *
+       * \param begin defines beginning of the range [ \e begin,\e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
+       * \param function is an instance of the lambda function to be called for each row.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseMatrix::RowView.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_forRows.cpp
+       * \par Output
+       * \include SparseMatrixExample_forRows.out
+       */
+      template< typename Function >
+      void forRows( IndexType begin, IndexType end, Function&& function ) const;
+
+      /**
+       * \brief Method for parallel iteration over all matrix rows.
+       *
+       * In each row, given lambda function is performed. Each row is processed by at most one thread unlike the method
+       * \ref SparseMatrix::forAllElements where more than one thread can be mapped to each row.
+       *
+       * \tparam Function is type of the lambda function.
+       *
+       * \param function is an instance of the lambda function to be called for each row.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) mutable { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseMatrix::RowView.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_forRows.cpp
+       * \par Output
+       * \include SparseMatrixExample_forRows.out
+       */
+      template< typename Function >
+      void forAllRows( Function&& function );
+
+      /**
+       * \brief Method for parallel iteration over all matrix rows for constant instances.
+       *
+       * In each row, given lambda function is performed. Each row is processed by at most one thread unlike the method
+       * \ref SparseMatrix::forAllElements where more than one thread can be mapped to each row.
+       *
+       * \tparam Function is type of the lambda function.
+       *
+       * \param function is an instance of the lambda function to be called for each row.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseMatrix::RowView.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_forRows.cpp
+       * \par Output
+       * \include SparseMatrixExample_forRows.out
+       */
+      template< typename Function >
+      void forAllRows( Function&& function ) const;
+
+      /**
+       * \brief Method for sequential iteration over all matrix rows for constant instances.
+       *
+       * \tparam Function is type of lambda function that will operate on matrix elements.
+       *    It is should have form like
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
+       *  The column index repeats twice only for compatibility with sparse matrices.
+       *
+       * \param begin defines beginning of the range [begin,end) of rows to be processed.
+       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param function is an instance of the lambda function to be called in each row.
+       */
+      template< typename Function >
+      void sequentialForRows( IndexType begin, IndexType end, Function& function ) const;
+
+      /**
+       * \brief Method for sequential iteration over all matrix rows for non-constant instances.
+       *
+       * \tparam Function is type of lambda function that will operate on matrix elements.
+       *    It is should have form like
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
+       *  The column index repeats twice only for compatibility with sparse matrices.
+       *
+       * \param begin defines beginning of the range [begin,end) of rows to be processed.
+       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param function is an instance of the lambda function to be called in each row.
+       */
+      template< typename Function >
+      void sequentialForRows( IndexType begin, IndexType end, Function& function );
+
+      /**
+       * \brief This method calls \e sequentialForRows for all matrix rows (for constant instances).
+       *
+       * See \ref SparseMatrix::sequentialForRows.
+       *
+       * \tparam Function is a type of lambda function that will operate on matrix elements.
+       * \param function  is an instance of the lambda function to be called in each row.
+       */
+      template< typename Function >
+      void sequentialForAllRows( Function& function ) const;
+
+      /**
+       * \brief This method calls \e sequentialForRows for all matrix rows.
+       *
+       * See \ref SparseMatrix::sequentialForAllRows.
+       *
+       * \tparam Function is a type of lambda function that will operate on matrix elements.
+       * \param function  is an instance of the lambda function to be called in each row.
+       */
+      template< typename Function >
+      void sequentialForAllRows( Function& function );
+
+      /**
+       * \brief Computes product of matrix and vector.
+       *
+       * More precisely, it computes:
+       *
+       * `outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector`
+       *
+       * \tparam InVector is type of input vector.  It can be \ref Vector,
+       *     \ref VectorView, \ref Array, \ref ArraView or similar container.
+       * \tparam OutVector is type of output vector. It can be \ref Vector,
+       *     \ref VectorView, \ref Array, \ref ArraView or similar container.
+       *
+       * \param inVector is input vector.
+       * \param outVector is output vector.
+       * \param matrixMultiplicator is a factor by which the matrix is multiplied. It is one by default.
+       * \param outVectorMultiplicator is a factor by which the outVector is multiplied before added
+       *    to the result of matrix-vector product. It is zero by default.
+       * \param begin is the beginning of the rows range for which the vector product
+       *    is computed. It is zero by default.
+       * \param end is the end of the rows range for which the vector product
+       *    is computed. It is number if the matrix rows by default.
+       */
+      template< typename InVector,
+                typename OutVector >
+      void vectorProduct( const InVector& inVector,
+                          OutVector& outVector,
+                          const RealType& matrixMultiplicator = 1.0,
+                          const RealType& outVectorMultiplicator = 0.0,
+                          const IndexType firstRow = 0,
+                          const IndexType lastRow = 0 ) const;
+
+      /*template< typename Real2, typename Index2 >
+      void addMatrix( const SparseMatrix< Real2, Segments, Device, Index2 >& matrix,
+                      const RealType& matrixMultiplicator = 1.0,
+                      const RealType& thisMatrixMultiplicator = 1.0 );
+
+      template< typename Real2, typename Index2 >
+      void getTransposition( const SparseMatrix< Real2, Segments, Device, Index2 >& matrix,
+                             const RealType& matrixMultiplicator = 1.0 );
+       */
+
+      template< typename Vector1, typename Vector2 >
+      bool performSORIteration( const Vector1& b,
+                                const IndexType row,
+                                Vector2& x,
+                                const RealType& omega = 1.0 ) const;
+
+      /**
+       * \brief Assignment of exactly the same matrix type.
+       *
+       * \param matrix is input matrix for the assignment.
+       * \return reference to this matrix.
+       */
+      SparseSandboxMatrix& operator=( const SparseSandboxMatrix& matrix );
+
+      /**
+       * \brief Assignment of exactly the same matrix type but different device.
+       *
+       * \param matrix is input matrix for the assignment.
+       * \return reference to this matrix.
+       */
+      template< typename Device_ >
+      SparseSandboxMatrix& operator=( const SparseSandboxMatrix< RealType, Device_, IndexType, MatrixType, RealAllocator, IndexAllocator >& matrix );
+
+      /**
+       * \brief Assignment of dense matrix
+       *
+       * \param matrix is input matrix for the assignment.
+       * \return reference to this matrix.
+       */
+      template< typename Real_, typename Device_, typename Index_, ElementsOrganization Organization, typename RealAllocator_ >
+      SparseSandboxMatrix& operator=( const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator_ >& matrix );
+
+
+      /**
+       * \brief Assignment of any matrix type other then this and dense.
+       *
+       * **Warning: Assignment of symmetric sparse matrix to general sparse matrix does not give correct result, currently. Only the diagonal and the lower part of the matrix is assigned.**
+       *
+       * \param matrix is input matrix for the assignment.
+       * \return reference to this matrix.
+       */
+      template< typename RHSMatrix >
+      SparseSandboxMatrix& operator=( const RHSMatrix& matrix );
+
+      /**
+       * \brief Comparison operator with another arbitrary matrix type.
+       *
+       * \param matrix is the right-hand side matrix.
+       * \return \e true if the RHS matrix is equal, \e false otherwise.
+       */
+      template< typename Matrix >
+      bool operator==( const Matrix& m ) const;
+
+      /**
+       * \brief Comparison operator with another arbitrary matrix type.
+       *
+       * \param matrix is the right-hand side matrix.
+       * \return \e true if the RHS matrix is equal, \e false otherwise.
+       */
+      template< typename Matrix >
+      bool operator!=( const Matrix& m ) const;
+
+      /**
+       * \brief Method for saving the matrix to the file with given filename.
+       *
+       * \param fileName is name of the file.
+       */
+      void save( const String& fileName ) const;
+
+      /**
+       * \brief Method for loading the matrix from the file with given filename.
+       *
+       * \param fileName is name of the file.
+       */
+      void load( const String& fileName );
+
+      /**
+       * \brief Method for saving the matrix to a file.
+       *
+       * \param file is the output file.
+       */
+      virtual void save( File& file ) const override;
+
+      /**
+       * \brief Method for loading the matrix from a file.
+       *
+       * \param file is the input file.
+       */
+      virtual void load( File& file ) override;
+
+      /**
+       * \brief Method for printing the matrix to output stream.
+       *
+       * \param str is the output stream.
+       */
+      virtual void print( std::ostream& str ) const override;
+
+      /**
+       * \brief Returns a padding index value.
+       *
+       * Padding index is used for column indexes of padding zeros. Padding zeros
+       * are used in some sparse matrix formats for better data alignment in memory.
+       *
+       * \return value of the padding index.
+       */
+      __cuda_callable__
+      IndexType getPaddingIndex() const;
+
+      /**
+       * \brief Getter of segments for non-constant instances.
+       *
+       * \e Segments are a structure for addressing the matrix elements columns and values.
+       * In fact, \e Segments represent the sparse matrix format.
+       *
+       * \return Non-constant reference to segments.
+       */
+      //SegmentsType& getSegments();
+
+      /**
+       * \brief Getter of segments for constant instances.
+       *
+       * \e Segments are a structure for addressing the matrix elements columns and values.
+       * In fact, \e Segments represent the sparse matrix format.
+       *
+       * \return Constant reference to segments.
+       */
+      //const SegmentsType& getSegments() const;
+
+      /**
+       * \brief Getter of column indexes for constant instances.
+       *
+       * \return Constant reference to a vector with matrix elements column indexes.
+       */
+      const ColumnsIndexesVectorType& getColumnIndexes() const;
+
+      /**
+       * \brief Getter of column indexes for nonconstant instances.
+       *
+       * \return Reference to a vector with matrix elements column indexes.
+       */
+      ColumnsIndexesVectorType& getColumnIndexes();
+
+   protected:
+
+      ColumnsIndexesVectorType columnIndexes;
+
+      IndexAllocator indexAllocator;
+
+      ViewType view;
+
+      /**
+       * \brief Container for CSR row pointers.
+       *
+       * SANDBOX_TODO: You may replace it with containers and metadata required by you format.
+       */
+
+      RowPointers rowPointers;
+};
+
+      } // namespace Sandbox
+   } // namespace Matrices
+} // namespace TNL
+
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrix.hpp>
diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.hpp b/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.hpp
new file mode 100644
index 000000000..63f49e6c8
--- /dev/null
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.hpp
@@ -0,0 +1,1197 @@
+/***************************************************************************
+                          SparseSandboxMatrix.hpp -  description
+                             -------------------
+    begin                : Apr 19, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <functional>
+#include <sstream>
+#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrix.h>
+
+namespace TNL {
+   namespace Matrices {
+      namespace Sandbox {
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+SparseSandboxMatrix( const RealAllocatorType& realAllocator,
+                     const IndexAllocatorType& indexAllocator )
+: BaseType( realAllocator ), columnIndexes( indexAllocator ), rowPointers( ( IndexType ) 1, ( IndexType ) 0, indexAllocator )
+{
+   this->view = this->getView();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Index_t, std::enable_if_t< std::is_integral< Index_t >::value, int > >
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+SparseSandboxMatrix( const Index_t rows,
+                     const Index_t columns,
+                     const RealAllocatorType& realAllocator,
+                     const IndexAllocatorType& indexAllocator )
+: BaseType( rows, columns, realAllocator ), columnIndexes( indexAllocator ), rowPointers( rows + 1, ( IndexType ) 0, indexAllocator )
+{
+   this->view = this->getView();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename ListIndex >
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+SparseSandboxMatrix( const std::initializer_list< ListIndex >& rowCapacities,
+                     const IndexType columns,
+                     const RealAllocatorType& realAllocator,
+                     const IndexAllocatorType& indexAllocator )
+: BaseType( rowCapacities.size(), columns, realAllocator ), columnIndexes( indexAllocator ), rowPointers( rowCapacities.size() + 1, ( IndexType ) 0, indexAllocator )
+{
+   this->setRowCapacities( RowsCapacitiesType( rowCapacities ) );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename RowCapacitiesVector, std::enable_if_t< TNL::IsArrayType< RowCapacitiesVector >::value, int > >
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+SparseSandboxMatrix( const RowCapacitiesVector& rowCapacities,
+                     const IndexType columns,
+                     const RealAllocatorType& realAllocator,
+                     const IndexAllocatorType& indexAllocator )
+: BaseType( rowCapacities.getSize(), columns, realAllocator ), columnIndexes( indexAllocator ), rowPointers( rowCapacities.getSize() + 1, ( IndexType ) 0, indexAllocator )
+{
+   this->setRowCapacities( rowCapacities );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+SparseSandboxMatrix( const IndexType rows,
+                     const IndexType columns,
+                     const std::initializer_list< std::tuple< IndexType, IndexType, RealType > >& data,
+                     const RealAllocatorType& realAllocator,
+                     const IndexAllocatorType& indexAllocator )
+: BaseType( rows, columns, realAllocator ), columnIndexes( indexAllocator ), rowPointers( rows + 1, ( IndexType ) 0, indexAllocator )
+{
+   this->setElements( data );
+   this->view = this->getView();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename MapIndex,
+             typename MapValue >
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+SparseSandboxMatrix( const IndexType rows,
+              const IndexType columns,
+              const std::map< std::pair< MapIndex, MapIndex > , MapValue >& map,
+              const RealAllocatorType& realAllocator,
+              const IndexAllocatorType& indexAllocator )
+: BaseType( rows, columns, realAllocator ), columnIndexes( indexAllocator ), rowPointers( rows + 1, ( IndexType ) 0, indexAllocator )
+{
+   this->setDimensions( rows, columns );
+   this->setElements( map );
+   this->view = this->getView();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+auto
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getView() const -> ViewType
+{
+   return ViewType( this->getRows(),
+                    this->getColumns(),
+                    const_cast< SparseSandboxMatrix* >( this )->getValues().getView(),  // TODO: remove const_cast
+                    const_cast< SparseSandboxMatrix* >( this )->columnIndexes.getView(),
+                    const_cast< SparseSandboxMatrix* >( this )->rowPointers.getView() );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+auto
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getConstView() const -> ConstViewType
+{
+   return ConstViewType( this->getRows(),
+                         this->getColumns(),
+                         this->getValues().getConstView(),
+                         this->columnIndexes.getConstView(),
+                         this->segments.getConstView() );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+String
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getSerializationType()
+{
+   return ViewType::getSerializationType();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+String
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getSerializationTypeVirtual() const
+{
+   return this->getSerializationType();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+setDimensions( const IndexType rows,
+               const IndexType columns )
+{
+   BaseType::setDimensions( rows, columns );
+   this->view = this->getView();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Matrix_ >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+setLike( const Matrix_& matrix )
+{
+   BaseType::setLike( matrix );
+   // SANDBOX_TODO: Replace the following line with assignment of metadata required by your format. 
+   //               Do not assign matrix elements here.
+   this->rowPointers = matrix.rowPointers;
+   this->view = this->getView();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename RowsCapacitiesVector >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+setRowCapacities( const RowsCapacitiesVector& rowsCapacities )
+{
+   TNL_ASSERT_EQ( rowsCapacities.getSize(), this->getRows(), "Number of matrix rows does not fit with rowCapacities vector size." );
+   using RowsCapacitiesVectorDevice = typename RowsCapacitiesVector::DeviceType;
+
+   // SANDBOX_TODO: Replace the following lines with the setup of your sparse matrix format based on
+   //               `rowsCapacities`. This container has the same number of elements as is the number of
+   //               rows of this matrix. Each element says how many nonzero elements the user needs to have
+   //               in each row. This number can be increased if the sparse matrix format uses padding zeros.
+   this->rowPointers.setSize( this->getRows() + 1 );
+   if( std::is_same< DeviceType, RowsCapacitiesVectorDevice >::value )
+   {
+      // GOTCHA: when this->getRows() == 0, getView returns a full view with size == 1
+      if( this->getRows() > 0 ) {
+         auto view = this->rowPointers.getView( 0, this->getRows() );
+         view = rowsCapacities;
+      }
+   }
+   else
+   {
+      RowsCapacitiesType thisRowsCapacities;
+      thisRowsCapacities = rowsCapacities;
+      if( this->getRows() > 0 ) {
+         auto view = this->rowPointers.getView( 0, this->getRows() );
+         view = thisRowsCapacities;
+      }
+   }
+   this->rowPointers.setElement( this->getRows(), 0 );
+   this->rowPointers.template scan< Algorithms::ScanType::Exclusive >();
+   // End of sparse matrix format initiation.
+
+   // SANDBOX_TODO: Compute number of all elements that need to be allocated by your format.
+   const auto storageSize = rowPointers.getElement( this->getRows() );
+
+   // The rest of this methods needs no changes.
+   if( ! isBinary() )
+   {
+      this->values.setSize( storageSize );
+      this->values = ( RealType ) 0;
+   }
+   this->columnIndexes.setSize( storageSize );
+   this->columnIndexes = this->getPaddingIndex();
+   this->view = this->getView();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Vector >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getRowCapacities( Vector& rowCapacities ) const
+{
+   this->view.getRowCapacities( rowCapacities );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+setElements( const std::initializer_list< std::tuple< IndexType, IndexType, RealType > >& data )
+{
+   const auto& rows = this->getRows();
+   const auto& columns = this->getColumns();
+   Containers::Vector< IndexType, Devices::Host, IndexType > rowCapacities( rows, 0 );
+   for( const auto& i : data )
+   {
+      if( std::get< 0 >( i ) >= rows )
+      {
+         std::stringstream s;
+         s << "Wrong row index " << std::get< 0 >( i ) << " in an initializer list";
+         throw std::logic_error( s.str() );
+      }
+      rowCapacities[ std::get< 0 >( i ) ]++;
+   }
+   SparseSandboxMatrix< Real, Devices::Host, Index, MatrixType > hostMatrix( rows, columns );
+   hostMatrix.setRowCapacities( rowCapacities );
+   for( const auto& i : data )
+   {
+      if( std::get< 1 >( i ) >= columns )
+      {
+         std::stringstream s;
+         s << "Wrong column index " << std::get< 1 >( i ) << " in an initializer list";
+         throw std::logic_error( s.str() );
+      }
+      hostMatrix.setElement( std::get< 0 >( i ), std::get< 1 >( i ), std::get< 2 >( i ) );
+   }
+   ( *this ) = hostMatrix;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename MapIndex,
+             typename MapValue >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+setElements( const std::map< std::pair< MapIndex, MapIndex > , MapValue >& map )
+{
+   Containers::Vector< IndexType, Devices::Host, IndexType > rowsCapacities( this->getRows(), 0 );
+   for( auto element : map )
+      rowsCapacities[ element.first.first ]++;
+   if( !std::is_same< DeviceType, Devices::Host >::value )
+   {
+      SparseSandboxMatrix< Real, Devices::Host, Index, MatrixType > hostMatrix( this->getRows(), this->getColumns() );
+      hostMatrix.setRowCapacities( rowsCapacities );
+      for( auto element : map )
+         hostMatrix.setElement( element.first.first, element.first.second, element.second );
+      *this = hostMatrix;
+   }
+   else
+   {
+      this->setRowCapacities( rowsCapacities );
+      for( auto element : map )
+         this->setElement( element.first.first, element.first.second, element.second );
+   }
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Vector >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getCompressedRowLengths( Vector& rowLengths ) const
+{
+   this->view.getCompressedRowLengths( rowLengths );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+__cuda_callable__
+Index
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getRowCapacity( const IndexType row ) const
+{
+   return this->view.getRowCapacity( row );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+Index
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getNonzeroElementsCount() const
+{
+   return this->view.getNonzeroElementsCount();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+reset()
+{
+   BaseType::reset();
+   this->columnIndexes.reset();
+   // SANDBOX_TODO: Reset the metadata required by your format here.
+   this->rowPointers.reset();
+   this->view = this->getView();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+__cuda_callable__ auto
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getRow( const IndexType& rowIdx ) const -> const ConstRowView
+{
+   return this->view.getRow( rowIdx );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+__cuda_callable__ auto
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getRow( const IndexType& rowIdx ) -> RowView
+{
+   return this->view.getRow( rowIdx );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+__cuda_callable__ void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+setElement( const IndexType row,
+            const IndexType column,
+            const RealType& value )
+{
+   this->view.setElement( row, column, value );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+__cuda_callable__ void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+addElement( const IndexType row,
+            const IndexType column,
+            const RealType& value,
+            const RealType& thisElementMultiplicator )
+{
+   this->view.addElement( row, column, value, thisElementMultiplicator );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+__cuda_callable__
+auto
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getElement( const IndexType row,
+            const IndexType column ) const -> RealType
+{
+   return this->view.getElement( row, column );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+template< typename InVector,
+       typename OutVector >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+vectorProduct( const InVector& inVector,
+               OutVector& outVector,
+               const RealType& matrixMultiplicator,
+               const RealType& outVectorMultiplicator,
+               const IndexType firstRow,
+               const IndexType lastRow ) const
+{
+   this->view.vectorProduct( inVector, outVector, matrixMultiplicator, outVectorMultiplicator, firstRow, lastRow );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero )
+{
+   this->view.reduceRows( begin, end, fetch, reduce, keep, zero );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const
+{
+   this->view.reduceRows( begin, end, fetch, reduce, keep, zero );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+reduceAllRows( Fetch&& fetch, const Reduce&& reduce, Keep&& keep, const FetchReal& zero )
+{
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+{
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+forElements( IndexType begin, IndexType end, Function&& function ) const
+{
+   this->view.forElements( begin, end, function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+forElements( IndexType begin, IndexType end, Function&& function )
+{
+   this->view.forElements( begin, end, function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+forAllElements( Function&& function ) const
+{
+   this->forElements( 0, this->getRows(), function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+forAllElements( Function&& function )
+{
+   this->forElements( 0, this->getRows(), function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+forRows( IndexType begin, IndexType end, Function&& function )
+{
+   this->getView().forRows( begin, end, function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+forRows( IndexType begin, IndexType end, Function&& function ) const
+{
+   this->getConstView().forRows( begin, end, function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+forAllRows( Function&& function )
+{
+   this->getView().forAllRows( function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+forAllRows( Function&& function ) const
+{
+   this->getConsView().forAllRows( function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+sequentialForRows( IndexType begin, IndexType end, Function& function ) const
+{
+   this->view.sequentialForRows( begin, end, function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+sequentialForRows( IndexType first, IndexType last, Function& function )
+{
+   this->view.sequentialForRows( first, last, function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+sequentialForAllRows( Function& function ) const
+{
+   this->sequentialForRows( 0, this->getRows(), function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Function >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+sequentialForAllRows( Function& function )
+{
+   this->sequentialForRows( 0, this->getRows(), function );
+}
+
+
+/*template< typename Real,
+          template< typename, typename, typename > class Segments,
+          typename Device,
+          typename Index,
+          typename RealAllocator,
+          typename IndexAllocator >
+template< typename Real2, template< typename, typename > class Segments2, typename Index2, typename RealAllocator2, typename IndexAllocator2 >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+addMatrix( const SparseSandboxMatrix< Real2, Segments2, Device, Index2, RealAllocator2, IndexAllocator2 >& matrix,
+           const RealType& matrixMultiplicator,
+           const RealType& thisMatrixMultiplicator )
+{
+
+}
+
+template< typename Real,
+          template< typename, typename, typename > class Segments,
+          typename Device,
+          typename Index,
+          typename RealAllocator,
+          typename IndexAllocator >
+template< typename Real2, typename Index2 >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getTransposition( const SparseSandboxMatrix< Real2, Device, Index2 >& matrix,
+                  const RealType& matrixMultiplicator )
+{
+
+}*/
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+template< typename Vector1, typename Vector2 >
+bool
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+performSORIteration( const Vector1& b,
+                     const IndexType row,
+                     Vector2& x,
+                     const RealType& omega ) const
+{
+   return false;
+}
+
+// copy assignment
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >&
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+operator=( const SparseSandboxMatrix& matrix )
+{
+   Matrix< Real, Device, Index >::operator=( matrix );
+   this->columnIndexes = matrix.columnIndexes;
+   // SANDBOX_TODO: Replace the following line with an assignment of metadata required by you sparse matrix format.
+   this->rowPointers = matrix.rowPointers;
+   this->view = this->getView();
+   return *this;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Device_ >
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >&
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+operator=( const SparseSandboxMatrix< RealType, Device_, IndexType, MatrixType, RealAllocator, IndexAllocator >& matrix )
+{
+   Matrix< Real, Device, Index >::operator=( matrix );
+   this->columnIndexes = matrix.columnIndexes;
+   // SANDBOX_TODO: Replace the following line with an assignment of metadata required by you sparse matrix format.
+   this->rowPointers = matrix.rowPointers;
+   this->view = this->getView();
+   return *this;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Real_, typename Device_, typename Index_, ElementsOrganization Organization, typename RealAllocator_ >
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >&
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+operator=( const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator_ >& matrix )
+{
+   using RHSMatrix = DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator_ >;
+   using RHSIndexType = typename RHSMatrix::IndexType;
+   using RHSRealType = typename RHSMatrix::RealType;
+   using RHSDeviceType = typename RHSMatrix::DeviceType;
+   using RHSRealAllocatorType = typename RHSMatrix::RealAllocatorType;
+
+   Containers::Vector< RHSIndexType, RHSDeviceType, RHSIndexType > rowLengths;
+   matrix.getCompressedRowLengths( rowLengths );
+   this->setLike( matrix );
+   this->setRowCapacities( rowLengths );
+   Containers::Vector< IndexType, DeviceType, IndexType > rowLocalIndexes( matrix.getRows() );
+   rowLocalIndexes = 0;
+
+   // TODO: use getConstView when it works
+   const auto matrixView = const_cast< RHSMatrix& >( matrix ).getView();
+   const IndexType paddingIndex = this->getPaddingIndex();
+   auto columns_view = this->columnIndexes.getView();
+   auto values_view = this->values.getView();
+   auto rowLocalIndexes_view = rowLocalIndexes.getView();
+   columns_view = paddingIndex;
+
+   if( std::is_same< DeviceType, RHSDeviceType >::value )
+   {
+      const auto segments_view = this->segments.getView();
+      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIdx, const RHSRealType& value ) mutable {
+         if( value != 0.0 )
+         {
+            IndexType thisGlobalIdx = segments_view.getGlobalIndex( rowIdx, rowLocalIndexes_view[ rowIdx ]++ );
+            columns_view[ thisGlobalIdx ] = columnIdx;
+            if( ! isBinary() )
+               values_view[ thisGlobalIdx ] = value;
+         }
+      };
+      matrix.forAllElements( f );
+   }
+   else
+   {
+      const IndexType maxRowLength = matrix.getColumns();
+      const IndexType bufferRowsCount( 128 );
+      const size_t bufferSize = bufferRowsCount * maxRowLength;
+      Containers::Vector< RHSRealType, RHSDeviceType, RHSIndexType, RHSRealAllocatorType > matrixValuesBuffer( bufferSize );
+      Containers::Vector< RealType, DeviceType, IndexType, RealAllocatorType > thisValuesBuffer( bufferSize );
+      Containers::Vector< IndexType, DeviceType, IndexType, IndexAllocatorType > thisColumnsBuffer( bufferSize );
+      auto matrixValuesBuffer_view = matrixValuesBuffer.getView();
+      auto thisValuesBuffer_view = thisValuesBuffer.getView();
+
+      IndexType baseRow( 0 );
+      const IndexType rowsCount = this->getRows();
+      while( baseRow < rowsCount )
+      {
+         const IndexType lastRow = min( baseRow + bufferRowsCount, rowsCount );
+         thisColumnsBuffer = paddingIndex;
+
+         ////
+         // Copy matrix elements into buffer
+         auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value ) mutable {
+            const IndexType bufferIdx = ( rowIdx - baseRow ) * maxRowLength + localIdx;
+            matrixValuesBuffer_view[ bufferIdx ] = value;
+         };
+         matrix.forElements( baseRow, lastRow, f1 );
+
+         ////
+         // Copy the source matrix buffer to this matrix buffer
+         thisValuesBuffer_view = matrixValuesBuffer_view;
+
+         ////
+         // Copy matrix elements from the buffer to the matrix and ignoring
+         // zero matrix elements.
+         const IndexType matrix_columns = this->getColumns();
+         auto f2 = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType& columnIndex, RealType& value ) mutable {
+            RealType inValue( 0.0 );
+            IndexType bufferIdx, column( rowLocalIndexes_view[ rowIdx ] );
+            while( inValue == 0.0 && column < matrix_columns )
+            {
+               bufferIdx = ( rowIdx - baseRow ) * maxRowLength + column++;
+               inValue = thisValuesBuffer_view[ bufferIdx ];
+            }
+            rowLocalIndexes_view[ rowIdx ] = column;
+            if( inValue == 0.0 )
+            {
+               columnIndex = paddingIndex;
+               value = 0.0;
+            }
+            else
+            {
+               columnIndex = column - 1;
+               value = inValue;
+            }
+         };
+         this->forElements( baseRow, lastRow, f2 );
+         baseRow += bufferRowsCount;
+      }
+      //std::cerr << "This matrix = " << std::endl << *this << std::endl;
+   }
+   this->view = this->getView();
+   return *this;
+
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename RHSMatrix >
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >&
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+operator=( const RHSMatrix& matrix )
+{
+   using RHSIndexType = typename RHSMatrix::IndexType;
+   using RHSRealType = typename RHSMatrix::RealType;
+   using RHSDeviceType = typename RHSMatrix::DeviceType;
+   using RHSRealAllocatorType = typename RHSMatrix::RealAllocatorType;
+
+   Containers::Vector< RHSIndexType, RHSDeviceType, RHSIndexType > rowCapacities;
+   matrix.getRowCapacities( rowCapacities );
+   this->setDimensions( matrix.getRows(), matrix.getColumns() );
+   this->setRowCapacities( rowCapacities );
+   Containers::Vector< IndexType, DeviceType, IndexType > rowLocalIndexes( matrix.getRows() );
+   rowLocalIndexes = 0;
+
+   // TODO: use getConstView when it works
+   const auto matrixView = const_cast< RHSMatrix& >( matrix ).getView();
+   const IndexType paddingIndex = this->getPaddingIndex();
+   auto columns_view = this->columnIndexes.getView();
+   auto values_view = this->values.getView();
+   auto rowLocalIndexes_view = rowLocalIndexes.getView();
+   columns_view = paddingIndex;
+
+   // SANDBOX_TODO: Modify the follwoing accoring to your format
+   auto row_pointers_view = this->rowPointers.getView();
+   if( std::is_same< DeviceType, RHSDeviceType >::value )
+   {
+      auto f = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx_, RHSIndexType columnIndex, const RHSRealType& value ) mutable {
+         IndexType localIdx( rowLocalIndexes_view[ rowIdx ] );
+         IndexType thisRowBegin = row_pointers_view[ rowIdx ];
+         if( value != 0.0 && columnIndex != paddingIndex )
+         {
+            IndexType thisGlobalIdx = thisRowBegin + localIdx++;
+            columns_view[ thisGlobalIdx ] = columnIndex;
+            if( ! isBinary() )
+               values_view[ thisGlobalIdx ] = value;
+            rowLocalIndexes_view[ rowIdx ] = localIdx;
+         }
+      };
+      matrix.forAllElements( f );
+   }
+   else
+   {
+      const IndexType maxRowLength = max( rowCapacities );
+      const IndexType bufferRowsCount( 128 );
+      const size_t bufferSize = bufferRowsCount * maxRowLength;
+      Containers::Vector< RHSRealType, RHSDeviceType, RHSIndexType, RHSRealAllocatorType > matrixValuesBuffer( bufferSize );
+      Containers::Vector< RHSIndexType, RHSDeviceType, RHSIndexType > matrixColumnsBuffer( bufferSize );
+      Containers::Vector< RealType, DeviceType, IndexType, RealAllocatorType > thisValuesBuffer( bufferSize );
+      Containers::Vector< IndexType, DeviceType, IndexType > thisColumnsBuffer( bufferSize );
+      Containers::Vector< IndexType, DeviceType, IndexType > thisRowLengths;
+      Containers::Vector< RHSIndexType, RHSDeviceType, RHSIndexType > rhsRowLengths;
+      matrix.getCompressedRowLengths( rhsRowLengths );
+      thisRowLengths= rhsRowLengths;
+      auto matrixValuesBuffer_view = matrixValuesBuffer.getView();
+      auto matrixColumnsBuffer_view = matrixColumnsBuffer.getView();
+      auto thisValuesBuffer_view = thisValuesBuffer.getView();
+      auto thisColumnsBuffer_view = thisColumnsBuffer.getView();
+      matrixValuesBuffer_view = 0.0;
+
+      IndexType baseRow( 0 );
+      const IndexType rowsCount = this->getRows();
+      while( baseRow < rowsCount )
+      {
+         const IndexType lastRow = min( baseRow + bufferRowsCount, rowsCount );
+         thisColumnsBuffer = paddingIndex;
+         matrixColumnsBuffer_view = paddingIndex;
+
+         ////
+         // Copy matrix elements into buffer
+         auto f1 = [=] __cuda_callable__ ( RHSIndexType rowIdx, RHSIndexType localIdx, RHSIndexType columnIndex, const RHSRealType& value ) mutable {
+            if( columnIndex != paddingIndex )
+            {
+               TNL_ASSERT_LT( rowIdx - baseRow, bufferRowsCount, "" );
+               TNL_ASSERT_LT( localIdx, maxRowLength, "" );
+               const IndexType bufferIdx = ( rowIdx - baseRow ) * maxRowLength + localIdx;
+               TNL_ASSERT_LT( bufferIdx, ( IndexType ) bufferSize, "" );
+               matrixColumnsBuffer_view[ bufferIdx ] = columnIndex;
+               matrixValuesBuffer_view[ bufferIdx ] = value;
+            }
+         };
+         matrix.forElements( baseRow, lastRow, f1 );
+
+         ////
+         // Copy the source matrix buffer to this matrix buffer
+         thisValuesBuffer_view = matrixValuesBuffer_view;
+         thisColumnsBuffer_view = matrixColumnsBuffer_view;
+
+         ////
+         // Copy matrix elements from the buffer to the matrix and ignoring
+         // zero matrix elements
+         //const IndexType matrix_columns = this->getColumns();
+         const auto thisRowLengths_view = thisRowLengths.getConstView();
+         auto f2 = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType& columnIndex, RealType& value ) mutable {
+            RealType inValue( 0.0 );
+            size_t bufferIdx;
+            IndexType bufferLocalIdx( rowLocalIndexes_view[ rowIdx ] );
+            while( inValue == 0.0 && localIdx < thisRowLengths_view[ rowIdx ] )
+            {
+               bufferIdx = ( rowIdx - baseRow ) * maxRowLength + bufferLocalIdx++;
+               TNL_ASSERT_LT( bufferIdx, bufferSize, "" );
+               inValue = thisValuesBuffer_view[ bufferIdx ];
+            }
+            rowLocalIndexes_view[ rowIdx ] = bufferLocalIdx;
+            if( inValue == 0.0 )
+            {
+               columnIndex = paddingIndex;
+               value = 0.0;
+            }
+            else
+            {
+               columnIndex = thisColumnsBuffer_view[ bufferIdx ];//column - 1;
+               value = inValue;
+            }
+         };
+         this->forElements( baseRow, lastRow, f2 );
+         baseRow += bufferRowsCount;
+      }
+   }
+   this->view = this->getView();
+   return *this;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Matrix >
+bool
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+operator==( const Matrix& m ) const
+{
+   return view == m;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Matrix >
+bool
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+operator!=( const Matrix& m ) const
+{
+   return view != m;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+save( File& file ) const
+{
+   this->view.save( file );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+load( File& file )
+{
+   Matrix< RealType, DeviceType, IndexType >::load( file );
+   file >> this->columnIndexes;
+   // SANDBOX_TODO: Replace the following line with loading of metadata required by your sparse matrix format.
+   file >> rowPointers;
+   this->view = this->getView();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+save( const String& fileName ) const
+{
+   Object::save( fileName );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+load( const String& fileName )
+{
+   Object::load( fileName );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+void
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+print( std::ostream& str ) const
+{
+   this->view.print( str );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+__cuda_callable__
+Index
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getPaddingIndex() const
+{
+   return -1;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+auto
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getColumnIndexes() const -> const ColumnsIndexesVectorType&
+{
+   return this->columnIndexes;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+auto
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
+getColumnIndexes() -> ColumnsIndexesVectorType&
+{
+   return this->columnIndexes;
+}
+
+      } // namespace Sandbox
+   } // namespace Matrices
+} // namespace TNL
diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.h b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.h
new file mode 100644
index 000000000..cabf7b7fd
--- /dev/null
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.h
@@ -0,0 +1,282 @@
+ /***************************************************************************
+                          SparseSandboxMatrixRowView.h -  description
+                             -------------------
+    begin                : Apr 20, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <ostream>
+
+#include <TNL/Cuda/CudaCallable.h>
+#include <TNL/Matrices/MatrixRowViewIterator.h>
+
+namespace TNL {
+   namespace Matrices {
+      namespace Sandbox {
+
+/**
+ * \brief RowView is a simple structure for accessing rows of sparse matrix.
+ *
+ * \tparam ValuesView is a vector view storing the matrix elements values.
+ * \tparam ColumnsIndexesView is a vector view storing the column indexes of the matrix element.
+ * \tparam isBinary tells if the the parent matrix is a binary matrix.
+ *
+ * See \ref SparseSandboxMatrix and \ref SparseSandboxMatrixView.
+ *
+ * \par Example
+ * \include Matrices/SparseMatrix/SparseMatrixExample_getRow.cpp
+ * \par Output
+ * \include SparseMatrixExample_getRow.out
+ *
+ * \par Example
+ * \include Matrices/SparseMatrix/SparseMatrixViewExample_getRow.cpp
+ * \par Output
+ * \include SparseMatrixViewExample_getRow.out
+ */
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+class SparseSandboxMatrixRowView
+{
+   public:
+
+      /**
+       * \brief The type of matrix elements.
+       */
+      using RealType = typename ValuesView::RealType;
+
+      /**
+       * \brief The type used for matrix elements indexing.
+       */
+      using IndexType = typename ColumnsIndexesView::IndexType;
+
+      /**
+       * \brief Type of container view used for storing the matrix elements values.
+       */
+      using ValuesViewType = ValuesView;
+
+      /**
+       * \brief Type of container view used for storing the column indexes of the matrix elements.
+       */
+      using ColumnsIndexesViewType = ColumnsIndexesView;
+
+      /**
+       * \brief Type of constant container view used for storing the matrix elements values.
+       */
+      using ConstValuesViewType = typename ValuesViewType::ConstViewType;
+
+      /**
+       * \brief Type of constant container view used for storing the column indexes of the matrix elements.
+       */
+      using ConstColumnsIndexesViewType = typename ColumnsIndexesViewType::ConstViewType;
+
+      /**
+       * \brief Type of sparse matrix row view.
+       */
+      using RowView = SparseSandboxMatrixRowView< ValuesViewType, ColumnsIndexesViewType, isBinary_ >;
+
+      /**
+       * \brief Type of constant sparse matrix row view.
+       */
+      using ConstView = SparseSandboxMatrixRowView< ConstValuesViewType, ConstColumnsIndexesViewType, isBinary_ >;
+
+      /**
+       * \brief The type of related matrix element.
+       */
+      using MatrixElementType = SparseMatrixElement< RealType, IndexType >;
+
+      /**
+       * \brief Type of iterator for the matrix row.
+       */
+      using IteratorType = MatrixRowViewIterator< RowView >;
+
+      /**
+       * \brief Tells whether the parent matrix is a binary matrix.
+       * @return `true` if the matrix is binary.
+       */
+      static constexpr bool isBinary() { return isBinary_; };
+
+      /**
+       * \brief Constructor with \e segmentView, \e values and \e columnIndexes.
+       *
+       * \param rowIdx is row index.
+       * \param offset is the begining of the matrix row in arrays with values and column indexes of matrix elements.
+       * \param size is row size, i.e. number of nonzero matrix elements in the row.
+       * \param values is a container view for storing the matrix elements values.
+       * \param columnIndexes is a container view for storing the column indexes of the matrix elements.
+       */
+      __cuda_callable__
+      SparseSandboxMatrixRowView( IndexType rowIdx,
+                                  IndexType offset,
+                                  IndexType size,
+                                  const ValuesViewType& values,
+                                  const ColumnsIndexesViewType& columnIndexes );
+
+      /**
+       * \brief Returns size of the matrix row, i.e. number of matrix elements in this row.
+       *
+       * \return Size of the matrix row.
+       */
+      __cuda_callable__
+      IndexType getSize() const;
+
+      /**
+       * \brief Returns the matrix row index.
+       *
+       * \return matrix row index.
+       */
+      __cuda_callable__
+      const IndexType& getRowIndex() const;
+
+      /**
+       * \brief Returns constants reference to a column index of an element with given rank in the row.
+       *
+       * \param localIdx is the rank of the non-zero element in given row.
+       *
+       * \return constant reference to the matrix element column index.
+       */
+      __cuda_callable__
+      const IndexType& getColumnIndex( const IndexType localIdx ) const;
+
+      /**
+       * \brief Returns non-constants reference to a column index of an element with given rank in the row.
+       *
+       * \param localIdx is the rank of the non-zero element in given row.
+       *
+       * \return non-constant reference to the matrix element column index.
+       */
+      __cuda_callable__
+      IndexType& getColumnIndex( const IndexType localIdx );
+
+      /**
+       * \brief Returns constants reference to value of an element with given rank in the row.
+       *
+       * \param localIdx is the rank of the non-zero element in given row.
+       *
+       * \return constant reference to the matrix element value.
+       */
+      __cuda_callable__
+      const RealType& getValue( const IndexType localIdx ) const;
+
+      /**
+       * \brief Returns non-constants reference to value of an element with given rank in the row.
+       *
+       * \param localIdx is the rank of the non-zero element in given row.
+       *
+       * \return non-constant reference to the matrix element value.
+       */
+      __cuda_callable__
+      RealType& getValue( const IndexType localIdx );
+
+      /**
+       * \brief Sets a value of matrix element with given rank in the matrix row.
+       *
+       * \param localIdx is the rank of the matrix element in the row.
+       * \param value is the new value of the matrix element.
+       */
+      __cuda_callable__
+      void setValue( const IndexType localIdx,
+                     const RealType& value );
+
+      /**
+       * \brief Sets a column index of matrix element with given rank in the matrix row.
+       *
+       * \param localIdx is the rank of the matrix element in the row.
+       * \param columnIndex is the new column index of the matrix element.
+       */
+      __cuda_callable__
+      void setColumnIndex( const IndexType localIdx,
+                           const IndexType& columnIndex );
+
+      /**
+       * \brief Sets both a value and a column index of matrix element with given rank in the matrix row.
+       *
+       * \param localIdx is the rank of the matrix element in the row.
+       * \param columnIndex is the new column index of the matrix element.
+       * \param value is the new value of the matrix element.
+       */
+      __cuda_callable__
+      void setElement( const IndexType localIdx,
+                       const IndexType columnIndex,
+                       const RealType& value );
+
+      /**
+       * \brief Comparison of two matrix rows.
+       *
+       * The other matrix row can be from any other matrix.
+       *
+       * \param other is another matrix row.
+       * \return \e true if both rows are the same, \e false otherwise.
+       */
+      template< typename _ValuesView,
+                typename _ColumnsIndexesView,
+                bool _isBinary >
+      __cuda_callable__
+      bool operator==( const SparseSandboxMatrixRowView< _ValuesView, _ColumnsIndexesView, _isBinary >& other ) const;
+
+      /**
+       * \brief Returns iterator pointing at the beginning of the matrix row.
+       *
+       * \return iterator pointing at the beginning.
+       */
+      __cuda_callable__
+      IteratorType begin();
+
+      /**
+       * \brief Returns iterator pointing at the end of the matrix row.
+       *
+       * \return iterator pointing at the end.
+       */
+      __cuda_callable__
+      IteratorType end();
+
+      /**
+       * \brief Returns constant iterator pointing at the beginning of the matrix row.
+       *
+       * \return iterator pointing at the beginning.
+       */
+      __cuda_callable__
+      const IteratorType cbegin() const;
+
+      /**
+       * \brief Returns constant iterator pointing at the end of the matrix row.
+       *
+       * \return iterator pointing at the end.
+       */
+      __cuda_callable__
+      const IteratorType cend() const;
+
+   protected:
+
+      IndexType rowIdx, size;
+
+      // SANDBOX_TODO: Replace the following line with data required by your format.
+      IndexType offset;
+
+      ValuesViewType values;
+
+      ColumnsIndexesViewType columnIndexes;
+};
+
+/**
+ * \brief Insertion operator for a sparse matrix row.
+ *
+ * \param str is an output stream.
+ * \param row is an input sparse matrix row.
+ * \return  reference to the output stream.
+ */
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+std::ostream& operator<<( std::ostream& str, const SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >& row );
+
+      } // namespace Sandbox
+   } // namespace Matrices
+} // namespace TNL
+
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.hpp>
diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.hpp b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.hpp
new file mode 100644
index 000000000..09598edd0
--- /dev/null
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.hpp
@@ -0,0 +1,240 @@
+/***************************************************************************
+                          SparseSandboxMatrixRowView.hpp -  description
+                             -------------------
+    begin                : Apr 20, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.h>
+#include <TNL/Assert.h>
+
+namespace TNL {
+   namespace Matrices {
+      namespace Sandbox {
+
+// SANDBOX_TODO: Modify the follwing constructor by your needs
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseSandboxMatrixRowView( IndexType rowIdx,
+                            IndexType offset,
+                            IndexType size,
+                            const ValuesViewType& values,
+                            const ColumnsIndexesViewType& columnIndexes )
+ : rowIdx( rowIdx ), offset( offset ), size( size ), values( values ), columnIndexes( columnIndexes )
+{
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ auto
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+getSize() const -> IndexType
+{
+   return this->size;
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__
+auto
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+getRowIndex() const -> const IndexType&
+{
+   return this->rowIdx;
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ auto
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+getColumnIndex( const IndexType localIdx ) const -> const IndexType&
+{
+   TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
+   // SANDBOX_TODO: Modify the following line to match with your sparse format.
+   return columnIndexes[ offset + localIdx ];
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ auto
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+getColumnIndex( const IndexType localIdx ) -> IndexType&
+{
+   TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
+   // SANDBOX_TODO: Modify the following line to match with your sparse format.
+   return columnIndexes[ offset + localIdx ];
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ auto
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+getValue( const IndexType localIdx ) const -> const RealType&
+{
+   TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
+   TNL_ASSERT_FALSE( isBinary(), "Cannot call this method for binary matrix row." );
+   // SANDBOX_TODO: Modify the following line to match with your sparse format.
+   return values[ offset + localIdx ];
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ auto
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+getValue( const IndexType localIdx ) -> RealType&
+{
+   TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
+   TNL_ASSERT_FALSE( isBinary(), "Cannot call this method for binary matrix row." );
+   // SANDBOX_TODO: Modify the following line to match with your sparse format.
+   return values[ offset + localIdx ];
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ void
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+setValue( const IndexType localIdx,
+          const RealType& value )
+{
+   TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
+   if( ! isBinary() ) {
+      // SANDBOX_TODO: Modify the following line to match with your sparse format.
+      const IndexType globalIdx = offset + localIdx;
+      values[ globalIdx ] = value;
+   }
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ void
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+setColumnIndex( const IndexType localIdx,
+                const IndexType& columnIndex )
+{
+   TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
+   // SANDBOX_TODO: Modify the following line to match with your sparse format.
+   const IndexType globalIdx = offset + localIdx;
+   this->columnIndexes[ globalIdx ] = columnIndex;
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ void
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+setElement( const IndexType localIdx,
+            const IndexType column,
+            const RealType& value )
+{
+   TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
+   // SANDBOX_TODO: Modify the following line to match with your sparse format.
+   const IndexType globalIdx = offset + localIdx;
+   columnIndexes[ globalIdx ] = column;
+   if( ! isBinary() )
+      values[ globalIdx ] = value;
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+   template< typename _ValuesView,
+             typename _ColumnsIndexesView,
+             bool _isBinary >
+__cuda_callable__
+bool
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+operator==( const SparseSandboxMatrixRowView< _ValuesView, _ColumnsIndexesView, _isBinary >& other ) const
+{
+   IndexType i = 0;
+   while( i < getSize() && i < other.getSize() ) {
+      if( getColumnIndex( i ) != other.getColumnIndex( i ) )
+         return false;
+      if( ! _isBinary && getValue( i ) != other.getValue( i ) )
+         return false;
+      ++i;
+   }
+   for( IndexType j = i; j < getSize(); j++ )
+      // TODO: use ... != getPaddingIndex()
+      if( getColumnIndex( j ) >= 0 )
+         return false;
+   for( IndexType j = i; j < other.getSize(); j++ )
+      // TODO: use ... != getPaddingIndex()
+      if( other.getColumnIndex( j ) >= 0 )
+         return false;
+   return true;
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ auto
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+begin() -> IteratorType
+{
+   return IteratorType( *this, 0 );
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ auto
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+end() -> IteratorType
+{
+   return IteratorType( *this, this->getSize() );
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ auto
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+cbegin() const -> const IteratorType
+{
+   return IteratorType( *this, 0 );
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+__cuda_callable__ auto
+SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::
+cend() const -> const IteratorType
+{
+   return IteratorType( *this, this->getSize() );
+}
+
+template< typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+std::ostream& operator<<( std::ostream& str, const SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >& row )
+{
+   using NonConstIndex = std::remove_const_t< typename SparseSandboxMatrixRowView< ValuesView, ColumnsIndexesView, isBinary_ >::IndexType >;
+   for( NonConstIndex i = 0; i < row.getSize(); i++ )
+      if( isBinary_ )
+         // TODO: check getPaddingIndex(), print only the column indices of non-zeros but not the values
+         str << " [ " << row.getColumnIndex( i ) << " ] = " << (row.getColumnIndex( i ) >= 0) << ", ";
+      else
+         str << " [ " << row.getColumnIndex( i ) << " ] = " << row.getValue( i ) << ", ";
+   return str;
+}
+
+      } // namespace Sandbox
+   } // namespace Matrices
+} // namespace TNL
diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.h b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.h
new file mode 100644
index 000000000..66247a349
--- /dev/null
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.h
@@ -0,0 +1,871 @@
+/***************************************************************************
+                          SparseSandboxMatrixView.h -  description
+                             -------------------
+    begin                : Apr 20, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Matrices/Matrix.h>
+#include <TNL/Matrices/MatrixType.h>
+#include <TNL/Allocators/Default.h>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.h>
+#include <TNL/TypeTraits.h>
+
+namespace TNL {
+   namespace Matrices {
+      namespace Sandbox {
+
+/**
+ * \brief Implementation of sparse sandbox matrix view.
+ *
+ * It serves as an accessor to \ref SparseSandboxMatrix for example when passing the
+ * matrix to lambda functions. SparseSandboxMatrix view can be also created in CUDA kernels.
+ *
+ * \tparam Real is a type of matrix elements. If \e Real equals \e bool the matrix is treated
+ *    as binary and so the matrix elements values are not stored in the memory since we need
+ *    to remember only coordinates of non-zero elements( which equal one).
+ * \tparam Device is a device where the matrix is allocated.
+ * \tparam Index is a type for indexing of the matrix elements.
+ * \tparam MatrixType specifies a symmetry of matrix. See \ref MatrixType. Symmetric
+ *    matrices store only lower part of the matrix and its diagonal. The upper part is reconstructed on the fly.
+ *    GeneralMatrix with no symmetry is used by default.
+ * \tparam Segments is a structure representing the sparse matrix format. Depending on the pattern of the non-zero elements
+ *    different matrix formats can perform differently especially on GPUs. By default \ref CSR format is used. See also
+ *    \ref Ellpack, \ref SlicedEllpack, \ref ChunkedEllpack or \ref BiEllpack.
+ * \tparam ComputeReal is the same as \e Real mostly but for binary matrices it is set to \e Index type. This can be changed
+ *    bu the user, of course.
+ *
+ */
+template< typename Real,
+          typename Device = Devices::Host,
+          typename Index = int,
+          typename MatrixType = GeneralMatrix >
+class SparseSandboxMatrixView : public MatrixView< Real, Device, Index >
+{
+   static_assert(
+      ! MatrixType::isSymmetric() ||
+      ! std::is_same< Device, Devices::Cuda >::value ||
+      ( std::is_same< Real, float >::value || std::is_same< Real, double >::value || std::is_same< Real, int >::value || std::is_same< Real, long long int >::value ),
+      "Given Real type is not supported by atomic operations on GPU which are necessary for symmetric operations." );
+
+   public:
+
+      // Supporting types - they are not important for the user
+      using BaseType = MatrixView< Real, Device, Index >;
+      using ValuesViewType = typename BaseType::ValuesView;
+      using ConstValuesViewType = typename ValuesViewType::ConstViewType;
+      using ColumnsIndexesViewType = Containers::VectorView< typename TNL::copy_const< Index >::template from< Real >::type, Device, Index >;
+      using ConstColumnsIndexesViewType = typename ColumnsIndexesViewType::ConstViewType;
+      using RowsCapacitiesView = Containers::VectorView< Index, Device, Index >;
+      using ConstRowsCapacitiesView = typename RowsCapacitiesView::ConstViewType;
+
+      /**
+       * \brief Test of symmetric matrix type.
+       *
+       * \return \e true if the matrix is stored as symmetric and \e false otherwise.
+       */
+      static constexpr bool isSymmetric() { return MatrixType::isSymmetric(); };
+
+      /**
+       * \brief Test of binary matrix type.
+       *
+       * \return \e true if the matrix is stored as binary and \e false otherwise.
+       */
+      static constexpr bool isBinary() { return std::is_same< Real, bool >::value; };
+
+      /**
+       * \brief The type of matrix elements.
+       */
+      using RealType = Real;
+
+      //using ComputeRealType = ComputeReal;
+
+      /**
+       * \brief The device where the matrix is allocated.
+       */
+      using DeviceType = Device;
+
+      /**
+       * \brief The type used for matrix elements indexing.
+       */
+      using IndexType = Index;
+
+      /**
+       * \brief Templated type of segments view, i.e. sparse matrix format.
+       */
+      //template< typename Device_, typename Index_ >
+      //using SegmentsViewTemplate = SegmentsView< Device_, Index_ >;
+
+      /**
+       * \brief Type of segments view used by this matrix. It represents the sparse matrix format.
+       */
+      //using SegmentsViewType = SegmentsView< Device, Index >;
+
+      /**
+       * \brief Type of related matrix view.
+       */
+      using ViewType = SparseSandboxMatrixView< Real, Device, Index, MatrixType >;
+
+      /**
+       * \brief Matrix view type for constant instances.
+       */
+      using ConstViewType = SparseSandboxMatrixView< std::add_const_t< Real >, Device, Index, MatrixType >;
+
+      /**
+       * \brief Type for accessing matrix rows.
+       */
+      using RowView = SparseSandboxMatrixRowView< ValuesViewType, ColumnsIndexesViewType, isBinary() >;
+
+      /**
+       * \brief Type for accessing constant matrix rows.
+       */
+      using ConstRowView = SparseSandboxMatrixRowView< ConstValuesViewType, ConstColumnsIndexesViewType, isBinary() >;;
+
+      /**
+       * \brief Helper type for getting self type or its modifications.
+       */
+      template< typename _Real = Real,
+                typename _Device = Device,
+                typename _Index = Index,
+                typename _MatrixType = MatrixType >
+      using Self = SparseSandboxMatrixView< _Real, _Device, _Index, _MatrixType >;
+
+      /**
+       * \brief Type of container view for CSR row pointers.
+       *
+       * SANDBOX_TODO: You may replace it with containers views for metadata of your format.
+       */
+      using RowPointersView = TNL::Containers::VectorView< IndexType, DeviceType, IndexType >;
+
+      /**
+       * \brief Constructor with no parameters.
+       */
+      __cuda_callable__
+      SparseSandboxMatrixView();
+
+      /**
+       * \brief Constructor with all necessary data and views.
+       *
+       * \param rows is a number of matrix rows.
+       * \param columns is a number of matrix columns.
+       * \param values is a vector view with matrix elements values.
+       * \param columnIndexes is a vector view with matrix elements column indexes.
+       * \param rowPointers is a container view with row pointers.
+       *
+       * SANDBOX_TODO: Replace `rowPointers` with metadata by your needs.
+       */
+      __cuda_callable__
+      SparseSandboxMatrixView( const IndexType rows,
+                               const IndexType columns,
+                               const ValuesViewType& values,
+                               const ColumnsIndexesViewType& columnIndexes,
+                               const RowPointersView& rowPointers );
+
+      /**
+       * \brief Copy constructor.
+       *
+       * \param matrix is an input sparse matrix view.
+       */
+      __cuda_callable__
+      SparseSandboxMatrixView( const SparseSandboxMatrixView& matrix ) = default;
+
+      /**
+       * \brief Move constructor.
+       *
+       * \param matrix is an input sparse matrix view.
+       */
+      __cuda_callable__
+      SparseSandboxMatrixView( SparseSandboxMatrixView&& matrix ) = default;
+
+      /**
+       * \brief Returns a modifiable view of the sparse matrix.
+       *
+       * \return sparse matrix view.
+       */
+      __cuda_callable__
+      ViewType getView();
+
+      /**
+       * \brief Returns a non-modifiable view of the sparse matrix.
+       *
+       * \return sparse matrix view.
+       */
+      __cuda_callable__
+      ConstViewType getConstView() const;
+
+      /**
+       * \brief Returns string with serialization type.
+       *
+       * The string has a form `Matrices::SparseSandboxMatrix< RealType,  [any_device], IndexType, General/Symmetric, Format, [any_allocator] >`.
+       *
+       * \return \ref String with the serialization type.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_getSerializationType.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_getSerializationType.out
+       */
+      static String getSerializationType();
+
+      /**
+       * \brief Returns string with serialization type.
+       *
+       * See \ref SparseSandboxMatrix::getSerializationType.
+       *
+       * \return \e String with the serialization type.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_getSerializationType.cpp
+       * \par Output
+       * \include SparseMatrixExample_getSerializationType.out
+       */
+      virtual String getSerializationTypeVirtual() const;
+
+      /**
+       * \brief Computes number of non-zeros in each row.
+       *
+       * \param rowLengths is a vector into which the number of non-zeros in each row
+       * will be stored.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_getCompressedRowLengths.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_getCompressedRowLengths.out
+       */
+      template< typename Vector >
+      void getCompressedRowLengths( Vector& rowLengths ) const;
+
+      /**
+       * \brief Compute capacities of all rows.
+       *
+       * The row capacities are not stored explicitly and must be computed.
+       *
+       * \param rowCapacities is a vector where the row capacities will be stored.
+       */
+      template< typename Vector >
+      void getRowCapacities( Vector& rowCapacities ) const;
+
+      /**
+       * \brief Returns capacity of given matrix row.
+       *
+       * \param row index of matrix row.
+       * \return number of matrix elements allocated for the row.
+       */
+      __cuda_callable__
+      IndexType getRowCapacity( const IndexType row ) const;
+
+      /**
+       * \brief Returns number of non-zero matrix elements.
+       *
+       * This method really counts the non-zero matrix elements and so
+       * it returns zero for matrix having all allocated elements set to zero.
+       *
+       * \return number of non-zero matrix elements.
+       */
+      IndexType getNonzeroElementsCount() const;
+
+      /**
+       * \brief Constant getter of simple structure for accessing given matrix row.
+       *
+       * \param rowIdx is matrix row index.
+       *
+       * \return RowView for accessing given matrix row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_getConstRow.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_getConstRow.out
+       *
+       * See \ref SparseSandboxMatrixRowView.
+       */
+      __cuda_callable__
+      ConstRowView getRow( const IndexType& rowIdx ) const;
+
+      /**
+       * \brief Non-constant getter of simple structure for accessing given matrix row.
+       *
+       * \param rowIdx is matrix row index.
+       *
+       * \return RowView for accessing given matrix row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_getRow.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_getRow.out
+       *
+       * See \ref SparseSandboxMatrixRowView.
+       */
+      __cuda_callable__
+      RowView getRow( const IndexType& rowIdx );
+
+      /**
+       * \brief Sets element at given \e row and \e column to given \e value.
+       *
+       * This method can be called from the host system (CPU) no matter
+       * where the matrix is allocated. If the matrix is allocated on GPU this method
+       * can be called even from device kernels. If the matrix is allocated in GPU device
+       * this method is called from CPU, it transfers values of each matrix element separately and so the
+       * performance is very low. For higher performance see. \ref SparseSandboxMatrix::getRow
+       * or \ref SparseSandboxMatrix::forElements and \ref SparseSandboxMatrix::forAllElements.
+       * The call may fail if the matrix row capacity is exhausted.
+       *
+       * \param row is row index of the element.
+       * \param column is columns index of the element.
+       * \param value is the value the element will be set to.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_setElement.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_setElement.out
+       */
+      __cuda_callable__
+      void setElement( const IndexType row,
+                       const IndexType column,
+                       const RealType& value );
+
+      /**
+       * \brief Add element at given \e row and \e column to given \e value.
+       *
+       * This method can be called from the host system (CPU) no matter
+       * where the matrix is allocated. If the matrix is allocated on GPU this method
+       * can be called even from device kernels. If the matrix is allocated in GPU device
+       * this method is called from CPU, it transfers values of each matrix element separately and so the
+       * performance is very low. For higher performance see. \ref SparseSandboxMatrix::getRow
+       * or \ref SparseSandboxMatrix::forElements and \ref SparseSandboxMatrix::forAllElements.
+       * The call may fail if the matrix row capacity is exhausted.
+       *
+       * \param row is row index of the element.
+       * \param column is columns index of the element.
+       * \param value is the value the element will be set to.
+       * \param thisElementMultiplicator is multiplicator the original matrix element
+       *   value is multiplied by before addition of given \e value.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_addElement.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_addElement.out
+       */
+      __cuda_callable__
+      void addElement( IndexType row,
+                       IndexType column,
+                       const RealType& value,
+                       const RealType& thisElementMultiplicator = 1.0 );
+
+      /**
+       * \brief Returns value of matrix element at position given by its row and column index.
+       *
+       * This method can be called from the host system (CPU) no matter
+       * where the matrix is allocated. If the matrix is allocated on GPU this method
+       * can be called even from device kernels. If the matrix is allocated in GPU device
+       * this method is called from CPU, it transfers values of each matrix element separately and so the
+       * performance is very low. For higher performance see. \ref SparseSandboxMatrix::getRow
+       * or \ref SparseSandboxMatrix::forElements and \ref SparseSandboxMatrix::forAllElements.
+       *
+       * \param row is a row index of the matrix element.
+       * \param column i a column index of the matrix element.
+       *
+       * \return value of given matrix element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_getElement.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_getElement.out
+       *
+       */
+      __cuda_callable__
+      RealType getElement( IndexType row,
+                           IndexType column ) const;
+
+      /**
+       * \brief Method for performing general reduction on matrix rows.
+       *
+       * \tparam Fetch is a type of lambda function for data fetch declared as
+       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
+       *          The return type of this lambda can be any non void.
+       * \tparam Reduce is a type of lambda function for reduction declared as
+       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
+       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       * \tparam FetchValue is type returned by the Fetch lambda function.
+       *
+       * \param begin defines beginning of the range [begin,end) of rows to be processed.
+       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param fetch is an instance of lambda function for data fetch.
+       * \param reduce is an instance of lambda function for reduction.
+       * \param keep in an instance of lambda function for storing results.
+       * \param zero is zero of given reduction operation also known as idempotent element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_reduceRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_reduceRows.out
+       */
+      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero );
+
+      /**
+       * \brief Method for performing general reduction on matrix rows for constant instances.
+       *
+       * \tparam Fetch is a type of lambda function for data fetch declared as
+       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
+       *          The return type of this lambda can be any non void.
+       * \tparam Reduce is a type of lambda function for reduction declared as
+       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
+       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       * \tparam FetchValue is type returned by the Fetch lambda function.
+       *
+       * \param begin defines beginning of the range [begin,end) of rows to be processed.
+       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param fetch is an instance of lambda function for data fetch.
+       * \param reduce is an instance of lambda function for reduction.
+       * \param keep in an instance of lambda function for storing results.
+       * \param zero is zero of given reduction operation also known as idempotent element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_reduceRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_reduceRows.out
+       */
+      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+
+      /**
+       * \brief Method for performing general reduction on all matrix rows.
+       *
+       * \tparam Fetch is a type of lambda function for data fetch declared as
+       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
+       *          The return type of this lambda can be any non void.
+       * \tparam Reduce is a type of lambda function for reduction declared as
+       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
+       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       * \tparam FetchValue is type returned by the Fetch lambda function.
+       *
+       * \param fetch is an instance of lambda function for data fetch.
+       * \param reduce is an instance of lambda function for reduction.
+       * \param keep in an instance of lambda function for storing results.
+       * \param zero is zero of given reduction operation also known as idempotent element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_reduceAllRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_reduceAllRows.out
+       */
+      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero );
+
+      /**
+       * \brief Method for performing general reduction on all matrix rows for constant instances.
+       *
+       * \tparam Fetch is a type of lambda function for data fetch declared as
+       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
+       *          The return type of this lambda can be any non void.
+       * \tparam Reduce is a type of lambda function for reduction declared as
+       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
+       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       * \tparam FetchValue is type returned by the Fetch lambda function.
+       *
+       * \param fetch is an instance of lambda function for data fetch.
+       * \param reduce is an instance of lambda function for reduction.
+       * \param keep in an instance of lambda function for storing results.
+       * \param zero is zero of given reduction operation also known as idempotent element.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_reduceAllRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_reduceAllRows.out
+       */
+      template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+
+      /**
+       * \brief Method for iteration over all matrix rows for constant instances.
+       *
+       * \tparam Function is type of lambda function that will operate on matrix elements.
+       *    It is should have form like
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`.
+       *  The \e localIdx parameter is a rank of the non-zero element in given row.
+       *
+       * \param begin defines beginning of the range [begin,end) of rows to be processed.
+       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param function is an instance of the lambda function to be called in each row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_forRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_forRows.out
+       */
+      template< typename Function >
+      void forElements( IndexType begin, IndexType end, Function& function ) const;
+
+      /**
+       * \brief Method for iteration over all matrix rows for non-constant instances.
+       *
+       * \tparam Function is type of lambda function that will operate on matrix elements.
+       *    It is should have form like
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`.
+       *  The \e localIdx parameter is a rank of the non-zero element in given row.
+       *
+       * \param begin defines beginning of the range [begin,end) of rows to be processed.
+       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param function is an instance of the lambda function to be called in each row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_forRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_forRows.out
+       */
+      template< typename Function >
+      void forElements( IndexType begin, IndexType end, Function& function );
+
+      /**
+       * \brief This method calls \e forElements for all matrix rows (for constant instances).
+       *
+       * See \ref SparseSandboxMatrix::forElements.
+       *
+       * \tparam Function is a type of lambda function that will operate on matrix elements.
+       * \param function  is an instance of the lambda function to be called in each row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_forAllRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_forAllRows.out
+       */
+      template< typename Function >
+      void forAllElements( Function& function ) const;
+
+      /**
+       * \brief This method calls \e forElements for all matrix rows.
+       *
+       * See \ref SparseSandboxMatrix::forElements.
+       *
+       * \tparam Function is a type of lambda function that will operate on matrix elements.
+       * \param function  is an instance of the lambda function to be called in each row.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_forAllRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_forAllRows.out
+       */
+      template< typename Function >
+      void forAllElements( Function& function );
+
+      /**
+       * \brief Method for parallel iteration over matrix rows from interval [ \e begin, \e end).
+       *
+       * In each row, given lambda function is performed. Each row is processed by at most one thread unlike the method
+       * \ref SparseSandboxMatrixView::forElements where more than one thread can be mapped to each row.
+
+       *
+       * \tparam Function is type of the lambda function.
+       *
+       * \param begin defines beginning of the range [ \e begin,\e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
+       * \param function is an instance of the lambda function to be called for each row.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) mutable { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseSandboxMatrixView::RowView.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_forRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_forRows.out
+       */
+      template< typename Function >
+      void forRows( IndexType begin, IndexType end, Function&& function );
+
+      /**
+       * \brief Method for parallel iteration over matrix rows from interval [ \e begin, \e end) for constant instances.
+       *
+       * In each row, given lambda function is performed. Each row is processed by at most one thread unlike the method
+       * \ref SparseSandboxMatrixView::forElements where more than one thread can be mapped to each row.
+       *
+       * \tparam Function is type of the lambda function.
+       *
+       * \param begin defines beginning of the range [ \e begin,\e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
+       * \param function is an instance of the lambda function to be called for each row.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseSandboxMatrixView::RowView.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_forRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_forRows.out
+       */
+      template< typename Function >
+      void forRows( IndexType begin, IndexType end, Function&& function ) const;
+
+      /**
+       * \brief Method for parallel iteration over all matrix rows.
+       *
+       * In each row, given lambda function is performed. Each row is processed by at most one thread unlike the method
+       * \ref SparseSandboxMatrixView::forAllElements where more than one thread can be mapped to each row.
+       *
+       * \tparam Function is type of the lambda function.
+       *
+       * \param function is an instance of the lambda function to be called for each row.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) mutable { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseSandboxMatrixView::RowView.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_forRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_forRows.out
+       */
+      template< typename Function >
+      void forAllRows( Function&& function );
+
+      /**
+       * \brief Method for parallel iteration over all matrix rows for constant instances.
+       *
+       * In each row, given lambda function is performed. Each row is processed by at most one thread unlike the method
+       * \ref SparseSandboxMatrixView::forAllElements where more than one thread can be mapped to each row.
+       *
+       * \tparam Function is type of the lambda function.
+       *
+       * \param function is an instance of the lambda function to be called for each row.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseSandboxMatrixView::RowView.
+       *
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixViewExample_forRows.cpp
+       * \par Output
+       * \include SparseMatrixViewExample_forRows.out
+       */
+      template< typename Function >
+      void forAllRows( Function&& function ) const;
+
+      /**
+       * \brief Method for sequential iteration over all matrix rows for constant instances.
+       *
+       * \tparam Function is type of lambda function that will operate on matrix elements.
+       *    It is should have form like
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
+       *  The column index repeats twice only for compatibility with sparse matrices.
+       *
+       * \param begin defines beginning of the range [begin,end) of rows to be processed.
+       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param function is an instance of the lambda function to be called in each row.
+       */
+      template< typename Function >
+      void sequentialForRows( IndexType begin, IndexType end, Function& function ) const;
+
+      /**
+       * \brief Method for sequential iteration over all matrix rows for non-constant instances.
+       *
+       * \tparam Function is type of lambda function that will operate on matrix elements.
+       *    It is should have form like
+       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
+       *  The column index repeats twice only for compatibility with sparse matrices.
+       *
+       * \param begin defines beginning of the range [begin,end) of rows to be processed.
+       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param function is an instance of the lambda function to be called in each row.
+       */
+      template< typename Function >
+      void sequentialForRows( IndexType begin, IndexType end, Function& function );
+
+      /**
+       * \brief This method calls \e sequentialForRows for all matrix rows (for constant instances).
+       *
+       * See \ref SparseSandboxMatrixView::sequentialForRows.
+       *
+       * \tparam Function is a type of lambda function that will operate on matrix elements.
+       * \param function  is an instance of the lambda function to be called in each row.
+       */
+      template< typename Function >
+      void sequentialForAllRows( Function& function ) const;
+
+      /**
+       * \brief This method calls \e sequentialForRows for all matrix rows.
+       *
+       * See \ref SparseSandboxMatrixView::sequentialForAllRows.
+       *
+       * \tparam Function is a type of lambda function that will operate on matrix elements.
+       * \param function  is an instance of the lambda function to be called in each row.
+       */
+      template< typename Function >
+      void sequentialForAllRows( Function& function );
+
+      /**
+       * \brief Computes product of matrix and vector.
+       *
+       * More precisely, it computes:
+       *
+       * `outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector`
+       *
+       * \tparam InVector is type of input vector.  It can be \ref Vector,
+       *     \ref VectorView, \ref Array, \ref ArraView or similar container.
+       * \tparam OutVector is type of output vector. It can be \ref Vector,
+       *     \ref VectorView, \ref Array, \ref ArraView or similar container.
+       *
+       * \param inVector is input vector.
+       * \param outVector is output vector.
+       * \param matrixMultiplicator is a factor by which the matrix is multiplied. It is one by default.
+       * \param outVectorMultiplicator is a factor by which the outVector is multiplied before added
+       *    to the result of matrix-vector product. It is zero by default.
+       * \param begin is the beginning of the rows range for which the vector product
+       *    is computed. It is zero by default.
+       * \param end is the end of the rows range for which the vector product
+       *    is computed. It is number if the matrix rows by default.
+       */
+      template< typename InVector,
+                typename OutVector >
+      void vectorProduct( const InVector& inVector,
+                          OutVector& outVector,
+                          const RealType matrixMultiplicator = 1.0,
+                          const RealType outVectorMultiplicator = 0.0,
+                          const IndexType begin = 0,
+                          IndexType end = 0 ) const;
+
+      template< typename Vector1, typename Vector2 >
+      bool performSORIteration( const Vector1& b,
+                                const IndexType row,
+                                Vector2& x,
+                                const RealType& omega = 1.0 ) const;
+
+      /**
+       * \brief Assignment of any matrix type.
+       * .
+       * \param matrix is input matrix for the assignment.
+       * \return reference to this matrix.
+       */
+      SparseSandboxMatrixView& operator=( const SparseSandboxMatrixView& matrix );
+
+      /**
+       * \brief Comparison operator with another arbitrary matrix type.
+       *
+       * \param matrix is the right-hand side matrix.
+       * \return \e true if the RHS matrix is equal, \e false otherwise.
+       */
+      template< typename Matrix >
+      bool operator==( const Matrix& m ) const;
+
+      /**
+       * \brief Comparison operator with another arbitrary matrix type.
+       *
+       * \param matrix is the right-hand side matrix.
+       * \return \e true if the RHS matrix is equal, \e false otherwise.
+       */
+      template< typename Matrix >
+      bool operator!=( const Matrix& m ) const;
+
+      /**
+       * \brief Method for saving the matrix to the file with given filename.
+       *
+       * \param fileName is name of the file.
+       */
+      void save( const String& fileName ) const;
+
+      /**
+       * \brief Method for saving the matrix to a file.
+       *
+       * \param file is the output file.
+       */
+      void save( File& file ) const;
+
+      /**
+       * \brief Method for printing the matrix to output stream.
+       *
+       * \param str is the output stream.
+       */
+      void print( std::ostream& str ) const;
+
+      /**
+       * \brief Getter of segments for non-constant instances.
+       *
+       * \e Segments are a structure for addressing the matrix elements columns and values.
+       * In fact, \e Segments represent the sparse matrix format.
+       *
+       * \return Non-constant reference to segments.
+       */
+      //SegmentsViewType& getSegments();
+
+      /**
+       * \brief Getter of segments for constant instances.
+       *
+       * \e Segments are a structure for addressing the matrix elements columns and values.
+       * In fact, \e Segments represent the sparse matrix format.
+       *
+       * \return Constant reference to segments.
+       */
+      //const SegmentsViewType& getSegments() const;
+
+      /**
+       * \brief Getter of column indexes for constant instances.
+       *
+       * \return Constant reference to a vector with matrix elements column indexes.
+       */
+      const ColumnsIndexesViewType& getColumnIndexes() const;
+
+      /**
+       * \brief Getter of column indexes for nonconstant instances.
+       *
+       * \return Reference to a vector with matrix elements column indexes.
+       */
+      ColumnsIndexesViewType& getColumnIndexes();
+
+      /**
+       * \brief Returns a padding index value.
+       *
+       * Padding index is used for column indexes of padding zeros. Padding zeros
+       * are used in some sparse matrix formats for better data alignment in memory.
+       *
+       * \return value of the padding index.
+       */
+      __cuda_callable__
+      IndexType getPaddingIndex() const;
+
+   protected:
+
+      ColumnsIndexesViewType columnIndexes;
+
+      RowPointersView rowPointers;
+      //SegmentsViewType segments;
+
+   private:
+      // TODO: this should be probably moved into a detail namespace
+      template< typename VectorOrView,
+                std::enable_if_t< HasSetSizeMethod< VectorOrView >::value, bool > = true >
+      static void set_size_if_resizable( VectorOrView& v, IndexType size )
+      {
+         v.setSize( size );
+      }
+
+      template< typename VectorOrView,
+                std::enable_if_t< ! HasSetSizeMethod< VectorOrView >::value, bool > = true >
+      static void set_size_if_resizable( VectorOrView& v, IndexType size )
+      {
+         TNL_ASSERT_EQ( v.getSize(), size, "view has wrong size" );
+      }
+};
+
+      } // namespace Sandbox
+   } // namespace Matrices
+} // namespace TNL
+
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp>
diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp
new file mode 100644
index 000000000..fb50a7acf
--- /dev/null
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp
@@ -0,0 +1,1026 @@
+/***************************************************************************
+                          SparseSandboxMatrixView.hpp -  description
+                             -------------------
+    begin                : Apr 20, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <functional>
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrixView.h>
+#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/AtomicOperations.h>
+#include <TNL/Matrices/details/SparseMatrix.h>
+
+namespace TNL {
+   namespace Matrices {
+      namespace Sandbox {
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+SparseSandboxMatrixView()
+{
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+SparseSandboxMatrixView( const IndexType rows,
+                         const IndexType columns,
+                         const ValuesViewType& values,
+                         const ColumnsIndexesViewType& columnIndexes,
+                         const RowPointersView& rowPointers )
+: MatrixView< Real, Device, Index >( rows, columns, values ), columnIndexes( columnIndexes ), rowPointers( rowPointers )
+{
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__
+auto
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getView() -> ViewType
+{
+   return ViewType( this->getRows(),
+                    this->getColumns(),
+                    this->getValues().getView(),
+                    this->columnIndexes.getView(),
+                    this->segments.getView() );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__
+auto
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getConstView() const -> ConstViewType
+{
+   return ConstViewType( this->getRows(),
+                         this->getColumns(),
+                         this->getValues().getConstView(),
+                         this->getColumnsIndexes().getConstView(),
+                         this->segments.getConstView() );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+String
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getSerializationType()
+{
+   return String( "Matrices::Sandbox::SparseMatrix< " ) +
+             TNL::getSerializationType< RealType >() + ", " +
+             TNL::getSerializationType< IndexType >() + ", " +
+             MatrixType::getSerializationType() + ", [any_allocator], [any_allocator] >";
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+String
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getSerializationTypeVirtual() const
+{
+   return this->getSerializationType();
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Vector >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getCompressedRowLengths( Vector& rowLengths ) const
+{
+   details::set_size_if_resizable( rowLengths, this->getRows() );
+   rowLengths = 0;
+   auto rowLengths_view = rowLengths.getView();
+   auto fetch = [] __cuda_callable__ ( IndexType row, IndexType column, const RealType& value ) -> IndexType {
+      return ( value != 0.0 );
+   };
+   auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
+      rowLengths_view[ rowIdx ] = value;
+   };
+   this->reduceAllRows( fetch, std::plus<>{}, keep, 0 );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Vector >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getRowCapacities( Vector& rowLengths ) const
+{
+   details::set_size_if_resizable( rowLengths, this->getRows() );
+   rowLengths = 0;
+   auto rowLengths_view = rowLengths.getView();
+   auto fetch = [] __cuda_callable__ ( IndexType row, IndexType column, const RealType& value ) -> IndexType {
+      return 1;
+   };
+   auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
+      rowLengths_view[ rowIdx ] = value;
+   };
+   this->reduceAllRows( fetch, std::plus<>{}, keep, 0 );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__
+Index
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getRowCapacity( const IndexType row ) const
+{
+   return this->segments.getSegmentSize( row );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+Index
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getNonzeroElementsCount() const
+{
+   const auto columns_view = this->columnIndexes.getConstView();
+   const IndexType paddingIndex = this->getPaddingIndex();
+   if( ! isSymmetric() )
+   {
+      auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> IndexType {
+         return ( columns_view[ i ] != paddingIndex );
+      };
+      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, this->columnIndexes.getSize(), fetch, std::plus<>{}, 0 );
+   }
+   else
+   {
+      const auto rows = this->getRows();
+      const auto columns = this->getColumns();
+      Containers::Vector< IndexType, DeviceType, IndexType > row_sums( this->getRows(), 0 );
+      auto row_sums_view = row_sums.getView();
+      auto row_pointers_view = this->rowPointers.getConstView();
+      const auto columnIndexesView = this->columnIndexes.getConstView();
+      // SANDBOX_TODO: Replace the following lambda function (or more) with code compute number of nonzero matrix elements
+      //               of symmetric matrix. Note, that this is required only by symmetric matrices and that the highest performance
+      //               is not a priority here.
+      auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
+         auto begin = row_pointers_view[ rowIdx ];
+         auto end = row_pointers_view[ rowIdx + 1 ];
+         IndexType sum( 0 );
+         for( IndexType globalIdx = begin; globalIdx < end; globalIdx++ )
+         {
+            const IndexType column = columnIndexesView[ globalIdx ];
+            if( column != paddingIndex )
+               sum += 1 + ( column != rowIdx && column < rows && rowIdx < columns );
+         }
+         row_sums_view[ rowIdx ] = sum;
+      };
+      TNL::Algorithms::ParallelFor< DeviceType >::exec( ( IndexType ) 0, this->getRows(), f );
+      return sum( row_sums );
+   }
+   return 0;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__ auto
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getRow( const IndexType& rowIdx ) const -> ConstRowView
+{
+   TNL_ASSERT_LT( rowIdx, this->getRows(), "Row index is larger than number of matrix rows." );
+   // SANDBOX_TODO: Replace the following with creation of RowView corresponding with your sparse matrix format.
+   return ConstRowView( rowIdx,                                                         // row index
+                        this->rowPointers[ rowIdx ],                                    // row begining
+                        this->rowPointers[ rowIdx + 1 ] - this->rowPointers[ rowIdx ],  // number of elemnts allocated for given row
+                        this->values, this->columnIndexes );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__ auto
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getRow( const IndexType& rowIdx ) -> RowView
+{
+   TNL_ASSERT_LT( rowIdx, this->getRows(), "Row index is larger than number of matrix rows." );
+   // SANDBOX_TODO: Replace this with RowView constructor by your needs.
+   return RowView( rowIdx,                                               // row index
+                   rowPointers[ rowIdx ],                                // index of the first nonzero element in the row
+                   rowPointers[ rowIdx + 1 ] - rowPointers[ rowIdx ],    // number of nonzero elements in the row
+                   this->values,
+                   this->columnIndexes );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__ void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+setElement( const IndexType row,
+            const IndexType column,
+            const RealType& value )
+{
+   this->addElement( row, column, value, 0.0 );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__ void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+addElement( IndexType row,
+            IndexType column,
+            const RealType& value,
+            const RealType& thisElementMultiplicator )
+{
+   TNL_ASSERT_GE( row, 0, "Sparse matrix row index cannot be negative." );
+   TNL_ASSERT_LT( row, this->getRows(), "Sparse matrix row index is larger than number of matrix rows." );
+   TNL_ASSERT_GE( column, 0, "Sparse matrix column index cannot be negative." );
+   TNL_ASSERT_LT( column, this->getColumns(), "Sparse matrix column index is larger than number of matrix columns." );
+
+   if( isSymmetric() && row < column )
+   {
+      swap( row, column );
+      TNL_ASSERT_LT( row, this->getRows(), "Column index is out of the symmetric part of the matrix after transposition." );
+      TNL_ASSERT_LT( column,this->getColumns(), "Row index is out of the symmetric part of the matrix after transposition." );
+   }
+
+   // SANDBOX_TODO: Replace the following line with a code that computes number of matrix elements allocated for 
+   //               matrix row with indedx `row`. Note that the code works on both host and GPU kernel. To achieve
+   //               the same effect, you may use macro __CUDA_ARCH__ as can be seen bellow in this method.
+   const IndexType rowSize = this->rowPointers.getElement( row + 1 ) - this->rowPointers.getElement( row );
+   IndexType col( this->getPaddingIndex() );
+   IndexType i;
+   IndexType globalIdx;
+   for( i = 0; i < rowSize; i++ )
+   {
+      // SANDBOX_TODO: Replace the following line with a code that computes a global index of `i`-th nonzero matrix element
+      //               in the `row`-th matrix row. The global index is a pointer to arrays `values` and `columnIndexes` storing
+      //               the matrix elements values and column indexes respectively.
+      globalIdx = this->rowPointers.getElement( row ) + i;
+      TNL_ASSERT_LT( globalIdx, this->columnIndexes.getSize(), "" );
+      col = this->columnIndexes.getElement( globalIdx );
+      if( col == column )
+      {
+         if( ! isBinary() )
+            this->values.setElement( globalIdx, thisElementMultiplicator * this->values.getElement( globalIdx ) + value );
+         return;
+      }
+      if( col == this->getPaddingIndex() || col > column )
+         break;
+   }
+   if( i == rowSize )
+   {
+#ifndef __CUDA_ARCH__
+      std::stringstream msg;
+      msg << "The capacity of the sparse matrix row number "  << row << " was exceeded.";
+      throw std::logic_error( msg.str() );
+#else
+      TNL_ASSERT_TRUE( false, "");
+      return;
+#endif
+   }
+   if( col == this->getPaddingIndex() )
+   {
+      this->columnIndexes.setElement( globalIdx, column );
+      if( ! isBinary() )
+         this->values.setElement( globalIdx, value );
+      return;
+   }
+   else
+   {
+      IndexType j = rowSize - 1;
+      while( j > i )
+      {
+         // SANDBOX_TODO: Replace the following two lines with a code that computes a global indexes of `j`-th and `j-1`-th nonzero matrix elements
+         //               in the `row`-th matrix row. The global index is a pointer to arrays `values` and `columnIndexes` storing
+         //               the matrix elements values and column indexes respectively.
+         const IndexType globalIdx1 = this->rowPointers.getElement( row ) + j;
+         const IndexType globalIdx2 = globalIdx1 - 1;
+         // End of code replacement.
+         TNL_ASSERT_LT( globalIdx1, this->columnIndexes.getSize(), "" );
+         TNL_ASSERT_LT( globalIdx2, this->columnIndexes.getSize(), "" );
+         this->columnIndexes.setElement( globalIdx1, this->columnIndexes.getElement( globalIdx2 ) );
+         if( ! isBinary() )
+            this->values.setElement( globalIdx1, this->values.getElement( globalIdx2 ) );
+         j--;
+      }
+
+      this->columnIndexes.setElement( globalIdx, column );
+      if( ! isBinary() )
+         this->values.setElement( globalIdx, value );
+      return;
+   }
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__
+auto
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getElement( IndexType row,
+            IndexType column ) const -> RealType
+{
+   TNL_ASSERT_GE( row, 0, "Sparse matrix row index cannot be negative." );
+   TNL_ASSERT_LT( row, this->getRows(), "Sparse matrix row index is larger than number of matrix rows." );
+   TNL_ASSERT_GE( column, 0, "Sparse matrix column index cannot be negative." );
+   TNL_ASSERT_LT( column, this->getColumns(), "Sparse matrix column index is larger than number of matrix columns." );
+
+   if( isSymmetric() && row < column )
+   {
+      swap( row, column );
+      if( row >= this->getRows() || column >= this->getColumns() )
+         return 0.0;
+   }
+
+   // SANDBOX_TODO: Replace the following lines with a code for getting number of elements allocated for given row.
+   const IndexType rowSize = this->rowPointers.getElement( row + 1 ) - this->rowPointers.getElement( row );
+   for( IndexType i = 0; i < rowSize; i++ )
+   {
+      // SANDBOX_TODO: Replace the following line with a code for getting index of the matrix element in arrays `values` and `columnIdexes`.
+      const IndexType globalIdx = this->rowPointers.getElement( row ) + i;
+      TNL_ASSERT_LT( globalIdx, this->columnIndexes.getSize(), "" );
+      const IndexType col = this->columnIndexes.getElement( globalIdx );
+      if( col == column )
+      {
+         if( isBinary() )
+            return 1;
+         else
+            return this->values.getElement( globalIdx );
+      }
+   }
+   return 0.0;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+template< typename InVector,
+       typename OutVector >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+vectorProduct( const InVector& inVector,
+               OutVector& outVector,
+               const RealType matrixMultiplicator,
+               const RealType outVectorMultiplicator,
+               const IndexType firstRow,
+               IndexType lastRow ) const
+{
+   TNL_ASSERT_EQ( this->getColumns(), inVector.getSize(), "Matrix columns do not fit with input vector." );
+   TNL_ASSERT_EQ( this->getRows(), outVector.getSize(), "Matrix rows do not fit with output vector." );
+
+   using OutVectorReal = typename OutVector::RealType;
+   static_assert(
+         ! MatrixType::isSymmetric() ||
+         ! std::is_same< Device, Devices::Cuda >::value ||
+         ( std::is_same< OutVectorReal, float >::value ||
+           std::is_same< OutVectorReal, double >::value ||
+           std::is_same< OutVectorReal, int >::value ||
+           std::is_same< OutVectorReal, long long int >::value ),
+         "Given Real type is not supported by atomic operations on GPU which are necessary for symmetric operations." );
+
+   const auto inVectorView = inVector.getConstView();
+   auto outVectorView = outVector.getView();
+   const auto valuesView = this->values.getConstView();
+   const auto columnIndexesView = this->columnIndexes.getConstView();
+   const auto rowPointersView = this->rowPointers.getConstView();
+   const IndexType paddingIndex = this->getPaddingIndex();
+#define HAVE_SANDBOX_SIMPLE_SPMV
+   // SANDBOX_TODO: The following is simple direct implementation of SpMV operation with CSR format. We recommend to start by
+   //               replacing this part with SpMV based on your sparse format.
+   if( std::is_same< DeviceType, TNL::Devices::Host >::value )          // this way you may easily specialize for different device types
+   {
+      // SANDBOX_TODO: This simple and naive implementation for CPU.
+      for( IndexType rowIdx = firstRow; rowIdx < lastRow; rowIdx++ )
+      {
+         const auto begin = rowPointers[ rowIdx ];
+         const auto end = rowPointers[ rowIdx + 1 ];
+         RealType sum( 0.0 );
+         for( IndexType globalIdx = begin; globalIdx < end; globalIdx++ )
+               sum += this->values[ globalIdx ] * inVector[ this->columnIndexes[ globalIdx ] ];
+         // SANDBOX_TODO:The following is quite inefficient, its better to specialized the code for cases when
+         // `outVectorMultiplicator` is zero or `matrixMultiplicator` is one - see. the full implementation bellow.
+         outVector[ rowIdx ] = outVector[ rowIdx ] * outVectorMultiplicator + matrixMultiplicator * sum;
+      }
+   }
+   else
+   {
+      //SANDBOX_TODO: The following is general implementation based on ParallelFor and lambda function. It would work even on CPU.
+      auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
+         const auto begin = rowPointersView[ rowIdx ];
+         const auto end = rowPointersView[ rowIdx + 1 ];
+         RealType sum( 0.0 );
+         for( IndexType globalIdx = begin; globalIdx < end; globalIdx++ )
+            sum + valuesView[ globalIdx ] * inVectorView[ columnIndexesView[ globalIdx ] ];
+         outVectorView[ rowIdx ] = outVectorView[ rowIdx ] * outVectorMultiplicator + matrixMultiplicator * sum;
+      };
+      TNL::Algorithms::ParallelFor< DeviceType >::exec( firstRow, lastRow, f );
+   }
+#ifdef HAVE_SANDBOX_SIMPLE_SPMV
+#else
+   // SANDBOX_TODO: The following is fully functional implementation based on method `reduceRows`.
+   if( isSymmetric() )
+      outVector *= outVectorMultiplicator;
+   auto symmetricFetch = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable -> RealType {
+      const IndexType column = columnIndexesView[ globalIdx ];
+      compute = ( column != paddingIndex );
+      if( ! compute )
+         return 0.0;
+      if( isSymmetric() && column < row )
+      {
+         if( isBinary() )
+            Algorithms::AtomicOperations< DeviceType >::add( outVectorView[ column ], ( OutVectorReal ) matrixMultiplicator * inVectorView[ row ] );
+         else
+            Algorithms::AtomicOperations< DeviceType >::add( outVectorView[ column ], ( OutVectorReal ) matrixMultiplicator * valuesView[ globalIdx ] * inVectorView[ row ] );
+      }
+      if( isBinary() )
+         return inVectorView[ column ];
+      return valuesView[ globalIdx ] * inVectorView[ column ];
+   };
+   auto fetch = [=] __cuda_callable__ ( IndexType globalIdx, bool& compute ) mutable -> RealType {
+      const IndexType column = columnIndexesView[ globalIdx ];
+      if( isBinary() )
+         return inVectorView[ column ];
+      return valuesView[ globalIdx ] * inVectorView[ column ];
+   };
+
+   auto keeperGeneral = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
+      if( isSymmetric() )
+      {
+         typename OutVector::RealType aux = matrixMultiplicator * value;
+         Algorithms::AtomicOperations< DeviceType >::add( outVectorView[ row ], aux );
+      }
+      else
+      {
+         if( outVectorMultiplicator == 0.0 )
+            outVectorView[ row ] = matrixMultiplicator * value;
+         else
+            outVectorView[ row ] = outVectorMultiplicator * outVectorView[ row ] + matrixMultiplicator * value;
+      }
+   };
+   auto keeperDirect = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
+      outVectorView[ row ] = value;
+   };
+   auto keeperMatrixMult = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
+      outVectorView[ row ] = matrixMultiplicator * value;
+   };
+   auto keeperVectorMult = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
+      outVectorView[ row ] = outVectorMultiplicator * outVectorView[ row ] + value;
+   };
+
+   if( lastRow == 0 )
+      lastRow = this->getRows();
+   if( isSymmetric() )
+      this->reduceRows( firstRow, lastRow, symmetricFetch, std::plus<>{}, keeperGeneral, ( RealType ) 0.0 );
+   else
+   {
+      if( outVectorMultiplicator == 0.0 )
+      {
+         if( matrixMultiplicator == 1.0 )
+            this->reduceRows( firstRow, lastRow, fetch, std::plus<>{}, keeperDirect, ( RealType ) 0.0 );
+         else
+            this->reduceRows( firstRow, lastRow, fetch, std::plus<>{}, keeperMatrixMult, ( RealType ) 0.0 );
+      }
+      else
+      {
+         if( matrixMultiplicator == 1.0 )
+            this->reduceRows( firstRow, lastRow, fetch, std::plus<>{}, keeperVectorMult, ( RealType ) 0.0 );
+         else
+            this->reduceRows( firstRow, lastRow, fetch, std::plus<>{}, keeperGeneral, ( RealType ) 0.0 );
+      }
+   }
+#endif
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero )
+{
+   auto columns_view = this->columnIndexes.getView();
+   auto values_view = this->values.getView();
+   auto row_pointers_view = this->rowPointers.getConstView();
+   const IndexType paddingIndex_ = this->getPaddingIndex();
+   // SANDBOX_TODO: Replace the following code with the one for computing reduction in rows by your format.
+   //               Note, that this method can be used for implementation of SpMV.
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
+      const auto begin = row_pointers_view[ rowIdx ];
+      const auto end = row_pointers_view[ rowIdx + 1 ];
+      FetchValue sum = zero;
+      for( IndexType globalIdx = begin; globalIdx < end; globalIdx++ )
+      {
+         IndexType& columnIdx = columns_view[ globalIdx ];
+         if( columnIdx != paddingIndex_ )
+         {
+            if( isBinary() )
+               sum = reduce( sum, fetch( rowIdx, columnIdx, 1 ) );
+            else
+               sum = reduce( sum, fetch( rowIdx, columnIdx, values_view[ globalIdx ] ) );
+         }
+      }
+      keep( rowIdx, sum );
+   };
+   TNL::Algorithms::ParallelFor< DeviceType >::exec( begin, end, f );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const
+{
+   auto columns_view = this->columnIndexes.getConstView();
+   auto values_view = this->values.getConstView();
+   const IndexType paddingIndex_ = this->getPaddingIndex();
+   // SANDBOX_TODO: Replace the following code with the one for computing reduction in rows by your format.
+   //               Note, that this method can be used for implementation of SpMV.
+   auto row_pointers_view = this->rowPointers.getConstView();
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
+      const auto begin = row_pointers_view[ rowIdx ];
+      const auto end = row_pointers_view[ rowIdx + 1 ];
+      FetchValue sum = zero;
+      for( IndexType globalIdx = begin; globalIdx < end; globalIdx++ )
+      {
+         const IndexType& columnIdx = columns_view[ globalIdx ];
+         if( columnIdx != paddingIndex_ )
+         {
+            if( isBinary() )
+               sum = reduce( sum, fetch( rowIdx, columnIdx, 1 ) );
+            else
+               sum = reduce( sum, fetch( rowIdx, columnIdx, values_view[ globalIdx ] ) );
+         }
+      }
+      keep( rowIdx, sum );
+   };
+   TNL::Algorithms::ParallelFor< DeviceType >::exec( begin, end, f );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero )
+{
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+{
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+forElements( IndexType begin, IndexType end, Function& function ) const
+{
+   const auto columns_view = this->columnIndexes.getConstView();
+   const auto values_view = this->values.getConstView();
+   // SANDBOX_TODO: Replace the following code with the one for iterating over all allocated matrix elements.
+   auto row_pointers_view = this->rowPointers.getConstView();
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
+      const auto begin = row_pointers_view[ rowIdx ];
+      const auto end = row_pointers_view[ rowIdx + 1 ];
+      IndexType localIdx( 0 );
+      for( IndexType globalIdx = begin; globalIdx < end; globalIdx++ )
+      {
+         if( isBinary() )
+            function( rowIdx, localIdx, columns_view[ globalIdx ], ( RealType ) 1 );
+         else
+            function( rowIdx, localIdx, columns_view[ globalIdx ], values_view[ globalIdx ] );
+         localIdx++;
+      }
+   };
+   TNL::Algorithms::ParallelFor< DeviceType >::exec( begin, end, f );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+forElements( IndexType begin, IndexType end, Function& function )
+{
+   auto columns_view = this->columnIndexes.getView();
+   auto values_view = this->values.getView();
+   // SANDBOX_TODO: Replace the following code with the one for iterating over all allocated matrix elements.
+   auto row_pointers_view = this->rowPointers.getConstView();
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
+      const auto begin = row_pointers_view[ rowIdx ];
+      const auto end = row_pointers_view[ rowIdx + 1 ];
+      IndexType localIdx( 0 );
+      RealType one( 1.0 );
+      for( IndexType globalIdx = begin; globalIdx < end; globalIdx++ )
+      {
+         if( isBinary() )
+            function( rowIdx, localIdx, columns_view[ globalIdx ], one ); // TODO: Fix this without using `one`.
+         else
+            function( rowIdx, localIdx, columns_view[ globalIdx ], values_view[ globalIdx ] );
+         localIdx++;
+      }
+   };
+   TNL::Algorithms::ParallelFor< DeviceType >::exec( begin, end, f );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+forAllElements( Function& function ) const
+{
+   this->forElements( 0, this->getRows(), function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+forAllElements( Function& function )
+{
+   this->forElements( 0, this->getRows(), function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+forRows( IndexType begin, IndexType end, Function&& function )
+{
+   auto columns_view = this->columnIndexes.getView();
+   auto values_view = this->values.getView();
+   // SANDBOX_TODO: Replace the following code with the one for iteration over matrix rows.
+   auto row_pointers_view = this->rowPointers.getConstView();
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
+      auto rowView = RowView( rowIdx,                                                       // row index
+                              row_pointers_view[ rowIdx ],                                  // row begining
+                              row_pointers_view[ rowIdx + 1 ] -row_pointers_view[ rowIdx ], // number of elemnts allocated for given matrix row
+                              values_view, columns_view );
+      function( rowView );
+   };
+   TNL::Algorithms::ParallelFor< DeviceType >::exec( begin, end, f );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+forRows( IndexType begin, IndexType end, Function&& function ) const
+{
+   const auto columns_view = this->columnIndexes.getConstView();
+   const auto values_view = this->values.getConstView();
+   // SANDBOX_TODO: Replace the following code with the one for iteration over matrix rows.
+   auto row_pointers_view = this->rowPointers.getConstView();
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx ) {
+      auto rowView = ConstRowView( rowIdx,                                                       // row index
+                                   row_pointers_view[ rowIdx ],                                  // row begining
+                                   row_pointers_view[ rowIdx + 1 ] -row_pointers_view[ rowIdx ], // number of elemnts allocated for given matrix row
+                                   values_view, columns_view );
+      function( rowView );
+   };
+   TNL::Algorithms::ParallelFor< DeviceType >::exec( begin, end, f );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+forAllRows( Function&& function )
+{
+   this->forRows( 0, this->getRows(), function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+forAllRows( Function&& function ) const
+{
+   this->forRows( 0, this->getRows(), function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+sequentialForRows( IndexType begin, IndexType end, Function& function ) const
+{
+   for( IndexType row = begin; row < end; row ++ )
+      this->forRows( row, row + 1, function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+sequentialForRows( IndexType begin, IndexType end, Function& function )
+{
+   for( IndexType row = begin; row < end; row ++ )
+      this->forRows( row, row + 1, function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+sequentialForAllRows( Function& function ) const
+{
+   this->sequentialForRows( 0, this->getRows(), function );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Function >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+sequentialForAllRows( Function& function )
+{
+   this->sequentialForRows( 0, this->getRows(), function );
+}
+
+/*template< typename Real,
+          template< typename, typename > class SegmentsView,
+          typename Device,
+          typename Index,
+          typename RealAllocator,
+          typename IndexAllocator >
+template< typename Real2, template< typename, typename > class Segments2, typename Index2, typename RealAllocator2, typename IndexAllocator2 >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+addMatrix( const SparseSandboxMatrixView< Real2, Segments2, Device, Index2, RealAllocator2, IndexAllocator2 >& matrix,
+           const RealType& matrixMultiplicator,
+           const RealType& thisMatrixMultiplicator )
+{
+
+}
+
+template< typename Real,
+          template< typename, typename > class SegmentsView,
+          typename Device,
+          typename Index,
+          typename RealAllocator,
+          typename IndexAllocator >
+template< typename Real2, typename Index2 >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getTransposition( const SparseSandboxMatrixView< Real2, Device, Index2 >& matrix,
+                  const RealType& matrixMultiplicator )
+{
+
+}*/
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+template< typename Vector1, typename Vector2 >
+bool
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+performSORIteration( const Vector1& b,
+                     const IndexType row,
+                     Vector2& x,
+                     const RealType& omega ) const
+{
+   return false;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >&
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+operator=( const SparseSandboxMatrixView< Real, Device, Index, MatrixType >& matrix )
+{
+   MatrixView< Real, Device, Index >::operator=( matrix );
+   this->columnIndexes.bind( matrix.columnIndexes );
+   // SANDBOX_TODO: Replace the following line with assignment of metadata required by your
+   //               sparse format.
+   this->rowPointers.bind( matrix.rowPointers );
+   return *this;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Matrix >
+bool
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+operator==( const Matrix& m ) const
+{
+   const auto& view1 = *this;
+   // FIXME: getConstView does not work
+   //const auto view2 = m.getConstView();
+   const auto view2 = m.getView();
+   auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> bool
+   {
+      return view1.getRow( i ) == view2.getRow( i );
+   };
+   return Algorithms::Reduction< DeviceType >::reduce( 0, this->getRows(), fetch, std::logical_and<>{}, true );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+   template< typename Matrix >
+bool
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+operator!=( const Matrix& m ) const
+{
+   return ! operator==( m );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+save( File& file ) const
+{
+   MatrixView< RealType, DeviceType, IndexType >::save( file );
+   file << this->columnIndexes
+        << this->rowPointers;  // SANDBOX_TODO: Replace this with medata required by your format
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+save( const String& fileName ) const
+{
+   Object::save( fileName );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+void
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+print( std::ostream& str ) const
+{
+   if( isSymmetric() )
+   {
+      for( IndexType row = 0; row < this->getRows(); row++ )
+      {
+         str <<"Row: " << row << " -> ";
+         for( IndexType column = 0; column < this->getColumns(); column++ )
+         {
+            auto value = this->getElement( row, column );
+            if( value != ( RealType ) 0 )
+               str << " Col:" << column << "->" << value << "\t";
+         }
+         str << std::endl;
+      }
+   }
+   else
+      for( IndexType row = 0; row < this->getRows(); row++ )
+      {
+         str <<"Row: " << row << " -> ";
+         // SANDBOX_TODO: Replace the followinf line with a code for computing number of elements allocated for given matrix row.
+         const auto rowLength = this->rowPointers.getElement( row + 1 ) - this->rowPointers.getElement( row );
+         for( IndexType i = 0; i < rowLength; i++ )
+         {
+            // SANDBOX_TODO: Replace the following line with a code for getting index of the matrix element in arrays `values` and `columnIdexes`.
+            const IndexType globalIdx = this->rowPointers.getElement( row ) + i;
+            const IndexType column = this->columnIndexes.getElement( globalIdx );
+            if( column == this->getPaddingIndex() )
+               break;
+            RealType value;
+            if( isBinary() )
+               value = ( RealType ) 1.0;
+            else
+               value = this->values.getElement( globalIdx );
+            if( value )
+            {
+               std::stringstream str_;
+               str_ << std::setw( 4 ) << std::right << column << ":" << std::setw( 4 ) << std::left << value;
+               str << std::setw( 10 ) << str_.str();
+            }
+         }
+         str << std::endl;
+      }
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+__cuda_callable__
+Index
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getPaddingIndex() const
+{
+   return -1;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+auto
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getColumnIndexes() const -> const ColumnsIndexesViewType&
+{
+   return this->columnIndexes;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+auto
+SparseSandboxMatrixView< Real, Device, Index, MatrixType >::
+getColumnIndexes() -> ColumnsIndexesViewType&
+{
+   return this->columnIndexes;
+}
+
+      } // namespace Sandbox
+   } //namespace Matrices
+} // namespace  TNL
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index 40c568b4d..b58620c2d 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -417,6 +417,7 @@ SparseMatrix< Real, Device, Index, MatrixType, Segments, ComputeReal, RealAlloca
 reset()
 {
    BaseType::reset();
+   this->columnIndexes.reset();
    this->segments.reset();
    this->view = this->getView();
    TNL_ASSERT_EQ( this->getRows(), segments.getSegmentsCount(), "mismatched segments count" );
diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt
index a4b06708e..3210920f5 100644
--- a/src/UnitTests/Matrices/CMakeLists.txt
+++ b/src/UnitTests/Matrices/CMakeLists.txt
@@ -28,6 +28,8 @@ set( COMMON_TESTS
             BinarySparseMatrixCopyTest
             SymmetricSparseMatrixTest_CSR
             LambdaMatrixTest
+            SparseMatrixTest_SandboxMatrix
+            SparseMatrixVectorProductTest_SandboxMatrix
 )
 
 set( CPP_TESTS
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_SandboxMatrix.cpp b/src/UnitTests/Matrices/SparseMatrixTest_SandboxMatrix.cpp
new file mode 100644
index 000000000..dc856310e
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_SandboxMatrix.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixTest_SandboxMatrix.cpp -  description
+                             -------------------
+    begin                : Apr 19, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixTest_SandboxMatrix.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_SandboxMatrix.cu b/src/UnitTests/Matrices/SparseMatrixTest_SandboxMatrix.cu
new file mode 120000
index 000000000..27787fdf2
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_SandboxMatrix.cu
@@ -0,0 +1 @@
+SparseMatrixTest_SandboxMatrix.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_SandboxMatrix.h b/src/UnitTests/Matrices/SparseMatrixTest_SandboxMatrix.h
new file mode 100644
index 000000000..ad1a0c74d
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_SandboxMatrix.h
@@ -0,0 +1,45 @@
+/***************************************************************************
+                          SandboxMatrixTest_SandboxMatrix.h -  description
+                             -------------------
+    begin                : Apr 19, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SandboxMatrixTest_SandboxMatrix";
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::Sandbox::SparseSandboxMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SandboxMatrix.cpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SandboxMatrix.cpp
new file mode 100644
index 000000000..bfa16c02b
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SandboxMatrix.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRScalar.cpp -  description
+                             -------------------
+    begin                : Mar 30, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixVectorProductTest_CSRScalar.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SandboxMatrix.cu b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SandboxMatrix.cu
new file mode 120000
index 000000000..bd87e1ad0
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SandboxMatrix.cu
@@ -0,0 +1 @@
+SparseMatrixVectorProductTest_SandboxMatrix.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SandboxMatrix.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SandboxMatrix.h
new file mode 100644
index 000000000..7b06af0f3
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_SandboxMatrix.h
@@ -0,0 +1,45 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_SandbxMatrix.h -  description
+                             -------------------
+    begin                : Apr 22, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRScalar_segments";
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::Sandbox::SparseSandboxMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix >,
+    TNL::Matrices::Sandbox::SparseSandboxMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixVectorProductTest.h"
+#include "../main.h"
-- 
GitLab


From 86bf24dd323c035b4539fb219b7a20407d77413e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 23 Apr 2021 10:24:16 +0200
Subject: [PATCH 038/117] Small fixes of sparse sandbox matrix.

---
 src/TNL/Matrices/Sandbox/SparseSandboxMatrix.h          | 3 +++
 src/TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.hpp | 2 +-
 src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp    | 8 ++++++--
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.h b/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.h
index 6a2a6a565..c4b3ae23f 100644
--- a/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.h
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.h
@@ -21,6 +21,9 @@
 
 namespace TNL {
    namespace Matrices {
+      /**
+       * \brief Namespace for sandbox matrices.
+       */
       namespace Sandbox {
 
 /**
diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.hpp b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.hpp
index 09598edd0..b4bd74c44 100644
--- a/src/TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.hpp
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixRowView.hpp
@@ -28,7 +28,7 @@ SparseSandboxMatrixRowView( IndexType rowIdx,
                             IndexType size,
                             const ValuesViewType& values,
                             const ColumnsIndexesViewType& columnIndexes )
- : rowIdx( rowIdx ), offset( offset ), size( size ), values( values ), columnIndexes( columnIndexes )
+ : rowIdx( rowIdx ), size( size ), offset( offset ), values( values ), columnIndexes( columnIndexes )
 {
 }
 
diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp
index fb50a7acf..900e3bf15 100644
--- a/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp
@@ -426,7 +426,11 @@ vectorProduct( const InVector& inVector,
          const auto end = rowPointers[ rowIdx + 1 ];
          RealType sum( 0.0 );
          for( IndexType globalIdx = begin; globalIdx < end; globalIdx++ )
-               sum += this->values[ globalIdx ] * inVector[ this->columnIndexes[ globalIdx ] ];
+         {
+            const auto columnIdx = this->columnIndexes[ globalIdx ];
+            if( columnIdx != paddingIndex )
+               sum += this->values[ globalIdx ] * inVector[ columnIdx ];
+         }
          // SANDBOX_TODO:The following is quite inefficient, its better to specialized the code for cases when
          // `outVectorMultiplicator` is zero or `matrixMultiplicator` is one - see. the full implementation bellow.
          outVector[ rowIdx ] = outVector[ rowIdx ] * outVectorMultiplicator + matrixMultiplicator * sum;
@@ -440,7 +444,7 @@ vectorProduct( const InVector& inVector,
          const auto end = rowPointersView[ rowIdx + 1 ];
          RealType sum( 0.0 );
          for( IndexType globalIdx = begin; globalIdx < end; globalIdx++ )
-            sum + valuesView[ globalIdx ] * inVectorView[ columnIndexesView[ globalIdx ] ];
+            sum += valuesView[ globalIdx ] * inVectorView[ columnIndexesView[ globalIdx ] ];
          outVectorView[ rowIdx ] = outVectorView[ rowIdx ] * outVectorMultiplicator + matrixMultiplicator * sum;
       };
       TNL::Algorithms::ParallelFor< DeviceType >::exec( firstRow, lastRow, f );
-- 
GitLab


From 0f97bfcd8f65c2a3cc84aa8df3be6870e48d4903 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 23 Apr 2021 10:24:53 +0200
Subject: [PATCH 039/117] Added sparse sandbox matrix optionaly to SpMV
 benchmark.

---
 src/Benchmarks/SpMV/spmv.h    | 14 ++++++++++++++
 src/TNL/Matrices/MatrixInfo.h | 29 +++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 652ed9405..49dbf4264 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -35,6 +35,13 @@
 #include <TNL/Algorithms/Segments/SlicedEllpack.h>
 #include <TNL/Algorithms/Segments/ChunkedEllpack.h>
 #include <TNL/Algorithms/Segments/BiEllpack.h>
+
+// Uncomment the following line to enable benchmarking the sandbox sparse matrix.
+//#define WITH_SANDBOX_MATRIX_BENCHMARK
+#ifdef WITH_SANDBOX_MATRIX_BENCHMARK
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrix.h>
+#endif
+
 using namespace TNL::Matrices;
 
 #include <Benchmarks/SpMV/ReferenceFormats/cusparseCSRMatrix.h>
@@ -122,6 +129,10 @@ using BiEllpackSegments = Algorithms::Segments::BiEllpack< Device, Index, IndexA
 template< typename Real, typename Device, typename Index >
 using SymmetricSparseMatrix_BiEllpack = Matrices::SparseMatrix< Real, Device, Index, Matrices::SymmetricMatrix, BiEllpackSegments >;
 
+#ifdef WITH_SANDBOX_MATRIX_BENCHMARK
+template< typename Real, typename Device, typename Index >
+using SparseSandboxMatrix = Matrices::Sandbox::SparseSandboxMatrix< Real, Device, Index, Matrices::GeneralMatrix >;
+#endif
 
 /////
 // Legacy formats
@@ -496,6 +507,9 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack                >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_ChunkedEllpack               >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_BiEllpack                    >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
+#ifdef WITH_SANDBOX_MATRIX_BENCHMARK
+   benchmarkSpMV< Real, HostMatrixType, SparseSandboxMatrix                       >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
+#endif
    hostMatrix.reset();
 
    /////
diff --git a/src/TNL/Matrices/MatrixInfo.h b/src/TNL/Matrices/MatrixInfo.h
index d84afa39a..716423884 100644
--- a/src/TNL/Matrices/MatrixInfo.h
+++ b/src/TNL/Matrices/MatrixInfo.h
@@ -15,6 +15,7 @@
 #include <TNL/Matrices/DenseMatrixView.h>
 #include <TNL/Matrices/SparseMatrix.h>
 #include <TNL/Matrices/SparseMatrixView.h>
+#include <TNL/Matrices/Sandbox/SparseSandboxMatrix.h>
 #include <TNL/Algorithms/Segments/CSRView.h>
 #include <TNL/Algorithms/Segments/EllpackView.h>
 #include <TNL/Algorithms/Segments/SlicedEllpackView.h>
@@ -85,6 +86,34 @@ struct MatrixInfo< SparseMatrix< Real, Device, Index, MatrixType, Segments, Real
 {
 };
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType >
+struct MatrixInfo< Sandbox::SparseSandboxMatrixView< Real, Device, Index, MatrixType > >
+{
+   static String getDensity() { return String( "sparse" ); };
+
+   static String getFormat()
+   {
+      if( MatrixType::isSymmetric() )
+         return TNL::String( "Symmetric Sandbox" );
+      else
+         return TNL::String( "Sandbox" );
+   };
+};
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          typename RealAllocator,
+          typename IndexAllocator >
+struct MatrixInfo< Sandbox::SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator > >
+: public MatrixInfo< typename Sandbox::SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::ViewType >
+{
+};
+
 /////
 // Legacy matrices
 template< typename Real, typename Device, typename Index >
-- 
GitLab


From 70a58c22eec54394046a6d9ae7df9e8d2e41fb89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 24 Apr 2021 12:45:51 +0200
Subject: [PATCH 040/117] Fixing types in asserts of array and arrays
 assignment.

---
 src/TNL/Containers/ArrayView.hpp            | 2 +-
 src/TNL/Containers/detail/ArrayAssignment.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Containers/ArrayView.hpp b/src/TNL/Containers/ArrayView.hpp
index cd0a4c537..1562508a8 100644
--- a/src/TNL/Containers/ArrayView.hpp
+++ b/src/TNL/Containers/ArrayView.hpp
@@ -49,7 +49,7 @@ void
 ArrayView< Value, Device, Index >::
 bind( ValueType* data, IndexType size )
 {
-   TNL_ASSERT_GE( size, 0, "ArrayView size was initialized with a negative size." );
+   TNL_ASSERT_GE( size, ( IndexType ) 0, "ArrayView size was initialized with a negative size." );
    TNL_ASSERT_TRUE( (data == nullptr && size == 0) || (data != nullptr && size > 0),
                     "ArrayView was initialized with a positive address and zero size or zero address and positive size." );
 
diff --git a/src/TNL/Containers/detail/ArrayAssignment.h b/src/TNL/Containers/detail/ArrayAssignment.h
index c6ac6cb1c..ee1487c76 100644
--- a/src/TNL/Containers/detail/ArrayAssignment.h
+++ b/src/TNL/Containers/detail/ArrayAssignment.h
@@ -38,7 +38,7 @@ struct ArrayAssignment< Array, T, true >
 
    static void assign( Array& a, const T& t )
    {
-      TNL_ASSERT_EQ( a.getSize(), t.getSize(), "The sizes of the arrays must be equal." );
+      TNL_ASSERT_EQ( a.getSize(), ( decltype( a.getSize() ) ) t.getSize(), "The sizes of the arrays must be equal." );
       // skip assignment of empty arrays
       if( a.getSize() == 0 )
          return;
-- 
GitLab


From 7955b02a8a2a0857fd6962b6a0f29e03bc1c0bc6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 24 Apr 2021 12:46:55 +0200
Subject: [PATCH 041/117] Added benchmark of original LightSpMV implementation.

---
 .../SpMV/ReferenceFormats/Legacy/CSR.h        |  15 +-
 .../SpMV/ReferenceFormats/Legacy/CSR_impl.h   |   4 +-
 .../ReferenceFormats/Legacy/Sparse_impl.h     |   2 +-
 .../ReferenceFormats/LightSpMV-1.0/Options.cu | 381 ++++++++
 .../ReferenceFormats/LightSpMV-1.0/Options.h  | 123 +++
 .../ReferenceFormats/LightSpMV-1.0/SpMV.cu    | 874 ++++++++++++++++++
 .../ReferenceFormats/LightSpMV-1.0/SpMV.h     | 152 +++
 .../ReferenceFormats/LightSpMV-1.0/SpMVCSR.cu |  12 +
 .../ReferenceFormats/LightSpMV-1.0/SpMVCSR.h  | 696 ++++++++++++++
 .../ReferenceFormats/LightSpMV-1.0/Types.h    |  59 ++
 .../ReferenceFormats/LightSpMV-1.0/main.cu    |  99 ++
 .../ReferenceFormats/LightSpMVBenchmark.h     | 157 ++++
 src/Benchmarks/SpMV/spmv.h                    |  39 +-
 13 files changed, 2600 insertions(+), 13 deletions(-)
 create mode 100644 src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Options.cu
 create mode 100644 src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Options.h
 create mode 100644 src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMV.cu
 create mode 100644 src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMV.h
 create mode 100644 src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMVCSR.cu
 create mode 100644 src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMVCSR.h
 create mode 100644 src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Types.h
 create mode 100644 src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/main.cu
 create mode 100644 src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h

diff --git a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR.h
index 2db4c9f0c..efbd997e2 100644
--- a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR.h
@@ -53,6 +53,17 @@ union Block {
 
    Block() = default;
 
+   template< typename Index2 >
+   Block& operator=( const Block< Index2 >& source ) {
+      index[ 0 ] = source.index[ 0 ];
+      index[ 1 ] = source.index[ 1 ];
+      for( int i = 0; i < ( sizeof(Index) == 4 ? 8 : 16); i ++ )
+         byte[ i ] = source.byte[ i ];
+      for( int i = 0; i < (sizeof(Index) == 4 ? 4 : 8); i++ )
+         twobytes[ i ] = source.twobytes[ i ];
+      return *this;
+   }
+
    Index index[2]; // index[0] is row pointer, index[1] is index in warp
    uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
    uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID
@@ -262,8 +273,8 @@ public:
    // copy assignment
    CSR& operator=( const CSR& matrix );
 
-   template< CSRKernel KernelType2 >
-   CSR& operator=( const CSR< RealType, DeviceType, IndexType, KernelType2 >& matrix );
+   template< typename IndexType2, CSRKernel KernelType2 >
+   CSR& operator=( const CSR< RealType, DeviceType, IndexType2, KernelType2 >& matrix );
 
    // cross-device copy assignment
    template< typename Real2, typename Device2, typename Index2, CSRKernel KernelType2,
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h
index 2cb2b4784..f71eba123 100644
--- a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h
@@ -690,10 +690,10 @@ template< typename Real,
           typename Device,
           typename Index,
           CSRKernel KernelType >
-   template< CSRKernel KernelType2 >
+   template< typename IndexType2, CSRKernel KernelType2 >
 CSR< Real, Device, Index, KernelType >&
 CSR< Real, Device, Index, KernelType >::
-operator=( const CSR< Real, Device, Index, KernelType2 >& matrix )
+operator=( const CSR< Real, Device, IndexType2, KernelType2 >& matrix )
 {
    this->setLike( matrix );
    this->values = matrix.values;
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse_impl.h
index d87c80eee..ddc851022 100644
--- a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse_impl.h
@@ -104,7 +104,7 @@ template< typename Real,
           typename Index >
 void Sparse< Real, Device, Index >::allocateMatrixElements( const IndexType& numberOfMatrixElements )
 {
-   TNL_ASSERT_GE( numberOfMatrixElements, 0, "Number of matrix elements must be non-negative." );
+   TNL_ASSERT_GE( numberOfMatrixElements, ( IndexType ) 0, "Number of matrix elements must be non-negative." );
 
    this->values.setSize( numberOfMatrixElements );
    this->columnIndexes.setSize( numberOfMatrixElements );
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Options.cu b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Options.cu
new file mode 100644
index 000000000..f9bbeae70
--- /dev/null
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Options.cu
@@ -0,0 +1,381 @@
+/*
+ * Options.cu
+ *
+ *  Created on: Nov 24, 2014
+ *      Author: yongchao
+ */
+
+#include "Options.h"
+
+void Options::printUsage() {
+	cerr << endl
+			<< "LightSpMV (" << VERSION << ")"
+			<< ": GPU-based sparse matrix-vector multiplication using CSR storate format"
+			<< endl;
+	cerr << "Usage: lightspmv -i matrix [options]" << endl << endl;
+	cerr << "Options:" << endl;
+	cerr << "Input:" << endl
+			<< "\t-i <string> sparse matrix A file (in Matrix Market format)"
+			<< endl
+			<< "\t-x <string> vector X file (one element per line) [otherwise, set each element to 1.0]"
+			<< endl
+			<< "\t-y <string> vector Y file (one elemenet per line) [otherwise, set each element to 0.0]"
+			<< endl << "Output:" << endl
+			<< "\t-o <string> output file (one element per line) [otherwise, no output]"
+			<< endl << "Compute:" << endl
+			<< "\t-a <float> alpha value, default = " << _alpha << endl
+			<< "\t-b <float> beta value, defualt = " << _beta << endl
+			<< "\t-f <int> formula used, default = " << _formula << endl
+			<< "\t    0: y = Ax" << endl << "\t    1: y = alpha * Ax + beta * y"
+			<< endl << "\t-r <int> select the routine to use, default = "
+			<< _routine << endl
+			<< "\t    0: vector-based row dynamic distribution" << endl
+			<< "\t    1: warp-based row dynamic distribution" << endl
+			<< "\t-d <int> double-precision floating point, default = "
+			<< (_singlePrecision ? 0 : 1) << endl
+			<< "\t-g <int> index of the single GPU used, default = "
+			<< _gpuIndex << endl
+			<< "\t-m <int> number of SpMV iterations, default = " << _numIters
+			<< endl << endl;
+}
+bool Options::parseArgs(int32_t argc, char* argv[]) {
+	int32_t c;
+
+	if (argc < 2) {
+		printUsage();
+		return false;
+	}
+
+	while ((c = getopt(argc, argv, "i:x:y:o:g:f:r:d:m:\n")) != -1) {
+		switch (c) {
+		case 'i':
+			_mmFileName = optarg;
+			break;
+		case 'x':
+			_vecXFileName = optarg;
+			break;
+		case 'y':
+			_vecYFileName = optarg;
+			break;
+		case 'o':
+			_outFileName = optarg;
+			break;
+		case 'a':
+			_alpha = atof(optarg);
+			break;
+		case 'b':
+			_beta = atof(optarg);
+			break;
+		case 'f':
+			_formula = atoi(optarg);
+			break;
+		case 'g':
+			_gpuIndex = atoi(optarg);
+			if (_gpuIndex < 0) {
+				_gpuIndex = 0;
+			}
+			break;
+		case 'r':
+			_routine = atoi(optarg);
+			if (_routine < 0) {
+				_routine = 0;
+			}
+			break;
+		case 'd':
+			_singlePrecision = atoi(optarg) ? false : true;
+			break;
+		case 'm':
+			_numIters = atoi(optarg);
+			if(_numIters < 1){
+				_numIters = 1;
+			}
+			break;
+		default:
+			cerr << "Unknown parameter: " << optarg << endl;
+			return false;
+		}
+	}
+
+	/*check the file length*/
+	if (_mmFileName.length() == 0) {
+		cerr << "Matrix file should be specified" << endl;
+		return false;
+	}
+
+	/*load the list of GPUs*/
+	if (!getGPUs()) {
+		return false;
+	}
+
+	/*load the matrix*/
+	if (!loadMatrixMarketFile(_mmFileName.c_str())) {
+		return false;
+	}
+
+	/*load vector X*/
+	int64_t elementSize = _singlePrecision ? sizeof(float) : sizeof(double);
+	int64_t numBytes = _numCols * elementSize;
+
+	/*allocate space*/
+	cudaMallocHost(&_vectorX, numBytes);
+	CudaCheckError();
+
+	/*load the vector X*/
+	if (_vecXFileName.length() == 0) {
+		/*initialize X*/
+		cerr << "Initialize each element of vector X to 1.0" << endl;
+		if (_singlePrecision) {
+			float* p = (float*) _vectorX;
+			for (uint32_t i = 0; i < _numCols; ++i) {
+				p[i] = 1.0;
+			}
+		} else {
+			double* p = (double*) _vectorX;
+			for (uint32_t i = 0; i < _numCols; ++i) {
+				p[i] = 1.0;
+			}
+		}
+	} else {
+		cerr << "Load vector X from file" << endl;
+		/*could not get the data*/
+		if (!loadVector(_vecXFileName, _vectorX, _numCols)) {
+			return false;
+		}
+	}
+
+	/*load vector Y*/
+	numBytes = _numRows * elementSize;
+
+	/*allocate space*/
+	cudaMallocHost(&_vectorY, numBytes);
+	CudaCheckError();
+
+	/*load the vector Y*/
+	if (_vecYFileName.length() == 0) {
+		/*initialize Y*/
+		cerr << "Initialize each element of vector Y to 0" << endl;
+
+		memset(_vectorY, 0, numBytes);
+	} else {
+		cerr << "Load vector Y from file" << endl;
+		/*could not get the data*/
+		if (!loadVector(_vecYFileName, _vectorY, _numRows)) {
+			return false;
+		}
+	}
+
+	return true;
+}
+/*convert the matrix market format to CSR*/
+bool Options::loadMatrixMarketFile(const char* fileName) {
+	uint64_t numBytes;
+
+	cerr << "loading sparse matrix" << endl;
+	if (_singlePrecision) {
+		/*create an empty CSR sparse matrix object*/
+		cusp::csr_matrix<uint32_t, float, cusp::host_memory> matrix;
+
+		// load a matrix stored in MatrixMarket format
+		cusp::io::read_matrix_market_file(matrix, fileName);
+
+		/*save the matrix information*/
+		_numRows = matrix.num_rows;
+		_numCols = matrix.num_cols;
+		_numValues = matrix.num_entries;
+
+		/*reserve memory*/
+		cudaMallocHost(&_rowOffsets, (_numRows + 1) * sizeof(uint32_t));
+		CudaCheckError();
+
+		cudaMallocHost(&_colIndexValues, _numValues * sizeof(uint32_t));
+		CudaCheckError();
+
+		cudaMallocHost(&_numericalValues, _numValues * sizeof(float));
+		CudaCheckError();
+
+		/*copy the elements*/
+		numBytes = (_numRows + 1) * sizeof(uint32_t);
+		cudaMemcpy(_rowOffsets, &matrix.row_offsets[0], numBytes,
+				cudaMemcpyHostToHost);
+		CudaCheckError();
+
+		numBytes = _numValues * sizeof(uint32_t);
+		cudaMemcpy(_colIndexValues, &matrix.column_indices[0], numBytes,
+				cudaMemcpyHostToHost);
+		CudaCheckError();
+
+		numBytes = _numValues * sizeof(float);
+		cudaMemcpy(_numericalValues, &matrix.values[0], numBytes,
+				cudaMemcpyHostToHost);
+		CudaCheckError();
+	} else {
+		/*create an empty CSR sparse matrix object*/
+		cusp::csr_matrix<uint32_t, double, cusp::host_memory> matrix;
+
+		// load a matrix stored in MatrixMarket format
+		cusp::io::read_matrix_market_file(matrix, fileName);
+
+		/*save the matrix information*/
+		_numRows = matrix.num_rows;
+		_numCols = matrix.num_cols;
+		_numValues = matrix.num_entries;
+
+		/*reserve memory*/
+		cudaMallocHost(&_rowOffsets, (_numRows + 1) * sizeof(uint32_t));
+		CudaCheckError();
+
+		cudaMallocHost(&_colIndexValues, _numValues * sizeof(uint32_t));
+		CudaCheckError();
+
+		cudaMallocHost(&_numericalValues, _numValues * sizeof(double));
+		CudaCheckError();
+
+		/*copy the elements*/
+		numBytes = (_numRows + 1) * sizeof(uint32_t);
+		cudaMemcpy(_rowOffsets, &matrix.row_offsets[0], numBytes,
+				cudaMemcpyHostToHost);
+		CudaCheckError();
+
+		numBytes = _numValues * sizeof(uint32_t);
+		cudaMemcpy(_colIndexValues, &matrix.column_indices[0], numBytes,
+				cudaMemcpyHostToHost);
+		CudaCheckError();
+
+		numBytes = _numValues * sizeof(double);
+		cudaMemcpy(_numericalValues, &matrix.values[0], numBytes,
+				cudaMemcpyHostToHost);
+		CudaCheckError();
+	}
+
+	return true;
+}
+bool Options::loadVector(const string& fileName, void* vector,
+		const uint32_t maxNumValues) {
+	char buffer[1024];
+	FILE* file;
+	uint32_t pos;
+	float* fptr = (float*) vector;
+	double* dptr = (double*) vector;
+
+	cerr << "loading vector X" << endl;
+	/*open the file*/
+	if (fileName.length() == 0) {
+		return false;
+	}
+	file = fopen(fileName.c_str(), "r");
+	if (!file) {
+		cerr << "Failed to open file " << fileName << endl;
+		return false;
+	}
+
+	/*read the file*/
+	pos = 0;
+	while (fgets(buffer, 1023, file)) {
+		/*remove the end of line*/
+		for (int32_t i = strlen(buffer) - 1;
+				i >= 0 && (buffer[i] == '\n' || buffer[i] == '\r'); --i) {
+			buffer[i] = '\0';
+		}
+		if (strlen(buffer) == 0) {
+			continue;
+		}
+
+		/*get the number and save to vector*/
+		if (pos >= maxNumValues) {
+			/*already have enough numbers*/
+			break;
+		}
+		if (_singlePrecision) {
+			float value;
+			sscanf(buffer, "%f", &value);
+			fptr[pos++] = value;
+		} else {
+			double value;
+			sscanf(buffer, "%lf", &value);
+			dptr[pos++] = value;
+		}
+	}
+	if (pos < maxNumValues) {
+		cerr << "Do not have enough numbers in the file" << endl;
+		return false;
+	}
+	cerr << "Finished loading vector X" << endl;
+	return true;
+}
+void Options::getRowSizeVariance() {
+	double rowStart;
+	uint32_t rowEnd;
+
+	/*compute the variance*/
+	_variance = 0;
+	_mean = rint((double) _numValues / _numRows);
+	rowStart = _rowOffsets[0];
+	for (uint32_t i = 1; i <= _numRows; ++i) {
+		rowEnd = _rowOffsets[i];
+		_variance += (rowEnd - rowStart - _mean) * (rowEnd - rowStart - _mean);
+		rowStart = rowEnd;
+	}
+	_variance = rint(sqrt(_variance / (_numRows > 1 ? _numRows - 1 : 1)));
+
+	/*information*/
+	cerr << "Rows: " << _numRows << " Cols: " << _numCols << " Elements: "
+			<< _numValues << " Mean: " << _mean << " Standard deviation: "
+			<< _variance << endl;
+}
+bool Options::getGPUs() {
+	int32_t numGPUs;
+
+	/*get the number of GPUs*/
+	if (cudaGetDeviceCount(&numGPUs) != cudaSuccess) {
+		cerr << "No CUDA-enabled GPU is available in the host" << endl;
+		return false;
+	}
+
+#if defined(HAVE_SM_35)
+	cerr << "Require GPUs with compute capability >= 3.5" << endl;
+#else
+	cerr << "Require GPUs with compute capability >= 3.0" << endl;
+#endif
+
+	/*iterate each GPU*/
+	cudaDeviceProp prop;
+	for (int32_t i = 0; i < numGPUs; ++i) {
+
+		/*get the property of the device*/
+		cudaGetDeviceProperties(&prop, i);
+
+		/*check the major of the GPU*/
+#if defined(HAVE_SM_35)
+		if ((prop.major * 10 + prop.minor) >= 35) {
+#else
+		if ((prop.major * 10 + prop.minor) >= 30) {
+#endif
+			cerr << "GPU " << _gpus.size() << ": " << prop.name
+					<< " (capability " << prop.major << "." << prop.minor << ")"
+					<< endl;
+
+			/*save the Kepler GPU*/
+			_gpus.push_back(make_pair(i, prop));
+		}
+	}
+	/*check the number of qualified GPUs*/
+	if (_gpus.size() == 0) {
+		cerr << "No qualified GPU is available" << endl;
+		return false;
+	}
+
+	/*check the GPU index*/
+
+	/*reset the number of GPUs*/
+	if (_gpuIndex >= (int32_t) _gpus.size()) {
+		_gpuIndex = _gpus.size() - 1;
+	}
+	if (_gpuIndex < 0) {
+		_gpuIndex = 0;
+	}
+
+	/*move the selected gpu to the first*/
+	swap(_gpus[0], _gpus[_gpuIndex]);
+
+	return true;
+}
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Options.h b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Options.h
new file mode 100644
index 000000000..877e36352
--- /dev/null
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Options.h
@@ -0,0 +1,123 @@
+/*
+ * Options.h
+ *
+ *  Created on: Nov 21, 2014
+ *      Author: yongchao
+ */
+
+#ifndef OPTIONS_H_
+#define OPTIONS_H_
+
+#include "Types.h"
+//#include <cusp/io/matrix_market.h>
+
+struct Options {
+	Options() {
+
+		/*input*/
+		_routine = 1;
+		_formula = 1;
+		_numIters = 1000;
+		_singlePrecision = true;
+
+		/*matrix data*/
+		_numRows = 0;
+		_numCols = 0;
+		_rowOffsets = NULL;
+		_numValues = 0;
+		_colIndexValues = NULL;
+		_numericalValues = NULL;
+		_alpha = 1.0;
+		_beta = 1.0;
+
+		/*vector data*/
+		_vectorX = NULL;
+		_vectorY = NULL;
+
+		/*the number of GPUs*/
+		_numGPUs = 1;
+
+		/*GPU index used*/
+		_gpuIndex = 0;
+
+		/*for debug*/
+		_mean = 0;
+		_variance = 0;
+	}
+	~Options() {
+		if (_rowOffsets) {
+			cudaFreeHost(_rowOffsets);
+		}
+		if (_colIndexValues) {
+			cudaFreeHost(_colIndexValues);
+		}
+		if (_numericalValues) {
+			cudaFreeHost(_numericalValues);
+		}
+
+		if (_vectorX) {
+			cudaFreeHost(_vectorX);
+		}
+		if (_vectorY) {
+			cudaFreeHost(_vectorY);
+		}
+	}
+
+	/*parse parameters*/
+	bool parseArgs(int32_t argc, char* argv[]);
+
+	/*load Matrix Market file*/
+	bool loadMatrixMarketFile(const char* fileName);
+
+	/*load vector*/
+	bool loadVector(const string& fileName, void* vector,
+			const uint32_t maxNumValues);
+
+	/*print out usage*/
+	void printUsage();
+
+	/*get row distribution*/
+	void getRowSizeVariance();
+
+	/*retrieve GPU list*/
+	bool getGPUs();
+
+	/*input files*/
+	string _mmFileName;
+	string _vecXFileName;
+	string _vecYFileName;
+	string _outFileName;
+	bool _singlePrecision;
+	int32_t _routine;
+	int32_t _formula;
+	int32_t _numIters;
+	double _alpha;
+	double _beta;
+
+	/*for debugging*/
+	double _mean;
+	double _variance;
+
+	/*matrix data*/
+	uint32_t _numRows;
+	uint32_t _numCols;
+	uint32_t *_rowOffsets;
+	uint32_t _numValues;
+	uint32_t *_colIndexValues;
+	void *_numericalValues;
+
+	/*vector data*/
+	void *_vectorX;
+	void *_vectorY;
+
+	/*number of GPUs to be used*/
+	int32_t _numGPUs;
+
+	/*GPU index used*/
+	int32_t _gpuIndex;
+
+	/*GPU device list*/
+	vector<pair<int32_t, struct cudaDeviceProp> > _gpus;
+};
+
+#endif /* OPTIONS_H_ */
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMV.cu b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMV.cu
new file mode 100644
index 000000000..c4ef3fda5
--- /dev/null
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMV.cu
@@ -0,0 +1,874 @@
+/*
+ * SpMV.cu
+ *
+ *  Created on: Nov 21, 2014
+ *      Author: yongchao
+ */
+#include "SpMV.h"
+#include "SpMVCSR.h"
+
+extern __constant__ uint32_t _cudaNumRows;
+
+SpMV::SpMV(Options* opt) {
+	_opt = opt;
+
+	/*the number of GPUs*/
+	_numGPUs = _opt->_numGPUs;
+
+	/*compute the mean number of elements per row*/
+	_meanElementsPerRow = (int32_t) rint(
+			(double) _opt->_numValues / _opt->_numRows);
+
+	/*create row counter*/
+	_cudaRowCounters.resize(_numGPUs, NULL);
+
+	/*create streams*/
+	_streams.resize(_numGPUs, 0);
+
+	for (int32_t i = 0; i < _numGPUs; ++i) {
+		cudaSetDevice(_opt->_gpus[i].first);
+		CudaCheckError();
+
+		cudaStreamCreate(&_streams[i]);
+		CudaCheckError();
+	}
+#if defined(FLOAT_USE_TEXTURE_MEMORY) || defined(DOUBLE_USE_TEXTURE_MEMORY)
+	_texVectorX.resize(_numGPUs, 0);
+#endif
+}
+SpMV::~SpMV() {
+	/*destroy the streams*/
+	for (int32_t i = 0; i < _numGPUs; ++i) {
+
+		/*set device*/
+		cudaSetDevice(_opt->_gpus[i].first);
+		CudaCheckError();
+
+		cudaStreamDestroy(_streams[i]);
+		CudaCheckError();
+
+#if defined(FLOAT_USE_TEXTURE_MEMORY) || defined(DOUBLE_USE_TEXTURE_MEMORY)
+		if (_texVectorX[i]) {
+			cudaDestroyTextureObject(_texVectorX[i]);
+		}
+		CudaCheckError();
+#endif
+	}
+}
+
+/*invoke kernel*/
+void SpMV::spmvKernel() {
+
+	/*initialize the counter*/
+	cudaMemset(_cudaRowCounters[0], 0, sizeof(uint32_t));
+
+	/*invoke kernel*/
+	if (_opt->_formula == 0) {
+		invokeKernel(0);
+	} else {
+		invokeKernelBLAS(0);
+	}
+}
+void SpMV::invokeKernel(const int32_t i) {
+	/*do nothing*/
+}
+void SpMV::invokeKernelBLAS(const int32_t i) {
+	/*do nothing*/
+}
+
+/*single-precision floating point*/
+SpMVFloatVector::SpMVFloatVector(Options* opt) :
+		SpMV(opt) {
+
+	_rowOffsets.resize(_numGPUs, NULL);
+	_colIndexValues.resize(_numGPUs, NULL);
+	_numericalValues.resize(_numGPUs, NULL);
+	_vectorY.resize(_numGPUs, NULL);
+	_vectorX.resize(_numGPUs, NULL);
+
+	_alpha = _opt->_alpha;
+	_beta = _opt->_beta;
+}
+SpMVFloatVector::~SpMVFloatVector() {
+	/*release matrix data*/
+	for (int32_t i = 0; i < _numGPUs; ++i) {
+
+		/*select the device*/
+		cudaSetDevice(_opt->_gpus[i].first);
+		CudaCheckError();
+
+		/*release the resources*/
+		if (_rowOffsets[i]) {
+			cudaFree(_rowOffsets[i]);
+		}
+		if (_colIndexValues[i]) {
+			cudaFree(_colIndexValues[i]);
+		}
+
+		if (_numericalValues[i]) {
+			cudaFree(_numericalValues[i]);
+		}
+		if (i == 0 && _vectorY[i]) {
+			cudaFree(_vectorY[i]);
+		}
+		if (_vectorX[i]) {
+			cudaFree(_vectorX[i]);
+		}
+	}
+}
+void SpMVFloatVector::loadData() {
+	size_t numBytes;
+
+#ifdef FLOAT_USE_TEXTURE_MEMORY
+	cudaTextureDesc texDesc;
+	cudaResourceDesc resDesc;
+
+	/*specify the texture object parameters*/
+	memset(&texDesc, 0, sizeof(texDesc));
+	texDesc.addressMode[0] = cudaAddressModeClamp;
+	texDesc.addressMode[1] = cudaAddressModeClamp;
+	texDesc.filterMode = cudaFilterModePoint;
+	texDesc.readMode = cudaReadModeElementType;
+#endif
+
+	/*iterate each GPU*/
+	for (int32_t i = 0; i < _numGPUs; ++i) {
+
+		/*select the device*/
+		cudaSetDevice(_opt->_gpus[i].first);
+		CudaCheckError();
+
+		/*allocate counter buffers*/
+		cudaMalloc(&_cudaRowCounters[i], sizeof(uint32_t));
+		CudaCheckError();
+
+		cudaMemcpyToSymbol(_cudaNumRows, &_opt->_numRows, sizeof(uint32_t));
+		CudaCheckError();
+
+		cudaMemcpyToSymbol(_cudaNumCols, &_opt->_numCols, sizeof(uint32_t));
+		CudaCheckError();
+
+		/******************************************************
+		 * Load matrix data
+		 ******************************************************/
+		numBytes = (_opt->_numRows + 1) * sizeof(uint32_t);
+		cudaMalloc(&_rowOffsets[i], numBytes);
+		CudaCheckError();
+
+		cudaMemcpy(_rowOffsets[i], _opt->_rowOffsets, numBytes,
+				cudaMemcpyHostToDevice);
+		CudaCheckError();
+
+		numBytes = _opt->_numValues * sizeof(uint32_t);
+		cudaMalloc(&_colIndexValues[i], numBytes);
+		CudaCheckError();
+
+		cudaMemcpy(_colIndexValues[i], _opt->_colIndexValues, numBytes,
+				cudaMemcpyHostToDevice);
+		CudaCheckError();
+
+		/*load the numerical values*/
+		numBytes = _opt->_numValues * sizeof(float);
+		cudaMalloc(&_numericalValues[i], numBytes);
+		CudaCheckError();
+
+		cudaMemcpy(_numericalValues[i], _opt->_numericalValues, numBytes,
+				cudaMemcpyHostToDevice);
+		CudaCheckError();
+
+		/*****************************************************
+		 * Load vector X data
+		 ******************************************************/
+		numBytes = _opt->_numCols * sizeof(float);
+		cudaMalloc(&_vectorX[i], numBytes);
+		CudaCheckError();
+
+		cudaMemcpy(_vectorX[i], _opt->_vectorX, numBytes,
+				cudaMemcpyHostToDevice);
+		CudaCheckError();
+
+#ifdef FLOAT_USE_TEXTURE_MEMORY
+		/*specify texture and texture object*/
+		memset(&resDesc, 0, sizeof(resDesc));
+		resDesc.resType = cudaResourceTypeLinear;
+		resDesc.res.linear.devPtr = _vectorX[i];
+		resDesc.res.linear.desc = cudaCreateChannelDesc(32, 0, 0, 0,
+				cudaChannelFormatKindFloat);
+		resDesc.res.linear.sizeInBytes = numBytes;
+		cudaCreateTextureObject(&_texVectorX[i], &resDesc, &texDesc, NULL);
+		CudaCheckError();
+#endif
+
+		/*****************************************************
+		 * vector Y data
+		 ******************************************************/
+		numBytes = _opt->_numRows * sizeof(float);
+		cudaMalloc(&_vectorY[i], numBytes);
+		CudaCheckError();
+
+		/*copy the data*/
+		cudaMemcpy(_vectorY[i], _opt->_vectorY, numBytes,
+				cudaMemcpyHostToDevice);
+		CudaCheckError();
+	}
+}
+void SpMVFloatVector::storeData() {
+	/*transfer back vector Y*/
+	uint64_t numBytes = _opt->_numRows * sizeof(float);
+
+	/*select the device*/
+	cudaSetDevice(_opt->_gpus[0].first);
+	CudaCheckError();
+
+	/*copy back the data*/
+	cudaMemcpy(_opt->_vectorY, _vectorY[0], numBytes, cudaMemcpyDeviceToHost);
+	CudaCheckError();
+
+	/*open the file*/
+	FILE* file;
+	if (_opt->_outFileName.length() == 0) {
+		return;
+	}
+
+	file = fopen(_opt->_outFileName.c_str(), "w");
+	if (!file) {
+		cerr << "Failed to open file: " << _opt->_outFileName << endl;
+		return;
+	}
+
+	/*write to the file*/
+	float* ptr = (float*) _opt->_vectorY;
+	for (uint32_t i = 0; i < _opt->_numRows; ++i) {
+		fprintf(file, "%f\n", ptr[i]);
+	}
+
+	/*close the file*/
+	if (file != stdout) {
+		fclose(file);
+	}
+}
+void SpMVFloatVector::invokeKernel(const int32_t i) {
+	int32_t numThreadsPerBlock;
+	int32_t numThreadBlocks;
+
+	/*get the number of threads per block*/
+	getKernelGridInfo(i, numThreadsPerBlock, numThreadBlocks);
+
+	/*invoke the kernel*/
+#ifdef FLOAT_USE_TEXTURE_MEMORY
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr32DynamicVector<float, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr32DynamicVector<float, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 64) {
+		spmv_csr::csr32DynamicVector<float, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else {
+		spmv_csr::csr32DynamicVector<float, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	}
+#else
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr32DynamicVector<float, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr32DynamicVector<float, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else if(_meanElementsPerRow <= 64) {
+		spmv_csr::csr32DynamicVector<float, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else {
+		spmv_csr::csr32DynamicVector<float, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	}
+
+#endif
+}
+
+void SpMVFloatVector::invokeKernelBLAS(const int32_t i) {
+	int32_t numThreadsPerBlock;
+	int32_t numThreadBlocks;
+
+	/*get the number of threads per block*/
+	getKernelGridInfo(i, numThreadsPerBlock, numThreadBlocks);
+
+	/*invoke the kernel*/
+#ifdef FLOAT_USE_TEXTURE_MEMORY
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr32DynamicVectorBLAS<float, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr32DynamicVectorBLAS<float, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 64) {
+		spmv_csr::csr32DynamicVectorBLAS<float, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i], _alpha, _beta);
+	} else {
+		spmv_csr::csr32DynamicVectorBLAS<float, 32,
+				MAX_NUM_THREADS_PER_BLOCK / 32><<<numThreadBlocks,
+				numThreadsPerBlock>>>(_cudaRowCounters[i], _rowOffsets[i],
+				_colIndexValues[i], _numericalValues[i], _texVectorX[i],
+				_vectorY[i], _alpha, _beta);
+	}
+#else
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr32DynamicVectorBLAS<float, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr32DynamicVectorBLAS<float, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _alpha, _beta);
+	} else if(_meanElementsPerRow <= 64) {
+		spmv_csr::csr32DynamicVectorBLAS<float, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _alpha, _beta);
+	} else {
+		spmv_csr::csr32DynamicVectorBLAS<float, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _alpha, _beta);
+	}
+
+#endif
+}
+
+void SpMVFloatWarp::invokeKernel(const int32_t i) {
+	int32_t numThreadsPerBlock;
+	int32_t numThreadBlocks;
+
+	/*get the number of threads per block*/
+	getKernelGridInfo(i, numThreadsPerBlock, numThreadBlocks);
+
+	/*invoke the kernel*/
+#ifdef FLOAT_USE_TEXTURE_MEMORY
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr32DynamicWarp<float, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr32DynamicWarp<float, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 64) {
+		spmv_csr::csr32DynamicWarp<float, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else {
+		spmv_csr::csr32DynamicWarp<float, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	}
+#else
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr32DynamicWarp<float, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i],_vectorY[i]);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr32DynamicWarp<float, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else if(_meanElementsPerRow <= 64) {
+		spmv_csr::csr32DynamicWarp<float, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else {
+		spmv_csr::csr32DynamicWarp<float, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	}
+
+#endif
+}
+
+void SpMVFloatWarp::invokeKernelBLAS(const int32_t i) {
+	int32_t numThreadsPerBlock;
+	int32_t numThreadBlocks;
+
+	/*get the number of threads per block*/
+	getKernelGridInfo(i, numThreadsPerBlock, numThreadBlocks);
+
+	/*invoke the kernel*/
+#ifdef FLOAT_USE_TEXTURE_MEMORY
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr32DynamicWarpBLAS<float, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr32DynamicWarpBLAS<float, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 64) {
+		spmv_csr::csr32DynamicWarpBLAS<float, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i], _alpha, _beta);
+	} else {
+		spmv_csr::csr32DynamicWarpBLAS<float, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i], _alpha, _beta);
+	}
+#else
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr32DynamicWarpBLAS<float, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i],_vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr32DynamicWarpBLAS<float, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _alpha, _beta);
+	} else if(_meanElementsPerRow <= 64) {
+		spmv_csr::csr32DynamicWarpBLAS<float, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _alpha, _beta);
+	} else {
+		spmv_csr::csr32DynamicWarpBLAS<float, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _alpha, _beta);
+	}
+
+#endif
+}
+
+/*double-precision floating point*/
+SpMVDoubleVector::SpMVDoubleVector(Options* opt) :
+		SpMV(opt) {
+
+	_rowOffsets.resize(_numGPUs, NULL);
+	_colIndexValues.resize(_numGPUs, NULL);
+	_numericalValues.resize(_numGPUs, NULL);
+	_vectorY.resize(_numGPUs, NULL);
+
+	_vectorX.resize(_numGPUs, NULL);
+
+	_alpha = _opt->_alpha;
+	_beta = _opt->_beta;
+
+}
+SpMVDoubleVector::~SpMVDoubleVector() {
+	/*release matrix data*/
+	for (int32_t i = 0; i < _numGPUs; ++i) {
+
+		/*select the device*/
+		cudaSetDevice(_opt->_gpus[i].first);
+		CudaCheckError();
+
+		/*release the resources*/
+		if (_rowOffsets[i]) {
+			cudaFree(_rowOffsets[i]);
+		}
+		if (_colIndexValues[i]) {
+			cudaFree(_colIndexValues[i]);
+		}
+
+		if (_numericalValues[i]) {
+			cudaFree(_numericalValues[i]);
+		}
+		if (i == 0 && _vectorY[i]) {
+			cudaFree(_vectorY[i]);
+		}
+		if (_vectorX[i]) {
+			cudaFree(_vectorX[i]);
+		}
+	}
+}
+void SpMVDoubleVector::loadData() {
+	size_t numBytes;
+
+#ifdef DOUBLE_USE_TEXTURE_MEMORY
+	cudaTextureDesc texDesc;
+	cudaResourceDesc resDesc;
+
+	/*specify the texture object parameters*/
+	memset(&texDesc, 0, sizeof(texDesc));
+	texDesc.addressMode[0] = cudaAddressModeClamp;
+	texDesc.addressMode[1] = cudaAddressModeClamp;
+	texDesc.filterMode = cudaFilterModePoint;
+	texDesc.readMode = cudaReadModeElementType;
+#endif
+
+	/*iterate each GPU*/
+	for (int32_t i = 0; i < _numGPUs; ++i) {
+
+		/*select the device*/
+		cudaSetDevice(_opt->_gpus[i].first);
+		CudaCheckError();
+
+		/*allocate counter buffers*/
+		cudaMalloc(&_cudaRowCounters[i], sizeof(uint32_t));
+		CudaCheckError();
+
+		cudaMemcpyToSymbol(_cudaNumRows, &_opt->_numRows, sizeof(uint32_t));
+		CudaCheckError();
+
+		cudaMemcpyToSymbol(_cudaNumCols, &_opt->_numCols, sizeof(uint32_t));
+		CudaCheckError();
+
+		/******************************************************
+		 * Load matrix data
+		 ******************************************************/
+		numBytes = (_opt->_numRows + 1) * sizeof(uint32_t);
+		cudaMalloc(&_rowOffsets[i], numBytes);
+		CudaCheckError();
+
+		cudaMemcpy(_rowOffsets[i], _opt->_rowOffsets, numBytes,
+				cudaMemcpyHostToDevice);
+		CudaCheckError();
+
+		numBytes = _opt->_numValues * sizeof(uint32_t);
+		cudaMalloc(&_colIndexValues[i], numBytes);
+		CudaCheckError();
+
+		cudaMemcpy(_colIndexValues[i], _opt->_colIndexValues, numBytes,
+				cudaMemcpyHostToDevice);
+		CudaCheckError();
+
+		/*load the numerical values*/
+		numBytes = _opt->_numValues * sizeof(double);
+		cudaMalloc(&_numericalValues[i], numBytes);
+		CudaCheckError();
+
+		cudaMemcpy(_numericalValues[i], _opt->_numericalValues, numBytes,
+				cudaMemcpyHostToDevice);
+		CudaCheckError();
+
+		/*****************************************************
+		 * Load vector X data
+		 ******************************************************/
+		numBytes = _opt->_numCols * sizeof(double);
+		cudaMalloc(&_vectorX[i], numBytes);
+		CudaCheckError();
+
+		cudaMemcpy(_vectorX[i], _opt->_vectorX, numBytes,
+				cudaMemcpyHostToDevice);
+		CudaCheckError();
+
+#ifdef DOUBLE_USE_TEXTURE_MEMORY
+		/*specify texture and texture object*/
+		memset(&resDesc, 0, sizeof(resDesc));
+		resDesc.resType = cudaResourceTypeLinear;
+		resDesc.res.linear.devPtr = _vectorX[i];
+		resDesc.res.linear.desc = cudaCreateChannelDesc(32, 32, 0, 0,
+				cudaChannelFormatKindSigned);
+		resDesc.res.linear.sizeInBytes = numBytes;
+		cudaCreateTextureObject(&_texVectorX[i], &resDesc, &texDesc, NULL);
+		CudaCheckError();
+#endif
+		/*****************************************************
+		 * vector Y data
+		 ******************************************************/
+		numBytes = _opt->_numRows * sizeof(double);
+		/*allocate space on the first GPU*/
+		cudaMalloc(&_vectorY[i], numBytes);
+		CudaCheckError();
+
+		/*copy the data*/
+		cudaMemcpy(_vectorY[i], _opt->_vectorY, numBytes,
+				cudaMemcpyHostToDevice);
+		CudaCheckError();
+	}
+}
+void SpMVDoubleVector::storeData() {
+	/*transfer back vector Y*/
+	uint64_t numBytes = _opt->_numRows * sizeof(double);
+
+	/*select the device*/
+	cudaSetDevice(_opt->_gpus[0].first);
+	CudaCheckError();
+
+	/*copy back the data*/
+	cudaMemcpy(_opt->_vectorY, _vectorY[0], numBytes, cudaMemcpyDeviceToHost);
+	CudaCheckError();
+
+	/*open the file*/
+	FILE* file;
+	if (_opt->_outFileName.length() == 0) {
+		return;
+	}
+
+	file = fopen(_opt->_outFileName.c_str(), "w");
+	if (!file) {
+		cerr << "Failed to open file: " << _opt->_outFileName << endl;
+		return;
+	}
+
+	/*write to the file*/
+	double* ptr = (double*) _opt->_vectorY;
+	for (uint32_t i = 0; i < _opt->_numRows; ++i) {
+		fprintf(file, "%lf\n", ptr[i]);
+	}
+
+	/*close the file*/
+	if (file != stdout) {
+		fclose(file);
+	}
+}
+void SpMVDoubleVector::invokeKernel(const int32_t i) {
+	int32_t numThreadsPerBlock;
+	int32_t numThreadBlocks;
+
+	/*get the number of threads per block*/
+	getKernelGridInfo(i, numThreadsPerBlock, numThreadBlocks);
+
+	/*invoke the kernel*/
+#ifdef DOUBLE_USE_TEXTURE_MEMORY
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr64DynamicVector<double, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr64DynamicVector<double, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 64) {
+		spmv_csr::csr64DynamicVector<double, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else {
+		spmv_csr::csr64DynamicVector<double, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	}
+#else
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr64DynamicVector<double, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr64DynamicVector<double, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else if(_meanElementsPerRow <= 64) {
+		spmv_csr::csr64DynamicVector<double, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else {
+		spmv_csr::csr64DynamicVector<double, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	}
+
+#endif
+}
+
+void SpMVDoubleVector::invokeKernelBLAS(const int32_t i) {
+	int32_t numThreadsPerBlock;
+	int32_t numThreadBlocks;
+
+	/*get the number of threads per block*/
+	getKernelGridInfo(i, numThreadsPerBlock, numThreadBlocks);
+
+	/*invoke the kernel*/
+#ifdef DOUBLE_USE_TEXTURE_MEMORY
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr64DynamicVectorBLAS<double, 2,
+				MAX_NUM_THREADS_PER_BLOCK / 2><<<numThreadBlocks,
+				numThreadsPerBlock>>>(_cudaRowCounters[i], _rowOffsets[i],
+				_colIndexValues[i], _numericalValues[i], _texVectorX[i],
+				_vectorY[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr64DynamicVectorBLAS<double, 4,
+				MAX_NUM_THREADS_PER_BLOCK / 4><<<numThreadBlocks,
+				numThreadsPerBlock>>>(_cudaRowCounters[i], _rowOffsets[i],
+				_colIndexValues[i], _numericalValues[i], _texVectorX[i],
+				_vectorY[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 64) {
+		spmv_csr::csr64DynamicVectorBLAS<double, 8,
+				MAX_NUM_THREADS_PER_BLOCK / 8><<<numThreadBlocks,
+				numThreadsPerBlock>>>(_cudaRowCounters[i], _rowOffsets[i],
+				_colIndexValues[i], _numericalValues[i], _texVectorX[i],
+				_vectorY[i], _vectorY[i], _alpha, _beta);
+	} else {
+		spmv_csr::csr64DynamicVectorBLAS<double, 32,
+				MAX_NUM_THREADS_PER_BLOCK / 32><<<numThreadBlocks,
+				numThreadsPerBlock>>>(_cudaRowCounters[i], _rowOffsets[i],
+				_colIndexValues[i], _numericalValues[i], _texVectorX[i],
+				_vectorY[i], _vectorY[i], _alpha, _beta);
+	}
+#else
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr64DynamicVectorBLAS<double, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr64DynamicVectorBLAS<double, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	} else if(_meanElementsPerRow <= 64) {
+		spmv_csr::csr64DynamicVectorBLAS<double, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	} else {
+		spmv_csr::csr64DynamicVectorBLAS<double, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	}
+#endif
+}
+
+void SpMVDoubleWarp::invokeKernel(const int32_t i) {
+	int32_t numThreadsPerBlock;
+	int32_t numThreadBlocks;
+
+	/*get the number of threads per block*/
+	getKernelGridInfo(i, numThreadsPerBlock, numThreadBlocks);
+
+	/*invoke the kernel*/
+#ifdef DOUBLE_USE_TEXTURE_MEMORY
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr64DynamicWarp<double, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr64DynamicWarp<double, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 64) {
+		spmv_csr::csr64DynamicWarp<double, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	} else {
+		spmv_csr::csr64DynamicWarp<double, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i]);
+	}
+#else
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr64DynamicWarp<double, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr64DynamicWarp<double, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else if(_meanElementsPerRow <= 64) {
+		spmv_csr::csr64DynamicWarp<double, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	} else {
+		spmv_csr::csr64DynamicWarp<double, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i]);
+	}
+
+#endif
+}
+
+void SpMVDoubleWarp::invokeKernelBLAS(const int32_t i) {
+	int32_t numThreadsPerBlock;
+	int32_t numThreadBlocks;
+
+	/*get the number of threads per block*/
+	getKernelGridInfo(i, numThreadsPerBlock, numThreadBlocks);
+
+	/*invoke the kernel*/
+#ifdef DOUBLE_USE_TEXTURE_MEMORY
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr64DynamicWarpBLAS<double, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr64DynamicWarpBLAS<double, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 64) {
+		spmv_csr::csr64DynamicWarpBLAS<double, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+				numThreadBlocks, numThreadsPerBlock>>>(_cudaRowCounters[i],
+				_rowOffsets[i], _colIndexValues[i], _numericalValues[i],
+				_texVectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	} else {
+		spmv_csr::csr64DynamicWarpBLAS<double, 32,
+				MAX_NUM_THREADS_PER_BLOCK / 32><<<numThreadBlocks,
+				numThreadsPerBlock>>>(_cudaRowCounters[i], _rowOffsets[i],
+				_colIndexValues[i], _numericalValues[i], _texVectorX[i],
+				_vectorY[i], _vectorY[i], _alpha, _beta);
+	}
+#else
+	if (_meanElementsPerRow <= 2) {
+		spmv_csr::csr64DynamicWarpBLAS<double, 2, MAX_NUM_THREADS_PER_BLOCK / 2><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	} else if (_meanElementsPerRow <= 4) {
+		spmv_csr::csr64DynamicWarpBLAS<double, 4, MAX_NUM_THREADS_PER_BLOCK / 4><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	} else if(_meanElementsPerRow <= 64) {
+		spmv_csr::csr64DynamicWarpBLAS<double, 8, MAX_NUM_THREADS_PER_BLOCK / 8><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	} else {
+		spmv_csr::csr64DynamicWarpBLAS<double, 32, MAX_NUM_THREADS_PER_BLOCK / 32><<<
+		numThreadBlocks, numThreadsPerBlock>>>(
+				_cudaRowCounters[i], _rowOffsets[i], _colIndexValues[i],
+				_numericalValues[i], _vectorX[i], _vectorY[i], _vectorY[i], _alpha, _beta);
+	}
+
+#endif
+}
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMV.h b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMV.h
new file mode 100644
index 000000000..55f89d3b3
--- /dev/null
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMV.h
@@ -0,0 +1,152 @@
+/*
+ * SpMV.h
+ *
+ *  Created on: Nov 21, 2014
+ *      Author: yongchao
+ */
+
+#ifndef SPMV_H_
+#define SPMV_H_
+#include "Options.h"
+#include "sys/time.h"
+
+class SpMV {
+public:
+	SpMV(Options* opt);
+	virtual ~SpMV() = 0;
+
+	/*compute the number of threads per block*/
+	inline void getKernelGridInfo(const int32_t dev,
+			int32_t & numThreadsPerBlock, int32_t &numThreadBlocks) {
+
+		/*set to the maximum number of threads per block*/
+		numThreadsPerBlock = _opt->_gpus[dev].second.maxThreadsPerBlock;
+
+		/*set to the number of multiprocessors*/
+		numThreadBlocks = _opt->_gpus[dev].second.multiProcessorCount
+				* (_opt->_gpus[dev].second.maxThreadsPerMultiProcessor
+						/ numThreadsPerBlock);
+
+		//cerr << numThreadsPerBlock << " " << numThreadBlocks << endl;
+	}
+
+	inline double getSysTime() {
+		double dtime;
+		struct timeval tv;
+
+		/*get the time of the day*/
+		gettimeofday(&tv, NULL);
+
+		/*get the milli-seconds*/
+		dtime = ((double) tv.tv_sec) * 1000.0;
+		dtime += ((double) tv.tv_usec) / 1000.0;
+
+		return dtime;
+	}
+	void spmvKernel();
+	virtual void loadData() = 0;
+	virtual void storeData() = 0;
+
+	/*y = AX*/
+	virtual void invokeKernel(const int32_t i) = 0;
+	/*y = alpha * Ax + beta * y*/
+	virtual void invokeKernelBLAS(const int32_t i) = 0;
+
+protected:
+	/*member variable*/
+	Options* _opt;
+
+	/*number of GPUs*/
+	int32_t _numGPUs;
+
+	/*average number of elements per row*/
+	int32_t _meanElementsPerRow;
+
+	/*stream*/
+	vector<cudaStream_t> _streams;
+
+	/*row counter*/
+	vector<uint32_t*> _cudaRowCounters;
+
+#if defined(FLOAT_USE_TEXTURE_MEMORY) || defined(DOUBLE_USE_TEXTURE_MEMORY)
+	vector<cudaTextureObject_t> _texVectorX;
+#endif
+};
+
+/*use global memory*/
+/*vector-based row dynamic distribution*/
+class SpMVFloatVector: public SpMV {
+public:
+	SpMVFloatVector(Options* opt);
+	virtual ~SpMVFloatVector();
+
+	void loadData();
+	void storeData();
+
+	/*y = Ax*/
+	virtual void invokeKernel(const int32_t i);
+	/*y = alpha * Ax + beta * y*/
+	virtual void invokeKernelBLAS(const int32_t i);
+
+//protected:
+	vector<uint32_t*> _rowOffsets;
+	vector<uint32_t*> _colIndexValues;
+	vector<float*> _numericalValues;
+	vector<float*> _vectorY;
+	vector<float*> _vectorX;
+
+	float _alpha;
+	float _beta;
+};
+
+/*warp-based row dynamic distribution*/
+class SpMVFloatWarp: public SpMVFloatVector {
+public:
+	SpMVFloatWarp(Options* opt) :
+			SpMVFloatVector(opt) {
+	}
+
+	/*y = Ax*/
+	void invokeKernel(const int32_t i);
+	/*y = alpha * Ax + beta * y*/
+	void invokeKernelBLAS(const int32_t i);
+};
+
+class SpMVDoubleVector: public SpMV {
+public:
+	SpMVDoubleVector(Options* opt);
+	virtual ~SpMVDoubleVector();
+
+	void loadData();
+	void storeData();
+
+	/*y = Ax*/
+	virtual void invokeKernel(const int32_t i);
+
+	/*y = alpha * Ax + beta * y*/
+	virtual void invokeKernelBLAS(const int32_t i);
+
+//protected:
+	vector<uint32_t*> _rowOffsets;
+	vector<uint32_t*> _colIndexValues;
+	vector<double*> _numericalValues;
+	vector<double*> _vectorY;
+	vector<double*> _vectorX;
+
+	double _alpha;
+	double _beta;
+};
+
+/*warp-based row dynamic distribution*/
+class SpMVDoubleWarp: public SpMVDoubleVector {
+public:
+	SpMVDoubleWarp(Options* opt) :
+			SpMVDoubleVector(opt) {
+	}
+	/*y = Ax*/
+	void invokeKernel(const int32_t i);
+
+	/*y = alpha * Ax + beta * y*/
+	void invokeKernelBLAS(const int32_t i);
+};
+#endif /* SPMV_H_ */
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMVCSR.cu b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMVCSR.cu
new file mode 100644
index 000000000..74ed61627
--- /dev/null
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMVCSR.cu
@@ -0,0 +1,12 @@
+/*
+ * SpMVCSR.cu
+ *
+ *  Created on: Nov 25, 2014
+ *      Author: yongchao
+ */
+#include "SpMVCSR.h"
+
+/*device variables*/
+__constant__ uint32_t _cudaNumRows;
+__constant__ uint32_t _cudaNumCols;
+
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMVCSR.h b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMVCSR.h
new file mode 100644
index 000000000..effa194d0
--- /dev/null
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMVCSR.h
@@ -0,0 +1,696 @@
+/*
+ * SpMVCSR.h
+ *
+ *  Created on: Nov 25, 2014
+ *      Author: yongchao
+ */
+
+#ifndef SPMVCSR_H_
+#define SPMVCSR_H_
+#include "Types.h"
+
+#pragma once
+
+extern __constant__ uint32_t _cudaNumRows;
+extern __constant__ uint32_t _cudaNumCols;
+
+namespace spmv_csr {
+
+/*device functions*/
+template < typename T>
+__device__ inline T shfl_down_64bits(T var, int32_t srcLane,
+		int32_t width) {
+
+	int2 a = *reinterpret_cast<int2*>(&var);
+
+	/*exchange the data*/
+	a.x = __shfl_down(a.x, srcLane, width);
+	a.y = __shfl_down(a.y, srcLane, width);
+	
+	return *reinterpret_cast<T*>(&a);
+}
+
+/*macro to get the X value*/
+__device__ inline float FLOAT_VECTOR_GET(const cudaTextureObject_t vectorX, uint32_t index){
+	return tex1Dfetch<float>(vectorX, index);
+}
+__device__ inline float FLOAT_VECTOR_GET (const float* __restrict vectorX, uint32_t index){
+	return vectorX[index];
+}
+
+__device__ inline double DOUBLE_VECTOR_GET (const cudaTextureObject_t vectorX, uint32_t index){
+	/*load the data*/
+	int2 v = tex1Dfetch<int2>(vectorX, index);
+
+	/*convert to double*/
+	return __hiloint2double(v.y, v.x);
+}
+__device__ inline double DOUBLE_VECTOR_GET (const double* __restrict vectorX, uint32_t index){
+	return vectorX[index];
+}
+
+
+/*32-bit*/
+template < typename T, uint32_t THREADS_PER_VECTOR, uint32_t MAX_NUM_VECTORS_PER_BLOCK>
+#ifdef FLOAT_USE_TEXTURE_MEMORY
+__global__ void csr32DynamicWarp(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+		const T* __restrict numericalValues, const cudaTextureObject_t vectorX,  T* vectorY) {
+#else
+	__global__ void csr32DynamicWarp(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+			const T* __restrict numericalValues, const T* __restrict vectorX, T* vectorY) {
+#endif
+	uint32_t i;
+	T sum;
+	uint32_t row;
+	uint32_t rowStart, rowEnd;
+	const uint32_t laneId = threadIdx.x % THREADS_PER_VECTOR; /*lane index in the vector*/
+	const uint32_t vectorId = threadIdx.x / THREADS_PER_VECTOR; /*vector index in the thread block*/
+	const uint32_t warpLaneId = threadIdx.x & 31;	/*lane index in the warp*/
+	const uint32_t warpVectorId = warpLaneId / THREADS_PER_VECTOR;	/*vector index in the warp*/
+
+	__shared__ volatile uint32_t space[MAX_NUM_VECTORS_PER_BLOCK][2];
+
+	/*get the row index*/
+	if (warpLaneId == 0) {
+		row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
+	}
+	/*broadcast the value to other threads in the same warp and compute the row index of each vector*/
+	row = __shfl(row, 0) + warpVectorId;
+
+	/*check the row range*/
+	while (row < _cudaNumRows) {
+
+		/*use two threads to fetch the row offset*/
+		if (laneId < 2) {
+			space[vectorId][laneId] = rowOffsets[row + laneId];
+		}
+		rowStart = space[vectorId][0];
+		rowEnd = space[vectorId][1];
+
+		/*there are non-zero elements in the current row*/
+		sum = 0;
+		/*compute dot product*/
+		if (THREADS_PER_VECTOR == 32) {
+
+			/*ensure aligned memory access*/
+			i = rowStart - (rowStart & (THREADS_PER_VECTOR - 1)) + laneId;
+
+			/*process the unaligned part*/
+			if (i >= rowStart && i < rowEnd) {
+				sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+
+				/*process the aligned part*/
+			for (i += THREADS_PER_VECTOR; i < rowEnd; i += THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		} else {
+			/*regardless of the global memory access alignment*/
+			for (i = rowStart + laneId; i < rowEnd; i +=
+					THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		}
+		/*intra-vector reduction*/
+		for (i = THREADS_PER_VECTOR >> 1; i > 0; i >>= 1) {
+			sum += __shfl_down(sum, i, THREADS_PER_VECTOR);
+		}
+
+		/*save the results and get a new row*/
+		if (laneId == 0) {
+			/*save the results*/
+			vectorY[row] = sum;
+		}
+
+		/*get a new row index*/
+		if(warpLaneId == 0){
+			row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
+		}
+		/*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/
+		row = __shfl(row, 0) + warpVectorId;
+
+	}/*while*/
+}
+
+/*vector-based row dynamic distribution*/
+template < typename T, uint32_t THREADS_PER_VECTOR, uint32_t MAX_NUM_VECTORS_PER_BLOCK>
+#ifdef FLOAT_USE_TEXTURE_MEMORY
+__global__ void csr32DynamicVector(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+		const T* __restrict numericalValues, const cudaTextureObject_t vectorX, T* vectorY) {
+#else
+	__global__ void csr32DynamicVector(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+			const T* __restrict numericalValues, const T* __restrict vectorX, T* vectorY) {
+#endif
+
+	uint32_t i;
+	T sum;
+	uint32_t row;
+	uint32_t rowStart, rowEnd;
+	const uint32_t laneId = threadIdx.x % THREADS_PER_VECTOR; /*lane index in the vector*/
+	const uint32_t vectorId = threadIdx.x / THREADS_PER_VECTOR; /*vector index in the block*/
+	__shared__ volatile uint32_t space[MAX_NUM_VECTORS_PER_BLOCK][2];
+
+	/*get the row index*/
+	if (laneId == 0) {
+		row = atomicAdd(cudaRowCounter, 1);
+	}
+	/*broadcast the value to other lanes from lane 0*/
+	row = __shfl(row, 0, THREADS_PER_VECTOR);
+
+	/*check the row range*/
+	while (row < _cudaNumRows) {
+
+		/*use two threads to fetch the row offset*/
+		if (laneId < 2) {
+			space[vectorId][laneId] = rowOffsets[row + laneId];
+		}
+		rowStart = space[vectorId][0];
+		rowEnd = space[vectorId][1];
+
+		/*there are non-zero elements in the current row*/
+		sum = 0;
+		/*compute dot product*/
+		if (THREADS_PER_VECTOR == 32) {
+
+			/*ensure aligned memory access*/
+			i = rowStart - (rowStart & (THREADS_PER_VECTOR - 1)) + laneId;
+
+			/*process the unaligned part*/
+			if (i >= rowStart && i < rowEnd) {
+				sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+
+				/*process the aligned part*/
+			for (i += THREADS_PER_VECTOR; i < rowEnd; i += THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		} else {
+			/*regardless of the global memory access alignment*/
+			for (i = rowStart + laneId; i < rowEnd; i +=
+					THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		}
+		/*intra-vector reduction*/
+		for (i = THREADS_PER_VECTOR >> 1; i > 0; i >>= 1) {
+			sum += __shfl_down(sum, i, THREADS_PER_VECTOR);
+		}
+
+		/*save the results and get a new row*/
+		if (laneId == 0) {
+			/*save the results*/
+			vectorY[row] = sum;
+
+			/*get a new row index*/
+			row = atomicAdd(cudaRowCounter, 1);
+		}
+		row = __shfl(row, 0, THREADS_PER_VECTOR);
+	}/*while*/
+}
+
+	/*32-bit*/
+	template < typename T, uint32_t THREADS_PER_VECTOR, uint32_t MAX_NUM_VECTORS_PER_BLOCK>
+	#ifdef FLOAT_USE_TEXTURE_MEMORY
+	__global__ void csr32DynamicWarpBLAS(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+			const T* __restrict numericalValues, const cudaTextureObject_t vectorX,  T* vectorY, const T alpha, const T beta) {
+	#else
+		__global__ void csr32DynamicWarpBLAS(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+				const T* __restrict numericalValues, const T* __restrict vectorX, T* vectorY, const T alpha, const T beta) {
+	#endif
+		uint32_t i;
+		T sum;
+		uint32_t row;
+		uint32_t rowStart, rowEnd;
+		const uint32_t laneId = threadIdx.x % THREADS_PER_VECTOR; /*lane index in the vector*/
+		const uint32_t vectorId = threadIdx.x / THREADS_PER_VECTOR; /*vector index in the thread block*/
+		const uint32_t warpLaneId = threadIdx.x & 31;	/*lane index in the warp*/
+		const uint32_t warpVectorId = warpLaneId / THREADS_PER_VECTOR;	/*vector index in the warp*/
+
+		__shared__ volatile uint32_t space[MAX_NUM_VECTORS_PER_BLOCK][2];
+
+		/*get the row index*/
+		if (warpLaneId == 0) {
+			row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
+		}
+		/*broadcast the value to other threads in the same warp and compute the row index of each vector*/
+		row = __shfl(row, 0) + warpVectorId;
+
+		/*check the row range*/
+		while (row < _cudaNumRows) {
+
+			/*use two threads to fetch the row offset*/
+			if (laneId < 2) {
+				space[vectorId][laneId] = rowOffsets[row + laneId];
+			}
+			rowStart = space[vectorId][0];
+			rowEnd = space[vectorId][1];
+
+			/*there are non-zero elements in the current row*/
+			sum = 0;
+			/*compute dot product*/
+			if (THREADS_PER_VECTOR == 32) {
+
+				/*ensure aligned memory access*/
+				i = rowStart - (rowStart & (THREADS_PER_VECTOR - 1)) + laneId;
+
+				/*process the unaligned part*/
+				if (i >= rowStart && i < rowEnd) {
+					sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+				}
+
+					/*process the aligned part*/
+				for (i += THREADS_PER_VECTOR; i < rowEnd; i += THREADS_PER_VECTOR) {
+					sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+				}
+			} else {
+				/*regardless of the global memory access alignment*/
+				for (i = rowStart + laneId; i < rowEnd; i +=
+						THREADS_PER_VECTOR) {
+					sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+				}
+			}
+			/*intra-vector reduction*/
+			sum *= alpha;
+			for (i = THREADS_PER_VECTOR >> 1; i > 0; i >>= 1) {
+				sum += __shfl_down(sum, i, THREADS_PER_VECTOR);
+			}
+
+			/*save the results and get a new row*/
+			if (laneId == 0) {
+				/*save the results*/
+				vectorY[row] = sum + beta * vectorY[row];
+			}
+
+			/*get a new row index*/
+			if(warpLaneId == 0){
+				row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
+			}
+			/*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/
+			row = __shfl(row, 0) + warpVectorId;
+
+		}/*while*/
+	}
+
+	/*vector-based row dynamic distribution*/
+	template < typename T, uint32_t THREADS_PER_VECTOR, uint32_t MAX_NUM_VECTORS_PER_BLOCK>
+	#ifdef FLOAT_USE_TEXTURE_MEMORY
+	__global__ void csr32DynamicVectorBLAS(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+			const T* __restrict numericalValues, const cudaTextureObject_t vectorX, T* vectorY, const T alpha, const T beta) {
+	#else
+		__global__ void csr32DynamicVectorBLAS(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+				const T* __restrict numericalValues, const T* __restrict vectorX, T* vectorY, const T alpha, const T beta) {
+	#endif
+
+		uint32_t i;
+		T sum;
+		uint32_t row;
+		uint32_t rowStart, rowEnd;
+		const uint32_t laneId = threadIdx.x % THREADS_PER_VECTOR; /*lane index in the vector*/
+		const uint32_t vectorId = threadIdx.x / THREADS_PER_VECTOR; /*vector index in the block*/
+		__shared__ volatile uint32_t space[MAX_NUM_VECTORS_PER_BLOCK][2];
+
+		/*get the row index*/
+		if (laneId == 0) {
+			row = atomicAdd(cudaRowCounter, 1);
+		}
+		/*broadcast the value to other lanes from lane 0*/
+		row = __shfl(row, 0, THREADS_PER_VECTOR);
+
+		/*check the row range*/
+		while (row < _cudaNumRows) {
+
+			/*use two threads to fetch the row offset*/
+			if (laneId < 2) {
+				space[vectorId][laneId] = rowOffsets[row + laneId];
+			}
+			rowStart = space[vectorId][0];
+			rowEnd = space[vectorId][1];
+
+			/*there are non-zero elements in the current row*/
+			sum = 0;
+			/*compute dot product*/
+			if (THREADS_PER_VECTOR == 32) {
+
+				/*ensure aligned memory access*/
+				i = rowStart - (rowStart & (THREADS_PER_VECTOR - 1)) + laneId;
+
+				/*process the unaligned part*/
+				if (i >= rowStart && i < rowEnd) {
+					sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+				}
+
+					/*process the aligned part*/
+				for (i += THREADS_PER_VECTOR; i < rowEnd; i += THREADS_PER_VECTOR) {
+					sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+				}
+			} else {
+				/*regardless of the global memory access alignment*/
+				for (i = rowStart + laneId; i < rowEnd; i +=
+						THREADS_PER_VECTOR) {
+					sum += numericalValues[i] * FLOAT_VECTOR_GET(vectorX, colIndexValues[i]);
+				}
+			}
+			/*intra-vector reduction*/
+			sum *= alpha;
+			for (i = THREADS_PER_VECTOR >> 1; i > 0; i >>= 1) {
+				sum += __shfl_down(sum, i, THREADS_PER_VECTOR);
+			}
+
+			/*save the results and get a new row*/
+			if (laneId == 0) {
+				/*save the results*/
+				vectorY[row] = sum + beta * vectorY[row];
+
+				/*get a new row index*/
+				row = atomicAdd(cudaRowCounter, 1);
+			}
+			row = __shfl(row, 0, THREADS_PER_VECTOR);
+		}/*while*/
+	}
+
+/*64-bit functions*/
+template < typename T, uint32_t THREADS_PER_VECTOR, uint32_t MAX_NUM_VECTORS_PER_BLOCK>
+#ifdef DOUBLE_USE_TEXTURE_MEMORY
+__global__ void csr64DynamicVector(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+		const T* __restrict numericalValues, const cudaTextureObject_t vectorX, T* vectorY)
+#else
+__global__ void csr64DynamicVector(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+		const T* __restrict numericalValues, const T* __restrict vectorX, T* vectorY)
+#endif
+{
+	uint32_t i;
+	T sum;
+	uint32_t row;
+	uint32_t rowStart, rowEnd;
+	const uint32_t laneId = threadIdx.x % THREADS_PER_VECTOR; /*lane index in the vector*/
+	const uint32_t vectorId = threadIdx.x / THREADS_PER_VECTOR; /*vector index in the block*/
+
+	__shared__ volatile uint32_t space[MAX_NUM_VECTORS_PER_BLOCK][2];
+
+	/*get the row index*/
+	if (laneId == 0) {
+		row = atomicAdd(cudaRowCounter, 1);
+	}
+	/*broadcast the value to other lanes from lane 0*/
+	row = __shfl(row, 0, THREADS_PER_VECTOR);
+
+	/*check the row range*/
+	while (row < _cudaNumRows) {
+
+		/*use two threads to fetch the row offset*/
+		if (laneId < 2) {
+			space[vectorId][laneId] = rowOffsets[row + laneId];
+		}
+		rowStart = space[vectorId][0];
+		rowEnd = space[vectorId][1];
+
+		/*there are non-zero elements in the current row*/
+		sum = 0;
+		/*compute dot product*/
+		if (THREADS_PER_VECTOR == 32) {
+
+			/*ensure aligned memory access*/
+			i = rowStart - (rowStart & (THREADS_PER_VECTOR - 1)) + laneId;
+
+			/*process the unaligned part*/
+			if (i >= rowStart && i < rowEnd) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+
+				/*process the aligned part*/
+			for (i += THREADS_PER_VECTOR; i < rowEnd; i += THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		} else {
+			/*regardless of the global memory access alignment*/
+			for (i = rowStart + laneId; i < rowEnd; i +=
+					THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		}
+		/*intra-vector reduction*/
+		for (i = THREADS_PER_VECTOR >> 1; i > 0; i >>= 1) {
+			sum += shfl_down_64bits<T>(sum, i, THREADS_PER_VECTOR);
+		}
+
+		/*save the results and get a new row*/
+		if (laneId == 0) {
+			/*save the results*/
+			vectorY[row] = sum;
+
+			/*get a new row index*/
+			row = atomicAdd(cudaRowCounter, 1);
+		}
+		row = __shfl(row, 0, THREADS_PER_VECTOR);
+	}/*while*/
+}
+
+template < typename T, uint32_t THREADS_PER_VECTOR, uint32_t MAX_NUM_VECTORS_PER_BLOCK>
+#ifdef DOUBLE_USE_TEXTURE_MEMORY
+__global__ void csr64DynamicWarp(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+		const T* __restrict numericalValues, const cudaTextureObject_t vectorX, T* vectorY)
+#else
+__global__ void csr64DynamicWarp(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+		const T* __restrict numericalValues, const T* __restrict vectorX, T* vectorY)
+#endif
+{
+	uint32_t i;
+	T sum;
+	uint32_t row;
+	uint32_t rowStart, rowEnd;
+	const uint32_t laneId = threadIdx.x % THREADS_PER_VECTOR; /*lane index in the vector*/
+	const uint32_t vectorId = threadIdx.x / THREADS_PER_VECTOR; /*vector index in the thread block*/
+	const uint32_t warpLaneId = threadIdx.x & 31;	/*lane index in the warp*/
+	const uint32_t warpVectorId = warpLaneId / THREADS_PER_VECTOR;	/*vector index in the warp*/
+
+	__shared__ volatile uint32_t space[MAX_NUM_VECTORS_PER_BLOCK][2];
+
+	/*get the row index*/
+	if (warpLaneId == 0) {
+		row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
+	}
+	/*broadcast the value to other threads in the same warp*/
+	row = __shfl(row, 0) + warpVectorId;
+
+	/*check the row range*/
+	while (row < _cudaNumRows) {
+
+		/*use two threads to fetch the row offset*/
+		if (laneId < 2) {
+			space[vectorId][laneId] = rowOffsets[row + laneId];
+		}
+		rowStart = space[vectorId][0];
+		rowEnd = space[vectorId][1];
+
+		/*there are non-zero elements in the current row*/
+		sum = 0;
+		/*compute dot product*/
+		if (THREADS_PER_VECTOR == 32) {
+
+			/*ensure aligned memory access*/
+			i = rowStart - (rowStart & (THREADS_PER_VECTOR - 1)) + laneId;
+
+			/*process the unaligned part*/
+			if (i >= rowStart && i < rowEnd) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+
+				/*process the aligned part*/
+			for (i += THREADS_PER_VECTOR; i < rowEnd; i += THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		} else {
+			/*regardless of the global memory access alignment*/
+			for (i = rowStart + laneId; i < rowEnd; i +=
+					THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		}
+
+		/*intra-vector reduction*/
+		for (i = THREADS_PER_VECTOR >> 1; i > 0; i >>= 1) {
+			sum += shfl_down_64bits<T>(sum, i, THREADS_PER_VECTOR);
+		}
+
+		/*save the results and get a new row*/
+		if (laneId == 0) {
+			/*save the results*/
+			vectorY[row] = sum;
+		}
+
+		/*get a new row index*/
+		if(warpLaneId == 0){
+			row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
+		}
+		/*broadcast the value to other threads in the same warp*/
+		row = __shfl(row, 0) + warpVectorId;
+
+	}/*while*/
+}
+
+/*64-bit functions*/
+template < typename T, uint32_t THREADS_PER_VECTOR, uint32_t MAX_NUM_VECTORS_PER_BLOCK>
+#ifdef DOUBLE_USE_TEXTURE_MEMORY
+__global__ void csr64DynamicVectorBLAS(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+		const T* __restrict numericalValues, const cudaTextureObject_t vectorX, const T* __restrict inVectorY, T* vectorY, const T alpha, const T beta)
+#else
+__global__ void csr64DynamicVectorBLAS(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+		const T* __restrict numericalValues, const T* __restrict vectorX, const T* __restrict inVectorY, T* vectorY, const T alpha, const T beta)
+#endif
+{
+	uint32_t i;
+	T sum;
+	uint32_t row;
+	uint32_t rowStart, rowEnd;
+	const uint32_t laneId = threadIdx.x % THREADS_PER_VECTOR; /*lane index in the vector*/
+	const uint32_t vectorId = threadIdx.x / THREADS_PER_VECTOR; /*vector index in the block*/
+
+	__shared__ volatile uint32_t space[MAX_NUM_VECTORS_PER_BLOCK][2];
+
+	/*get the row index*/
+	if (laneId == 0) {
+		row = atomicAdd(cudaRowCounter, 1);
+	}
+	/*broadcast the value to other lanes from lane 0*/
+	row = __shfl(row, 0, THREADS_PER_VECTOR);
+
+	/*check the row range*/
+	while (row < _cudaNumRows) {
+
+		/*use two threads to fetch the row offset*/
+		if (laneId < 2) {
+			space[vectorId][laneId] = rowOffsets[row + laneId];
+		}
+		rowStart = space[vectorId][0];
+		rowEnd = space[vectorId][1];
+
+		/*there are non-zero elements in the current row*/
+		sum = 0;
+		/*compute dot product*/
+		if (THREADS_PER_VECTOR == 32) {
+
+			/*ensure aligned memory access*/
+			i = rowStart - (rowStart & (THREADS_PER_VECTOR - 1)) + laneId;
+
+			/*process the unaligned part*/
+			if (i >= rowStart && i < rowEnd) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+
+				/*process the aligned part*/
+			for (i += THREADS_PER_VECTOR; i < rowEnd; i += THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		} else {
+			/*regardless of the global memory access alignment*/
+			for (i = rowStart + laneId; i < rowEnd; i +=
+					THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		}
+		/*intra-vector reduction*/
+		sum *= alpha;
+		for (i = THREADS_PER_VECTOR >> 1; i > 0; i >>= 1) {
+			sum += shfl_down_64bits<T>(sum, i, THREADS_PER_VECTOR);
+		}
+
+		/*save the results and get a new row*/
+		if (laneId == 0) {
+			/*save the results*/
+			vectorY[row] = sum + beta * DOUBLE_VECTOR_GET(inVectorY, row);
+
+			/*get a new row index*/
+			row = atomicAdd(cudaRowCounter, 1);
+		}
+		row = __shfl(row, 0, THREADS_PER_VECTOR);
+	}/*while*/
+}
+
+template < typename T, uint32_t THREADS_PER_VECTOR, uint32_t MAX_NUM_VECTORS_PER_BLOCK>
+#ifdef DOUBLE_USE_TEXTURE_MEMORY
+__global__ void csr64DynamicWarpBLAS(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+		const T* __restrict numericalValues, const cudaTextureObject_t vectorX, const T* __restrict inVectorY, T* vectorY, const T alpha, const T beta)
+#else
+__global__ void csr64DynamicWarpBLAS(uint32_t* __restrict cudaRowCounter, const uint32_t* __restrict rowOffsets, const uint32_t* __restrict colIndexValues,
+		const T* __restrict numericalValues, const T* __restrict vectorX, const T* __restrict inVectorY, T* vectorY, const T alpha, const T beta)
+#endif
+{
+	uint32_t i;
+	T sum;
+	uint32_t row;
+	uint32_t rowStart, rowEnd;
+	const uint32_t laneId = threadIdx.x % THREADS_PER_VECTOR; /*lane index in the vector*/
+	const uint32_t vectorId = threadIdx.x / THREADS_PER_VECTOR; /*vector index in the thread block*/
+	const uint32_t warpLaneId = threadIdx.x & 31;	/*lane index in the warp*/
+	const uint32_t warpVectorId = warpLaneId / THREADS_PER_VECTOR;	/*vector index in the warp*/
+
+	__shared__ volatile uint32_t space[MAX_NUM_VECTORS_PER_BLOCK][2];
+
+	/*get the row index*/
+	if (warpLaneId == 0) {
+		row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
+	}
+	/*broadcast the value to other threads in the same warp*/
+	row = __shfl(row, 0) + warpVectorId;
+
+	/*check the row range*/
+	while (row < _cudaNumRows) {
+
+		/*use two threads to fetch the row offset*/
+		if (laneId < 2) {
+			space[vectorId][laneId] = rowOffsets[row + laneId];
+		}
+		rowStart = space[vectorId][0];
+		rowEnd = space[vectorId][1];
+
+		/*there are non-zero elements in the current row*/
+		sum = 0;
+		/*compute dot product*/
+		if (THREADS_PER_VECTOR == 32) {
+
+			/*ensure aligned memory access*/
+			i = rowStart - (rowStart & (THREADS_PER_VECTOR - 1)) + laneId;
+
+			/*process the unaligned part*/
+			if (i >= rowStart && i < rowEnd) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+
+				/*process the aligned part*/
+			for (i += THREADS_PER_VECTOR; i < rowEnd; i += THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		} else {
+			/*regardless of the global memory access alignment*/
+			for (i = rowStart + laneId; i < rowEnd; i +=
+					THREADS_PER_VECTOR) {
+				sum += numericalValues[i] * DOUBLE_VECTOR_GET(vectorX, colIndexValues[i]);
+			}
+		}
+
+		/*intra-vector reduction*/
+		sum *= alpha;
+		for (i = THREADS_PER_VECTOR >> 1; i > 0; i >>= 1) {
+			sum += shfl_down_64bits<T>(sum, i, THREADS_PER_VECTOR);
+		}
+
+		/*save the results and get a new row*/
+		if (laneId == 0) {
+			/*save the results*/
+			vectorY[row] = sum + beta * DOUBLE_VECTOR_GET(inVectorY, row);
+		}
+
+		/*get a new row index*/
+		if(warpLaneId == 0){
+			row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
+		}
+		/*broadcast the value to other threads in the same warp*/
+		row = __shfl(row, 0) + warpVectorId;
+
+	}/*while*/
+}
+
+
+}/*namespace*/
+
+#endif /* SPMVCSR_H_ */
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Types.h b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Types.h
new file mode 100644
index 000000000..6aca384ff
--- /dev/null
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/Types.h
@@ -0,0 +1,59 @@
+/*
+ * Types.h
+ *
+ *  Created on: Nov 21, 2014
+ *      Author: yongchao
+ */
+
+#ifndef TYPES_H_
+#define TYPES_H_
+
+#include <cuda.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <vector>
+#include <iostream>
+using namespace std;
+
+/*program version*/
+#define VERSION "v1.0"
+
+/*macros for cuda array*/
+#if !defined(SPMV_CUDA_ARRAY_WIDTH_SHIFT) || SPMV_CUDA_ARRAY_WIDTH_SHIFT < 10 || SPMV_CUDA_ARRAY_WIDTH_SHIFT > 16
+#define SPMV_CUDA_ARRAY_WIDTH_SHIFT		15
+#endif
+#define SPMV_CUDA_ARRAY_WIDTH_MASK		((1 << SPMV_CUDA_ARRAY_WIDTH_SHIFT) - 1)
+#define SPMV_CUDA_ARRAY_WIDTH 			(1 << SPMV_CUDA_ARRAY_WIDTH_SHIFT)
+
+/*texture memory*/
+#ifdef NO_FLOAT_TEXTURE_MEMORY
+#undef FLOAT_USE_TEXTURE_MEMORY
+#else
+#define FLOAT_USE_TEXTURE_MEMORY
+#endif
+
+#ifdef NO_DOUBLE_TEXTURE_MEMORY
+#undef DOUBLE_USE_TEXTURE_MEMORY
+#else
+#define DOUBLE_USE_TEXTURE_MEMORY
+#endif
+
+/*maximum number of threads per block*/
+#define MAX_NUM_THREADS_PER_BLOCK			1024
+
+/*error check*/
+#define CudaCheckError() __cudaCheckError( __FILE__, __LINE__ )
+inline void __cudaCheckError(const char* file, const int32_t line) {
+	cudaError err = cudaGetLastError();
+	if (cudaSuccess != err) {
+		cerr << "cudaCheckError() failed at " << file << ":" << line << " : "
+				<< cudaGetErrorString(err) << endl;
+		exit(-1);
+	}
+}
+
+#endif /* TYPES_H_ */
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/main.cu b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/main.cu
new file mode 100644
index 000000000..0345d0b39
--- /dev/null
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/main.cu
@@ -0,0 +1,99 @@
+/*
+ * main.cu
+ *
+ *  Created on: Nov 21, 2014
+ *      Author: yongchao
+ */
+#include "Options.h"
+#include "SpMV.h"
+
+int32_t main(int32_t argc, char* argv[]) {
+	Options opt;
+	float runtime;
+	double gflops;
+	int32_t numIters;
+
+	/*parse the parameters*/
+	if (!opt.parseArgs(argc, argv)) {
+		return -1;
+	}
+	numIters = opt._numIters;
+
+	/*run the sparse matrix-vector multiplication kernel*/
+	SpMV* spmv;
+	if (opt._singlePrecision) {
+		switch (opt._routine) {
+		case 0:
+			spmv = new SpMVFloatVector(&opt);
+			break;
+		case 1:
+			spmv = new SpMVFloatWarp(&opt);
+			break;
+		default:
+			cerr << "Error: unsupported routine number for FLOAT" << endl;
+			return -1;
+		}
+	} else {
+		switch (opt._routine) {
+		case 0:
+			spmv = new SpMVDoubleVector(&opt);
+			break;
+		case 1:
+			spmv = new SpMVDoubleWarp(&opt);
+			break;
+		default:
+			cerr << "Error: unsupported routine number for DOUBLE" << endl;
+			return -1;
+		}
+	}
+
+	/*set device cache*/
+	if (opt._routine == 2) {
+		cudaDeviceSetCacheConfig (cudaFuncCachePreferShared);
+	} else {
+		cudaDeviceSetCacheConfig (cudaFuncCachePreferL1);
+	}
+
+	if (opt._singlePrecision) {
+		cerr << "Use single-precision floating point" << endl;
+	} else {
+		cerr << "Use double-precision floating point" << endl;
+	}
+
+	/*print out the statistical information of the sparse matrix*/
+	opt.getRowSizeVariance();
+
+	/*load the data*/
+	spmv->loadData();
+
+	/*run the kernel*/
+	double stime = spmv->getSysTime();
+	for (int32_t i = 0; i < numIters; ++i) {
+		spmv->spmvKernel();
+	}
+	/*synchronize all kernels*/
+	cudaDeviceSynchronize();
+	double etime = spmv->getSysTime();
+
+	runtime = etime - stime;
+	runtime /= 1000.0 * (float) numIters;
+	cerr << "Average runtime: " << runtime << " seconds (in " << numIters
+			<< " iterations)" << endl;
+
+	/*compute the GFLOPS*/
+	gflops =
+			opt._formula == 0 ?
+					2 * opt._numValues - 1 :
+					2 * (opt._numValues + opt._numRows);
+	cerr << "Total FLOPs: " << (uint64_t) gflops << endl;
+	gflops /= runtime * 1000000000;
+	cerr << "GFLOPS: " << gflops << endl;
+
+	/*store the data*/
+	spmv->storeData();
+
+	/*release the data*/
+	delete spmv;
+
+	return 0;
+}
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h
new file mode 100644
index 000000000..9daebfe38
--- /dev/null
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h
@@ -0,0 +1,157 @@
+/***************************************************************************
+                          LightSpMVBenchmark.h  -  description
+                             -------------------
+    begin                : Apr 23, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+/***
+ * Wrapper of original LightSpMV kernels for TNL benchmarks.
+ */
+
+#include <stdexcept>
+#ifdef HAVE_CUDA
+#pragma push
+#pragma diag_suppress = 1444
+#include "LightSpMV-1.0/SpMV.h"
+#include "LightSpMV-1.0/SpMV.cu"
+#include "LightSpMV-1.0/SpMVCSR.cu"
+#pragma pop
+#endif
+#include <TNL/Matrices/SparseMatrix.h>
+
+namespace TNL {
+
+enum LightSpMVBenchmarkKernelType { LightSpMVBenchmarkKernelVector, LightSpMVBenchmarkKernelWarp };
+
+template< typename Real1, typename Real2 >
+struct LightSpMVVectorsBinder
+{
+   template< typename Index >
+   static void bind( TNL::Containers::VectorView< Real1, TNL::Devices::Cuda, Index >& vectorView, Real2* data, Index size ){};
+};
+
+template< typename Real >
+struct LightSpMVVectorsBinder< Real, Real >
+{
+   template< typename Index >
+   static void bind( TNL::Containers::VectorView< Real, TNL::Devices::Cuda, Index >& vectorView, Real* data, Index size )
+   {
+      vectorView.bind( data, size );
+   }
+};
+
+template< typename Real >
+struct LightSpMVBenchmark
+{
+   using RealType = Real;
+   using DeviceType = TNL::Devices::Host;
+   using IndexType = uint32_t;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+   using CudaVectorView = TNL::Containers::VectorView< RealType, TNL::Devices::Cuda, IndexType >;
+
+   template< typename Matrix >
+   LightSpMVBenchmark( Matrix& matrix, LightSpMVBenchmarkKernelType kernelType )
+   : inVector( matrix.getColumns(), 1.0 ),
+     outVector( matrix.getRows(), 0.0 ),
+     kernelType( kernelType )
+   {
+      static_assert( std::is_same< typename Matrix::DeviceType, TNL::Devices::Host >::value, "The only device type accepted here is TNL::Devices::Host." );
+#ifdef HAVE_CUDA
+      Options opt;
+      opt._numRows = matrix.getRows();
+      opt._numCols = matrix.getColumns();
+      opt._rowOffsets = matrix.getRowPointers().getData();
+      opt._numValues = matrix.getValues().getSize();
+      opt._colIndexValues = matrix.getColumnIndexes().getData();
+      opt._numericalValues = matrix.getValues().getData();;
+      opt._alpha = 1.0; // matrix multiplicator
+      opt._beta = 0.0;  // output vector multiplicator
+      opt._vectorX = inVector.getData();
+      opt._vectorY = outVector.getData();
+      if( std::is_same< Real, float >::value )
+      {
+         if( kernelType == LightSpMVBenchmarkKernelVector )
+            this->spmv = new SpMVFloatVector( &opt );
+         else
+            this->spmv = new SpMVFloatWarp( &opt );
+      }
+      else if( std::is_same< Real, double >::value )
+      {
+         if( kernelType == LightSpMVBenchmarkKernelVector )
+            this->spmv = new SpMVDoubleVector( &opt );
+         else
+            this->spmv = new SpMVDoubleWarp( &opt );
+      }
+      else throw std::runtime_error( "Unknown real type for LightSpMV." );
+      this->spmv->loadData();
+      if( std::is_same< Real, float >::value )
+      {
+         if( kernelType == LightSpMVBenchmarkKernelVector )
+         {
+            SpMVFloatVector* floatSpMV = dynamic_cast< SpMVFloatVector* >( this->spmv );
+            LightSpMVVectorsBinder< Real, float >::bind( this->inVectorView, floatSpMV->_vectorX[ 0 ], matrix.getColumns() );
+            LightSpMVVectorsBinder< Real, float >::bind( this->outVectorView, floatSpMV->_vectorY[ 0 ], matrix.getRows() );
+         }
+         else
+         {
+            SpMVFloatVector* floatSpMV = dynamic_cast< SpMVFloatWarp* >( this->spmv );
+            LightSpMVVectorsBinder< Real, float >::bind( this->inVectorView, floatSpMV->_vectorX[ 0 ], matrix.getColumns() );
+            LightSpMVVectorsBinder< Real, float >::bind( this->outVectorView, floatSpMV->_vectorY[ 0 ], matrix.getRows() );
+         }
+      }
+      else if( std::is_same< Real, double >::value )
+      {
+         if( kernelType == LightSpMVBenchmarkKernelVector )
+         {
+            SpMVDoubleVector* doubleSpMV = dynamic_cast< SpMVDoubleVector* >( this->spmv );
+            LightSpMVVectorsBinder< Real, double >::bind( this->inVectorView, doubleSpMV->_vectorX[ 0 ], matrix.getColumns() );
+            LightSpMVVectorsBinder< Real, double >::bind( this->outVectorView, doubleSpMV->_vectorY[ 0 ], matrix.getRows() );
+         }
+         else
+         {
+            SpMVDoubleVector* doubleSpMV = dynamic_cast< SpMVDoubleWarp* >( this->spmv );
+            LightSpMVVectorsBinder< Real, double >::bind( this->inVectorView, doubleSpMV->_vectorX[ 0 ], matrix.getColumns() );
+            LightSpMVVectorsBinder< Real, double >::bind( this->outVectorView, doubleSpMV->_vectorY[ 0 ], matrix.getRows() );
+         }
+      }
+      else std::runtime_error( "Unknown real type for LightSpMV." );
+#endif
+   }
+
+   void resetVectors()
+   {
+      this->inVectorView = 1.0;
+      this->outVectorView = 0.0;
+   }
+
+   void vectorProduct()
+   {
+      this->spmv->invokeKernel( 0 );
+   }
+
+   const CudaVectorView& getCudaOutVector()
+   {
+      return this->outVectorView;
+   }
+
+   ~LightSpMVBenchmark()
+   {
+#ifdef HAVE_CUDA
+      if( spmv ) delete spmv;
+#endif
+   }
+
+   protected:
+#ifdef HAVE_CUDA
+      SpMV* spmv = nullptr;
+#endif
+      VectorType  inVector, outVector;
+      CudaVectorView inVectorView, outVectorView;
+      LightSpMVBenchmarkKernelType kernelType = LightSpMVBenchmarkKernelVector;
+};
+
+} // namespace TNL
diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 49dbf4264..da0d795d5 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <cstdint>
+
 #include "../Benchmarks.h"
 #include "SpmvBenchmarkResult.h"
 
@@ -46,6 +48,7 @@ using namespace TNL::Matrices;
 
 #include <Benchmarks/SpMV/ReferenceFormats/cusparseCSRMatrix.h>
 #include <Benchmarks/SpMV/ReferenceFormats/cusparseCSRMatrixLegacy.h>
+#include <Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h>
 
 namespace TNL {
    namespace Benchmarks {
@@ -383,6 +386,7 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    using CSRHostMatrix = SpMV::ReferenceFormats::Legacy::CSR< Real, Devices::Host, int >;
    using CSRCudaMatrix = SpMV::ReferenceFormats::Legacy::CSR< Real, Devices::Cuda, int >;
    using CusparseMatrix = TNL::CusparseCSRLegacy< Real >;
+   using LightSpMVCSRHostMatrix = SpMV::ReferenceFormats::Legacy::CSR< Real, Devices::Host, uint32_t >;
 #else
    // Here we use 'int' instead of 'Index' because of compatibility with cusparse.
    using CSRHostMatrix = TNL::Matrices::SparseMatrix< Real, TNL::Devices::Host, int >;
@@ -428,10 +432,10 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    SpmvBenchmarkResult< Real, Devices::Host, int > csrBenchmarkResults( hostOutVector, hostOutVector, csrHostMatrix.getNonzeroElementsCount() );
    benchmark.time< Devices::Cuda >( resetHostVectors, "CPU", spmvCSRHost, csrBenchmarkResults );
 
+#ifdef HAVE_CUDA
    ////
    // Perform benchmark on CUDA device with cuSparse as a reference GPU format
    //
-#ifdef HAVE_CUDA
    benchmark.setMetadataColumns( Benchmark::MetadataColumns({
          { "matrix name", convertToString( inputFileName ) },
          { "rows", convertToString( csrHostMatrix.getRows() ) },
@@ -445,26 +449,45 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    CSRCudaMatrix csrCudaMatrix;
    csrCudaMatrix = csrHostMatrix;
 
-   // Delete the CSRhostMatrix, so it doesn't take up unnecessary space
-   csrHostMatrix.reset();
-
    CusparseMatrix cusparseMatrix;
    cusparseMatrix.init( csrCudaMatrix, &cusparseHandle );
 
-   CudaVector cusparseInVector( csrCudaMatrix.getColumns() ), cusparseOutVector( csrCudaMatrix.getRows() );
+   CudaVector cudaInVector( csrCudaMatrix.getColumns() ), cudaOutVector( csrCudaMatrix.getRows() );
 
    auto resetCusparseVectors = [&]() {
-      cusparseInVector = 1.0;
-      cusparseOutVector = 0.0;
+      cudaInVector = 1.0;
+      cudaOutVector = 0.0;
    };
 
    auto spmvCusparse = [&]() {
-       cusparseMatrix.vectorProduct( cusparseInVector, cusparseOutVector );
+       cusparseMatrix.vectorProduct( cudaInVector, cudaOutVector );
    };
 
    SpmvBenchmarkResult< Real, Devices::Host, int > cusparseBenchmarkResults( hostOutVector, hostOutVector, csrHostMatrix.getNonzeroElementsCount() );
    benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, cusparseBenchmarkResults );
    csrCudaMatrix.reset();
+
+   ////
+   // Perform benchmark on CUDA device with LightSpMV as a reference GPU format
+   //
+   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+      { "matrix name", convertToString( inputFileName ) },
+      { "rows", convertToString( csrHostMatrix.getRows() ) },
+      { "columns", convertToString( csrHostMatrix.getColumns() ) },
+      { "matrix format", String( "LightSpMV" ) }
+   } ));
+
+   LightSpMVCSRHostMatrix lightSpMVCSRHostMatrix;
+   lightSpMVCSRHostMatrix = csrHostMatrix;
+   LightSpMVBenchmark< Real > lightSpMVBenchmark( lightSpMVCSRHostMatrix, LightSpMVBenchmarkKernelVector );
+   auto resetLightSpMVVectors = [&]() {
+      lightSpMVBenchmark.resetVectors();
+   };
+
+   auto spmvLightSpMV = [&]() {
+       lightSpMVBenchmark.vectorProduct();
+   };
+   benchmark.time< Devices::Cuda >( resetLightSpMVVectors, "GPU", spmvLightSpMV, cusparseBenchmarkResults );
 #endif
    csrHostMatrix.reset();
 
-- 
GitLab


From 1d894e615107cf2eb67c048de972c90899d78355 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 27 Apr 2021 15:44:48 +0200
Subject: [PATCH 042/117] Fixed LightSpMV benchmark.

---
 .../ReferenceFormats/LightSpMVBenchmark.h     | 20 +++++++++++++++---
 src/Benchmarks/SpMV/spmv.h                    | 21 ++++++++++++++-----
 src/Benchmarks/scripts/run-tnl-benchmark-spmv | 18 ++++++++--------
 3 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h
index 9daebfe38..221fc6274 100644
--- a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h
@@ -61,17 +61,21 @@ struct LightSpMVBenchmark
    {
       static_assert( std::is_same< typename Matrix::DeviceType, TNL::Devices::Host >::value, "The only device type accepted here is TNL::Devices::Host." );
 #ifdef HAVE_CUDA
-      Options opt;
+      cudaDeviceProp prop;
+      cudaGetDeviceProperties(&prop, 0);
+      opt._gpus.push_back(make_pair(0, prop));
+      opt._numGPUs = 1;
       opt._numRows = matrix.getRows();
       opt._numCols = matrix.getColumns();
       opt._rowOffsets = matrix.getRowPointers().getData();
       opt._numValues = matrix.getValues().getSize();
       opt._colIndexValues = matrix.getColumnIndexes().getData();
-      opt._numericalValues = matrix.getValues().getData();;
+      opt._numericalValues = matrix.getValues().getData();
       opt._alpha = 1.0; // matrix multiplicator
       opt._beta = 0.0;  // output vector multiplicator
       opt._vectorX = inVector.getData();
       opt._vectorY = outVector.getData();
+      opt._formula = 0;
       if( std::is_same< Real, float >::value )
       {
          if( kernelType == LightSpMVBenchmarkKernelVector )
@@ -130,7 +134,11 @@ struct LightSpMVBenchmark
 
    void vectorProduct()
    {
-      this->spmv->invokeKernel( 0 );
+#ifdef HAVE_CUDA
+      this->spmv->spmvKernel();
+      cudaDeviceSynchronize();
+#endif
+
    }
 
    const CudaVectorView& getCudaOutVector()
@@ -142,11 +150,17 @@ struct LightSpMVBenchmark
    {
 #ifdef HAVE_CUDA
       if( spmv ) delete spmv;
+      opt._rowOffsets = nullptr;
+      opt._colIndexValues = nullptr;
+      opt._numericalValues = nullptr;
+      opt._vectorX = nullptr;
+      opt._vectorY = nullptr;
 #endif
    }
 
    protected:
 #ifdef HAVE_CUDA
+      Options opt;
       SpMV* spmv = nullptr;
 #endif
       VectorType  inVector, outVector;
diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index da0d795d5..113c1c504 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -38,9 +38,14 @@
 #include <TNL/Algorithms/Segments/ChunkedEllpack.h>
 #include <TNL/Algorithms/Segments/BiEllpack.h>
 
+// Comment the following to turn off some groups of SpMV benchmarks and speed-up the compilation
+#define WITH_TNL_BENCHMARK_SPMV_GENERAL_MATRICES
+#define WITH_TNL_BENCHMARK_SPMV_SYMMETRIC_MATRICES
+#define WITH_TNL_BENCHMARK_SPMV_LEGACY_FORMATS
+
 // Uncomment the following line to enable benchmarking the sandbox sparse matrix.
-//#define WITH_SANDBOX_MATRIX_BENCHMARK
-#ifdef WITH_SANDBOX_MATRIX_BENCHMARK
+//#define WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX
+#ifdef WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX
 #include <TNL/Matrices/Sandbox/SparseSandboxMatrix.h>
 #endif
 
@@ -132,7 +137,7 @@ using BiEllpackSegments = Algorithms::Segments::BiEllpack< Device, Index, IndexA
 template< typename Real, typename Device, typename Index >
 using SymmetricSparseMatrix_BiEllpack = Matrices::SparseMatrix< Real, Device, Index, Matrices::SymmetricMatrix, BiEllpackSegments >;
 
-#ifdef WITH_SANDBOX_MATRIX_BENCHMARK
+#ifdef WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX
 template< typename Real, typename Device, typename Index >
 using SparseSandboxMatrix = Matrices::Sandbox::SparseSandboxMatrix< Real, Device, Index, Matrices::GeneralMatrix >;
 #endif
@@ -491,6 +496,7 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
 #endif
    csrHostMatrix.reset();
 
+#ifdef WITH_TNL_BENCHMARK_SPMV_LEGACY_FORMATS
    /////
    // Benchmarking of TNL legacy formats
    //
@@ -515,7 +521,9 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    }
    // AdEllpack is broken
    //benchmarkSpMV< Real, Matrices::AdEllpack              >( benchmark, hostOutVector, inputFileName, verboseMR );
+#endif
 
+#ifdef WITH_TNL_BENCHMARK_SPMV_GENERAL_MATRICES
    /////
    // Benchmarking TNL formats
    //
@@ -530,11 +538,13 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack                >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_ChunkedEllpack               >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_BiEllpack                    >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-#ifdef WITH_SANDBOX_MATRIX_BENCHMARK
+#ifdef WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX
    benchmarkSpMV< Real, HostMatrixType, SparseSandboxMatrix                       >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
 #endif
    hostMatrix.reset();
+#endif
 
+#ifdef WITH_TNL_BENCHMARK_SPMV_SYMMETRIC_MATRICES
    /////
    // Benchmarking symmetric sparse matrices
    //
@@ -556,7 +566,7 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
       TNL::Matrices::MatrixReader< InputMatrix >::readMtx( inputFileName, hostMatrix, verboseMR );
       if( hostMatrix != symmetricHostMatrix )
       {
-         std::cerr << "ERROR !!!!!! " << std::endl;
+         std::cerr << "ERROR: Symmetric matrices do not match !!!" << std::endl;
       }
       benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
       benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
@@ -567,6 +577,7 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
       benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack               >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
       benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack                    >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
    }
+#endif
 }
 
 } // namespace SpMVLegacy
diff --git a/src/Benchmarks/scripts/run-tnl-benchmark-spmv b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
index 88b4d70d0..adccbb0aa 100755
--- a/src/Benchmarks/scripts/run-tnl-benchmark-spmv
+++ b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-                
+
 DEBUG="no"
 STOP_TIME="1"
 export CUDA_PROFILE=0
@@ -15,7 +15,7 @@ PROCESS_CUDA_PROFILE="$IWD/process-cuda-profile.pl"
 #source matrix-market
 source florida-matrix-market
 
-# !!!Matrices in MatrixMarket2 don't load properly, formatting issues with every file. MatrixReader fails. 
+# !!!Matrices in MatrixMarket2 don't load properly, formatting issues with every file. MatrixReader fails.
 #for link in $MM_MATRICES;
 #do
 #   echo "======================================================================================================"
@@ -24,9 +24,9 @@ source florida-matrix-market
 #   if test ! -e $matrix;
 #   then
 #      echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first."
-#      #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log            
+#      #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log
 #   else
-#      gunzip -c ${matrix} > ${unzipped_matrix}      
+#      gunzip -c ${matrix} > ${unzipped_matrix}
 #      echo "Benchmarking with the matrix $unzipped_matrix ..."
 #      export CUDA_PROFILE_LOG=$unzipped_matrix.float.log
 #      if test x$DEBUG = xyes;
@@ -35,7 +35,7 @@ source florida-matrix-market
 #      else
 #         $BENCHMARK --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --verbose 1
 #      fi
-#      #perl $PROCESS_CUDA_PROFILE $unzipped_matrix.float.log sparse-matrix-profiling-float.log          
+#      #perl $PROCESS_CUDA_PROFILE $unzipped_matrix.float.log sparse-matrix-profiling-float.log
 #   fi
 #done
 
@@ -43,7 +43,7 @@ for link in $FLORIDA_MM_MATRICES;
 do
    matrix=matrices`echo $link | sed 's/http:\/\/www.cise.ufl.edu\/research\/sparse//'`
    if test ! -e $matrix;
-   then      
+   then
       echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first."
       #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log
    else
@@ -59,14 +59,14 @@ do
      SUBDIRNAME=`echo $FILENAME | sed 's/.tar.gz//'`
      rm -f $DIRNAME/$SUBDIRNAME/*_b.mtx # these are usualy in array format
      for file in $DIRNAME/$SUBDIRNAME/*.mtx;
-     do        
+     do
          echo "======================================================================================================"
          echo "Benchmarking with the matrix $file ..."
 	 mtx_file_name=`basename $file`
-	 mtx_file_name=${mtx_file_name%.mtx}	 
+	 mtx_file_name=${mtx_file_name%.mtx}
          if test x$DEBUG = xyes;
          then
-            gdb --args $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark.log --output-mode append --verbose 1
+            gdb --args ${BENCHMARK_DBG} --input-file $file --log-file log-files/sparse-matrix-benchmark.log --output-mode append --verbose 1
          else
             $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark.log --output-mode append --verbose 1
          fi
-- 
GitLab


From 751c3d1cd3a3f29f794b82606e1c2380b804dfa2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 29 Apr 2021 15:23:47 +0200
Subject: [PATCH 043/117] Added CSR5 to SpMV benchmark - it does not work yet.

---
 src/Benchmarks/SpMV/CMakeLists.txt            |  16 ++-
 .../SpMV/ReferenceFormats/CSR5Benchmark.h     | 136 ++++++++++++++++++
 src/Benchmarks/SpMV/cmake/BuildCSR5.cmake     |  28 ++++
 src/Benchmarks/SpMV/cmake/CSR5.cmake.in       |  24 ++++
 src/Benchmarks/SpMV/spmv.h                    |  28 +++-
 5 files changed, 228 insertions(+), 4 deletions(-)
 create mode 100644 src/Benchmarks/SpMV/ReferenceFormats/CSR5Benchmark.h
 create mode 100644 src/Benchmarks/SpMV/cmake/BuildCSR5.cmake
 create mode 100644 src/Benchmarks/SpMV/cmake/CSR5.cmake.in

diff --git a/src/Benchmarks/SpMV/CMakeLists.txt b/src/Benchmarks/SpMV/CMakeLists.txt
index 6af696534..9976c19c8 100644
--- a/src/Benchmarks/SpMV/CMakeLists.txt
+++ b/src/Benchmarks/SpMV/CMakeLists.txt
@@ -1,8 +1,22 @@
+# CSR5 does not work properly yet:
+#
+# https://github.com/weifengliu-ssslab/Benchmark_SpMV_using_CSR5/issues/9
+# https://github.com/weifengliu-ssslab/Benchmark_SpMV_using_CSR5/issues/10
+#
+# We can build it with TNL but it crashes with many CUDA errors. We should first check it
+# with the original build.
+#
+#include( cmake/BuildCSR5.cmake )
+
 if( BUILD_CUDA )
-    CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu )
+    cuda_include_directories( ${CXX_BENCHMARKS_INCLUDE_DIRS} )
+    message( STATUS ${CXX_BENCHMARKS_FLAGS} )
+    CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu OPTIONS ${CXX_BENCHMARKS_FLAGS} )
     TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} ${CUDA_cudadevrt_LIBRARY} )
 else()
     ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cpp )
+    target_compile_options( tnl-benchmark-spmv  PRIVATE ${CXX_BENCHMARKS_FLAGS} )
+    target_include_directories( tnl-benchmark-spmv PRIVATE ${CXX_BENCHMARKS_INCLUDE_DIRS} )
 endif()
 
 install( TARGETS tnl-benchmark-spmv RUNTIME DESTINATION bin )
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/CSR5Benchmark.h b/src/Benchmarks/SpMV/ReferenceFormats/CSR5Benchmark.h
new file mode 100644
index 000000000..8cfd8f453
--- /dev/null
+++ b/src/Benchmarks/SpMV/ReferenceFormats/CSR5Benchmark.h
@@ -0,0 +1,136 @@
+/***************************************************************************
+                          CSR5Benchmark.h  -  description
+                             -------------------
+    begin                : Apr 23, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+/***
+ * Wrapper of original CSR5 kernels for TNL benchmarks.
+ */
+
+#include <stdexcept>
+
+
+namespace TNL {
+/////
+// Currently CSR5 for CUDA cannot be build because of conflict of atomicAdd for `double` type:
+//   https://github.com/weifengliu-ssslab/Benchmark_SpMV_using_CSR5/issues/9
+// The solution is to insert whole benchmark into separate namespace. In this case, however,
+// CSR5 does not work with `float`. So far, this seems to be the best solution.
+namespace CSR5Benchmark {
+
+#ifdef HAVE_CSR5
+#include <CSR5_cuda/anonymouslib_cuda.h>
+#endif
+
+#ifdef HAVE_CSR5
+template< typename CsrMatrix,
+          typename Real = typename CsrMatrix::RealType >
+struct CSR5SpMVCaller
+{
+   static_assert( std::is_same< typename CsrMatrix::DeviceType, TNL::Devices::Cuda >::value, "Only CUDA device is allowed for CSR matrix for CSR5 benchmark." );
+   using RealType = typename CsrMatrix::RealType;
+   using DeviceType = TNL::Devices::Cuda;
+   using IndexType = typename CsrMatrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+   using VectorView = typename VectorType::ViewType;
+   using CSR5Type = anonymouslibHandle< IndexType, typename std::make_unsigned< IndexType >::type, RealType >;
+
+   static void spmv( CSR5Type& csr5, VectorView& outVector ) {
+      csr5.spmv( ( RealType ) 1.0, outVector.getData() );
+   };
+};
+
+template< typename CsrMatrix >
+struct CSR5SpMVCaller< CsrMatrix, float >
+{
+   static_assert( std::is_same< typename CsrMatrix::DeviceType, TNL::Devices::Cuda >::value, "Only CUDA device is allowed for CSR matrix for CSR5 benchmark." );
+   using RealType = typename CsrMatrix::RealType;
+   using DeviceType = TNL::Devices::Cuda;
+   using IndexType = typename CsrMatrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+   using VectorView = typename VectorType::ViewType;
+   using CSR5Type = anonymouslibHandle< IndexType, typename std::make_unsigned< IndexType >::type, RealType >;
+
+   static void spmv( CSR5Type& csr5, VectorView& outVector )
+   {
+      //csr5.spmv( ( RealType ) 1.0, outVector.getData() );
+   };
+};
+#endif
+
+
+template< typename CsrMatrix >
+struct CSR5Benchmark
+{
+   static_assert( std::is_same< typename CsrMatrix::DeviceType, TNL::Devices::Cuda >::value, "Only CUDA device is allowed for CSR matrix for CSR5 benchmark." );
+   using RealType = typename CsrMatrix::RealType;
+   using DeviceType = TNL::Devices::Cuda;
+   using IndexType = typename CsrMatrix::IndexType;
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+   using VectorView = typename VectorType::ViewType;
+#ifdef HAVE_CSR5
+   using CSR5Type = anonymouslibHandle< IndexType, typename std::make_unsigned< IndexType >::type, RealType >;
+#endif
+
+   CSR5Benchmark( CsrMatrix& matrix, VectorType& inVector, VectorType& outVector )
+   :
+#ifdef HAVE_CSR5
+   csr5( matrix.getRows(), matrix.getColumns() ),
+#endif
+     inVectorView( inVector ), outVectorView( outVector )
+   {
+#ifdef HAVE_CSR5
+      // err = A.inputCSR(nnzA, d_csrRowPtrA, d_csrColIdxA, d_csrValA);
+      //cout << "inputCSR err = " << err << endl;
+      this->csr5.inputCSR( matrix.getValues().getSize(),
+                           matrix.getRowPointers().getData(),
+                           matrix.getColumnIndexes().getData(),
+                           matrix.getValues().getData() );
+
+      //err = A.setX(d_x); // you only need to do it once!
+      //cout << "setX err = " << err << endl;
+      this->csr5.setX( inVector.getData() );
+
+      this->csr5.setSigma(ANONYMOUSLIB_AUTO_TUNED_SIGMA);
+
+      // warmup device
+      this->csr5.warmup();
+
+      // conversion ... probably
+      this->csr5.asCSR5();
+#endif
+   }
+
+   void vectorProduct()
+   {
+#ifdef HAVE_CSR5
+      CSR5SpMVCaller< CsrMatrix >::spmv( this->csr5, outVectorView );
+#endif
+   }
+
+   const VectorView& getCudaOutVector()
+   {
+      return this->outVectorView;
+   }
+
+   ~CSR5Benchmark()
+   {
+#ifdef HAVE_CSR5
+      this->csr5.destroy();
+#endif
+   }
+
+   protected:
+#ifdef HAVE_CSR5
+      CSR5Type csr5;
+#endif
+      VectorView inVectorView, outVectorView;
+};
+
+   } // namespace CSR5Benchmark
+} // namespace TNL
diff --git a/src/Benchmarks/SpMV/cmake/BuildCSR5.cmake b/src/Benchmarks/SpMV/cmake/BuildCSR5.cmake
new file mode 100644
index 000000000..51a4a0be3
--- /dev/null
+++ b/src/Benchmarks/SpMV/cmake/BuildCSR5.cmake
@@ -0,0 +1,28 @@
+# compatibility with the CSR5 package
+
+set( CUDA_SAMPLES_DIR $ENV{CUDA_SAMPLES_DIR} )
+if( NOT DEFINED CUDA_SAMPLES_DIR )
+    message( WARNING "CUDA_SAMPLES_DIR variable was not set and it is required by CSR5 benchmark - CSR5 benchmark is disabled.")
+else()
+    # Download and unpack CSR5 at configure time
+    message( STATUS "CUDA_SAMPLES_DIR set to ${CUDA_SAMPLES_DIR}")
+    configure_file(cmake/CSR5.cmake.in csr5-download/CMakeLists.txt)
+    execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
+    RESULT_VARIABLE result
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/src/Benchmarks/SpMV/csr5-download )
+    if(result)
+        message(WARNING "CMake step for CSR5 failed: ${result}")
+    else()
+        execute_process(COMMAND ${CMAKE_COMMAND} --build .
+            RESULT_VARIABLE result
+            WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/src/Benchmarks/SpMV/csr5-download )
+        if(result)
+            message( ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR}/src/Benchmarks/SpMV/csr5-download )
+            message(WARNING "Build step for CSR5 failed: ${result}")
+        else()
+            set( CXX_BENCHMARKS_FLAGS ${CXX_BENCHMARKS_FLAGS} "-DHAVE_CSR5" )
+            set( CXX_BENCHMARKS_INCLUDE_DIRS ${CXX_BENCHMARKS_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/Benchmarks/SpMV/csr5-src ${CUDA_SAMPLES_DIR}/common/inc)
+            message( STATUS "CSR5 build was succesfull.")
+        endif()
+    endif()
+endif()
diff --git a/src/Benchmarks/SpMV/cmake/CSR5.cmake.in b/src/Benchmarks/SpMV/cmake/CSR5.cmake.in
new file mode 100644
index 000000000..14a0c61ae
--- /dev/null
+++ b/src/Benchmarks/SpMV/cmake/CSR5.cmake.in
@@ -0,0 +1,24 @@
+# vim: ft=cmake
+
+# This is a separate template for CMakeLists.txt to build gtest as a separate project
+
+cmake_minimum_required(VERSION 2.8.2)
+
+project(csr5-download NONE)
+
+include(ExternalProject)
+ExternalProject_Add(csr5
+  GIT_REPOSITORY    https://github.com/weifengliu-ssslab/Benchmark_SpMV_using_CSR5.git
+  #GIT_TAG           master
+  # build from a stable branch instead of master (which gets broken pretty often)
+  #GIT_TAG           v1.10.x
+  SOURCE_DIR        "${CMAKE_BINARY_DIR}/src/Benchmarks/SpMV/csr5-src"
+  BINARY_DIR        "${CMAKE_BINARY_DIR}/src/Benchmarks/SpMV/csr5-build"
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+  # Disable update of the external project in an offline build
+  # reference: https://stackoverflow.com/a/40423683
+  UPDATE_DISCONNECTED ${OFFLINE_BUILD}
+)
diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 113c1c504..d07a0f6bc 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -39,9 +39,9 @@
 #include <TNL/Algorithms/Segments/BiEllpack.h>
 
 // Comment the following to turn off some groups of SpMV benchmarks and speed-up the compilation
-#define WITH_TNL_BENCHMARK_SPMV_GENERAL_MATRICES
-#define WITH_TNL_BENCHMARK_SPMV_SYMMETRIC_MATRICES
-#define WITH_TNL_BENCHMARK_SPMV_LEGACY_FORMATS
+//#define WITH_TNL_BENCHMARK_SPMV_GENERAL_MATRICES
+//#define WITH_TNL_BENCHMARK_SPMV_SYMMETRIC_MATRICES
+//#define WITH_TNL_BENCHMARK_SPMV_LEGACY_FORMATS
 
 // Uncomment the following line to enable benchmarking the sandbox sparse matrix.
 //#define WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX
@@ -54,6 +54,7 @@ using namespace TNL::Matrices;
 #include <Benchmarks/SpMV/ReferenceFormats/cusparseCSRMatrix.h>
 #include <Benchmarks/SpMV/ReferenceFormats/cusparseCSRMatrixLegacy.h>
 #include <Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h>
+#include <Benchmarks/SpMV/ReferenceFormats/CSR5Benchmark.h>
 
 namespace TNL {
    namespace Benchmarks {
@@ -470,7 +471,28 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
 
    SpmvBenchmarkResult< Real, Devices::Host, int > cusparseBenchmarkResults( hostOutVector, hostOutVector, csrHostMatrix.getNonzeroElementsCount() );
    benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, cusparseBenchmarkResults );
+
+#ifdef HAVE_CSR5
+   ////
+   // Perform benchmark on CUDA device with CSR5 as a reference GPU format
+   //
+   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+      { "matrix name", convertToString( inputFileName ) },
+      { "rows", convertToString( csrHostMatrix.getRows() ) },
+      { "columns", convertToString( csrHostMatrix.getColumns() ) },
+      { "matrix format", String( "CSR5" ) }
+   } ));
+
+   CudaVector cudaOutVector2( cudaOutVector );
+   CSR5Benchmark::CSR5Benchmark< CSRCudaMatrix > csr5Benchmark( csrCudaMatrix, cudaInVector, cudaOutVector );
+
+   auto csr5SpMV = [&]() {
+       csr5Benchmark.vectorProduct();
+   };
+   benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", csr5SpMV, cusparseBenchmarkResults );
+   std::cerr << "CSR5 error = " << max( abs( cudaOutVector - cudaOutVector2 ) ) << std::endl;
    csrCudaMatrix.reset();
+#endif
 
    ////
    // Perform benchmark on CUDA device with LightSpMV as a reference GPU format
-- 
GitLab


From 92bb20c095d2a501fdbb4e1331e40a7234416f73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 29 Apr 2021 16:55:58 +0200
Subject: [PATCH 044/117] Added LightSpMV Warp kernel benchmark.

---
 .../ReferenceFormats/LightSpMVBenchmark.h     |  5 +++
 src/Benchmarks/SpMV/spmv.h                    | 33 ++++++++++++-------
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h      |  2 +-
 3 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h
index 221fc6274..7d6ffde49 100644
--- a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMVBenchmark.h
@@ -126,6 +126,11 @@ struct LightSpMVBenchmark
 #endif
    }
 
+   void setKernelType( LightSpMVBenchmarkKernelType type )
+   {
+      this->kernelType = type;
+   }
+
    void resetVectors()
    {
       this->inVectorView = 1.0;
diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index d07a0f6bc..991c6b56c 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -39,9 +39,9 @@
 #include <TNL/Algorithms/Segments/BiEllpack.h>
 
 // Comment the following to turn off some groups of SpMV benchmarks and speed-up the compilation
-//#define WITH_TNL_BENCHMARK_SPMV_GENERAL_MATRICES
-//#define WITH_TNL_BENCHMARK_SPMV_SYMMETRIC_MATRICES
-//#define WITH_TNL_BENCHMARK_SPMV_LEGACY_FORMATS
+#define WITH_TNL_BENCHMARK_SPMV_GENERAL_MATRICES
+#define WITH_TNL_BENCHMARK_SPMV_SYMMETRIC_MATRICES
+#define WITH_TNL_BENCHMARK_SPMV_LEGACY_FORMATS
 
 // Uncomment the following line to enable benchmarking the sandbox sparse matrix.
 //#define WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX
@@ -374,10 +374,10 @@ benchmarkSpMV( Benchmark& benchmark,
 template< typename Real = double,
           typename Index = int >
 void
-benchmarkSpmvSynthetic( Benchmark& benchmark,
-                        const String& inputFileName,
-                        const Config::ParameterContainer& parameters,
-                        bool verboseMR )
+benchmarkSpmv( Benchmark& benchmark,
+               const String& inputFileName,
+               const Config::ParameterContainer& parameters,
+               bool verboseMR )
 {
    // The following is another workaround because of a bug in nvcc versions 10 and 11.
    // If we use the current matrix formats, not the legacy ones, we get
@@ -469,8 +469,8 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
        cusparseMatrix.vectorProduct( cudaInVector, cudaOutVector );
    };
 
-   SpmvBenchmarkResult< Real, Devices::Host, int > cusparseBenchmarkResults( hostOutVector, hostOutVector, csrHostMatrix.getNonzeroElementsCount() );
-   benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, cusparseBenchmarkResults );
+   SpmvBenchmarkResult< Real, Devices::Host, int > cudaBenchmarkResults( hostOutVector, hostOutVector, csrHostMatrix.getNonzeroElementsCount() );
+   benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, cudaBenchmarkResults );
 
 #ifdef HAVE_CSR5
    ////
@@ -489,7 +489,7 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    auto csr5SpMV = [&]() {
        csr5Benchmark.vectorProduct();
    };
-   benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", csr5SpMV, cusparseBenchmarkResults );
+   benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", csr5SpMV, cudaBenchmarkResults );
    std::cerr << "CSR5 error = " << max( abs( cudaOutVector - cudaOutVector2 ) ) << std::endl;
    csrCudaMatrix.reset();
 #endif
@@ -501,7 +501,7 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
       { "matrix name", convertToString( inputFileName ) },
       { "rows", convertToString( csrHostMatrix.getRows() ) },
       { "columns", convertToString( csrHostMatrix.getColumns() ) },
-      { "matrix format", String( "LightSpMV" ) }
+      { "matrix format", String( "LightSpMV Vector" ) }
    } ));
 
    LightSpMVCSRHostMatrix lightSpMVCSRHostMatrix;
@@ -514,7 +514,16 @@ benchmarkSpmvSynthetic( Benchmark& benchmark,
    auto spmvLightSpMV = [&]() {
        lightSpMVBenchmark.vectorProduct();
    };
-   benchmark.time< Devices::Cuda >( resetLightSpMVVectors, "GPU", spmvLightSpMV, cusparseBenchmarkResults );
+   benchmark.time< Devices::Cuda >( resetLightSpMVVectors, "GPU", spmvLightSpMV, cudaBenchmarkResults );
+
+   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+      { "matrix name", convertToString( inputFileName ) },
+      { "rows", convertToString( csrHostMatrix.getRows() ) },
+      { "columns", convertToString( csrHostMatrix.getColumns() ) },
+      { "matrix format", String( "LightSpMV Warp" ) }
+   } ));
+   lightSpMVBenchmark.setKernelType( LightSpMVBenchmarkKernelWarp );
+   benchmark.time< Devices::Cuda >( resetLightSpMVVectors, "GPU", spmvLightSpMV, cudaBenchmarkResults );
 #endif
    csrHostMatrix.reset();
 
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index 9a5005de7..026ed356d 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -45,7 +45,7 @@ runSpMVBenchmarks( Benchmark & benchmark,
                            metadata );
    // Start the actual benchmark in spmv.h
    try {
-      SpMVLegacy::benchmarkSpmvSynthetic< Real >( benchmark, inputFileName, parameters, verboseMR );
+      SpMVLegacy::benchmarkSpmv< Real >( benchmark, inputFileName, parameters, verboseMR );
    }
    catch( const std::exception& ex ) {
       std::cerr << ex.what() << std::endl;
-- 
GitLab


From f3d92a665b72b0955a34821e9f4b104681b5b258 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 30 Apr 2021 16:50:38 +0200
Subject: [PATCH 045/117] Fixing cmake lists for examples.

---
 Documentation/Examples/Algorithms/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/Examples/Algorithms/CMakeLists.txt b/Documentation/Examples/Algorithms/CMakeLists.txt
index c200642f5..339ab5754 100644
--- a/Documentation/Examples/Algorithms/CMakeLists.txt
+++ b/Documentation/Examples/Algorithms/CMakeLists.txt
@@ -21,7 +21,7 @@ if( BUILD_CUDA )
       set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
    endforeach()
 else()
-   foreach( target IN ITEMS "${COMMON_EXAMPLES} ${HOST_EXAMPLES}")
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} ${HOST_EXAMPLES})
       add_executable( ${target} ${target}.cpp )
       add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
       set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out )
-- 
GitLab


From e61d838b31fa044acd9b9935358af8308d8e206b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 30 Apr 2021 16:50:59 +0200
Subject: [PATCH 046/117] Fixing matrix documentation.

---
 src/TNL/Matrices/DenseMatrix.h                | 129 ++++++++++---
 src/TNL/Matrices/DenseMatrixElement.h         |  46 ++++-
 src/TNL/Matrices/DenseMatrixView.h            | 140 ++++++++++----
 src/TNL/Matrices/LambdaMatrix.h               |  70 +++++--
 src/TNL/Matrices/LambdaMatrixElement.h        |  46 ++++-
 src/TNL/Matrices/MatrixRowViewIterator.h      |  12 ++
 src/TNL/Matrices/MultidiagonalMatrix.h        | 140 ++++++++++----
 src/TNL/Matrices/MultidiagonalMatrixElement.h |  50 +++++
 src/TNL/Matrices/MultidiagonalMatrixView.h    | 160 ++++++++++------
 src/TNL/Matrices/SparseMatrix.h               | 136 ++++++++++----
 src/TNL/Matrices/SparseMatrixElement.h        |  50 +++++
 src/TNL/Matrices/SparseMatrixView.h           | 156 +++++++++++-----
 src/TNL/Matrices/TridiagonalMatrix.h          | 174 +++++++++++------
 src/TNL/Matrices/TridiagonalMatrixView.h      | 175 ++++++++++++------
 14 files changed, 1100 insertions(+), 384 deletions(-)

diff --git a/src/TNL/Matrices/DenseMatrix.h b/src/TNL/Matrices/DenseMatrix.h
index 0036ba240..faf0d9649 100644
--- a/src/TNL/Matrices/DenseMatrix.h
+++ b/src/TNL/Matrices/DenseMatrix.h
@@ -457,7 +457,11 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value ) { ... };
+       * ```
+       *
        *  The column index repeats twice only for compatibility with sparse matrices.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
@@ -477,7 +481,11 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value ) { ... };
+       * ```
+       *
        *  The column index repeats twice only for compatibility with sparse matrices.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
@@ -537,10 +545,10 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param function is an instance of the lambda function to be called for each row.
        *
        * ```
-       * auto function = [] __cuda_callable__ ( RowViewType& row ) mutable { ... };
+       * auto function = [] __cuda_callable__ ( RowView& row ) mutable { ... };
        * ```
        *
-       * \e RowViewType represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowViewType.
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowView.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixExample_forRows.cpp
@@ -563,10 +571,10 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param function is an instance of the lambda function to be called for each row.
        *
        * ```
-       * auto function = [] __cuda_callable__ ( RowViewType& row ) { ... };
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
        * ```
        *
-       * \e RowViewType represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowViewType.
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowView.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixExample_forRows.cpp
@@ -587,10 +595,10 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param function is an instance of the lambda function to be called for each row.
        *
        * ```
-       * auto function = [] __cuda_callable__ ( RowViewType& row ) mutable { ... };
+       * auto function = [] __cuda_callable__ ( RowView& row ) mutable { ... };
        * ```
        *
-       * \e RowViewType represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowViewType.
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowView.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixExample_forRows.cpp
@@ -611,10 +619,10 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param function is an instance of the lambda function to be called for each row.
        *
        * ```
-       * auto function = [] __cuda_callable__ ( RowViewType& row ) { ... };
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
        * ```
        *
-       * \e RowViewType represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowViewType.
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowView.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixExample_forRows.cpp
@@ -629,8 +637,12 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowView.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -644,8 +656,12 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowView.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -680,12 +696,25 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *  It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
@@ -709,12 +738,25 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *          It is declared as
+       *
+       * ````
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
@@ -738,12 +780,24 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on ALL matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *      The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ````
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *          It is declared as
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -765,12 +819,25 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on ALL matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue { ... };
+       * ```
+       *
        *          The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *          It is declared as
+       *
+       *  ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -793,7 +860,9 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * More precisely, it computes:
        *
-       * `outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector`
+       * ```
+       * outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
+       * ```
        *
        * \tparam InVector is type of input vector.  It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
diff --git a/src/TNL/Matrices/DenseMatrixElement.h b/src/TNL/Matrices/DenseMatrixElement.h
index e35235fd9..c9dcc3c86 100644
--- a/src/TNL/Matrices/DenseMatrixElement.h
+++ b/src/TNL/Matrices/DenseMatrixElement.h
@@ -17,17 +17,36 @@
 namespace TNL {
 namespace Matrices {
 
-
+/**
+ * \brief Accessor for dense matrix elements.
+ *
+ * \tparam Real is a type of matrix elements values.
+ * \tparam Index is a type of matrix elements column indexes.
+ */
 template< typename Real,
           typename Index >
 class DenseMatrixElement
 {
    public:
 
+      /**
+       * \brief Type of matrix elements values.
+       */
       using RealType = Real;
 
+      /**
+       * \brief Type of matrix elements column indexes.
+       */
       using IndexType = Index;
 
+      /**
+       * \brief Constructor.
+       *
+       * \param value is matrix element value.
+       * \param rowIdx is row index of the matrix element.
+       * \param columnIdx is a column index of the matrix element.
+       * \param localIdx is the column index of the matrix element as well.
+       */
       __cuda_callable__
       DenseMatrixElement( RealType& value,
                           const IndexType& rowIdx,
@@ -35,18 +54,43 @@ class DenseMatrixElement
                           const IndexType& localIdx )  // localIdx is here only for compatibility with SparseMatrixElement
       : value_( value ), rowIdx( rowIdx ), columnIdx( columnIdx ) {};
 
+      /**
+       * \brief Returns reference on matrix element value.
+       *
+       * \return reference on matrix element value.
+       */
       __cuda_callable__
       RealType& value() { return value_; };
 
+      /**
+       * \brief Returns constant reference on matrix element value.
+       *
+       * \return constant reference on matrix element value.
+       */
       __cuda_callable__
       const RealType& value() const { return value_; };
 
+      /**
+       * \brief Returns constant reference on matrix element row index.
+       *
+       * \return constant reference on matrix element row index.
+       */
       __cuda_callable__
       const IndexType& rowIndex() const { return rowIdx; };
 
+      /**
+       * \brief Returns constant reference on matrix element column index.
+       *
+       * \return constant reference on matrix element column index.
+       */
       __cuda_callable__
       const IndexType& columnIndex() const { return columnIdx; };
 
+      /**
+       * \brief Returns constant reference on matrix element column index.
+       *
+       * \return constant reference on matrix element column index.
+       */
       __cuda_callable__
       const IndexType& localIndex() const { return columnIdx; };
 
diff --git a/src/TNL/Matrices/DenseMatrixView.h b/src/TNL/Matrices/DenseMatrixView.h
index 5fd6cda8e..6e181d698 100644
--- a/src/TNL/Matrices/DenseMatrixView.h
+++ b/src/TNL/Matrices/DenseMatrixView.h
@@ -405,12 +405,25 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *   The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *          It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
@@ -434,12 +447,25 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *          It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
@@ -463,12 +489,25 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on ALL matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *   It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -490,12 +529,25 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on ALL matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *  It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -518,10 +570,12 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx, const RealType& value )`.
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
+       *
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -540,10 +594,12 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx, RealType& value )`.
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx, RealType& value ) { ... };
+       * ```
+       *
        *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -602,10 +658,10 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \param function is an instance of the lambda function to be called for each row.
        *
        * ```
-       * auto function = [] __cuda_callable__ ( RowViewType& row ) mutable { ... };
+       * auto function = [] __cuda_callable__ ( RowView& row ) mutable { ... };
        * ```
        *
-       * \e RowViewType represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowViewType.
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrix::RowView.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixExample_forRows.cpp
@@ -628,10 +684,10 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \param function is an instance of the lambda function to be called for each row.
        *
        * ```
-       * auto function = [] __cuda_callable__ ( RowViewType& row ) { ... };
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
        * ```
        *
-       * \e RowViewType represents matrix row - see \ref TNL::Matrices::DenseMatrixView::RowViewType.
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrixView::RowView.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixViewExample_forRows.cpp
@@ -652,10 +708,10 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \param function is an instance of the lambda function to be called for each row.
        *
        * ```
-       * auto function = [] __cuda_callable__ ( RowViewType& row ) mutable { ... };
+       * auto function = [] __cuda_callable__ ( RowView& row ) mutable { ... };
        * ```
        *
-       * \e RowViewType represents matrix row - see \ref TNL::Matrices::DenseMatrixView::RowViewType.
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrixView::RowView.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixViewExample_forRows.cpp
@@ -676,10 +732,10 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \param function is an instance of the lambda function to be called for each row.
        *
        * ```
-       * auto function = [] __cuda_callable__ ( RowViewType& row ) { ... };
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
        * ```
        *
-       * \e RowViewType represents matrix row - see \ref TNL::Matrices::DenseMatrixView::RowViewType.
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrixView::RowView.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixViewExample_forRows.cpp
@@ -694,10 +750,12 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrixView::RowView.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -711,10 +769,12 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::DenseMatrixView::RowView.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -750,7 +810,9 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        *
        * More precisely, it computes:
        *
-       * `outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector`
+       * ```
+       * outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
+       * ```
        *
        * \tparam InVector is type of input vector.  It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
diff --git a/src/TNL/Matrices/LambdaMatrix.h b/src/TNL/Matrices/LambdaMatrix.h
index cfdd77d2e..2b788ed52 100644
--- a/src/TNL/Matrices/LambdaMatrix.h
+++ b/src/TNL/Matrices/LambdaMatrix.h
@@ -260,7 +260,11 @@ class LambdaMatrix
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value ) { ... };
+       * ```
+       *
        *  The column index repeats twice only for compatibility with sparse matrices.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
@@ -273,7 +277,7 @@ class LambdaMatrix
        * \include LambdaMatrixExample_forRows.out
        */
       template< typename Function >
-      void forElements( IndexType first, IndexType last, Function& function ) const;
+      void forElements( IndexType begin, IndexType end, Function& function ) const;
 
       /**
        * \brief This method calls \e forElements for all matrix rows (for constant instances).
@@ -346,10 +350,12 @@ class LambdaMatrix
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::LambdaMatrix::RowView.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -373,16 +379,29 @@ class LambdaMatrix
        * \brief Method for performing general reduction on matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callbale__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *    It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value )
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [\e begin, \e end) of rows to be processed.
+       * \param end defines ending of the range [\e begin,\e end) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -396,18 +415,31 @@ class LambdaMatrix
        * \include LambdaMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       /**
        * \brief Method for performing general reduction on ALL matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *   It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value )
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -430,7 +462,9 @@ class LambdaMatrix
        *
        * More precisely, it computes:
        *
-       * `outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector`
+       * ```
+       * outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector
+       * ```
        *
        * \tparam InVector is type of input vector.  It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
diff --git a/src/TNL/Matrices/LambdaMatrixElement.h b/src/TNL/Matrices/LambdaMatrixElement.h
index 57ba698f3..b094eb004 100644
--- a/src/TNL/Matrices/LambdaMatrixElement.h
+++ b/src/TNL/Matrices/LambdaMatrixElement.h
@@ -17,17 +17,36 @@
 namespace TNL {
 namespace Matrices {
 
-
+/**
+ * \brief Accessor for elements of lambda matrix.
+ *
+ * \tparam Real is type of matrix elements values.
+ * \tparam Index is a type of matrix elements column indexes.
+ */
 template< typename Real,
           typename Index >
 class LambdaMatrixElement
 {
    public:
 
+      /**
+       * \brief Type of matrix elements values.
+       */
       using RealType = Real;
 
+      /**
+       * \brief Type of matrix elements column indexes.
+       */
       using IndexType = Index;
 
+      /**
+       * \brief Constructor.
+       *
+       * \param value is matrix element value.
+       * \param rowIdx is row index of the matrix element.
+       * \param columnIdx is a column index of the matrix element.
+       * \param localIdx is the rank of the non-zero elements in the matrix row.
+       */
       __cuda_callable__
       LambdaMatrixElement( const RealType& value,
                            const IndexType& rowIdx,
@@ -35,18 +54,43 @@ class LambdaMatrixElement
                            const IndexType& localIdx )
       : value_( value ), rowIdx( rowIdx ), columnIdx( columnIdx ), localIdx( localIdx ) {};
 
+      /**
+       * \brief Copy constructor.
+       *
+       * \param el is the source matrix element.
+       */
       __cuda_callable__
       LambdaMatrixElement( const LambdaMatrixElement& el ) = default;
 
+      /**
+       * \brief Returns constant reference on matrix element value.
+       *
+       * \return constant reference on matrix element value.
+       */
       __cuda_callable__
       const RealType& value() const { return value_; };
 
+      /**
+       * \brief Returns constant reference on matrix element row index.
+       *
+       * \return constant reference on matrix element row index.
+       */
       __cuda_callable__
       const IndexType& rowIndex() const { return rowIdx; };
 
+      /**
+       * \brief Returns constant reference on matrix element column index.
+       *
+       * \return constant reference on matrix element column index.
+       */
       __cuda_callable__
       const IndexType& columnIndex() const { return columnIdx; };
 
+      /**
+       * \brief Returns constant reference on the rank of the non-zero matrix element in the row.
+       *
+       * \return constant reference on the rank of the non-zero matrix element in the row.
+       */
       __cuda_callable__
       const IndexType& localIndex() const { return localIdx; };
 
diff --git a/src/TNL/Matrices/MatrixRowViewIterator.h b/src/TNL/Matrices/MatrixRowViewIterator.h
index cf99bea29..463ac3ca5 100644
--- a/src/TNL/Matrices/MatrixRowViewIterator.h
+++ b/src/TNL/Matrices/MatrixRowViewIterator.h
@@ -72,15 +72,27 @@ class MatrixRowViewIterator
       __cuda_callable__
       bool operator!=( const MatrixRowViewIterator& other ) const;
 
+      /**
+       * \brief Increment operator.
+       */
       __cuda_callable__
       MatrixRowViewIterator& operator++();
 
+      /**
+       * \brief Decrement operetor.
+       */
       __cuda_callable__
       MatrixRowViewIterator& operator--();
 
+      /**
+       * \brief Dereference operator.
+       */
       __cuda_callable__
       MatrixElementType operator*();
 
+      /**
+       * \brief Dereference operator for constant instances.
+       */
       __cuda_callable__
       const MatrixElementType operator*() const;
 
diff --git a/src/TNL/Matrices/MultidiagonalMatrix.h b/src/TNL/Matrices/MultidiagonalMatrix.h
index 622b79da4..3a74b0b44 100644
--- a/src/TNL/Matrices/MultidiagonalMatrix.h
+++ b/src/TNL/Matrices/MultidiagonalMatrix.h
@@ -245,7 +245,7 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       MultidiagonalMatrix( MultidiagonalMatrix&& matrix ) = default;
 
       /**
-       * \brief Returns a modifiable view of the mutlidiagonal matrix.
+       * \brief Returns a modifiable view of the multidiagonal matrix.
        *
        * See \ref MultidiagonalMatrixView.
        *
@@ -601,12 +601,24 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
@@ -624,22 +636,34 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include MultidiagonalMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
 
       /**
        * \brief Method for performing general reduction on matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callbale__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep =[=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [\e begin,\e end) of rows to be processed.
+       * \param end defines ending of the range [\e begin,\e end) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -653,18 +677,30 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include MultidiagonalMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       /**
        * \brief Method for performing general reduction on all matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -686,12 +722,24 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on all matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       *  ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -712,10 +760,11 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for iteration over matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
        *
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`,
+       * ```
+       * auto function = [=] __cuda_callble__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
        *
        * where
        *
@@ -724,7 +773,7 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \e localIdx parameter is a rank of the non-zero element in given row. It is also, in fact,
        *  index of the matrix subdiagonal.
        *
-       * \e columnIdx is a column index of the matrx element.
+       * \e columnIdx is a column index of the matrix element.
        *
        * \e value is the matrix element value.
        *
@@ -743,10 +792,11 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for iteration over matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
        *
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`,
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
        *
        * where
        *
@@ -906,9 +956,12 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for sequential iteration over all matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [=] ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value ) { ... };
+       * ```
+       *
        *  The column index repeats twice only for compatibility with sparse matrices.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
@@ -921,13 +974,16 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for sequential iteration over all matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value ) { ... };
+       * ```
+       *
        *  The column index repeats twice only for compatibility with sparse matrices.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [\e begin,\e end) of rows to be processed.
+       * \param end defines ending of the range [\e begin,\e end) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        */
       template< typename Function >
@@ -960,7 +1016,9 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * More precisely, it computes:
        *
-       * `outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector`
+       * ```
+       * outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector
+       * ```
        *
        * \tparam InVector is type of input vector.  It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
diff --git a/src/TNL/Matrices/MultidiagonalMatrixElement.h b/src/TNL/Matrices/MultidiagonalMatrixElement.h
index 3672526ea..2c8c37027 100644
--- a/src/TNL/Matrices/MultidiagonalMatrixElement.h
+++ b/src/TNL/Matrices/MultidiagonalMatrixElement.h
@@ -18,16 +18,36 @@ namespace TNL {
 namespace Matrices {
 
 
+/**
+ * \brief Accessor for multidiagonal matrix elements.
+ *
+ * \tparam Real is a type of matrix elements values.
+ * \tparam Index is a type of matrix elements column indexes.
+ */
 template< typename Real,
           typename Index >
 class MultidiagonalMatrixElement
 {
    public:
 
+      /**
+       * \brief Type of matrix elements values.
+       */
       using RealType = Real;
 
+      /**
+       * \brief Type of matrix elements column indexes.
+       */
       using IndexType = Index;
 
+      /**
+       * \brief Constructor.
+       *
+       * \param value is matrix element value.
+       * \param rowIdx is row index of the matrix element.
+       * \param columnIdx is a column index of the matrix element.
+       * \param localIdx is the rank of the non-zero elements in the matrix row.
+       */
       __cuda_callable__
       MultidiagonalMatrixElement( RealType& value,
                                   const IndexType& rowIdx,
@@ -35,21 +55,51 @@ class MultidiagonalMatrixElement
                                   const IndexType& localIdx )
       : value_( value ), rowIdx( rowIdx ), columnIdx( columnIdx ), localIdx( localIdx ) {};
 
+      /**
+       * \brief Returns reference on matrix element value.
+       *
+       * \return reference on matrix element value.
+       */
       __cuda_callable__
       RealType& value() { return value_; };
 
+      /**
+       * \brief Returns constant reference on matrix element value.
+       *
+       * \return constant reference on matrix element value.
+       */
       __cuda_callable__
       const RealType& value() const { return value_; };
 
+      /**
+       * \brief Returns constant reference on matrix element column index.
+       *
+       * \return constant reference on matrix element column index.
+       */
       __cuda_callable__
       const IndexType& rowIndex() const { return rowIdx; };
 
+      /**
+       * \brief Returns reference on matrix element column index.
+       *
+       * \return reference on matrix element column index.
+       */
       __cuda_callable__
       IndexType& columnIndex() { return columnIdx; };
 
+      /**
+       * \brief Returns constant reference on matrix element column index.
+       *
+       * \return constant reference on matrix element column index.
+       */
       __cuda_callable__
       const IndexType& columnIndex() const { return columnIdx; };
 
+      /**
+       * \brief Returns constant reference on the rank of the non-zero matrix element in the row.
+       *
+       * \return constant reference on the rank of the non-zero matrix element in the row.
+       */
       __cuda_callable__
       const IndexType& localIndex() const { return localIdx; };
 
diff --git a/src/TNL/Matrices/MultidiagonalMatrixView.h b/src/TNL/Matrices/MultidiagonalMatrixView.h
index fc5799fa6..869ddb973 100644
--- a/src/TNL/Matrices/MultidiagonalMatrixView.h
+++ b/src/TNL/Matrices/MultidiagonalMatrixView.h
@@ -363,16 +363,28 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -386,22 +398,34 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include MultidiagonalMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       /**
        * \brief Method for performing general reduction on matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -415,18 +439,30 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include MultidiagonalMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
 
       /**
        * \brief Method for performing general reduction on all matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -448,10 +484,18 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on all matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
        *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
        * \tparam FetchValue is type returned by the Fetch lambda function.
@@ -474,10 +518,11 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Method for iteration over all matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
        *
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`,
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
        *
        * where
        *
@@ -486,12 +531,12 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \e localIdx parameter is a rank of the non-zero element in given row. It is also, in fact,
        *  index of the matrix subdiagonal.
        *
-       * \e columnIdx is a column index of the matrx element.
+       * \e columnIdx is a column index of the matrix element.
        *
        * \e value is the matrix element value.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        *
        * \par Example
@@ -500,15 +545,16 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include MultidiagonalMatrixViewExample_forRows.out
        */
       template< typename Function >
-      void forElements( IndexType first, IndexType last, Function& function ) const;
+      void forElements( IndexType begin, IndexType end, Function& function ) const;
 
       /**
        * \brief Method for iteration over all matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
        *
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`,
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
        *
        * where
        *
@@ -517,12 +563,12 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \e localIdx parameter is a rank of the non-zero element in given row. It is also, in fact,
        *  index of the matrix subdiagonal.
        *
-       * \e columnIdx is a column index of the matrx element.
+       * \e columnIdx is a column index of the matrix element.
        *
        * \e value is a reference to the matrix element value. It can be used even for changing the matrix element value.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        *
        * \par Example
@@ -531,7 +577,7 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include MultidiagonalMatrixViewExample_forRows.out
        */
       template< typename Function >
-      void forElements( IndexType first, IndexType last, Function& function );
+      void forElements( IndexType begin, IndexType end, Function& function );
 
       /**
        * \brief This method calls \e forElements for all matrix rows (for constant instances).
@@ -668,13 +714,16 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Method for sequential iteration over all matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * ```
+       * auto function = [] __cuda_callable__ ( const RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::MultidiagonalMatrixView::RowView.
+       *
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        */
       template< typename Function >
@@ -683,13 +732,16 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Method for sequential iteration over all matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::MultidiagonalMatrixView::RowView.
+       *
+       * \param begin defines beginning of the range [ \e  begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        */
       template< typename Function >
@@ -722,7 +774,9 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        *
        * More precisely, it computes:
        *
-       * `outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector`
+       * ```
+       * outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector
+       * ```
        *
        * \tparam InVector is type of input vector.  It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
@@ -745,8 +799,8 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
                           OutVector& outVector,
                           const RealType matrixMultiplicator = 1.0,
                           const RealType outVectorMultiplicator = 0.0,
-                          const IndexType firstRow = 0,
-                          IndexType lastRow = 0 ) const;
+                          const IndexType begin = 0,
+                          IndexType end = 0 ) const;
 
       template< typename Real_, typename Device_, typename Index_, ElementsOrganization Organization_ >
       void addMatrix( const MultidiagonalMatrixView< Real_, Device_, Index_, Organization_ >& matrix,
diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index ef7fe0580..3b139d941 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -604,16 +604,28 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -633,16 +645,28 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... }
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -662,12 +686,24 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on all matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValu { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -689,12 +725,24 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on all matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value )
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -717,7 +765,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *
-       * \param begin defines beginning of the range [ \e begin,\e end ) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
        * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called for element of given rows.
        *
@@ -897,13 +945,16 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for sequential iteration over all matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseMatrix::RowView.
+       *
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        */
       template< typename Function >
@@ -912,13 +963,16 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for sequential iteration over all matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseMatrix::RowView.
+       *
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        */
       template< typename Function >
@@ -951,7 +1005,9 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * More precisely, it computes:
        *
-       * `outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector`
+       * ```
+       * outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector
+       * ```
        *
        * \tparam InVector is type of input vector.  It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
@@ -974,8 +1030,8 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
                           OutVector& outVector,
                           const ComputeRealType& matrixMultiplicator = 1.0,
                           const ComputeRealType& outVectorMultiplicator = 0.0,
-                          const IndexType firstRow = 0,
-                          const IndexType lastRow = 0 ) const;
+                          const IndexType begin = 0,
+                          const IndexType end = 0 ) const;
 
       /*template< typename Real2, typename Index2 >
       void addMatrix( const SparseMatrix< Real2, Segments, Device, Index2 >& matrix,
diff --git a/src/TNL/Matrices/SparseMatrixElement.h b/src/TNL/Matrices/SparseMatrixElement.h
index 485fb919b..39c7b61b4 100644
--- a/src/TNL/Matrices/SparseMatrixElement.h
+++ b/src/TNL/Matrices/SparseMatrixElement.h
@@ -18,6 +18,12 @@ namespace TNL {
 namespace Matrices {
 
 
+/**
+ * \brief Accessor for sparse matrix elements.
+ *
+ * \tparam Real is a type of matrix elements values.
+ * \tparam Index is a type of matrix elements column indexes.
+ */
 template< typename Real,
           typename Index,
           bool isBinary_ = false >
@@ -25,10 +31,24 @@ class SparseMatrixElement
 {
    public:
 
+      /**
+       * \brief Type of matrix elements values.
+       */
       using RealType = Real;
 
+      /**
+       * \brief Type of matrix elements column indexes.
+       */
       using IndexType = Index;
 
+      /**
+       * \brief Constructor.
+       *
+       * \param value is matrix element value.
+       * \param rowIdx is row index of the matrix element.
+       * \param columnIdx is a column index of the matrix element.
+       * \param localIdx is the rank of the non-zero elements in the matrix row.
+       */
       __cuda_callable__
       SparseMatrixElement( RealType& value,
                            const IndexType& rowIdx,
@@ -36,21 +56,51 @@ class SparseMatrixElement
                            const IndexType& localIdx )
       : value_( value ), rowIdx( rowIdx ), columnIdx( columnIdx ), localIdx( localIdx ) {};
 
+      /**
+       * \brief Returns reference on matrix element value.
+       *
+       * \return reference on matrix element value.
+       */
       __cuda_callable__
       RealType& value() { return value_; };
 
+      /**
+       * \brief Returns constant reference on matrix element value.
+       *
+       * \return constant reference on matrix element value.
+       */
       __cuda_callable__
       const RealType& value() const { return value_; };
 
+      /**
+       * \brief Returns constant reference on matrix element column index.
+       *
+       * \return constant reference on matrix element column index.
+       */
       __cuda_callable__
       const IndexType& rowIndex() const { return rowIdx; };
 
+      /**
+       * \brief Returns reference on matrix element column index.
+       *
+       * \return reference on matrix element column index.
+       */
       __cuda_callable__
       IndexType& columnIndex() { return columnIdx; };
 
+      /**
+       * \brief Returns constant reference on matrix element column index.
+       *
+       * \return constant reference on matrix element column index.
+       */
       __cuda_callable__
       const IndexType& columnIndex() const { return columnIdx; };
 
+      /**
+       * \brief Returns constant reference on the rank of the non-zero matrix element in the row.
+       *
+       * \return constant reference on the rank of the non-zero matrix element in the row.
+       */
       __cuda_callable__
       const IndexType& localIndex() const { return localIdx; };
 
diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index 10310d802..ba73ea651 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -395,16 +395,28 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -424,16 +436,28 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -453,12 +477,24 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on all matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -480,12 +516,24 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on all matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -506,13 +554,16 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Method for iteration over all matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
+       *
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin,\e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        *
        * \par Example
@@ -526,13 +577,16 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Method for iteration over all matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
+       *
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        *
        * \par Example
@@ -679,13 +733,16 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Method for sequential iteration over all matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseMatrixView::RowView.
+       *
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        */
       template< typename Function >
@@ -694,13 +751,16 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Method for sequential iteration over all matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::SparseMatrixView::RowView.
+       *
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        */
       template< typename Function >
@@ -733,7 +793,9 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        *
        * More precisely, it computes:
        *
-       * `outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector`
+       * ```
+       * outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector
+       * ```
        *
        * \tparam InVector is type of input vector.  It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
diff --git a/src/TNL/Matrices/TridiagonalMatrix.h b/src/TNL/Matrices/TridiagonalMatrix.h
index 4789a079f..45e6d132f 100644
--- a/src/TNL/Matrices/TridiagonalMatrix.h
+++ b/src/TNL/Matrices/TridiagonalMatrix.h
@@ -493,16 +493,28 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -522,16 +534,28 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on matrix rows of constant matrix instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -551,16 +575,28 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on all matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row.  It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -580,16 +616,28 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \brief Method for performing general reduction on all matrix rows of constant matrix instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -608,13 +656,16 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for iteration over matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
+       *
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        *
        * \par Example
@@ -628,9 +679,12 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for iteration over matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
+       *
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
@@ -648,13 +702,16 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for iteration over all matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
+       *
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        *
        * \par Example
@@ -668,9 +725,12 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for iteration over all matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
+       *
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
@@ -788,10 +848,13 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for sequential iteration over all matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::TridiagonalMatrix::RowView.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -803,10 +866,13 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       /**
        * \brief Method for sequential iteration over all matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::TridiagonalMatrix::RowView.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
@@ -842,7 +908,9 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * More precisely, it computes:
        *
-       * `outVector = matrixTriplicator * ( * this ) * inVector + outVectorTriplicator * outVector`
+       * ```
+       * outVector = matrixTriplicator * ( * this ) * inVector + outVectorTriplicator * outVector
+       * ```
        *
        * \tparam InVector is type of input vector.  It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
diff --git a/src/TNL/Matrices/TridiagonalMatrixView.h b/src/TNL/Matrices/TridiagonalMatrixView.h
index 6b68f4197..c8e0ecdca 100644
--- a/src/TNL/Matrices/TridiagonalMatrixView.h
+++ b/src/TNL/Matrices/TridiagonalMatrixView.h
@@ -179,11 +179,6 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
       template< typename Vector >
       void getCompressedRowLengths( Vector& rowLengths ) const;
 
-      //[[deprecated]]
-      //IndexType getRowLength( const IndexType row ) const;
-
-      //IndexType getMaxRowLength() const;
-
       /**
        * \brief Returns number of non-zero matrix elements.
        *
@@ -350,16 +345,28 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -373,22 +380,34 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include TridiagonalMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
 
       /**
        * \brief Method for performing general reduction on matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
@@ -402,18 +421,30 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include TridiagonalMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
 
       /**
        * \brief Method for performing general reduction on all matrix rows for constant instances.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       * The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -435,12 +466,24 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \brief Method for performing general reduction on all matrix rows.
        *
        * \tparam Fetch is a type of lambda function for data fetch declared as
-       *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
-       *          The return type of this lambda can be any non void.
+       *
+       * ```
+       * auto fetch = [=] __cuda_callable__ ( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue { ... };
+       * ```
+       *
+       *  The return type of this lambda can be any non void.
        * \tparam Reduce is a type of lambda function for reduction declared as
-       *          `reduce( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue`.
-       * \tparam Keep is a type of lambda function for storing results of reduction in each row.
-       *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
+       *
+       * ```
+       * auto reduce = [=] __cuda_callable__ ( const FetchValue& v1, const FetchValue& v2 ) -> FetchValue { ... };
+       * ```
+       *
+       * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
+       *
+       * ```
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * ```
+       *
        * \tparam FetchValue is type returned by the Fetch lambda function.
        *
        * \param fetch is an instance of lambda function for data fetch.
@@ -461,13 +504,16 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Method for iteration over all matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`.
-       *  The \e localIdx parameter is a rank of the non-zero element in given row.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
+       *
+       * The \e localIdx parameter is a rank of the non-zero element in given row.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin,\e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        *
        * \par Example
@@ -476,18 +522,21 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include TridiagonalMatrixViewExample_forRows.out
        */
       template< typename Function >
-      void forElements( IndexType first, IndexType last, Function& function ) const;
+      void forElements( IndexType begin, IndexType end, Function& function ) const;
 
       /**
        * \brief Method for iteration over all matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value )`.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) { ... };
+       * ```
+       *
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        *
        * \par Example
@@ -496,7 +545,7 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include TridiagonalMatrixViewExample_forRows.out
        */
       template< typename Function >
-      void forElements( IndexType first, IndexType last, Function& function );
+      void forElements( IndexType begin, IndexType end, Function& function );
 
       /**
        * \brief This method calls \e forElements for all matrix rows (for constant instances).
@@ -633,15 +682,16 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Method for sequential iteration over all matrix rows for constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
+       *
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::TridiagonalMatrixView::RowView.
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        */
       template< typename Function >
@@ -650,15 +700,16 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Method for sequential iteration over all matrix rows for non-constant instances.
        *
-       * \tparam Function is type of lambda function that will operate on matrix elements.
-       *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value )`.
-       *  The column index repeats twice only for compatibility with sparse matrices.
-       *  If the 'compute' variable is set to false the iteration over the row can
-       *  be interrupted.
+       * \tparam Function is type of lambda function that will operate on matrix elements. It is should have form like
        *
-       * \param begin defines beginning of the range [begin,end) of rows to be processed.
-       * \param end defines ending of the range [begin,end) of rows to be processed.
+       * ```
+       * auto function = [] __cuda_callable__ ( RowView& row ) { ... };
+       * ```
+       *
+       * \e RowView represents matrix row - see \ref TNL::Matrices::TridiagonalMatrixView::RowView.
+       *
+       * \param begin defines beginning of the range [ \e begin, \e end ) of rows to be processed.
+       * \param end defines ending of the range [ \e begin, \e end ) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
        */
       template< typename Function >
@@ -691,7 +742,9 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        *
        * More precisely, it computes:
        *
-       * `outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector`
+       * ```
+       * outVector = matrixMultiplicator * ( * this ) * inVector + outVectorMultiplicator * outVector
+       * ```
        *
        * \tparam InVector is type of input vector.  It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
-- 
GitLab


From 117283d0dc5d6979a7ab06b420be129d225e01fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 4 May 2021 14:40:32 +0200
Subject: [PATCH 047/117] Added functions for matrix wrapping.

---
 .../Matrices/DenseMatrix/CMakeLists.txt       |   1 +
 .../DenseMatrixViewExample_wrap.cpp           |  34 ++++
 .../DenseMatrixViewExample_wrap.cu            |   1 +
 .../Matrices/SparseMatrix/CMakeLists.txt      |   2 +
 .../SparseMatrixViewExample_wrapCSR.cpp       |  45 +++++
 .../SparseMatrixViewExample_wrapCSR.cu        |   1 +
 .../SparseMatrixViewExample_wrapEllpack.cpp   |  43 +++++
 .../SparseMatrixViewExample_wrapEllpack.cu    |   1 +
 .../Tutorials/Matrices/tutorial_Matrices.md   |  54 ++++--
 src/TNL/Matrices/MatrixWrapping.h             | 155 ++++++++++++++++++
 src/UnitTests/Matrices/CMakeLists.txt         |   1 +
 src/UnitTests/Matrices/MatrixWrappingTest.cpp |  11 ++
 src/UnitTests/Matrices/MatrixWrappingTest.cu  |  11 ++
 src/UnitTests/Matrices/MatrixWrappingTest.h   | 113 +++++++++++++
 14 files changed, 463 insertions(+), 10 deletions(-)
 create mode 100644 Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_wrap.cpp
 create mode 120000 Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_wrap.cu
 create mode 100644 Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapCSR.cpp
 create mode 120000 Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapCSR.cu
 create mode 100644 Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapEllpack.cpp
 create mode 120000 Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapEllpack.cu
 create mode 100644 src/TNL/Matrices/MatrixWrapping.h
 create mode 100644 src/UnitTests/Matrices/MatrixWrappingTest.cpp
 create mode 100644 src/UnitTests/Matrices/MatrixWrappingTest.cu
 create mode 100644 src/UnitTests/Matrices/MatrixWrappingTest.h

diff --git a/Documentation/Examples/Matrices/DenseMatrix/CMakeLists.txt b/Documentation/Examples/Matrices/DenseMatrix/CMakeLists.txt
index e28145776..d88862afb 100644
--- a/Documentation/Examples/Matrices/DenseMatrix/CMakeLists.txt
+++ b/Documentation/Examples/Matrices/DenseMatrix/CMakeLists.txt
@@ -26,6 +26,7 @@ set( COMMON_EXAMPLES
     DenseMatrixViewExample_forElements
     DenseMatrixViewExample_forRows
     DenseMatrixViewExample_forAllElements
+    DenseMatrixViewExample_wrap
 )
 
 if( BUILD_CUDA )
diff --git a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_wrap.cpp b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_wrap.cpp
new file mode 100644
index 000000000..819191468
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_wrap.cpp
@@ -0,0 +1,34 @@
+#include <iostream>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Matrices/MatrixWrapping.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void wrapMatrixView()
+{
+   const int rows( 3 ), columns( 4 );
+   TNL::Containers::Vector< double, Device > valuesVector {
+      1,  2,  3,  4,
+      5,  6,  7,  8,
+      9, 10, 11, 12 };
+   double* values = valuesVector.getData();
+
+   /***
+    * Wrap the array `values` to dense matrix view
+    */
+   auto matrix = TNL::Matrices::wrapDenseMatrix< Device >( rows, columns, values );
+   std::cout << "Matrix reads as: " << std::endl << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Wraping matrix view on host: " << std::endl;
+   wrapMatrixView< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Wraping matrix view on CUDA device: " << std::endl;
+   wrapMatrixView< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_wrap.cu b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_wrap.cu
new file mode 120000
index 000000000..fbdc1d8bb
--- /dev/null
+++ b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_wrap.cu
@@ -0,0 +1 @@
+DenseMatrixViewExample_wrap.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/SparseMatrix/CMakeLists.txt b/Documentation/Examples/Matrices/SparseMatrix/CMakeLists.txt
index c2db3879e..641534bae 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/CMakeLists.txt
+++ b/Documentation/Examples/Matrices/SparseMatrix/CMakeLists.txt
@@ -30,6 +30,8 @@ set( COMMON_EXAMPLES
    SparseMatrixViewExample_forElements
    SparseMatrixViewExample_forRows
    SparseMatrixViewExample_forAllElements
+   SparseMatrixViewExample_wrapCSR
+   SparseMatrixViewExample_wrapEllpack
 )
 
 if( BUILD_CUDA )
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapCSR.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapCSR.cpp
new file mode 100644
index 000000000..0c574bb7a
--- /dev/null
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapCSR.cpp
@@ -0,0 +1,45 @@
+#include <iostream>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Matrices/MatrixWrapping.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void wrapMatrixView()
+{
+   /***
+    * Encode the following matrix to CSR format...
+    *
+    * /  1  2  0  0 \.
+    * |  0  6  0  0 |
+    * |  9  0  0  0 |
+    * \  0  0 15 16 /
+    */
+   const int rows( 4 ), columns( 4 );
+   TNL::Containers::Vector< double, Device > valuesVector     { 1, 2, 6, 9, 15, 16 };
+   TNL::Containers::Vector< int, Device > columnIndexesVector { 0, 1, 1, 0,  2,  3 };
+   TNL::Containers::Vector< int, Device > rowPointersVector   { 0, 2, 3, 4, 6 };
+
+   double* values = valuesVector.getData();
+   int* columnIndexes = columnIndexesVector.getData();
+   int* rowPointers = rowPointersVector.getData();
+
+   /***
+    * Wrap the arrays `rowPointers, `values` and `columnIndexes` to sparse matrix view
+    */
+   auto matrix = TNL::Matrices::wrapCSRMatrix< Device >( rows, columns, rowPointers, values, columnIndexes );
+
+   std::cout << "Matrix reads as: " << std::endl << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Wraping matrix view on host: " << std::endl;
+   wrapMatrixView< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Wraping matrix view on CUDA device: " << std::endl;
+   wrapMatrixView< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapCSR.cu b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapCSR.cu
new file mode 120000
index 000000000..6581b62dc
--- /dev/null
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapCSR.cu
@@ -0,0 +1 @@
+SparseMatrixViewExample_wrapCSR.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapEllpack.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapEllpack.cpp
new file mode 100644
index 000000000..9f36df57e
--- /dev/null
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapEllpack.cpp
@@ -0,0 +1,43 @@
+#include <iostream>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Matrices/MatrixWrapping.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void wrapMatrixView()
+{
+   /***
+    * Encode the following matrix to Ellpack format...
+    *
+    * /  1  2  0  0 \.
+    * |  0  6  0  0 |
+    * |  9  0  0  0 |
+    * \  0  0 15 16 /
+    */
+   const int rows( 4 ), columns( 4 );
+   TNL::Containers::Vector< double, Device > valuesVector     { 1,  2,  6,  0,  9,  0, 15, 16 };
+   TNL::Containers::Vector< int, Device > columnIndexesVector { 0,  1,  1, -1,  0, -1,  2,  3 };
+
+   double* values = valuesVector.getData();
+   int* columnIndexes = columnIndexesVector.getData();
+
+   /***
+    * Wrap the arrays `values` and `columnIndexes` to sparse matrix view
+    */
+   auto matrix = TNL::Matrices::wrapEllpackMatrix< Device >( rows, columns, 2, values, columnIndexes );
+
+   std::cout << "Matrix reads as: " << std::endl << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Wraping matrix view on host: " << std::endl;
+   wrapMatrixView< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Wraping matrix view on CUDA device: " << std::endl;
+   wrapMatrixView< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapEllpack.cu b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapEllpack.cu
new file mode 120000
index 000000000..3d0a09594
--- /dev/null
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapEllpack.cu
@@ -0,0 +1 @@
+SparseMatrixViewExample_wrapEllpack.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index 5c60bece9..1014c9103 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -432,6 +432,16 @@ The result looks as follows:
 
 \include DenseMatrixExample_forElements.out
 
+#### Wrapping existing data to dense matrix view
+
+In case when you have already allocated data for dense matrix (for example in some other library), you may wrap it to dense matrix view with a function \ref TNL::Matrices::wrapDenseMatrix . See the following example:
+
+\includelineno DenseMatrixViewExample_wrap.cpp
+
+Here we create dense matrix having three rows and four columns. We use TNL vector (\ref TNL::Containers::Vector) only for allocation of the matrix elements (lines 12-15) and we get a pointer to the allocated array immediately (line 16). Next we use just the array to get dense matrix view with proper matrix dimensions (line 21). Note that we must explicitly state the device type as a template parameter of the function `wrapDenseMatrix` (\ref TNL::Matrices::wrapDenseMatrix). Finally, we print the matrix to see if it is correct (line 22). The result looks as follows:
+
+\include DenseMatrixViewExample_wrap.out
+
 ### Sparse matrices
 
 [Sparse matrices](https://en.wikipedia.org/wiki/Sparse_matrix) are extremely important in a lot of numerical algorithms. They are used at situations when we need to operate with matrices having majority of the matrix elements equal to zero. In this case, only the non-zero matrix elements are stored with possibly some *padding zeros* used for memory alignment. This is necessary mainly on GPUs. See the [Overview of matrix types](#overview_of_matrix_types) for the differences in memory requirements.
@@ -647,6 +657,30 @@ would not make sense. If we pass through this test, the matrix element lies in t
 
 \include SparseMatrixExample_forElements.out
 
+#### Wrapping existing data to sparse matrix view
+
+Standard sparse matrix format like [CSR](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)) and [Ellpack](https://people.math.sc.edu/Burkardt/data/sparse_ellpack/sparse_ellpack.html) store the matrix elements in specifically defined arrays. In case that you have already allocated them (for example in some other library), they can be wrapped into a sparse matrix view with given matrix format. This can be done by means of functions \ref TNL::Matrices::wrapCSRMatrix and \ref TNL::Matrices::wrapEllpackMatrix . See the following example for demonstration of the CSR format:
+
+\includelineno SparseMatrixViewExample_wrapCSR.cpp
+
+We create sparse matrix having four rows and four columns (line 19). We use TNL vector (\ref TNL::Containers::Vector) to allocate arrays necessary for the CSR format:
+
+1. `valuesVector` (line 20) - contains values of the nonzero matrix elements.
+2. `columnIndexesVector` (line 21) - contains column indexes of the nonzero matrix elements.
+3. `rowPointersVector` (line 22) - contains positions of the first nonzero matrix elements in each row within `valuesVector` and `columnIndexesVector`. The size of this array equals number of matrix rows plus one.
+
+Next we turn the vectors into C style pointers (lines 24-26) to wrap them into sparse matrix view (line 31). Note, that we must explicitly state the device on which the arrays are allocated. Finlay we print the matrix to check the correctness (line 33). The result looks as follows:
+
+\include SparseMatrixViewExample_wrapCSR.out
+
+Wrapping data corresponding with the Ellpack format is very similar as we can see in the following example:
+
+\includelineno SparseMatrixViewExample_wrapEllpack.cpp
+
+We encode the same sparse matrix as in the previous example. The essence of the Ellpack format is that we allocate the same number of matrix elements for each row which is two in our example. For some matrix rows we use the padding zeros for which we set the column index to -1 (line 21). Therefore the size of `valuesVector` and `columnIndexesVector` equals number of matrix rows times number of matrix elements allocated in each row. As before, we turn the vectors into C style pointers (lines 23-24) and wrap them into sparse matrix view with Ellpack format (line 29). Note that we must state the device on which the arrays are allocated explicitly. The result looks as follows:
+
+\include SparseMatrixViewExample_wrapEllpack.out
+
 #### Symmetric sparse matrices
 
 For sparse [symmetric matrices](https://en.wikipedia.org/wiki/Symmetric_matrix), TNL offers a format storing only a half of the matrix elements. More precisely, ony the matrix diagonal and the elements bellow are stored in the memory. The matrix elements above the diagonal are deduced from those bellow. If such a symmetric format is used on GPU, atomic operations must be used in some matrix operations. For this reason, symmetric matrices can be combined only with matrix elements values expressed in `float` or `double` type. An advantage of the symmetric formats is lower memory consumption. Since less data need to be transferred from the memory, better performance might be observed. In some cases, however, the use of atomic operations on GPU may cause performance drop. Mostly we can see approximately the same performance compared to general formats but we can profit from lower memory requirements which is appreciated especially on GPU. The following example shows how to create symmetric sparse matrix.
@@ -833,7 +867,7 @@ The output of the example looks as:
 
 \include TridiagonalMatrixExample_Constructor_init_list_1.out
 
-#### Methods `setElement` and `addElement`
+#### Methods setElement and addElement
 
 Similar way of the tridiagonal matrix setup is offered by the method `setElements` (\ref TNL::Matrices::TridiagonalMatrix::setElements) as the following example demonstrates:
 
@@ -851,7 +885,7 @@ The result looks as follows:
 
 \include TridiagonalMatrixExample_setElement.out
 
-#### Method `getRow`
+#### Method getRow
 
  A bit different way of setting up the matrix, is the use of tridiagonal matrix view and the method `getRow` (\ref TNL::Matrices::TridiagonalMatrixView::getRow) as the following example demonstrates:
 
@@ -863,7 +897,7 @@ The result looks as follows:
 
 \include TridiagonalMatrixViewExample_getRow.out
 
-### Method `forRows`
+#### Method forRows
 
 As in the case of other matrix types, the method `forRows` (\ref TNL::Matrices::TridiagonalMatrix::forRows) calls the method `getRow` (\ref TNL::Matrices::TridiagonalMatrix::getRow) in parallel. It is demonstrated by the following example which we may directly compare with the previous one:
 
@@ -881,7 +915,7 @@ The result looks as follows:
 
 \include TridiagonalMatrixExample_forRows.out
 
-#### Method `forElements`
+#### Method forElements
 
 Finally, even a bit more simple way of matrix elements manipulation with the method `forElements` (\ref TNL::Matrices::TridiagonalMatrix::forElements) is demonstrated in the following example:
 
@@ -1043,7 +1077,7 @@ On the lines 25-46, we call the constructor which, in addition to matrix dimensi
 
 \include MultidiagonalMatrixExample_Constructor_init_list_2.out
 
-#### Methods `setElement` and `addElement`
+#### Methods setElement and addElement
 
 Another and more efficient way of setting the matrix elements is by means of the method `setElement` (\ref TNL::Matrices::MultidiagonalMatrix::setElement). It is demonstrated in the following example:
 
@@ -1053,7 +1087,7 @@ This examples shows that the method `setElement` can be used both on the host (C
 
 \include MultidiagonalMatrixViewExample_setElement.out
 
-#### Method `getRow`
+#### Method getRow
 
 Slightly more efficient way of the multidiagonal matrix setup is offered by the method `getRow` (\ref TNL::Matrices::MultidiagonalMatrix::getRow). We will use it to create a matrix of the following form:
 
@@ -1137,7 +1171,7 @@ We use `ParallelFor2D` (\ref TNL::Algorithms::ParallelFor2D) to iterate over all
 
 \include MultidiagonalMatrixExample_Constructor.out
 
-### Method `forRows`
+#### Method forRows
 
 As in the case of other matrix types, the method `forRows` (\ref TNL::Matrices::MultidiagonalMatrix::forRows) calls the method `getRow` (\ref TNL::Matrices::MultidiagonalMatrix::getRow) in parallel. It is demonstrated by the following example:
 
@@ -1151,7 +1185,7 @@ The result looks as follows:
 
 \include MultidiagonalMatrixExample_forRows.out
 
-#### Method `forElements`
+#### Method forElements
 
 Similar and even a bit simpler way of setting the matrix elements is offered by the method `forElements` (\ref TNL::Matrices::MultidiagonalMatrix::forElements, \ref TNL::Matrices::MultidiagonalMatrixView::forElements) as demonstrated in the following example:
 
@@ -1220,7 +1254,7 @@ The result looks as follows:
 
 \include LambdaMatrixExample_Constructor.out
 
-#### Method `forRows`
+#### Method forRows
 
 Method `forRows` (\ref TNL::Matrices::LambdaMatrix::forRows, \ref TNL::Matrices::LambdaMatrix::forAllRows) iterates in parallel over all matrix rows. In the case of lambda matrices, it cannot be used for changing the matrix elements since they cannot be changed. In the following example, we show how to use this method to copy the matrix elements values to the dense matrix:
 
@@ -1238,7 +1272,7 @@ The result looks as follows:
 
 \include LambdaMatrixExample_forRows.out
 
-#### Method `forElements`
+#### Method forElements
 
 The lambda matrix has the same interface as other matrix types except of the method `getRow`. The following example demonstrates the use of the method `forElements` (\ref TNL::Matrices::LambdaMatrix::forElements) to copy the lambda matrix into the dense matrix:
 
diff --git a/src/TNL/Matrices/MatrixWrapping.h b/src/TNL/Matrices/MatrixWrapping.h
new file mode 100644
index 000000000..fa91de831
--- /dev/null
+++ b/src/TNL/Matrices/MatrixWrapping.h
@@ -0,0 +1,155 @@
+/***************************************************************************
+                          MatrixWrapping.h -  description
+                             -------------------
+    begin                : May 3, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Algorithms/Segments/Ellpack.h>
+#include <TNL/Matrices/SparseMatrixView.h>
+#include <TNL/Matrices/DenseMatrixView.h>
+
+namespace TNL {
+namespace Matrices {
+
+/**
+ * \brief Function for wrapping an array of values into a dense matrix view.
+ *
+ * \tparam Device is a device on which the array is allocated.
+ * \tparam Real is a type of array elements.
+ * \tparam Index is a type for indexing of matrix elements.
+ * \tparam Organization is matrix elements organization - see \ref TNL::Algorithms::Segments::ElementsOrganization.
+ * \param rows is a number of matrix rows.
+ * \param columns is a number of matrix columns.
+ * \param values is the array with matrix elements values.
+ * \return instance of DenseMatrixView wrapping the array.
+ *
+ * The array size must be equal to product of `rows` and `columns`. The dense matrix view does not deallocate the input
+ * array at the end of its lifespan.
+ *
+ * \par Example
+ * \include Matrices/DenseMatrix/DenseMatrixViewExample_wrap.cpp
+ * \par Output
+ * \include DenseMatrixViewExample_wrap.out
+ */
+template< typename Device,
+          typename Real,
+          typename Index,
+          ElementsOrganization Organization = Algorithms::Segments::DefaultElementsOrganization< Device >::getOrganization() >
+DenseMatrixView< Real, Device, Index, Organization >
+wrapDenseMatrix( const Index& rows, const Index& columns, Real* values )
+{
+   using MatrixView = DenseMatrixView< Real, Device, Index, Organization >;
+   using ValuesViewType = typename MatrixView::ValuesViewType;
+   return MatrixView( rows, columns, ValuesViewType( values, rows * columns ) );
+}
+
+/**
+ * \brief Function for wrapping of arrays defining [CSR format](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)) into a sparse matrix view.
+ *
+ * \tparam Device  is a device on which the arrays are allocated.
+ * \tparam Real is a type of matrix elements values.
+ * \tparam Index is a type for matrix elements indexing.
+ * \param rows is a number of matrix rows.
+ * \param columns is a number of matrix columns.
+ * \param rowPointers is an array holding row pointers of the CSR format ( `ROW_INDEX` [here](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)))
+ * \param values is an array with values of matrix elements ( `V` [here](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)))
+ * \param columnIndexes is an array with column indexes of matrix elements  ( `COL_INDEX` [here](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)))
+ * \return instance of SparseMatrixView with CSR format.
+ *
+ * The size of array \e rowPointers must be equal to number of `rows + 1`. The last element of the array equals to the number of all nonzero matrix elements. The sizes of arrays `values` and
+ * `columnIndexes` must be equal to this number.
+ *
+ * \par Example
+ * \include Matrices/SparseMatrix/SparseMatrixViewExample_wrapCSR.cpp
+ * \par Output
+ * \include SparseMatrixViewExample_wrapCSR.out
+ */
+template< typename Device,
+          typename Real,
+          typename Index >
+SparseMatrixView< Real, Device, Index, GeneralMatrix, Algorithms::Segments::CSRViewDefault >
+wrapCSRMatrix( const Index& rows, const Index& columns, Index* rowPointers, Real* values, Index* columnIndexes )
+{
+   using MatrixView = SparseMatrixView< Real, Device, Index, GeneralMatrix, Algorithms::Segments::CSRViewDefault >;
+   using ValuesViewType = typename MatrixView::ValuesViewType;
+   using ColumnIndexesView = typename MatrixView::ColumnsIndexesViewType;
+   using SegmentsView = typename MatrixView::SegmentsViewType;
+   using KernelView = typename SegmentsView::KernelView;
+   using RowPointersView = typename SegmentsView::OffsetsView;
+   RowPointersView rowPointersView( rowPointers, rows + 1 );
+   Index elementsCount = rowPointersView.getElement( rows );
+   SegmentsView segments( rowPointersView, KernelView() );
+   ValuesViewType valuesView( values, elementsCount );
+   ColumnIndexesView columnIndexesView( columnIndexes, elementsCount );
+   return MatrixView( rows, columns, valuesView, columnIndexesView, segments );
+}
+
+/// This is to prevent from appearing in Doxygen documentation.
+/// \cond HIDDEN_CLASS
+template< typename Device,
+          typename Real,
+          typename Index,
+          ElementsOrganization Organization = Algorithms::Segments::DefaultElementsOrganization< Device >::getOrganization(),
+          int Alignment = 1 >
+struct EllpackMatrixWrapper
+{
+   template< typename Device_, typename Index_ >
+   using EllpackSegments = Algorithms::Segments::EllpackView< Device_, Index_, Organization, Alignment >;
+   using MatrixView = SparseMatrixView< Real, Device, Index, GeneralMatrix, EllpackSegments >;
+
+   static MatrixView wrap( const Index& rows, const Index& columns, const Index& nonzerosPerRow, Real* values, Index* columnIndexes )
+   {
+      using ValuesViewType = typename MatrixView::ValuesViewType;
+      using ColumnIndexesView = typename MatrixView::ColumnsIndexesViewType;
+      using SegmentsView = Algorithms::Segments::EllpackView< Device, Index, Organization, Alignment >;
+      SegmentsView segments( rows, nonzerosPerRow );
+      Index elementsCount = segments.getStorageSize();
+      ValuesViewType valuesView( values, elementsCount );
+      ColumnIndexesView columnIndexesView( columnIndexes, elementsCount );
+      return MatrixView( rows, columns, valuesView, columnIndexesView, segments );
+   }
+};
+/// \endcond
+
+/**
+ * \brief Function for wrapping of arrays defining [Ellpack format](https://people.math.sc.edu/Burkardt/data/sparse_ellpack/sparse_ellpack.html) into a sparse matrix view.
+ *
+ * \tparam Device  is a device on which the arrays are allocated.
+ * \tparam Real is a type of matrix elements values.
+ * \tparam Index is a type for matrix elements indexing.
+ * \tparam Alignment defines alignment of data. The number of matrix rows is rounded to a multiple of this number. It it usefull mainly for GPUs.
+ * \param rows is a number of matrix rows.
+ * \param columns is a number of matrix columns.
+ * \param nonzerosPerRow is number of nonzero matrix elements in each row.
+ * \param values is an array with values of matrix elements.
+ * \param columnIndexes is an array with column indexes of matrix elements.
+ * \return instance of SparseMatrixView with CSR format.
+ *
+ *  The sizes of arrays `values` and `columnIndexes` must be equal to `rows * nonzerosPerRow`. Use `-1` as a column index for padding zeros.
+ *
+ * \par Example
+ * \include Matrices/SparseMatrix/SparseMatrixViewExample_wrapEllpack.cpp
+ * \par Output
+ * \include SparseMatrixViewExample_wrapEllpack.out
+ */
+template< typename Device,
+          typename Real,
+          typename Index,
+          ElementsOrganization Organization = Algorithms::Segments::DefaultElementsOrganization< Device >::getOrganization(),
+          int Alignment = 1 >
+auto
+wrapEllpackMatrix( const Index rows, const Index columns, const Index nonzerosPerRow, Real* values, Index* columnIndexes )
+-> decltype( EllpackMatrixWrapper< Device, Real, Index, Organization, Alignment >::wrap( rows, columns, nonzerosPerRow, values, columnIndexes ) )
+{
+   return EllpackMatrixWrapper< Device, Real, Index, Organization, Alignment >::wrap( rows, columns, nonzerosPerRow, values, columnIndexes );
+}
+
+   } //namespace Matrices
+} //namepsace TNL
diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt
index 3210920f5..fa8876993 100644
--- a/src/UnitTests/Matrices/CMakeLists.txt
+++ b/src/UnitTests/Matrices/CMakeLists.txt
@@ -30,6 +30,7 @@ set( COMMON_TESTS
             LambdaMatrixTest
             SparseMatrixTest_SandboxMatrix
             SparseMatrixVectorProductTest_SandboxMatrix
+            MatrixWrappingTest
 )
 
 set( CPP_TESTS
diff --git a/src/UnitTests/Matrices/MatrixWrappingTest.cpp b/src/UnitTests/Matrices/MatrixWrappingTest.cpp
new file mode 100644
index 000000000..db87ce482
--- /dev/null
+++ b/src/UnitTests/Matrices/MatrixWrappingTest.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          MatrixWrappingTest.cpp -  description
+                             -------------------
+    begin                : Mar 4, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "MatrixWrappingTest.h"
diff --git a/src/UnitTests/Matrices/MatrixWrappingTest.cu b/src/UnitTests/Matrices/MatrixWrappingTest.cu
new file mode 100644
index 000000000..8dd0849dc
--- /dev/null
+++ b/src/UnitTests/Matrices/MatrixWrappingTest.cu
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          MatrixWrappingTest.cu -  description
+                             -------------------
+    begin                : Mar 4, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "MatrixWrappingTest.h"
diff --git a/src/UnitTests/Matrices/MatrixWrappingTest.h b/src/UnitTests/Matrices/MatrixWrappingTest.h
new file mode 100644
index 000000000..af3fb00b3
--- /dev/null
+++ b/src/UnitTests/Matrices/MatrixWrappingTest.h
@@ -0,0 +1,113 @@
+/***************************************************************************
+                          SparseMatrixTest.h -  description
+                             -------------------
+    begin                : Mar 21, 2020
+    copyright            : (C) 2020 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Containers/Vector.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Math.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Matrices/SparseMatrix.h>
+#include <TNL/Matrices/MatrixWrapping.h>
+#include <TNL/Algorithms/Segments/Ellpack.h>
+#include <iostream>
+#include <sstream>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+template< typename Device_, typename Index_, typename IndexAllocator_ >
+using RowMajorEllpack = TNL::Algorithms::Segments::Ellpack< Device_, Index_, IndexAllocator_, TNL::Algorithms::Segments::RowMajorOrder, 1 >;
+
+// test fixture for typed tests
+template< typename Matrix >
+class MatrixTest : public ::testing::Test
+{
+protected:
+   using MatrixType = Matrix;
+};
+
+
+// types for which MatrixTest is instantiated
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::DenseMatrix< int,    TNL::Devices::Host, short >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Host, short >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Host, short >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Host, short >,
+    TNL::Matrices::DenseMatrix< int,    TNL::Devices::Host, int >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Host, int >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Host, int >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Host, int >,
+    TNL::Matrices::DenseMatrix< int,    TNL::Devices::Host, long >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Host, long >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Host, long >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Host, long >
+#ifdef HAVE_CUDA
+    ,TNL::Matrices::DenseMatrix< int,    TNL::Devices::Cuda, short >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Cuda, short >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Cuda, short >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Cuda, short >,
+    TNL::Matrices::DenseMatrix< int,    TNL::Devices::Cuda, int >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Cuda, int >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Cuda, int >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Cuda, int >,
+    TNL::Matrices::DenseMatrix< int,    TNL::Devices::Cuda, long >,
+    TNL::Matrices::DenseMatrix< long,   TNL::Devices::Cuda, long >,
+    TNL::Matrices::DenseMatrix< float,  TNL::Devices::Cuda, long >,
+    TNL::Matrices::DenseMatrix< double, TNL::Devices::Cuda, long >
+#endif
+>;
+
+
+TYPED_TEST_SUITE( MatrixTest, MatrixTypes);
+
+TYPED_TEST( MatrixTest, WrapMatrix )
+{
+   using DenseMatrix = typename TestFixture::MatrixType;
+   using RealType  = typename DenseMatrix::RealType;
+   using DeviceType  = typename DenseMatrix::DeviceType;
+   using IndexType  = typename DenseMatrix::IndexType;
+   using CSRMatrix = TNL::Matrices::SparseMatrix< RealType, DeviceType, IndexType,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >;
+   using EllpackMatrix = TNL::Matrices::SparseMatrix< RealType, DeviceType, IndexType,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >;
+
+   DenseMatrix denseMatrix{
+    { 1,  2,  0,  0 },
+    { 0,  6,  0,  0 },
+    { 9,  0,  0,  0 },
+    { 0,  0, 15, 16 } };
+   IndexType rows( 4 ), columns( 4 );
+   CSRMatrix csrMatrix;
+   EllpackMatrix ellpackMatrix;
+   csrMatrix = ellpackMatrix = denseMatrix;
+
+   auto denseMatrixValues  = denseMatrix.getValues().getData();
+
+   auto csrMatrixValues = csrMatrix.getValues().getData();
+   auto csrMatrixColumnIndexes = csrMatrix.getColumnIndexes().getData();
+   auto csrMatrixRowPointers = csrMatrix.getSegments().getOffsets().getData();
+
+   auto ellpackMatrixValues = ellpackMatrix.getValues().getData();
+   auto ellpackMatrixColumnIndexes = ellpackMatrix.getColumnIndexes().getData();
+
+   auto wrappedDenseMatrix   = TNL::Matrices::wrapDenseMatrix< DeviceType >( rows, columns, denseMatrixValues );
+   auto wrappedCSRMatrix     = TNL::Matrices::wrapCSRMatrix< DeviceType >( rows, columns, csrMatrixRowPointers, csrMatrixValues, csrMatrixColumnIndexes );
+   auto wrappedEllpackMatrix = TNL::Matrices::wrapEllpackMatrix< DeviceType >( rows, columns, ( IndexType ) 2, ellpackMatrixValues, ellpackMatrixColumnIndexes );
+
+   EXPECT_EQ( denseMatrix, wrappedDenseMatrix );
+   EXPECT_EQ( csrMatrix, wrappedCSRMatrix );
+   EXPECT_EQ( ellpackMatrix, wrappedEllpackMatrix );
+}
+
+
+#include "../main.h"
+
+#endif
-- 
GitLab


From 3190b11ca51a3b34020a4d3a29c14e0c0939c39e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 4 May 2021 14:41:31 +0200
Subject: [PATCH 048/117] Fixing EllpackView::getView method.

---
 src/TNL/Algorithms/Segments/EllpackView.h   | 1 -
 src/TNL/Algorithms/Segments/EllpackView.hpp | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/EllpackView.h b/src/TNL/Algorithms/Segments/EllpackView.h
index 6e4995e1d..b8066f635 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.h
+++ b/src/TNL/Algorithms/Segments/EllpackView.h
@@ -137,7 +137,6 @@ template< typename Device,
           int Alignment >
 std::ostream& operator<<( std::ostream& str, const EllpackView< Device, Index, Organization, Alignment >& ellpack ) { return printSegments( str, ellpack ); }
 
-
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/EllpackView.hpp b/src/TNL/Algorithms/Segments/EllpackView.hpp
index 18f1cde7b..7abf2caed 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/EllpackView.hpp
@@ -110,7 +110,7 @@ typename EllpackView< Device, Index, Organization, Alignment >::ViewType
 EllpackView< Device, Index, Organization, Alignment >::
 getView()
 {
-   return ViewType( segmentSize, segmentsCount, alignedSize );
+   return ViewType( segmentsCount, segmentSize, alignedSize );
 }
 
 template< typename Device,
-- 
GitLab


From debacf8f0d7c84e0db78464dfc1018b492640b75 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 4 May 2021 14:42:32 +0200
Subject: [PATCH 049/117] Fixing move constructors of CSRView.

---
 src/TNL/Algorithms/Segments/CSRView.h   | 4 ++--
 src/TNL/Algorithms/Segments/CSRView.hpp | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index dee96ba5a..8770f8ca8 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -52,13 +52,13 @@ class CSRView
       CSRView( const OffsetsView& offsets, const KernelView& kernel );
 
       __cuda_callable__
-      CSRView( const OffsetsView&& offsets, const KernelView&& kernel );
+      CSRView( OffsetsView&& offsets, KernelView&& kernel );
 
       __cuda_callable__
       CSRView( const CSRView& csr_view );
 
       __cuda_callable__
-      CSRView( const CSRView&& csr_view );
+      CSRView( CSRView&& csr_view );
 
       static String getSerializationType();
 
diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp
index b69e61e5a..4343e672b 100644
--- a/src/TNL/Algorithms/Segments/CSRView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRView.hpp
@@ -46,8 +46,8 @@ template< typename Device,
           typename Kernel >
 __cuda_callable__
 CSRView< Device, Index, Kernel >::
-CSRView( const OffsetsView&& offsets_view,
-         const KernelView&& kernel_view )
+CSRView( OffsetsView&& offsets_view,
+         KernelView&& kernel_view )
    : offsets( std::move( offsets_view ) ), kernel( std::move( kernel_view ) )
 {
 }
@@ -67,7 +67,7 @@ template< typename Device,
           typename Kernel >
 __cuda_callable__
 CSRView< Device, Index, Kernel >::
-CSRView( const CSRView&& csr_view )
+CSRView( CSRView&& csr_view )
    : offsets( std::move( csr_view.offsets ) ), kernel( std::move( csr_view.kernel ) )
 {
 }
-- 
GitLab


From c67f1184ed8fc4108de991ae0aaa48243be7c089 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 4 May 2021 14:44:28 +0200
Subject: [PATCH 050/117] Fixed RealType definition in SparseMatrixView -
 removing const.

---
 src/TNL/Matrices/SparseMatrixView.h   |  4 ++--
 src/TNL/Matrices/SparseMatrixView.hpp | 12 +++++-------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index ba73ea651..29371c802 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -98,7 +98,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief The type of matrix elements.
        */
-      using RealType = Real;
+      using RealType = std::remove_const_t< Real >;
 
       using ComputeRealType = ComputeReal;
 
@@ -942,7 +942,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       }
 };
 
-} // namespace Conatiners
+   } // namespace Matrices
 } // namespace TNL
 
 #include <TNL/Matrices/SparseMatrixView.hpp>
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 72377f847..63220888a 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -80,8 +80,8 @@ getConstView() const -> ConstViewType
    return ConstViewType( this->getRows(),
                          this->getColumns(),
                          this->getValues().getConstView(),
-                         this->getColumnsIndexes().getConstView(),
-                         this->segments.getConstView() );
+                         this->getColumnIndexes().getConstView(),
+                         const_cast< SparseMatrixView* >( this )->segments.getView() );
 }
 
 template< typename Real,
@@ -862,14 +862,12 @@ SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView, ComputeReal >::
 operator==( const Matrix& m ) const
 {
    const auto& view1 = *this;
-   // FIXME: getConstView does not work
-   //const auto view2 = m.getConstView();
-   const auto view2 = m.getView();
+   const auto view2 = m.getConstView();
    auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> bool
    {
       return view1.getRow( i ) == view2.getRow( i );
    };
-   return Algorithms::reduce< DeviceType >( (IndexType) 0, this->getRows(), fetch, std::logical_and<>{}, true );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, this->getRows(), fetch, std::logical_and<>{}, true );
 }
 
 template< typename Real,
@@ -896,7 +894,7 @@ void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView, ComputeReal >::
 save( File& file ) const
 {
-   MatrixView< RealType, DeviceType, IndexType >::save( file );
+   MatrixView< Real, Device, Index >::save( file );
    file << this->columnIndexes;
    this->segments.save( file );
 }
-- 
GitLab


From db588b4f20027a3431a1847a58c36544458e3c43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 4 May 2021 14:44:50 +0200
Subject: [PATCH 051/117] Added more comparison operators for dense matrices.

---
 src/TNL/Matrices/DenseMatrix.h       | 67 +++++++++++++++++++++++-
 src/TNL/Matrices/DenseMatrix.hpp     | 76 +++++++++++++++++++++++++++-
 src/TNL/Matrices/DenseMatrixView.h   | 36 +++++++++++++
 src/TNL/Matrices/DenseMatrixView.hpp | 57 +++++++++++++++++++++
 4 files changed, 232 insertions(+), 4 deletions(-)

diff --git a/src/TNL/Matrices/DenseMatrix.h b/src/TNL/Matrices/DenseMatrix.h
index faf0d9649..3d0e2d62d 100644
--- a/src/TNL/Matrices/DenseMatrix.h
+++ b/src/TNL/Matrices/DenseMatrix.h
@@ -974,7 +974,7 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \return \e true if the RHS matrix is equal, \e false otherwise.
        */
       template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
-      bool operator==( const DenseMatrix< Real_, Device_, Index_, Organization >& matrix ) const;
+      bool operator==( const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator_ >& matrix ) const;
 
       /**
        * \brief Comparison operator with another dense matrix.
@@ -983,7 +983,43 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \return \e false if the RHS matrix is equal, \e true otherwise.
        */
       template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
-      bool operator!=( const DenseMatrix< Real_, Device_, Index_, Organization >& matrix ) const;
+      bool operator!=( const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator_ >& matrix ) const;
+
+      /**
+       * \brief Comparison operator with another dense matrix view.
+       *
+       * \param matrix is the right-hand side matrix view.
+       * \return \e true if the RHS matrix view is equal, \e false otherwise.
+       */
+      template< typename Real_, typename Device_, typename Index_ >
+      bool operator==( const DenseMatrixView< Real_, Device_, Index_, Organization >& matrix ) const;
+
+      /**
+       * \brief Comparison operator with another dense matrix view.
+       *
+       * \param matrix is the right-hand side matrix view.
+       * \return \e false if the RHS matrix view is equal, \e true otherwise.
+       */
+      template< typename Real_, typename Device_, typename Index_ >
+      bool operator!=( const DenseMatrixView< Real_, Device_, Index_, Organization >& matrix ) const;
+
+      /**
+       * \brief Comparison operator with another arbitrary matrix type.
+       *
+       * \param matrix is the right-hand side matrix.
+       * \return \e true if the RHS matrix is equal, \e false otherwise.
+       */
+      template< typename Matrix >
+      bool operator==( const Matrix& m ) const;
+
+      /**
+       * \brief Comparison operator with another arbitrary matrix type.
+       *
+       * \param matrix is the right-hand side matrix.
+       * \return \e true if the RHS matrix is equal, \e false otherwise.
+       */
+      template< typename Matrix >
+      bool operator!=( const Matrix& m ) const;
 
       /**
        * \brief Method for saving the matrix to the file with given filename.
@@ -1045,6 +1081,33 @@ template< typename Real,
           typename RealAllocator >
 std::ostream& operator<< ( std::ostream& str, const DenseMatrix< Real, Device, Index, Organization, RealAllocator >& matrix );
 
+/**
+ * \brief Comparison operator with another dense matrix view.
+ *
+ * \param leftMatrix is the left-hand side matrix view.
+ * \param rightMatrix is the right-hand side matrix.
+ * \return \e true if the both matrices are is equal, \e false otherwise.
+ */
+template< typename Real, typename Device, typename Index,
+          typename Real_, typename Device_, typename Index_,
+          ElementsOrganization Organization, typename RealAllocator >
+bool operator==( const DenseMatrixView< Real, Device, Index, Organization >& leftMatrix,
+                 const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator >& rightMatrix );
+
+/**
+ * \brief Comparison operator with another dense matrix view.
+ *
+ * \param leftMatrix is the left-hand side matrix view.
+ * \param rightMatrix is the right-hand side matrix.
+ * \return \e false if the both matrices are is equal, \e true otherwise.
+ */
+template< typename Real, typename Device, typename Index,
+          typename Real_, typename Device_, typename Index_,
+          ElementsOrganization Organization, typename RealAllocator >
+bool operator!=( const DenseMatrixView< Real, Device, Index, Organization >& leftMatrix,
+                 const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator >& rightMatrix );
+
+
 } // namespace Matrices
 } // namespace TNL
 
diff --git a/src/TNL/Matrices/DenseMatrix.hpp b/src/TNL/Matrices/DenseMatrix.hpp
index 46c85a373..e265f6f15 100644
--- a/src/TNL/Matrices/DenseMatrix.hpp
+++ b/src/TNL/Matrices/DenseMatrix.hpp
@@ -1284,7 +1284,7 @@ template< typename Real,
    template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
 bool
 DenseMatrix< Real, Device, Index, Organization, RealAllocator >::
-operator==( const DenseMatrix< Real_, Device_, Index_, Organization >& matrix ) const
+operator==( const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator_ >& matrix ) const
 {
    return( this->getRows() == matrix.getRows() &&
            this->getColumns() == matrix.getColumns() &&
@@ -1299,11 +1299,65 @@ template< typename Real,
    template< typename Real_, typename Device_, typename Index_, typename RealAllocator_ >
 bool
 DenseMatrix< Real, Device, Index, Organization, RealAllocator >::
-operator!=( const DenseMatrix< Real_, Device_, Index_, Organization >& matrix ) const
+operator!=( const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator_ >& matrix ) const
 {
    return ! ( *this == matrix );
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          typename RealAllocator >
+   template< typename Real_, typename Device_, typename Index_ >
+bool
+DenseMatrix< Real, Device, Index, Organization, RealAllocator >::
+operator==( const DenseMatrixView< Real_, Device_, Index_, Organization >& matrix ) const
+{
+   return( this->getRows() == matrix.getRows() &&
+           this->getColumns() == matrix.getColumns() &&
+           this->getValues() == matrix.getValues() );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          typename RealAllocator >
+   template< typename Real_, typename Device_, typename Index_ >
+bool
+DenseMatrix< Real, Device, Index, Organization, RealAllocator >::
+operator!=( const DenseMatrixView< Real_, Device_, Index_, Organization >& matrix ) const
+{
+   return ! ( *this == matrix );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          typename RealAllocator >
+   template< typename Matrix >
+bool
+DenseMatrix< Real, Device, Index, Organization, RealAllocator >::
+operator==( const Matrix& m ) const
+{
+   return ( this->view == m );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          typename RealAllocator >
+   template< typename Matrix >
+bool
+DenseMatrix< Real, Device, Index, Organization, RealAllocator >::
+operator!=( const Matrix& m ) const
+{
+   return ( this->view != m );
+}
+
 template< typename Real,
           typename Device,
           typename Index,
@@ -1380,5 +1434,23 @@ std::ostream& operator<< ( std::ostream& str, const DenseMatrix< Real, Device, I
    return str;
 }
 
+template< typename Real, typename Device, typename Index,
+          typename Real_, typename Device_, typename Index_,
+          ElementsOrganization Organization, typename RealAllocator >
+bool operator==( const DenseMatrixView< Real, Device, Index, Organization >& leftMatrix,
+                 const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator >& rightMatrix )
+{
+   return rightMatrix == leftMatrix;
+}
+
+template< typename Real, typename Device, typename Index,
+          typename Real_, typename Device_, typename Index_,
+          ElementsOrganization Organization, typename RealAllocator >
+bool operator!=( const DenseMatrixView< Real, Device, Index, Organization >& leftMatrix,
+                 const DenseMatrix< Real_, Device_, Index_, Organization, RealAllocator >& rightMatrix )
+{
+   return rightMatrix != leftMatrix;
+}
+
 } // namespace Matrices
 } // namespace TNL
diff --git a/src/TNL/Matrices/DenseMatrixView.h b/src/TNL/Matrices/DenseMatrixView.h
index 6e181d698..89a2219b3 100644
--- a/src/TNL/Matrices/DenseMatrixView.h
+++ b/src/TNL/Matrices/DenseMatrixView.h
@@ -870,6 +870,42 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        */
       DenseMatrixView& operator=( const DenseMatrixView& matrix );
 
+      /**
+       * \brief Comparison operator with another dense matrix view.
+       *
+       * \param matrix is the right-hand side matrix view.
+       * \return \e true if the RHS matrix view is equal, \e false otherwise.
+       */
+      template< typename Real_, typename Device_, typename Index_ >
+      bool operator==( const DenseMatrixView< Real_, Device_, Index_, Organization >& matrix ) const;
+
+      /**
+       * \brief Comparison operator with another dense matrix view.
+       *
+       * \param matrix is the right-hand side matrix.
+       * \return \e false if the RHS matrix view is equal, \e true otherwise.
+       */
+      template< typename Real_, typename Device_, typename Index_ >
+      bool operator!=( const DenseMatrixView< Real_, Device_, Index_, Organization >& matrix ) const;
+
+      /**
+       * \brief Comparison operator with another arbitrary matrix type.
+       *
+       * \param matrix is the right-hand side matrix.
+       * \return \e true if the RHS matrix is equal, \e false otherwise.
+       */
+      template< typename Matrix >
+      bool operator==( const Matrix& m ) const;
+
+      /**
+       * \brief Comparison operator with another arbitrary matrix type.
+       *
+       * \param matrix is the right-hand side matrix.
+       * \return \e true if the RHS matrix is equal, \e false otherwise.
+       */
+      template< typename Matrix >
+      bool operator!=( const Matrix& m ) const;
+
       /**
        * \brief Method for saving the matrix view to the file with given filename.
        *
diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index 6371832ee..4faa25aa0 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -808,6 +808,63 @@ operator=( const DenseMatrixView& matrix )
    return *this;
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          ElementsOrganization Organization >
+   template< typename Real_, typename Device_, typename Index_ >
+bool
+DenseMatrixView< Real, Device, Index, Organization >::
+operator==( const DenseMatrixView< Real_, Device_, Index_, Organization >& matrix ) const
+{
+   return( this->getRows() == matrix.getRows() &&
+           this->getColumns() == matrix.getColumns() &&
+           this->getValues() == matrix.getValues() );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          ElementsOrganization Organization >
+   template< typename Real_, typename Device_, typename Index_ >
+bool
+DenseMatrixView< Real, Device, Index, Organization >::
+operator!=( const DenseMatrixView< Real_, Device_, Index_, Organization >& matrix ) const
+{
+   return ! ( *this == matrix );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          ElementsOrganization Organization >
+   template< typename Matrix >
+bool
+DenseMatrixView< Real, Device, Index, Organization >::
+operator==( const Matrix& m ) const
+{
+   const auto& view1 = *this;
+   const auto view2 = m.getConstView();
+   auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> bool
+   {
+      return view1.getRow( i ) == view2.getRow( i );
+   };
+   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, this->getRows(), fetch, std::logical_and<>{}, true );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          ElementsOrganization Organization >
+   template< typename Matrix >
+bool
+DenseMatrixView< Real, Device, Index, Organization >::
+operator!=( const Matrix& m ) const
+{
+   return ! ( *this == m );
+}
+
+
 template< typename Real,
           typename Device,
           typename Index,
-- 
GitLab


From 2ea8dc337af6d84b1b62f72975ffcd8dde698cb2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 4 May 2021 17:21:50 +0200
Subject: [PATCH 052/117] Added SparseMatrixRowViewValueGetter.

---
 .../details/SparseMatrixRowViewValueGetter.h  | 76 +++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 src/TNL/Matrices/details/SparseMatrixRowViewValueGetter.h

diff --git a/src/TNL/Matrices/details/SparseMatrixRowViewValueGetter.h b/src/TNL/Matrices/details/SparseMatrixRowViewValueGetter.h
new file mode 100644
index 000000000..d47a931d7
--- /dev/null
+++ b/src/TNL/Matrices/details/SparseMatrixRowViewValueGetter.h
@@ -0,0 +1,76 @@
+
+
+/***************************************************************************
+                          SparseMatrixRowViewValueGetter.h  -  description
+                             -------------------
+    begin                : May 4, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+namespace TNL {
+   namespace Matrices {
+      namespace details {
+
+
+template< typename SegmentView,
+          typename ValuesView,
+          typename ColumnsIndexesView,
+          bool isBinary_ >
+struct SparseMatrixRowViewValueGetter {};
+
+template< typename SegmentView,
+          typename ValuesView,
+          typename ColumnsIndexesView >
+struct SparseMatrixRowViewValueGetter< SegmentView, ValuesView, ColumnsIndexesView, true >
+{
+   using RealType = typename ValuesView::RealType;
+
+   using IndexType = typename ColumnsIndexesView::IndexType;
+
+   using ResultType = bool;
+
+   using ConstResultType = bool;
+
+   __cuda_callable__
+   static bool getValue( const IndexType& globalIdx, const ValuesView& values, const ColumnsIndexesView& columnIndexes, const IndexType& paddingIndex )
+   {
+      if( columnIndexes[ globalIdx ] != paddingIndex )
+         return true;
+      return false;
+   };
+};
+
+template< typename SegmentView,
+          typename ValuesView,
+          typename ColumnsIndexesView >
+struct SparseMatrixRowViewValueGetter< SegmentView, ValuesView, ColumnsIndexesView, false >
+{
+   using RealType = typename ValuesView::RealType;
+
+   using IndexType = typename ColumnsIndexesView::IndexType;
+
+   using ResultType = RealType&;
+
+   using ConstResultType = const RealType&;
+
+   __cuda_callable__
+   static const RealType& getValue( const IndexType& globalIdx, const ValuesView& values, const ColumnsIndexesView& columnIndexes, const IndexType& paddingIndex )
+   {
+      return values[ globalIdx ];
+   };
+
+   __cuda_callable__
+   static RealType& getValue( const IndexType& globalIdx, ValuesView& values, ColumnsIndexesView& columnIndexes, const IndexType& paddingIndex )
+   {
+      return values[ globalIdx ];
+   };
+};
+
+      } //namespace details
+   } //namepsace Matrices
+} //namespace TNL
-- 
GitLab


From 01ce56d7ee611fd4e6422609f6770b9a6c6a164e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 4 May 2021 17:22:42 +0200
Subject: [PATCH 053/117] Refactoring CMakeLists for smart pointers examples.

---
 .../Examples/Pointers/CMakeLists.txt          | 35 ++++++++++++-------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/Documentation/Examples/Pointers/CMakeLists.txt b/Documentation/Examples/Pointers/CMakeLists.txt
index ef7a5f615..2b08ac329 100644
--- a/Documentation/Examples/Pointers/CMakeLists.txt
+++ b/Documentation/Examples/Pointers/CMakeLists.txt
@@ -1,15 +1,26 @@
-IF( BUILD_CUDA )
-   CUDA_ADD_EXECUTABLE(UniquePointerExampleCuda UniquePointerExample.cu)
-   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
-   CUDA_ADD_EXECUTABLE(SharedPointerExampleCuda SharedPointerExample.cu)
-   ADD_CUSTOM_COMMAND( COMMAND SharedPointerExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SharedPointerExample.out OUTPUT SharedPointerExample.out )
-   CUDA_ADD_EXECUTABLE(DevicePointerExampleCuda DevicePointerExample.cu)
-   ADD_CUSTOM_COMMAND( COMMAND DevicePointerExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DevicePointerExample.out OUTPUT DevicePointerExample.out )
+set( COMMON_EXAMPLES
+   UniquePointerExample
+   SharedPointerExample
+   DevicePointerExample
+)
 
-ADD_CUSTOM_TARGET( RunPointersExamples ALL DEPENDS
-   UniquePointerExample.out
-   SharedPointerExample.out
-   DevicePointerExample.out
- )
+if( BUILD_CUDA )
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+      cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
+      add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
+   endforeach()
+else()
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+      add_executable( ${target} ${target}.cpp )
+      add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out )
+   endforeach()
+endif()
 
+IF( BUILD_CUDA )
+   ADD_CUSTOM_TARGET( RunPointersExamples-cuda ALL DEPENDS ${CUDA_OUTPUTS} )
+ELSE()
+   ADD_CUSTOM_TARGET( RunPointersExamples ALL DEPENDS ${HOST_OUTPUTS} )
 ENDIF()
+
-- 
GitLab


From 362ecdd4fa3bc17a4ad339b4c1024953084178ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 4 May 2021 17:23:12 +0200
Subject: [PATCH 054/117] Fixing matrix elements organization for ellpack
 wrapping.

---
 Documentation/Examples/CMakeLists.txt         | 112 ++++++------------
 .../SparseMatrixViewExample_wrapEllpack.cpp   |   2 +-
 .../Tutorials/Matrices/tutorial_Matrices.md   |   2 +-
 src/TNL/Matrices/MatrixWrapping.h             |   8 +-
 src/TNL/Matrices/SparseMatrix.hpp             |   2 +-
 src/TNL/Matrices/SparseMatrixRowView.h        |  10 +-
 src/TNL/Matrices/SparseMatrixRowView.hpp      |  18 ++-
 src/UnitTests/Matrices/MatrixWrappingTest.h   |   4 +-
 8 files changed, 66 insertions(+), 92 deletions(-)

diff --git a/Documentation/Examples/CMakeLists.txt b/Documentation/Examples/CMakeLists.txt
index 29ba5a5df..e984d2f1f 100644
--- a/Documentation/Examples/CMakeLists.txt
+++ b/Documentation/Examples/CMakeLists.txt
@@ -3,80 +3,42 @@ ADD_SUBDIRECTORY( Containers )
 ADD_SUBDIRECTORY( Pointers )
 ADD_SUBDIRECTORY( Matrices )
 
-ADD_EXECUTABLE( FileExample FileExample.cpp )
-ADD_CUSTOM_COMMAND( COMMAND FileExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/FileExample.out OUTPUT FileExample.out )
-
-IF( BUILD_CUDA )
-   CUDA_ADD_EXECUTABLE(FileExampleCuda FileExampleCuda.cu)
-   ADD_CUSTOM_COMMAND( COMMAND FileExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/FileExampleCuda.out OUTPUT FileExampleCuda.out )
-ENDIF()
-
-ADD_EXECUTABLE( FileExampleSaveAndLoad FileExampleSaveAndLoad.cpp )
-ADD_CUSTOM_COMMAND( COMMAND FileExampleSaveAndLoad > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/FileExampleSaveAndLoad.out OUTPUT FileExampleSaveAndLoad.out )
-
-ADD_EXECUTABLE( FileNameExample FileNameExample.cpp )
-ADD_CUSTOM_COMMAND( COMMAND FileNameExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/FileNameExample.out OUTPUT FileNameExample.out )
-
-ADD_EXECUTABLE( FileNameExampleDistributedSystemNodeCoordinates FileNameExampleDistributedSystemNodeCoordinates.cpp )
-ADD_CUSTOM_COMMAND( COMMAND FileNameExampleDistributedSystemNodeCoordinates > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/FileNameExampleDistributedSystemNodeCoordinates.out OUTPUT FileNameExampleDistributedSystemNodeCoordinates.out )
-
-
-ADD_EXECUTABLE( FileNameExampleDistributedSystemNodeId FileNameExampleDistributedSystemNodeId.cpp )
-ADD_CUSTOM_COMMAND( COMMAND FileNameExampleDistributedSystemNodeId > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/FileNameExampleDistributedSystemNodeId.out OUTPUT FileNameExampleDistributedSystemNodeId.out )
-
-ADD_EXECUTABLE( ObjectExample_getType ObjectExample_getType.cpp )
-ADD_CUSTOM_COMMAND( COMMAND ObjectExample_getType > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ObjectExample_getType.out OUTPUT ObjectExample_getType.out )
-
-ADD_EXECUTABLE( ParameterContainerExample ParameterContainerExample.cpp )
-ADD_EXECUTABLE( ConfigDescriptionExample ConfigDescriptionExample.cpp )
-ADD_EXECUTABLE( LoggerExample LoggerExample.cpp )
-ADD_EXECUTABLE( MathExample MathExample.cpp )
-
-ADD_EXECUTABLE( ParseObjectTypeExample ParseObjectTypeExample.cpp )
-ADD_CUSTOM_COMMAND( COMMAND ParseObjectTypeExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParseObjectTypeExample.out OUTPUT ParseObjectTypeExample.out )
-
-ADD_EXECUTABLE( StringExample StringExample.cpp )
-ADD_CUSTOM_COMMAND( COMMAND StringExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/StringExample.out OUTPUT StringExample.out )
-
-ADD_EXECUTABLE( StringExampleGetAllocatedSize StringExampleGetAllocatedSize.cpp )
-ADD_CUSTOM_COMMAND( COMMAND StringExampleGetAllocatedSize > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/StringExampleGetAllocatedSize.out OUTPUT StringExampleGetAllocatedSize.out )
-
-ADD_EXECUTABLE( StringExampleReplace StringExampleReplace.cpp )
-ADD_CUSTOM_COMMAND( COMMAND StringExampleReplace > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/StringExampleReplace.out OUTPUT StringExampleReplace.out )
-
-ADD_EXECUTABLE( StringExampleSetSize StringExampleSetSize.cpp )
-ADD_CUSTOM_COMMAND( COMMAND StringExampleSetSize > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/StringExampleSetSize.out OUTPUT StringExampleSetSize.out )
-
-ADD_EXECUTABLE( StringExampleSplit StringExampleSplit.cpp )
-ADD_CUSTOM_COMMAND( COMMAND StringExampleSplit > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/StringExampleSplit.out OUTPUT StringExampleSplit.out )
-
-ADD_EXECUTABLE( StringExampleStrip StringExampleStrip.cpp )
-ADD_CUSTOM_COMMAND( COMMAND StringExampleStrip > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/StringExampleStrip.out OUTPUT StringExampleStrip.out )
-
-ADD_EXECUTABLE( TimerExample TimerExample.cpp )
-ADD_CUSTOM_COMMAND( COMMAND TimerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/TimerExample.out OUTPUT TimerExample.out )
-
-ADD_EXECUTABLE( TimerExampleLogger TimerExampleLogger.cpp )
-ADD_CUSTOM_COMMAND( COMMAND TimerExampleLogger > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/TimerExampleLogger.out OUTPUT TimerExampleLogger.out )
-
-
-ADD_CUSTOM_TARGET( RunExamples ALL DEPENDS
-   FileExample.out
-   FileExampleSaveAndLoad.out
-   FileNameExample.out
-   FileNameExampleDistributedSystemNodeCoordinates.out
-   FileNameExampleDistributedSystemNodeId.out
-   ObjectExample_getType.out
-   ParseObjectTypeExample.out
-   StringExample.out
-   StringExampleGetAllocatedSize.out
-   StringExampleReplace.out
-   StringExampleSplit.out
-   StringExampleStrip.out
-   TimerExample.out
-   TimerExampleLogger.out )
+set( COMMON_EXAMPLES
+   FileExampleCuda
+)
+
+set( HOST_EXAMPLES
+   FileExample
+   FileExampleSaveAndLoad
+   FileNameExample
+   FileNameExampleDistributedSystemNodeCoordinates
+   FileNameExampleDistributedSystemNodeId
+   ObjectExample_getType
+   ParseObjectTypeExample
+   StringExample
+   StringExampleGetAllocatedSize
+   StringExampleReplace
+   StringExampleSplit
+   StringExampleStrip
+   TimerExample
+   TimerExampleLogger )
 
 if( BUILD_CUDA )
-   ADD_CUSTOM_TARGET( RunExamples-cuda ALL DEPENDS
-      FileExampleCuda.out )
-ENDIF()
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+      cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
+      add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
+   endforeach()
+else()
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} ${HOST_EXAMPLES})
+      add_executable( ${target} ${target}.cpp )
+      add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out )
+   endforeach()
+endif()
+
+IF( BUILD_CUDA )
+   ADD_CUSTOM_TARGET( RunExamples-cuda ALL DEPENDS ${CUDA_OUTPUTS} )
+ELSE()
+   ADD_CUSTOM_TARGET( RunExamples ALL DEPENDS ${HOST_OUTPUTS} )
+ENDIF()
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapEllpack.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapEllpack.cpp
index 9f36df57e..67df09891 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapEllpack.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_wrapEllpack.cpp
@@ -26,7 +26,7 @@ void wrapMatrixView()
    /***
     * Wrap the arrays `values` and `columnIndexes` to sparse matrix view
     */
-   auto matrix = TNL::Matrices::wrapEllpackMatrix< Device >( rows, columns, 2, values, columnIndexes );
+   auto matrix = TNL::Matrices::wrapEllpackMatrix< Device, TNL::Algorithms::Segments::RowMajorOrder >( rows, columns, 2, values, columnIndexes );
 
    std::cout << "Matrix reads as: " << std::endl << matrix << std::endl;
 }
diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
index 1014c9103..efcf7a4bf 100644
--- a/Documentation/Tutorials/Matrices/tutorial_Matrices.md
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -677,7 +677,7 @@ Wrapping data corresponding with the Ellpack format is very similar as we can se
 
 \includelineno SparseMatrixViewExample_wrapEllpack.cpp
 
-We encode the same sparse matrix as in the previous example. The essence of the Ellpack format is that we allocate the same number of matrix elements for each row which is two in our example. For some matrix rows we use the padding zeros for which we set the column index to -1 (line 21). Therefore the size of `valuesVector` and `columnIndexesVector` equals number of matrix rows times number of matrix elements allocated in each row. As before, we turn the vectors into C style pointers (lines 23-24) and wrap them into sparse matrix view with Ellpack format (line 29). Note that we must state the device on which the arrays are allocated explicitly. The result looks as follows:
+We encode the same sparse matrix as in the previous example. The essence of the Ellpack format is that we allocate the same number of matrix elements for each row which is two in our example. For some matrix rows we use the padding zeros for which we set the column index to -1 (line 21). Therefore the size of `valuesVector` and `columnIndexesVector` equals number of matrix rows times number of matrix elements allocated in each row. As before, we turn the vectors into C style pointers (lines 23-24) and wrap them into sparse matrix view with Ellpack format (line 29). Note that we must state the device on which the arrays are allocated explicitly and also the matrix elements organization, which is \ref TNL::Algorithms::Segments::RowMajorOrder in this case. For Ellpack matrix stored on GPU, \ref TNL::Algorithms::Segments::ColumnMajorOrder is preferred. The result looks as follows:
 
 \include SparseMatrixViewExample_wrapEllpack.out
 
diff --git a/src/TNL/Matrices/MatrixWrapping.h b/src/TNL/Matrices/MatrixWrapping.h
index fa91de831..6c0c6bb8c 100644
--- a/src/TNL/Matrices/MatrixWrapping.h
+++ b/src/TNL/Matrices/MatrixWrapping.h
@@ -94,9 +94,9 @@ wrapCSRMatrix( const Index& rows, const Index& columns, Index* rowPointers, Real
 /// This is to prevent from appearing in Doxygen documentation.
 /// \cond HIDDEN_CLASS
 template< typename Device,
+          ElementsOrganization Organization,
           typename Real,
           typename Index,
-          ElementsOrganization Organization = Algorithms::Segments::DefaultElementsOrganization< Device >::getOrganization(),
           int Alignment = 1 >
 struct EllpackMatrixWrapper
 {
@@ -140,15 +140,15 @@ struct EllpackMatrixWrapper
  * \include SparseMatrixViewExample_wrapEllpack.out
  */
 template< typename Device,
+          ElementsOrganization Organization,
           typename Real,
           typename Index,
-          ElementsOrganization Organization = Algorithms::Segments::DefaultElementsOrganization< Device >::getOrganization(),
           int Alignment = 1 >
 auto
 wrapEllpackMatrix( const Index rows, const Index columns, const Index nonzerosPerRow, Real* values, Index* columnIndexes )
--> decltype( EllpackMatrixWrapper< Device, Real, Index, Organization, Alignment >::wrap( rows, columns, nonzerosPerRow, values, columnIndexes ) )
+-> decltype( EllpackMatrixWrapper< Device, Organization, Real, Index, Alignment >::wrap( rows, columns, nonzerosPerRow, values, columnIndexes ) )
 {
-   return EllpackMatrixWrapper< Device, Real, Index, Organization, Alignment >::wrap( rows, columns, nonzerosPerRow, values, columnIndexes );
+   return EllpackMatrixWrapper< Device, Organization, Real, Index, Alignment >::wrap( rows, columns, nonzerosPerRow, values, columnIndexes );
 }
 
    } //namespace Matrices
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index b58620c2d..ac8688425 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -165,7 +165,7 @@ getConstView() const -> ConstViewType
                          this->getColumns(),
                          this->getValues().getConstView(),
                          this->columnIndexes.getConstView(),
-                         this->segments.getConstView() );
+                         const_cast< SparseMatrix* >( this )->segments.getView() );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/SparseMatrixRowView.h b/src/TNL/Matrices/SparseMatrixRowView.h
index 4976a420e..03a90af7a 100644
--- a/src/TNL/Matrices/SparseMatrixRowView.h
+++ b/src/TNL/Matrices/SparseMatrixRowView.h
@@ -14,6 +14,7 @@
 
 #include <TNL/Cuda/CudaCallable.h>
 #include <TNL/Matrices/MatrixRowViewIterator.h>
+#include <TNL/Matrices/details/SparseMatrixRowViewValueGetter.h>
 
 namespace TNL {
 namespace Matrices {
@@ -101,6 +102,8 @@ class SparseMatrixRowView
        */
       using IteratorType = MatrixRowViewIterator< RowView >;
 
+      using ValueGetterType = details::SparseMatrixRowViewValueGetter< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >;
+
       /**
        * \brief Tells whether the parent matrix is a binary matrix.
        * @return `true` if the matrix is binary.
@@ -163,7 +166,7 @@ class SparseMatrixRowView
        * \return constant reference to the matrix element value.
        */
       __cuda_callable__
-      const RealType& getValue( const IndexType localIdx ) const;
+      auto getValue( const IndexType localIdx ) const -> typename ValueGetterType::ConstResultType;
 
       /**
        * \brief Returns non-constants reference to value of an element with given rank in the row.
@@ -173,7 +176,7 @@ class SparseMatrixRowView
        * \return non-constant reference to the matrix element value.
        */
       __cuda_callable__
-      RealType& getValue( const IndexType localIdx );
+      auto getValue( const IndexType localIdx ) -> typename ValueGetterType::ResultType;
 
       /**
        * \brief Sets a value of matrix element with given rank in the matrix row.
@@ -254,6 +257,9 @@ class SparseMatrixRowView
       __cuda_callable__
       const IteratorType cend() const;
 
+      __cuda_callable__
+      IndexType getPaddingIndex() const { return -1; };
+
    protected:
 
       SegmentViewType segmentView;
diff --git a/src/TNL/Matrices/SparseMatrixRowView.hpp b/src/TNL/Matrices/SparseMatrixRowView.hpp
index 82ae9b870..75cba117a 100644
--- a/src/TNL/Matrices/SparseMatrixRowView.hpp
+++ b/src/TNL/Matrices/SparseMatrixRowView.hpp
@@ -82,11 +82,14 @@ template< typename SegmentView,
           bool isBinary_ >
 __cuda_callable__ auto
 SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
-getValue( const IndexType localIdx ) const -> const RealType&
+getValue( const IndexType localIdx ) const -> typename ValueGetterType::ConstResultType
 {
    TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
-   TNL_ASSERT_FALSE( isBinary(), "Cannot call this method for binary matrix row." );
-   return values[ segmentView.getGlobalIndex( localIdx ) ];
+   //TNL_ASSERT_FALSE( isBinary(), "Cannot call this method for binary matrix row." );
+   return ValueGetterType::getValue( segmentView.getGlobalIndex( localIdx ),
+                                     values,
+                                     columnIndexes,
+                                     this->getPaddingIndex() );
 }
 
 template< typename SegmentView,
@@ -95,11 +98,14 @@ template< typename SegmentView,
           bool isBinary_ >
 __cuda_callable__ auto
 SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
-getValue( const IndexType localIdx ) -> RealType&
+getValue( const IndexType localIdx ) -> typename ValueGetterType::ResultType
 {
    TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
-   TNL_ASSERT_FALSE( isBinary(), "Cannot call this method for binary matrix row." );
-   return values[ segmentView.getGlobalIndex( localIdx ) ];
+   //TNL_ASSERT_FALSE( isBinary(), "Cannot call this method for binary matrix row." );
+   return ValueGetterType::getValue( segmentView.getGlobalIndex( localIdx ),
+                                     values,
+                                     columnIndexes,
+                                     this->getPaddingIndex() );
 }
 
 template< typename SegmentView,
diff --git a/src/UnitTests/Matrices/MatrixWrappingTest.h b/src/UnitTests/Matrices/MatrixWrappingTest.h
index af3fb00b3..9da8421d5 100644
--- a/src/UnitTests/Matrices/MatrixWrappingTest.h
+++ b/src/UnitTests/Matrices/MatrixWrappingTest.h
@@ -77,7 +77,7 @@ TYPED_TEST( MatrixTest, WrapMatrix )
    using DeviceType  = typename DenseMatrix::DeviceType;
    using IndexType  = typename DenseMatrix::IndexType;
    using CSRMatrix = TNL::Matrices::SparseMatrix< RealType, DeviceType, IndexType,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >;
-   using EllpackMatrix = TNL::Matrices::SparseMatrix< RealType, DeviceType, IndexType,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >;
+   using EllpackMatrix = TNL::Matrices::SparseMatrix< RealType, DeviceType, IndexType, TNL::Matrices::GeneralMatrix, RowMajorEllpack >;
 
    DenseMatrix denseMatrix{
     { 1,  2,  0,  0 },
@@ -100,7 +100,7 @@ TYPED_TEST( MatrixTest, WrapMatrix )
 
    auto wrappedDenseMatrix   = TNL::Matrices::wrapDenseMatrix< DeviceType >( rows, columns, denseMatrixValues );
    auto wrappedCSRMatrix     = TNL::Matrices::wrapCSRMatrix< DeviceType >( rows, columns, csrMatrixRowPointers, csrMatrixValues, csrMatrixColumnIndexes );
-   auto wrappedEllpackMatrix = TNL::Matrices::wrapEllpackMatrix< DeviceType >( rows, columns, ( IndexType ) 2, ellpackMatrixValues, ellpackMatrixColumnIndexes );
+   auto wrappedEllpackMatrix = TNL::Matrices::wrapEllpackMatrix< DeviceType, TNL::Algorithms::Segments::RowMajorOrder >( rows, columns, ( IndexType ) 2, ellpackMatrixValues, ellpackMatrixColumnIndexes );
 
    EXPECT_EQ( denseMatrix, wrappedDenseMatrix );
    EXPECT_EQ( csrMatrix, wrappedCSRMatrix );
-- 
GitLab


From 0e118aae4a1c2b8aa1a1a65e7184bf988d1b4889 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 4 May 2021 19:23:59 +0200
Subject: [PATCH 055/117] Fixing SparseMatrixRowViewValueGetter for binary
 matrices.

---
 src/TNL/Matrices/SparseMatrixElement.h        | 10 +-
 src/TNL/Matrices/SparseMatrixRowView.h        | 32 +++----
 src/TNL/Matrices/SparseMatrixRowView.hpp      | 93 ++++++++-----------
 src/TNL/Matrices/SparseMatrixView.h           |  4 +-
 .../details/SparseMatrixRowViewValueGetter.h  | 13 ++-
 5 files changed, 69 insertions(+), 83 deletions(-)

diff --git a/src/TNL/Matrices/SparseMatrixElement.h b/src/TNL/Matrices/SparseMatrixElement.h
index 39c7b61b4..3dcb74379 100644
--- a/src/TNL/Matrices/SparseMatrixElement.h
+++ b/src/TNL/Matrices/SparseMatrixElement.h
@@ -25,12 +25,18 @@ namespace Matrices {
  * \tparam Index is a type of matrix elements column indexes.
  */
 template< typename Real,
-          typename Index,
-          bool isBinary_ = false >
+          typename Index >
 class SparseMatrixElement
 {
    public:
 
+      /**
+       * \brief Test of binary matrix type.
+       *
+       * \return \e true if the matrix is stored as binary and \e false otherwise.
+       */
+      static constexpr bool isBinary() { return std::is_same< std::remove_const_t< Real >, bool >::value; };
+
       /**
        * \brief Type of matrix elements values.
        */
diff --git a/src/TNL/Matrices/SparseMatrixRowView.h b/src/TNL/Matrices/SparseMatrixRowView.h
index 03a90af7a..10236d94d 100644
--- a/src/TNL/Matrices/SparseMatrixRowView.h
+++ b/src/TNL/Matrices/SparseMatrixRowView.h
@@ -25,7 +25,6 @@ namespace Matrices {
  * \tparam SegmentView is a segment view of segments representing the matrix format.
  * \tparam ValuesView is a vector view storing the matrix elements values.
  * \tparam ColumnsIndexesView is a vector view storing the column indexes of the matrix element.
- * \tparam isBinary tells if the the parent matrix is a binary matrix.
  *
  * See \ref SparseMatrix and \ref SparseMatrixView.
  *
@@ -41,12 +40,17 @@ namespace Matrices {
  */
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 class SparseMatrixRowView
 {
    public:
 
+      /**
+       * \brief Tells whether the parent matrix is a binary matrix.
+       * @return `true` if the matrix is binary.
+       */
+      static constexpr bool isBinary() { return std::is_same< std::remove_const_t< RealType >, bool >::value; };
+
       /**
        * \brief The type of matrix elements.
        */
@@ -85,12 +89,12 @@ class SparseMatrixRowView
       /**
        * \brief Type of sparse matrix row view.
        */
-      using RowView = SparseMatrixRowView< SegmentView, ValuesViewType, ColumnsIndexesViewType, isBinary_ >;
+      using RowView = SparseMatrixRowView< SegmentView, ValuesViewType, ColumnsIndexesViewType >;
 
       /**
        * \brief Type of constant sparse matrix row view.
        */
-      using ConstView = SparseMatrixRowView< SegmentView, ConstValuesViewType, ConstColumnsIndexesViewType, isBinary_ >;
+      using ConstView = SparseMatrixRowView< SegmentView, ConstValuesViewType, ConstColumnsIndexesViewType >;
 
       /**
        * \brief The type of related matrix element.
@@ -102,13 +106,7 @@ class SparseMatrixRowView
        */
       using IteratorType = MatrixRowViewIterator< RowView >;
 
-      using ValueGetterType = details::SparseMatrixRowViewValueGetter< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >;
-
-      /**
-       * \brief Tells whether the parent matrix is a binary matrix.
-       * @return `true` if the matrix is binary.
-       */
-      static constexpr bool isBinary() { return isBinary_; };
+      using ValueGetterType = details::SparseMatrixRowViewValueGetter< SegmentView, ValuesView, ColumnsIndexesView >;
 
       /**
        * \brief Constructor with \e segmentView, \e values and \e columnIndexes.
@@ -220,10 +218,9 @@ class SparseMatrixRowView
        */
       template< typename _SegmentView,
                 typename _ValuesView,
-                typename _ColumnsIndexesView,
-                bool _isBinary >
+                typename _ColumnsIndexesView >
       __cuda_callable__
-      bool operator==( const SparseMatrixRowView< _SegmentView, _ValuesView, _ColumnsIndexesView, _isBinary >& other ) const;
+      bool operator==( const SparseMatrixRowView< _SegmentView, _ValuesView, _ColumnsIndexesView >& other ) const;
 
       /**
        * \brief Returns iterator pointing at the beginning of the matrix row.
@@ -278,9 +275,8 @@ class SparseMatrixRowView
  */
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
-std::ostream& operator<<( std::ostream& str, const SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >& row );
+          typename ColumnsIndexesView >
+std::ostream& operator<<( std::ostream& str, const SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >& row );
 
 } // namespace Matrices
 } // namespace TNL
diff --git a/src/TNL/Matrices/SparseMatrixRowView.hpp b/src/TNL/Matrices/SparseMatrixRowView.hpp
index 75cba117a..2f14774df 100644
--- a/src/TNL/Matrices/SparseMatrixRowView.hpp
+++ b/src/TNL/Matrices/SparseMatrixRowView.hpp
@@ -18,10 +18,9 @@ namespace Matrices {
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 SparseMatrixRowView( const SegmentViewType& segmentView,
                      const ValuesViewType& values,
                      const ColumnsIndexesViewType& columnIndexes )
@@ -31,10 +30,9 @@ SparseMatrixRowView( const SegmentViewType& segmentView,
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ auto
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 getSize() const -> IndexType
 {
    return segmentView.getSize();
@@ -42,11 +40,10 @@ getSize() const -> IndexType
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__
 auto
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 getRowIndex() const -> const IndexType&
 {
    return segmentView.getSegmentIndex();
@@ -54,10 +51,9 @@ getRowIndex() const -> const IndexType&
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ auto
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 getColumnIndex( const IndexType localIdx ) const -> const IndexType&
 {
    TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
@@ -66,10 +62,9 @@ getColumnIndex( const IndexType localIdx ) const -> const IndexType&
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ auto
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 getColumnIndex( const IndexType localIdx ) -> IndexType&
 {
    TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
@@ -78,14 +73,12 @@ getColumnIndex( const IndexType localIdx ) -> IndexType&
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ auto
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 getValue( const IndexType localIdx ) const -> typename ValueGetterType::ConstResultType
 {
    TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
-   //TNL_ASSERT_FALSE( isBinary(), "Cannot call this method for binary matrix row." );
    return ValueGetterType::getValue( segmentView.getGlobalIndex( localIdx ),
                                      values,
                                      columnIndexes,
@@ -94,14 +87,12 @@ getValue( const IndexType localIdx ) const -> typename ValueGetterType::ConstRes
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ auto
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 getValue( const IndexType localIdx ) -> typename ValueGetterType::ResultType
 {
    TNL_ASSERT_LT( localIdx, this->getSize(), "Local index exceeds matrix row capacity." );
-   //TNL_ASSERT_FALSE( isBinary(), "Cannot call this method for binary matrix row." );
    return ValueGetterType::getValue( segmentView.getGlobalIndex( localIdx ),
                                      values,
                                      columnIndexes,
@@ -110,10 +101,9 @@ getValue( const IndexType localIdx ) -> typename ValueGetterType::ResultType
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ void
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 setValue( const IndexType localIdx,
           const RealType& value )
 {
@@ -126,10 +116,9 @@ setValue( const IndexType localIdx,
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ void
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 setColumnIndex( const IndexType localIdx,
                 const IndexType& columnIndex )
 {
@@ -140,10 +129,9 @@ setColumnIndex( const IndexType localIdx,
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ void
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 setElement( const IndexType localIdx,
             const IndexType column,
             const RealType& value )
@@ -157,22 +145,20 @@ setElement( const IndexType localIdx,
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
    template< typename _SegmentView,
              typename _ValuesView,
-             typename _ColumnsIndexesView,
-             bool _isBinary >
+             typename _ColumnsIndexesView >
 __cuda_callable__
 bool
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
-operator==( const SparseMatrixRowView< _SegmentView, _ValuesView, _ColumnsIndexesView, _isBinary >& other ) const
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
+operator==( const SparseMatrixRowView< _SegmentView, _ValuesView, _ColumnsIndexesView >& other ) const
 {
    IndexType i = 0;
    while( i < getSize() && i < other.getSize() ) {
       if( getColumnIndex( i ) != other.getColumnIndex( i ) )
          return false;
-      if( ! _isBinary && getValue( i ) != other.getValue( i ) )
+      if( ! isBinary() && getValue( i ) != other.getValue( i ) )
          return false;
       ++i;
    }
@@ -189,10 +175,9 @@ operator==( const SparseMatrixRowView< _SegmentView, _ValuesView, _ColumnsIndexe
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ auto
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 begin() -> IteratorType
 {
    return IteratorType( *this, 0 );
@@ -200,10 +185,9 @@ begin() -> IteratorType
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ auto
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 end() -> IteratorType
 {
    return IteratorType( *this, this->getSize() );
@@ -211,10 +195,9 @@ end() -> IteratorType
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ auto
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 cbegin() const -> const IteratorType
 {
    return IteratorType( *this, 0 );
@@ -222,10 +205,9 @@ cbegin() const -> const IteratorType
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename ColumnsIndexesView >
 __cuda_callable__ auto
-SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::
+SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::
 cend() const -> const IteratorType
 {
    return IteratorType( *this, this->getSize() );
@@ -233,13 +215,12 @@ cend() const -> const IteratorType
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView,
-          bool isBinary_ >
-std::ostream& operator<<( std::ostream& str, const SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >& row )
+          typename ColumnsIndexesView >
+std::ostream& operator<<( std::ostream& str, const SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >& row )
 {
-   using NonConstIndex = std::remove_const_t< typename SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView, isBinary_ >::IndexType >;
+   using NonConstIndex = std::remove_const_t< typename SparseMatrixRowView< SegmentView, ValuesView, ColumnsIndexesView >::IndexType >;
    for( NonConstIndex i = 0; i < row.getSize(); i++ )
-      if( isBinary_ )
+      if( row.isBinary() )
          // TODO: check getPaddingIndex(), print only the column indices of non-zeros but not the values
          str << " [ " << row.getColumnIndex( i ) << " ] = " << (row.getColumnIndex( i ) >= 0) << ", ";
       else
diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index 29371c802..c59a79690 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -136,12 +136,12 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
       /**
        * \brief Type for accessing matrix rows.
        */
-      using RowView = SparseMatrixRowView< typename SegmentsViewType::SegmentViewType, ValuesViewType, ColumnsIndexesViewType, isBinary() >;
+      using RowView = SparseMatrixRowView< typename SegmentsViewType::SegmentViewType, ValuesViewType, ColumnsIndexesViewType >;
 
       /**
        * \brief Type for accessing constant matrix rows.
        */
-      using ConstRowView = SparseMatrixRowView< typename SegmentsViewType::SegmentViewType, ConstValuesViewType, ConstColumnsIndexesViewType, isBinary() >;;
+      using ConstRowView = SparseMatrixRowView< typename SegmentsViewType::SegmentViewType, ConstValuesViewType, ConstColumnsIndexesViewType >;;
 
       /**
        * \brief Helper type for getting self type or its modifications.
diff --git a/src/TNL/Matrices/details/SparseMatrixRowViewValueGetter.h b/src/TNL/Matrices/details/SparseMatrixRowViewValueGetter.h
index d47a931d7..ce696e56e 100644
--- a/src/TNL/Matrices/details/SparseMatrixRowViewValueGetter.h
+++ b/src/TNL/Matrices/details/SparseMatrixRowViewValueGetter.h
@@ -20,13 +20,15 @@ namespace TNL {
 template< typename SegmentView,
           typename ValuesView,
           typename ColumnsIndexesView,
-          bool isBinary_ >
+          typename Real = std::remove_const_t<typename ValuesView::RealType >,
+          bool isBinary_ = std::is_same< std::remove_const_t<typename ValuesView::RealType >, bool >::value >
 struct SparseMatrixRowViewValueGetter {};
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView >
-struct SparseMatrixRowViewValueGetter< SegmentView, ValuesView, ColumnsIndexesView, true >
+          typename ColumnsIndexesView,
+          typename Real >
+struct SparseMatrixRowViewValueGetter< SegmentView, ValuesView, ColumnsIndexesView, Real, true >
 {
    using RealType = typename ValuesView::RealType;
 
@@ -47,8 +49,9 @@ struct SparseMatrixRowViewValueGetter< SegmentView, ValuesView, ColumnsIndexesVi
 
 template< typename SegmentView,
           typename ValuesView,
-          typename ColumnsIndexesView >
-struct SparseMatrixRowViewValueGetter< SegmentView, ValuesView, ColumnsIndexesView, false >
+          typename ColumnsIndexesView,
+          typename Real >
+struct SparseMatrixRowViewValueGetter< SegmentView, ValuesView, ColumnsIndexesView, Real, false >
 {
    using RealType = typename ValuesView::RealType;
 
-- 
GitLab


From 527ad5cc685bffd492e0b9e728ba21c4fc0fb4c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 4 May 2021 21:02:55 +0200
Subject: [PATCH 056/117] Fixed method forElements in SparseMatrixView for case
 when segments manage more elements than matrix columns.

---
 src/TNL/Matrices/SparseMatrixView.hpp         | 30 ++++++++------
 .../SparseMatrixVectorProductTest.hpp         | 39 +++++++------------
 2 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 63220888a..2d9a06a8c 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -594,12 +594,16 @@ forElements( IndexType begin, IndexType end, Function& function ) const
    const auto columns_view = this->columnIndexes.getConstView();
    const auto values_view = this->values.getConstView();
    //const IndexType paddingIndex_ = this->getPaddingIndex();
-   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx ) mutable -> bool {
-      if( isBinary() )
-         function( rowIdx, localIdx, columns_view[ globalIdx ], 1 );
-      else
-         function( rowIdx, localIdx, columns_view[ globalIdx ], values_view[ globalIdx ] );
-      return true;
+   auto columns = this->getColumns();
+   auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx ) mutable {
+      if( localIdx < columns )
+      {
+         if( isBinary() )
+            function( rowIdx, localIdx, columns_view[ globalIdx ], 1 );
+         else
+            function( rowIdx, localIdx, columns_view[ globalIdx ], values_view[ globalIdx ] );
+      }
+      //return true;
    };
    this->segments.forElements( begin, end, f );
 }
@@ -618,14 +622,18 @@ forElements( IndexType begin, IndexType end, Function& function )
    auto columns_view = this->columnIndexes.getView();
    auto values_view = this->values.getView();
    const IndexType paddingIndex_ = this->getPaddingIndex();
+   auto columns = this->getColumns();
    auto f = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx ) mutable {
-      if( isBinary() )
+      if( localIdx < columns )
       {
-         RealType one( columns_view[ globalIdx ] != paddingIndex_ );
-         function( rowIdx, localIdx, columns_view[ globalIdx ], one );
+         if( isBinary() )
+         {
+            RealType one( columns_view[ globalIdx ] != paddingIndex_ );
+            function( rowIdx, localIdx, columns_view[ globalIdx ], one );
+         }
+         else
+            function( rowIdx, localIdx, columns_view[ globalIdx ], values_view[ globalIdx ] );
       }
-      else
-         function( rowIdx, localIdx, columns_view[ globalIdx ], values_view[ globalIdx ] );
    };
    this->segments.forElements( begin, end, f );
 }
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest.hpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest.hpp
index d39593e23..dadecff56 100644
--- a/src/UnitTests/Matrices/SparseMatrixVectorProductTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest.hpp
@@ -424,31 +424,22 @@ void test_VectorProduct_longRowsMatrix()
    using MatrixSegmentsType = typename Matrix::SegmentsType;
    constexpr TNL::Algorithms::Segments::ElementsOrganization organization = MatrixSegmentsType::getOrganization();
    using ChunkedEllpackView_ = TNL::Algorithms::Segments::ChunkedEllpackView< DeviceType, IndexType, organization >;
-   if( ! std::is_same< typename Matrix::SegmentsViewType, ChunkedEllpackView_ >::value )
+   for( auto columns : { 64, 65, 128, 129, 256, 257, 512, 513, 1024, 1025, 2048, 2049, 3000 } )
    {
-      // TODO: Fix ChunkedEllpack for this test - seems that it allocates too much memory
-      for( auto columns : { 64, 65, 128, 129, 256, 257, 512, 513, 1024, 1025, 2048, 2049, 3000 } )
-      {
-         //std::cerr << "Long-rows-matrix-test: columns = " << columns << std::endl;
-         //const int columns = 3000;
-         const int rows = 33;
-         Matrix m3( rows, columns );
-         TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowsCapacities( rows );
-         rowsCapacities = columns;
-         m3.setRowCapacities( rowsCapacities );
-         auto f = [] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value ) {
-            column = localIdx;
-            value = localIdx + row;
-         };
-         m3.forAllElements( f );
-         TNL::Containers::Vector< double, DeviceType, IndexType > in( columns, 1.0 ), out( rows, 0.0 );
-         m3.vectorProduct( in, out );
-         for( IndexType rowIdx = 0; rowIdx < rows; rowIdx++ )
-         {
-            //std::cerr << "Long-rows-matrix-test: rowIndex = " << rowIdx << std::endl;
-            EXPECT_EQ( out.getElement( rowIdx ), ( double ) columns * ( double ) (columns - 1 ) / 2.0 + columns * rowIdx );
-         }
-      }
+      const int rows = 33;
+      Matrix m3( rows, columns );
+      TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowsCapacities( rows );
+      rowsCapacities = columns;
+      m3.setRowCapacities( rowsCapacities );
+      auto f = [] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value ) {
+         column = localIdx;
+         value = localIdx + row;
+      };
+      m3.forAllElements( f );
+      TNL::Containers::Vector< double, DeviceType, IndexType > in( columns, 1.0 ), out( rows, 0.0 );
+      m3.vectorProduct( in, out );
+      for( IndexType rowIdx = 0; rowIdx < rows; rowIdx++ )
+         EXPECT_EQ( out.getElement( rowIdx ), ( double ) columns * ( double ) (columns - 1 ) / 2.0 + columns * rowIdx );
    }
 }
 
-- 
GitLab


From 4273faac5c839343ec6623f1e6ab92f0499d4be7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 11 May 2021 14:41:20 +0200
Subject: [PATCH 057/117] Preparing Benchmark for logging into JSON.

---
 src/Benchmarks/BLAS/array-operations.h        |   2 +-
 src/Benchmarks/BLAS/spmv.h                    |   4 +-
 src/Benchmarks/BLAS/tnl-benchmark-blas.h      |  16 +-
 src/Benchmarks/BLAS/triad.h                   |   2 +-
 src/Benchmarks/BLAS/vector-operations.h       |   2 +-
 src/Benchmarks/Benchmarks.h                   |  77 ++---
 .../DistSpMV/tnl-benchmark-distributed-spmv.h |  26 +-
 src/Benchmarks/JsonLogging.h                  | 285 ++++++++++++++++++
 src/Benchmarks/LinearSolvers/benchmarks.h     |   4 +-
 .../tnl-benchmark-linear-solvers.h            |  20 +-
 src/Benchmarks/Logging.h                      |   2 +-
 .../NDArray/tnl-benchmark-ndarray-boundary.h  |  16 +-
 .../NDArray/tnl-benchmark-ndarray.h           |  30 +-
 src/Benchmarks/ODESolvers/benchmarks.h        |   2 +-
 .../ODESolvers/tnl-benchmark-ode-solvers.h    |  30 +-
 .../ReferenceFormats/LightSpMV-1.0/SpMVCSR.h  |  44 +--
 src/Benchmarks/SpMV/spmv.h                    |  12 +-
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h      |   8 +-
 18 files changed, 434 insertions(+), 148 deletions(-)
 create mode 100644 src/Benchmarks/JsonLogging.h

diff --git a/src/Benchmarks/BLAS/array-operations.h b/src/Benchmarks/BLAS/array-operations.h
index a8c606d38..38a58c431 100644
--- a/src/Benchmarks/BLAS/array-operations.h
+++ b/src/Benchmarks/BLAS/array-operations.h
@@ -26,7 +26,7 @@ template< typename Real = double,
           template<typename> class HostAllocator = Allocators::Default< Devices::Host >::Allocator,
           template<typename> class CudaAllocator = Allocators::Default< Devices::Cuda >::Allocator >
 void
-benchmarkArrayOperations( Benchmark & benchmark,
+benchmarkArrayOperations( Benchmark<> & benchmark,
                           const long & size )
 {
    using HostArray = Containers::Array< Real, Devices::Host, Index, HostAllocator< Real > >;
diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h
index 587794f35..6cd669dc0 100644
--- a/src/Benchmarks/BLAS/spmv.h
+++ b/src/Benchmarks/BLAS/spmv.h
@@ -97,7 +97,7 @@ void setCudaTestMatrix( Matrix& matrix,
 template< typename Real,
           template< typename, typename, typename > class Matrix >
 void
-benchmarkSpMV( Benchmark & benchmark,
+benchmarkSpMV( Benchmark<> & benchmark,
                const int & size,
                const int elementsPerRow = 5 )
 {
@@ -173,7 +173,7 @@ benchmarkSpMV( Benchmark & benchmark,
 template< typename Real = double,
           typename Index = int >
 void
-benchmarkSpmvSynthetic( Benchmark & benchmark,
+benchmarkSpmvSynthetic( Benchmark<> & benchmark,
                         const int & size,
                         const int & elementsPerRow )
 {
diff --git a/src/Benchmarks/BLAS/tnl-benchmark-blas.h b/src/Benchmarks/BLAS/tnl-benchmark-blas.h
index 3e05da630..8db1a6e33 100644
--- a/src/Benchmarks/BLAS/tnl-benchmark-blas.h
+++ b/src/Benchmarks/BLAS/tnl-benchmark-blas.h
@@ -29,8 +29,8 @@ using namespace TNL::Benchmarks;
 
 template< typename Real >
 void
-runBlasBenchmarks( Benchmark & benchmark,
-                   Benchmark::MetadataMap metadata,
+runBlasBenchmarks( Benchmark<> & benchmark,
+                   Benchmark<>::MetadataMap metadata,
                    const std::size_t & minSize,
                    const std::size_t & maxSize,
                    const double & sizeStepFactor,
@@ -43,7 +43,7 @@ runBlasBenchmarks( Benchmark & benchmark,
    benchmark.newBenchmark( String("Array operations (") + precision + ", host allocator = Host)",
                            metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
-      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
          { "size", convertToString( size ) },
       } ));
       benchmarkArrayOperations< Real >( benchmark, size );
@@ -71,7 +71,7 @@ runBlasBenchmarks( Benchmark & benchmark,
    benchmark.newBenchmark( String("Vector operations (") + precision + ")",
                            metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= sizeStepFactor ) {
-      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
          { "size", convertToString( size ) },
       } ));
       benchmarkVectorOperations< Real >( benchmark, size );
@@ -82,7 +82,7 @@ runBlasBenchmarks( Benchmark & benchmark,
    benchmark.newBenchmark( String("Triad benchmark (") + precision + ")",
                            metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
-      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
          { "size", convertToString( size ) },
       } ));
       benchmarkTriad< Real >( benchmark, size );
@@ -93,7 +93,7 @@ runBlasBenchmarks( Benchmark & benchmark,
    benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")",
                            metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
-      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
          { "rows", convertToString( size ) },
          { "columns", convertToString( size ) },
          { "elements per row", convertToString( elementsPerRow ) },
@@ -168,10 +168,10 @@ main( int argc, char* argv[] )
    std::ofstream logFile( logFileName.getString(), mode );
 
    // init benchmark and common metadata
-   Benchmark benchmark( loops, verbose );
+   Benchmark<> benchmark( loops, verbose );
 
    // prepare global metadata
-   Benchmark::MetadataMap metadata = getHardwareMetadata();
+   Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
 
    if( precision == "all" || precision == "float" )
       runBlasBenchmarks< float >( benchmark, metadata, minSize, maxSize, sizeStepFactor, elementsPerRow );
diff --git a/src/Benchmarks/BLAS/triad.h b/src/Benchmarks/BLAS/triad.h
index 3ac747fba..d2bdf12cf 100644
--- a/src/Benchmarks/BLAS/triad.h
+++ b/src/Benchmarks/BLAS/triad.h
@@ -24,7 +24,7 @@ namespace Benchmarks {
 template< typename Real = double,
           typename Index = int >
 void
-benchmarkTriad( Benchmark & benchmark,
+benchmarkTriad( Benchmark<> & benchmark,
                 const long & size )
 {
    using HostAllocator = Allocators::Host< Real >;
diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h
index 3391f23fa..c2a3ceab3 100644
--- a/src/Benchmarks/BLAS/vector-operations.h
+++ b/src/Benchmarks/BLAS/vector-operations.h
@@ -36,7 +36,7 @@ namespace Benchmarks {
 template< typename Real = double,
           typename Index = int >
 void
-benchmarkVectorOperations( Benchmark & benchmark,
+benchmarkVectorOperations( Benchmark<> & benchmark,
                            const long & size )
 {
    using HostVector = Containers::Vector< Real, Devices::Host, Index >;
diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 770d38a3e..ab6c9f522 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -61,19 +61,19 @@ struct BenchmarkResult
    }
 };
 
-
+template< typename Logger = Logging >
 class Benchmark
-: protected Logging
+: protected Logger
 {
 public:
-   using Logging::MetadataElement;
-   using Logging::MetadataMap;
-   using Logging::MetadataColumns;
+   using typename Logger::MetadataElement;
+   using typename Logger::MetadataMap;
+   using typename Logger::MetadataColumns;
    using SolverMonitorType = Solvers::IterativeSolverMonitor< double, int >;
 
    Benchmark( int loops = 10,
               bool verbose = true )
-   : Logging(verbose), loops(loops)
+   : Logger(verbose), loops(loops)
    {}
 
    static void configSetup( Config::ConfigDescription& config )
@@ -90,7 +90,7 @@ public:
       this->reset = parameters.getParameter< bool >( "reset" );
       this->minTime = parameters.getParameter< double >( "min-time" );
       const int verbose = parameters.getParameter< int >( "verbose" );
-      Logging::setVerbose( verbose );
+      Logger::setVerbose( verbose );
    }
    // TODO: ensure that this is not called in the middle of the benchmark
    // (or just remove it completely?)
@@ -109,8 +109,8 @@ public:
    void
    newBenchmark( const String & title )
    {
-      closeTable();
-      writeTitle( title );
+      Logger::closeTable();
+      Logger::writeTitle( title );
    }
 
    // Marks the start of a new benchmark (with custom metadata)
@@ -118,13 +118,13 @@ public:
    newBenchmark( const String & title,
                  MetadataMap metadata )
    {
-      closeTable();
-      writeTitle( title );
+      Logger::closeTable();
+      Logger::writeTitle( title );
       // add loops and reset flag to metadata
       metadata["loops"] = convertToString(loops);
       metadata["reset"] = convertToString( reset );
       metadata["minimal test time"] = convertToString( minTime );
-      writeMetadata( metadata );
+      Logger::writeMetadata( metadata );
    }
 
    // Sets metadata columns -- values used for all subsequent rows until
@@ -132,9 +132,9 @@ public:
    void
    setMetadataColumns( const MetadataColumns & metadata )
    {
-      if( metadataColumns != metadata )
-         header_changed = true;
-      metadataColumns = metadata;
+      if( Logger::metadataColumns != metadata )
+         Logger::header_changed = true;
+      Logger::metadataColumns = metadata;
    }
 
    // TODO: maybe should be renamed to createVerticalGroup and ensured that vertical and horizontal groups are not used within the same "Benchmark"
@@ -149,14 +149,14 @@ public:
                  const double baseTime = 0.0 )
    {
       monitor.setStage( operation.getString() );
-      if( metadataColumns.size() > 0 && String(metadataColumns[ 0 ].first) == "operation" ) {
-         metadataColumns[ 0 ].second = operation;
+      if( Logger::metadataColumns.size() > 0 && String(Logger::metadataColumns[ 0 ].first) == "operation" ) {
+         Logger::metadataColumns[ 0 ].second = operation;
       }
       else {
-         metadataColumns.insert( metadataColumns.begin(), {"operation", operation} );
+         Logger::metadataColumns.insert( Logger::metadataColumns.begin(), {"operation", operation} );
       }
       setOperation( datasetSize, baseTime );
-      header_changed = true;
+      Logger::header_changed = true;
    }
 
    void
@@ -174,13 +174,13 @@ public:
    createHorizontalGroup( const String & name,
                           int subcolumns )
    {
-      if( horizontalGroups.size() == 0 ) {
-         horizontalGroups.push_back( {name, subcolumns} );
+      if( Logger::horizontalGroups.size() == 0 ) {
+         Logger::horizontalGroups.push_back( {name, subcolumns} );
       }
       else {
-         auto & last = horizontalGroups.back();
+         auto & last = Logger::horizontalGroups.back();
          if( last.first != name && last.second > 0 ) {
-            horizontalGroups.push_back( {name, subcolumns} );
+            Logger::horizontalGroups.push_back( {name, subcolumns} );
          }
          else {
             last.first = name;
@@ -208,19 +208,19 @@ public:
       result.stddev = std::numeric_limits<double>::quiet_NaN();
       FunctionTimer< Device > functionTimer;
       try {
-         if( verbose > 1 ) {
+         if( Logger::verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
             if( this->reset )
-               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor );
+               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, Logger::verbose, monitor );
             else
-               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
+               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor );
          }
          else {
             if( this->reset )
-               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor );
+               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, Logger::verbose, monitor );
             else
-               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
+               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor );
          }
          this->performedLoops = functionTimer.getPerformedLoops();
       }
@@ -233,8 +233,8 @@ public:
       if( this->baseTime == 0.0 )
          this->baseTime = result.time;
 
-      writeTableHeader( performer, result.getTableHeader() );
-      writeTableRow( performer, result.getRowElements() );
+      Logger::writeTableHeader( performer, result.getTableHeader() );
+      Logger::writeTableRow( performer, result.getRowElements() );
 
       return this->baseTime;
    }
@@ -265,13 +265,13 @@ public:
       result.stddev = std::numeric_limits<double>::quiet_NaN();
       FunctionTimer< Device > functionTimer;
       try {
-         if( verbose > 1 ) {
+         if( Logger::verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
-            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
+            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor );
          }
          else {
-            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
+            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor );
          }
       }
       catch ( const std::exception& e ) {
@@ -283,8 +283,8 @@ public:
       if( this->baseTime == 0.0 )
          this->baseTime = result.time;
 
-      writeTableHeader( performer, result.getTableHeader() );
-      writeTableRow( performer, result.getRowElements() );
+      Logger::writeTableHeader( performer, result.getTableHeader() );
+      Logger::writeTableRow( performer, result.getRowElements() );
 
       return this->baseTime;
    }
@@ -306,7 +306,7 @@ public:
                     int numberOfComputations = 1 ) {
       // each computation has 3 subcolumns
       const int colspan = 3 * numberOfComputations;
-      writeErrorMessage( msg, colspan );
+      Logger::writeErrorMessage( msg, colspan );
       std::cerr << msg << std::endl;
    }
 
@@ -334,7 +334,8 @@ protected:
 };
 
 
-inline Benchmark::MetadataMap getHardwareMetadata()
+template< typename Logger >
+inline typename Benchmark< Logger >::MetadataMap getHardwareMetadata()
 {
    const int cpu_id = 0;
    const CacheSizes cacheSizes = SystemInfo::getCPUCacheSizes( cpu_id );
@@ -356,7 +357,7 @@ inline Benchmark::MetadataMap getHardwareMetadata()
       nproc = TNL::MPI::GetSize();
 #endif
 
-   Benchmark::MetadataMap metadata {
+   typename Benchmark< Logger >::MetadataMap metadata {
        { "host name", SystemInfo::getHostname() },
        { "architecture", SystemInfo::getArchitecture() },
        { "system", SystemInfo::getSystemName() },
diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
index e17878696..b79d80ebf 100644
--- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
+++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h
@@ -42,7 +42,7 @@ using namespace TNL::Benchmarks;
 
 template< typename Matrix, typename Vector >
 void
-benchmarkSpmv( Benchmark& benchmark,
+benchmarkSpmv( Benchmark<>& benchmark,
                const Matrix& matrix,
                const Vector& x,
                const char* performer = "CPU" )
@@ -65,7 +65,7 @@ benchmarkSpmv( Benchmark& benchmark,
 
 template< typename Matrix, typename Vector >
 void
-benchmarkSpmvCuda( Benchmark& benchmark,
+benchmarkSpmvCuda( Benchmark<>& benchmark,
                    const Matrix& matrix,
                    const Vector& x )
 {
@@ -91,7 +91,7 @@ benchmarkSpmvCuda( Benchmark& benchmark,
 
 template< typename Matrix, typename Vector >
 void
-benchmarkDistributedSpmv( Benchmark& benchmark,
+benchmarkDistributedSpmv( Benchmark<>& benchmark,
                           // TODO: cannot be const due to internal buffering
 //                          const Matrix& matrix,
                           Matrix& matrix,
@@ -117,7 +117,7 @@ benchmarkDistributedSpmv( Benchmark& benchmark,
 
 template< typename Matrix, typename Vector >
 void
-benchmarkDistributedSpmvCuda( Benchmark& benchmark,
+benchmarkDistributedSpmvCuda( Benchmark<>& benchmark,
                               const Matrix& matrix,
                               const Vector& x )
 {
@@ -156,8 +156,8 @@ struct SpmvBenchmark
    using DistributedRowLengths = typename DistributedMatrix::RowsCapacitiesType;
 
    static bool
-   run( Benchmark& benchmark,
-        Benchmark::MetadataMap metadata,
+   run( Benchmark<>& benchmark,
+        Benchmark<>::MetadataMap metadata,
         const Config::ParameterContainer& parameters )
    {
       MatrixType matrix;
@@ -172,7 +172,7 @@ struct SpmvBenchmark
       const String name = String( (TNL::MPI::GetSize() > 1) ? "DistSpMV" : "SpMV" )
                           + " (" + parameters.getParameter< String >( "name" ) + "): ";
       benchmark.newBenchmark( name, metadata );
-      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
          // TODO: strip the device
 //         { "matrix type", matrix.getType() },
          { "rows", convertToString( matrix.getRows() ) },
@@ -205,8 +205,8 @@ struct SpmvBenchmark
    }
 
    static void
-   runNonDistributed( Benchmark& benchmark,
-                      Benchmark::MetadataMap metadata,
+   runNonDistributed( Benchmark<>& benchmark,
+                      Benchmark<>::MetadataMap metadata,
                       const Config::ParameterContainer& parameters,
                       MatrixType& matrix,
                       VectorType& vector )
@@ -218,8 +218,8 @@ struct SpmvBenchmark
    }
 
    static void
-   runDistributed( Benchmark& benchmark,
-                   Benchmark::MetadataMap metadata,
+   runDistributed( Benchmark<>& benchmark,
+                   Benchmark<>::MetadataMap metadata,
                    const Config::ParameterContainer& parameters,
                    MatrixType& matrix,
                    VectorType& vector )
@@ -334,10 +334,10 @@ main( int argc, char* argv[] )
       logFile.open( logFileName.getString(), mode );
 
    // init benchmark and common metadata
-   Benchmark benchmark( loops, verbose );
+   Benchmark<> benchmark( loops, verbose );
 
    // prepare global metadata
-   Benchmark::MetadataMap metadata = getHardwareMetadata();
+   Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
 
    // TODO: implement resolveMatrixType
 //   return ! Matrices::resolveMatrixType< MainConfig,
diff --git a/src/Benchmarks/JsonLogging.h b/src/Benchmarks/JsonLogging.h
new file mode 100644
index 000000000..58d6558cf
--- /dev/null
+++ b/src/Benchmarks/JsonLogging.h
@@ -0,0 +1,285 @@
+/***************************************************************************
+                          JsonLogging.h  -  description
+                             -------------------
+    begin                : May 11, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky,
+//                 Tomas Oberhuber
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include <string>
+#include <sstream>
+
+#include <TNL/String.h>
+
+namespace TNL {
+namespace Benchmarks {
+
+class JsonLoggingRowElements
+{
+   public:
+
+      JsonLoggingRowElements()
+      {
+         stream << std::setprecision( 6 ) << std::fixed;
+      }
+
+      template< typename T >
+      JsonLoggingRowElements& operator << ( const T& b )
+      {
+         stream << b;
+         elements.push_back( stream.str() );
+         stream.str( std::string() );
+         return *this;
+      }
+
+      JsonLoggingRowElements& operator << ( decltype( std::setprecision( 2 ) )& setprec )
+      {
+         stream << setprec;
+         return *this;
+      }
+
+      JsonLoggingRowElements& operator << ( decltype( std::fixed )& setfixed ) // the same works also for std::scientific
+      {
+         stream << setfixed;
+         return *this;
+      }
+
+      // iterators
+      auto begin() noexcept { return elements.begin(); }
+
+      auto begin() const noexcept { return elements.begin(); }
+
+      auto cbegin() const noexcept { return elements.cbegin(); }
+
+      auto end() noexcept { return elements.end(); }
+
+      auto end() const noexcept { return elements.end(); }
+
+      auto cend() const noexcept { return elements.cend(); }
+
+   protected:
+      std::list< String > elements;
+
+      std::stringstream stream;
+};
+
+class JsonLogging
+{
+public:
+   using MetadataElement = std::pair< const char*, String >;
+   using MetadataMap = std::map< const char*, String >;
+   using MetadataColumns = std::vector<MetadataElement>;
+
+   using HeaderElements = std::vector< String >;
+   using RowElements = LoggingRowElements;
+
+   JsonLogging( int verbose = true )
+   : verbose(verbose)
+   {}
+
+   void
+   setVerbose( int verbose)
+   {
+      this->verbose = verbose;
+   }
+
+   void
+   writeTitle( const String & title )
+   {
+      if( verbose )
+         std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
+      log << ": title = " << title << std::endl;
+   }
+
+   void
+   writeMetadata( const MetadataMap & metadata )
+   {
+      if( verbose )
+         std::cout << "properties:" << std::endl;
+
+      for( auto & it : metadata ) {
+         if( verbose )
+            std::cout << "   " << it.first << " = " << it.second << std::endl;
+         log << ": " << it.first << " = " << it.second << std::endl;
+      }
+      if( verbose )
+         std::cout << std::endl;
+   }
+
+   void
+   writeTableHeader( const String & spanningElement,
+                     const HeaderElements & subElements )
+   {
+      if( verbose && header_changed ) {
+         for( auto & it : metadataColumns ) {
+            std::cout << std::setw( 20 ) << it.first;
+         }
+
+         // spanning element is printed as usual column to stdout,
+         // but is excluded from header
+         std::cout << std::setw( 15 ) << "";
+
+         for( auto & it : subElements ) {
+            std::cout << std::setw( 15 ) << it;
+         }
+         std::cout << std::endl;
+
+         header_changed = false;
+      }
+
+      // initial indent string
+      header_indent = "!";
+      log << std::endl;
+      for( auto & it : metadataColumns ) {
+         log << header_indent << " " << it.first << std::endl;
+      }
+
+      // dump stacked spanning columns
+      if( horizontalGroups.size() > 0 )
+         while( horizontalGroups.back().second <= 0 ) {
+            horizontalGroups.pop_back();
+            header_indent.pop_back();
+         }
+      for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
+         if( horizontalGroups[ i ].second > 0 ) {
+            log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
+            header_indent += "!";
+         }
+      }
+
+      log << header_indent << " " << spanningElement << std::endl;
+      for( auto & it : subElements ) {
+         log << header_indent << "! " << it << std::endl;
+      }
+
+      if( horizontalGroups.size() > 0 ) {
+         horizontalGroups.back().second--;
+         header_indent.pop_back();
+      }
+   }
+
+   void
+   writeTableRow( const String & spanningElement,
+                  const RowElements & subElements )
+   {
+      if( verbose ) {
+         for( auto & it : metadataColumns ) {
+            std::cout << std::setw( 20 ) << it.second;
+         }
+         // spanning element is printed as usual column to stdout
+         std::cout << std::setw( 15 ) << spanningElement;
+         for( auto & it : subElements ) {
+            std::cout << std::setw( 15 ) << it;
+         }
+         std::cout << std::endl;
+      }
+
+      // only when changed (the header has been already adjusted)
+      // print each element on separate line
+      for( auto & it : metadataColumns ) {
+         log << it.second << std::endl;
+      }
+
+      // benchmark data are indented
+      const String indent = "    ";
+      for( auto & it : subElements ) {
+         log << indent << it << std::endl;
+      }
+   }
+
+   void
+   writeErrorMessage( const char* msg,
+                      int colspan = 1 )
+   {
+      // initial indent string
+      header_indent = "!";
+      log << std::endl;
+      for( auto & it : metadataColumns ) {
+         log << header_indent << " " << it.first << std::endl;
+      }
+
+      // make sure there is a header column for the message
+      if( horizontalGroups.size() == 0 )
+         horizontalGroups.push_back( {"", 1} );
+
+      // dump stacked spanning columns
+      while( horizontalGroups.back().second <= 0 ) {
+         horizontalGroups.pop_back();
+         header_indent.pop_back();
+      }
+      for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
+         if( horizontalGroups[ i ].second > 0 ) {
+            log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
+            header_indent += "!";
+         }
+      }
+      if( horizontalGroups.size() > 0 ) {
+         horizontalGroups.back().second -= colspan;
+         header_indent.pop_back();
+      }
+
+      // only when changed (the header has been already adjusted)
+      // print each element on separate line
+      for( auto & it : metadataColumns ) {
+         log << it.second << std::endl;
+      }
+      log << msg << std::endl;
+   }
+
+   void
+   closeTable()
+   {
+      log << std::endl;
+      header_indent = body_indent = "";
+      header_changed = true;
+      horizontalGroups.clear();
+   }
+
+   bool save( std::ostream & logFile )
+   {
+      closeTable();
+      logFile << log.str();
+      if( logFile.good() ) {
+         log.str() = "";
+         return true;
+      }
+      return false;
+   }
+
+protected:
+   // manual double -> String conversion with fixed precision
+   static String
+   _to_string( double num, int precision = 0, bool fixed = false )
+   {
+      std::stringstream str;
+      if( fixed )
+         str << std::fixed;
+      if( precision )
+         str << std::setprecision( precision );
+      str << num;
+      return String( str.str().data() );
+   }
+
+   std::stringstream log;
+   std::string header_indent;
+   std::string body_indent;
+
+   int verbose;
+   MetadataColumns metadataColumns;
+   bool header_changed = true;
+   std::vector< std::pair< String, int > > horizontalGroups;
+};
+
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/LinearSolvers/benchmarks.h b/src/Benchmarks/LinearSolvers/benchmarks.h
index b9e130c39..cf05bb0d6 100644
--- a/src/Benchmarks/LinearSolvers/benchmarks.h
+++ b/src/Benchmarks/LinearSolvers/benchmarks.h
@@ -54,7 +54,7 @@ bool checkDevice( const Config::ParameterContainer& parameters )
 
 template< template<typename> class Preconditioner, typename Matrix >
 void
-benchmarkPreconditionerUpdate( Benchmark& benchmark,
+benchmarkPreconditionerUpdate( Benchmark<>& benchmark,
                                const Config::ParameterContainer& parameters,
                                const SharedPointer< Matrix >& matrix )
 {
@@ -78,7 +78,7 @@ benchmarkPreconditionerUpdate( Benchmark& benchmark,
 
 template< template<typename> class Solver, template<typename> class Preconditioner, typename Matrix, typename Vector >
 void
-benchmarkSolver( Benchmark& benchmark,
+benchmarkSolver( Benchmark<>& benchmark,
                  const Config::ParameterContainer& parameters,
                  const SharedPointer< Matrix >& matrix,
                  const Vector& x0,
diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
index 393fafb49..0c1651320 100644
--- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
+++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
@@ -145,7 +145,7 @@ void set_random_vector( Vector& v, typename Vector::RealType a, typename Vector:
 
 template< typename Matrix, typename Vector >
 void
-benchmarkIterativeSolvers( Benchmark& benchmark,
+benchmarkIterativeSolvers( Benchmark<>& benchmark,
                            Config::ParameterContainer parameters,
                            const SharedPointer< Matrix >& matrixPointer,
                            const Vector& x0,
@@ -337,8 +337,8 @@ struct LinearSolversBenchmark
    using DistributedRowLengths = typename DistributedMatrix::RowsCapacitiesType;
 
    static bool
-   run( Benchmark& benchmark,
-        Benchmark::MetadataMap metadata,
+   run( Benchmark<>& benchmark,
+        Benchmark<>::MetadataMap metadata,
         const Config::ParameterContainer& parameters )
    {
       const String file_matrix = parameters.getParameter< String >( "input-matrix" );
@@ -384,7 +384,7 @@ struct LinearSolversBenchmark
       const String name = String( (TNL::MPI::GetSize() > 1) ? "Distributed linear solvers" : "Linear solvers" )
                           + " (" + parameters.getParameter< String >( "name" ) + "): ";
       benchmark.newBenchmark( name, metadata );
-      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
          // TODO: strip the device
 //         { "matrix type", matrixPointer->getType() },
          { "rows", convertToString( matrixPointer->getRows() ) },
@@ -422,8 +422,8 @@ struct LinearSolversBenchmark
    }
 
    static void
-   runDistributed( Benchmark& benchmark,
-                   Benchmark::MetadataMap metadata,
+   runDistributed( Benchmark<>& benchmark,
+                   Benchmark<>::MetadataMap metadata,
                    const Config::ParameterContainer& parameters,
                    const SharedPointer< MatrixType >& matrixPointer,
                    const VectorType& x0,
@@ -466,8 +466,8 @@ struct LinearSolversBenchmark
    }
 
    static void
-   runNonDistributed( Benchmark& benchmark,
-                      Benchmark::MetadataMap metadata,
+   runNonDistributed( Benchmark<>& benchmark,
+                      Benchmark<>::MetadataMap metadata,
                       const Config::ParameterContainer& parameters,
                       const SharedPointer< MatrixType >& matrixPointer,
                       const VectorType& x0,
@@ -614,10 +614,10 @@ main( int argc, char* argv[] )
       logFile.open( logFileName.getString(), mode );
 
    // init benchmark and common metadata
-   Benchmark benchmark( loops, verbose );
+   Benchmark<> benchmark( loops, verbose );
 
    // prepare global metadata
-   Benchmark::MetadataMap metadata = getHardwareMetadata();
+   Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
 
    // TODO: implement resolveMatrixType
 //   return ! Matrices::resolveMatrixType< MainConfig,
diff --git a/src/Benchmarks/Logging.h b/src/Benchmarks/Logging.h
index fb4426bb1..343cc2cda 100644
--- a/src/Benchmarks/Logging.h
+++ b/src/Benchmarks/Logging.h
@@ -28,7 +28,7 @@ namespace Benchmarks {
 class LoggingRowElements
 {
    public:
-   
+
       LoggingRowElements()
       {
          stream << std::setprecision( 6 ) << std::fixed;
diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h
index 29445234c..f7a485aa1 100644
--- a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h
+++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h
@@ -81,7 +81,7 @@ void reset() {}
 // as "compile-time constants" and thus e.g. optimizing the 1D iterations with memcpy
 
 template< typename Device >
-void benchmark_1D( Benchmark& benchmark, index_type size = 500000000 )
+void benchmark_1D( Benchmark<>& benchmark, index_type size = 500000000 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0 >,
@@ -108,7 +108,7 @@ void benchmark_1D( Benchmark& benchmark, index_type size = 500000000 )
 }
 
 template< typename Device >
-void benchmark_2D( Benchmark& benchmark, index_type size = 22333 )
+void benchmark_2D( Benchmark<>& benchmark, index_type size = 22333 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0 >,
@@ -135,7 +135,7 @@ void benchmark_2D( Benchmark& benchmark, index_type size = 22333 )
 }
 
 template< typename Device >
-void benchmark_3D( Benchmark& benchmark, index_type size = 800 )
+void benchmark_3D( Benchmark<>& benchmark, index_type size = 800 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0 >,
@@ -245,7 +245,7 @@ void benchmark_3D( Benchmark& benchmark, index_type size = 800 )
 
 
 template< typename Device >
-void benchmark_2D_perm( Benchmark& benchmark, index_type size = 22333 )
+void benchmark_2D_perm( Benchmark<>& benchmark, index_type size = 22333 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0 >,
@@ -272,7 +272,7 @@ void benchmark_2D_perm( Benchmark& benchmark, index_type size = 22333 )
 }
 
 template< typename Device >
-void benchmark_3D_perm( Benchmark& benchmark, index_type size = 800 )
+void benchmark_3D_perm( Benchmark<>& benchmark, index_type size = 800 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0 >,
@@ -381,7 +381,7 @@ void benchmark_3D_perm( Benchmark& benchmark, index_type size = 800 )
 //}
 
 template< typename Device >
-void run_benchmarks( Benchmark& benchmark )
+void run_benchmarks( Benchmark<>& benchmark )
 {
    benchmark_1D< Device >( benchmark );
    benchmark_2D< Device >( benchmark );
@@ -443,10 +443,10 @@ int main( int argc, char* argv[] )
    std::ofstream logFile( logFileName.getString(), mode );
 
    // init benchmark and common metadata
-   Benchmark benchmark( loops, verbose );
+   Benchmark<> benchmark( loops, verbose );
 
    // prepare global metadata
-   Benchmark::MetadataMap metadata = getHardwareMetadata();
+   Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
 
    const String devices = parameters.getParameter< String >( "devices" );
    if( devices == "all" || devices == "host" )
diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h
index 9f17b8b5c..8d4ac8e7a 100644
--- a/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h
+++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h
@@ -83,7 +83,7 @@ void reset() {}
 // as "compile-time constants" and thus e.g. optimizing the 1D iterations with memcpy
 
 template< typename Device >
-void benchmark_array( Benchmark& benchmark, index_type size = 500000000 )
+void benchmark_array( Benchmark<>& benchmark, index_type size = 500000000 )
 {
    Array< value_type, Device > a, b;
    a.setSize( size );
@@ -114,7 +114,7 @@ void benchmark_array( Benchmark& benchmark, index_type size = 500000000 )
 }
 
 template< typename Device >
-void benchmark_1D( Benchmark& benchmark, index_type size = 500000000 )
+void benchmark_1D( Benchmark<>& benchmark, index_type size = 500000000 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0 >,
@@ -137,7 +137,7 @@ void benchmark_1D( Benchmark& benchmark, index_type size = 500000000 )
 }
 
 template< typename Device >
-void benchmark_2D( Benchmark& benchmark, index_type size = 22333 )
+void benchmark_2D( Benchmark<>& benchmark, index_type size = 22333 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0 >,
@@ -160,7 +160,7 @@ void benchmark_2D( Benchmark& benchmark, index_type size = 22333 )
 }
 
 template< typename Device >
-void benchmark_3D( Benchmark& benchmark, index_type size = 800 )
+void benchmark_3D( Benchmark<>& benchmark, index_type size = 800 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0 >,
@@ -183,7 +183,7 @@ void benchmark_3D( Benchmark& benchmark, index_type size = 800 )
 }
 
 template< typename Device >
-void benchmark_4D( Benchmark& benchmark, index_type size = 150 )
+void benchmark_4D( Benchmark<>& benchmark, index_type size = 150 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0, 0 >,
@@ -206,7 +206,7 @@ void benchmark_4D( Benchmark& benchmark, index_type size = 150 )
 }
 
 template< typename Device >
-void benchmark_5D( Benchmark& benchmark, index_type size = 56 )
+void benchmark_5D( Benchmark<>& benchmark, index_type size = 56 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0, 0, 0 >,
@@ -229,7 +229,7 @@ void benchmark_5D( Benchmark& benchmark, index_type size = 56 )
 }
 
 template< typename Device >
-void benchmark_6D( Benchmark& benchmark, index_type size = 28 )
+void benchmark_6D( Benchmark<>& benchmark, index_type size = 28 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >,
@@ -253,7 +253,7 @@ void benchmark_6D( Benchmark& benchmark, index_type size = 28 )
 
 
 template< typename Device >
-void benchmark_2D_perm( Benchmark& benchmark, index_type size = 22333 )
+void benchmark_2D_perm( Benchmark<>& benchmark, index_type size = 22333 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0 >,
@@ -276,7 +276,7 @@ void benchmark_2D_perm( Benchmark& benchmark, index_type size = 22333 )
 }
 
 template< typename Device >
-void benchmark_3D_perm( Benchmark& benchmark, index_type size = 800 )
+void benchmark_3D_perm( Benchmark<>& benchmark, index_type size = 800 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0 >,
@@ -299,7 +299,7 @@ void benchmark_3D_perm( Benchmark& benchmark, index_type size = 800 )
 }
 
 template< typename Device >
-void benchmark_4D_perm( Benchmark& benchmark, index_type size = 150 )
+void benchmark_4D_perm( Benchmark<>& benchmark, index_type size = 150 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0, 0 >,
@@ -322,7 +322,7 @@ void benchmark_4D_perm( Benchmark& benchmark, index_type size = 150 )
 }
 
 template< typename Device >
-void benchmark_5D_perm( Benchmark& benchmark, index_type size = 56 )
+void benchmark_5D_perm( Benchmark<>& benchmark, index_type size = 56 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0, 0, 0 >,
@@ -345,7 +345,7 @@ void benchmark_5D_perm( Benchmark& benchmark, index_type size = 56 )
 }
 
 template< typename Device >
-void benchmark_6D_perm( Benchmark& benchmark, index_type size = 28 )
+void benchmark_6D_perm( Benchmark<>& benchmark, index_type size = 28 )
 {
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >,
@@ -368,7 +368,7 @@ void benchmark_6D_perm( Benchmark& benchmark, index_type size = 28 )
 }
 
 template< typename Device >
-void run_benchmarks( Benchmark& benchmark )
+void run_benchmarks( Benchmark<>& benchmark )
 {
    benchmark_array< Device >( benchmark );
    benchmark_1D< Device >( benchmark );
@@ -431,10 +431,10 @@ int main( int argc, char* argv[] )
    std::ofstream logFile( logFileName.getString(), mode );
 
    // init benchmark and common metadata
-   Benchmark benchmark( loops, verbose );
+   Benchmark<> benchmark( loops, verbose );
 
    // prepare global metadata
-   Benchmark::MetadataMap metadata = getHardwareMetadata();
+   Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
 
    const String devices = parameters.getParameter< String >( "devices" );
    if( devices == "all" || devices == "host" )
diff --git a/src/Benchmarks/ODESolvers/benchmarks.h b/src/Benchmarks/ODESolvers/benchmarks.h
index a6ee67a62..f27d6962e 100644
--- a/src/Benchmarks/ODESolvers/benchmarks.h
+++ b/src/Benchmarks/ODESolvers/benchmarks.h
@@ -35,7 +35,7 @@ getPerformer()
 
 template< typename Solver, typename VectorPointer >
 void
-benchmarkSolver( Benchmark& benchmark,
+benchmarkSolver( Benchmark<>& benchmark,
                  const Config::ParameterContainer& parameters,
                  VectorPointer& u )
 {
diff --git a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
index 4def52d52..afdf33d3a 100644
--- a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
+++ b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h
@@ -41,7 +41,7 @@ using namespace TNL::Pointers;
 
 template< typename Real, typename Index >
 void
-benchmarkODESolvers( Benchmark& benchmark,
+benchmarkODESolvers( Benchmark<>& benchmark,
                      const Config::ParameterContainer& parameters,
                      size_t dofs )
 {
@@ -51,7 +51,7 @@ benchmarkODESolvers( Benchmark& benchmark,
    using CudaVectorPointer = Pointers::SharedPointer< CudaVectorType >;
    using HostProblem = SimpleProblem< Real, Devices::Host, Index >;
    using CudaProblem = SimpleProblem< Real, Devices::Cuda, Index >;
-   using SolverMonitorType = typename Benchmark::SolverMonitorType;
+   using SolverMonitorType = typename Benchmark<>::SolverMonitorType;
 
    const auto& solvers = parameters.getList< String >( "solvers" );
    for( auto&& solver : solvers )
@@ -107,15 +107,15 @@ struct ODESolversBenchmark
    using VectorPointer = Pointers::SharedPointer< VectorType >;
 
    static bool
-   run( Benchmark& benchmark,
-        Benchmark::MetadataMap metadata,
+   run( Benchmark<>& benchmark,
+        Benchmark<>::MetadataMap metadata,
         const Config::ParameterContainer& parameters )
    {
       const String name = String( (TNL::MPI::GetSize() > 1) ? "Distributed ODE solvers" : "ODE solvers" );
                           //+ " (" + parameters.getParameter< String >( "name" ) + "): ";
       benchmark.newBenchmark( name, metadata );
       for( size_t dofs = 25; dofs <= 10000000; dofs *= 2 ) {
-         benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+         benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
             // TODO: strip the device
             { "DOFs", convertToString( dofs ) },
          } ));
@@ -129,8 +129,8 @@ struct ODESolversBenchmark
    }
 
    static void
-   runDistributed( Benchmark& benchmark,
-                   Benchmark::MetadataMap metadata,
+   runDistributed( Benchmark<>& benchmark,
+                   Benchmark<>::MetadataMap metadata,
                    const Config::ParameterContainer& parameters,
                    size_t dofs )
    {
@@ -139,8 +139,8 @@ struct ODESolversBenchmark
    }
 
    static void
-   runNonDistributed( Benchmark& benchmark,
-                      Benchmark::MetadataMap metadata,
+   runNonDistributed( Benchmark<>& benchmark,
+                      Benchmark<>::MetadataMap metadata,
                       const Config::ParameterContainer& parameters,
                       size_t dofs )
    {
@@ -150,8 +150,8 @@ struct ODESolversBenchmark
 };
 
 template< typename Real >
-bool resolveIndexType( Benchmark& benchmark,
-   Benchmark::MetadataMap& metadata,
+bool resolveIndexType( Benchmark<>& benchmark,
+   Benchmark<>::MetadataMap& metadata,
    Config::ParameterContainer& parameters )
 {
    const String& index = parameters.getParameter< String >( "index-type" );
@@ -159,8 +159,8 @@ bool resolveIndexType( Benchmark& benchmark,
    return ODESolversBenchmark< Real, long int >::run( benchmark, metadata, parameters );
 }
 
-bool resolveRealTypes( Benchmark& benchmark,
-   Benchmark::MetadataMap& metadata,
+bool resolveRealTypes( Benchmark<>& benchmark,
+   Benchmark<>::MetadataMap& metadata,
    Config::ParameterContainer& parameters )
 {
    const String& realType = parameters.getParameter< String >( "real-type" );
@@ -245,10 +245,10 @@ main( int argc, char* argv[] )
       logFile.open( logFileName.getString(), mode );
 
    // init benchmark and common metadata
-   Benchmark benchmark( loops, verbose );
+   Benchmark<> benchmark( loops, verbose );
 
    // prepare global metadata
-   Benchmark::MetadataMap metadata = getHardwareMetadata();
+   Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
 
    const bool status = resolveRealTypes( benchmark, metadata, parameters );
 
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMVCSR.h b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMVCSR.h
index effa194d0..fb004308f 100644
--- a/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMVCSR.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/LightSpMV-1.0/SpMVCSR.h
@@ -24,8 +24,8 @@ __device__ inline T shfl_down_64bits(T var, int32_t srcLane,
 	int2 a = *reinterpret_cast<int2*>(&var);
 
 	/*exchange the data*/
-	a.x = __shfl_down(a.x, srcLane, width);
-	a.y = __shfl_down(a.y, srcLane, width);
+	a.x = __shfl_down_sync(0xffffffff,a.x, srcLane, width);
+	a.y = __shfl_down_sync(0xffffffff,a.y, srcLane, width);
 	
 	return *reinterpret_cast<T*>(&a);
 }
@@ -75,7 +75,7 @@ __global__ void csr32DynamicWarp(uint32_t* __restrict cudaRowCounter, const uint
 		row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
 	}
 	/*broadcast the value to other threads in the same warp and compute the row index of each vector*/
-	row = __shfl(row, 0) + warpVectorId;
+	row = __shfl_sync(0xffffffff,row, 0) + warpVectorId;
 
 	/*check the row range*/
 	while (row < _cudaNumRows) {
@@ -113,7 +113,7 @@ __global__ void csr32DynamicWarp(uint32_t* __restrict cudaRowCounter, const uint
 		}
 		/*intra-vector reduction*/
 		for (i = THREADS_PER_VECTOR >> 1; i > 0; i >>= 1) {
-			sum += __shfl_down(sum, i, THREADS_PER_VECTOR);
+			sum += __shfl_down_sync(0xffffffff,sum, i, THREADS_PER_VECTOR);
 		}
 
 		/*save the results and get a new row*/
@@ -127,7 +127,7 @@ __global__ void csr32DynamicWarp(uint32_t* __restrict cudaRowCounter, const uint
 			row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
 		}
 		/*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/
-		row = __shfl(row, 0) + warpVectorId;
+		row = __shfl_sync(0xffffffff,row, 0) + warpVectorId;
 
 	}/*while*/
 }
@@ -155,7 +155,7 @@ __global__ void csr32DynamicVector(uint32_t* __restrict cudaRowCounter, const ui
 		row = atomicAdd(cudaRowCounter, 1);
 	}
 	/*broadcast the value to other lanes from lane 0*/
-	row = __shfl(row, 0, THREADS_PER_VECTOR);
+	row = __shfl_sync(0xffffffff,row, 0, THREADS_PER_VECTOR);
 
 	/*check the row range*/
 	while (row < _cudaNumRows) {
@@ -193,7 +193,7 @@ __global__ void csr32DynamicVector(uint32_t* __restrict cudaRowCounter, const ui
 		}
 		/*intra-vector reduction*/
 		for (i = THREADS_PER_VECTOR >> 1; i > 0; i >>= 1) {
-			sum += __shfl_down(sum, i, THREADS_PER_VECTOR);
+			sum += __shfl_down_sync(0xffffffff,sum, i, THREADS_PER_VECTOR);
 		}
 
 		/*save the results and get a new row*/
@@ -204,7 +204,7 @@ __global__ void csr32DynamicVector(uint32_t* __restrict cudaRowCounter, const ui
 			/*get a new row index*/
 			row = atomicAdd(cudaRowCounter, 1);
 		}
-		row = __shfl(row, 0, THREADS_PER_VECTOR);
+		row = __shfl_sync(0xffffffff,row, 0, THREADS_PER_VECTOR);
 	}/*while*/
 }
 
@@ -233,7 +233,7 @@ __global__ void csr32DynamicVector(uint32_t* __restrict cudaRowCounter, const ui
 			row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
 		}
 		/*broadcast the value to other threads in the same warp and compute the row index of each vector*/
-		row = __shfl(row, 0) + warpVectorId;
+		row = __shfl_sync(0xffffffff,row, 0) + warpVectorId;
 
 		/*check the row range*/
 		while (row < _cudaNumRows) {
@@ -272,7 +272,7 @@ __global__ void csr32DynamicVector(uint32_t* __restrict cudaRowCounter, const ui
 			/*intra-vector reduction*/
 			sum *= alpha;
 			for (i = THREADS_PER_VECTOR >> 1; i > 0; i >>= 1) {
-				sum += __shfl_down(sum, i, THREADS_PER_VECTOR);
+				sum += __shfl_down_sync(0xffffffff,sum, i, THREADS_PER_VECTOR);
 			}
 
 			/*save the results and get a new row*/
@@ -286,7 +286,7 @@ __global__ void csr32DynamicVector(uint32_t* __restrict cudaRowCounter, const ui
 				row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
 			}
 			/*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/
-			row = __shfl(row, 0) + warpVectorId;
+			row = __shfl_sync(0xffffffff,row, 0) + warpVectorId;
 
 		}/*while*/
 	}
@@ -314,7 +314,7 @@ __global__ void csr32DynamicVector(uint32_t* __restrict cudaRowCounter, const ui
 			row = atomicAdd(cudaRowCounter, 1);
 		}
 		/*broadcast the value to other lanes from lane 0*/
-		row = __shfl(row, 0, THREADS_PER_VECTOR);
+		row = __shfl_sync(0xffffffff,row, 0, THREADS_PER_VECTOR);
 
 		/*check the row range*/
 		while (row < _cudaNumRows) {
@@ -353,7 +353,7 @@ __global__ void csr32DynamicVector(uint32_t* __restrict cudaRowCounter, const ui
 			/*intra-vector reduction*/
 			sum *= alpha;
 			for (i = THREADS_PER_VECTOR >> 1; i > 0; i >>= 1) {
-				sum += __shfl_down(sum, i, THREADS_PER_VECTOR);
+				sum += __shfl_down_sync(0xffffffff,sum, i, THREADS_PER_VECTOR);
 			}
 
 			/*save the results and get a new row*/
@@ -364,7 +364,7 @@ __global__ void csr32DynamicVector(uint32_t* __restrict cudaRowCounter, const ui
 				/*get a new row index*/
 				row = atomicAdd(cudaRowCounter, 1);
 			}
-			row = __shfl(row, 0, THREADS_PER_VECTOR);
+			row = __shfl_sync(0xffffffff,row, 0, THREADS_PER_VECTOR);
 		}/*while*/
 	}
 
@@ -392,7 +392,7 @@ __global__ void csr64DynamicVector(uint32_t* __restrict cudaRowCounter, const ui
 		row = atomicAdd(cudaRowCounter, 1);
 	}
 	/*broadcast the value to other lanes from lane 0*/
-	row = __shfl(row, 0, THREADS_PER_VECTOR);
+	row = __shfl_sync(0xffffffff,row, 0, THREADS_PER_VECTOR);
 
 	/*check the row range*/
 	while (row < _cudaNumRows) {
@@ -441,7 +441,7 @@ __global__ void csr64DynamicVector(uint32_t* __restrict cudaRowCounter, const ui
 			/*get a new row index*/
 			row = atomicAdd(cudaRowCounter, 1);
 		}
-		row = __shfl(row, 0, THREADS_PER_VECTOR);
+		row = __shfl_sync( 0xffffffff, row, 0, THREADS_PER_VECTOR);
 	}/*while*/
 }
 
@@ -470,7 +470,7 @@ __global__ void csr64DynamicWarp(uint32_t* __restrict cudaRowCounter, const uint
 		row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
 	}
 	/*broadcast the value to other threads in the same warp*/
-	row = __shfl(row, 0) + warpVectorId;
+	row = __shfl_sync(0xffffffff,row, 0) + warpVectorId;
 
 	/*check the row range*/
 	while (row < _cudaNumRows) {
@@ -523,7 +523,7 @@ __global__ void csr64DynamicWarp(uint32_t* __restrict cudaRowCounter, const uint
 			row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
 		}
 		/*broadcast the value to other threads in the same warp*/
-		row = __shfl(row, 0) + warpVectorId;
+		row = __shfl_sync(0xffffffff,row, 0) + warpVectorId;
 
 	}/*while*/
 }
@@ -552,7 +552,7 @@ __global__ void csr64DynamicVectorBLAS(uint32_t* __restrict cudaRowCounter, cons
 		row = atomicAdd(cudaRowCounter, 1);
 	}
 	/*broadcast the value to other lanes from lane 0*/
-	row = __shfl(row, 0, THREADS_PER_VECTOR);
+	row = __shfl_sync(0xffffffff,row, 0, THREADS_PER_VECTOR);
 
 	/*check the row range*/
 	while (row < _cudaNumRows) {
@@ -602,7 +602,7 @@ __global__ void csr64DynamicVectorBLAS(uint32_t* __restrict cudaRowCounter, cons
 			/*get a new row index*/
 			row = atomicAdd(cudaRowCounter, 1);
 		}
-		row = __shfl(row, 0, THREADS_PER_VECTOR);
+		row = __shfl_sync(0xffffffff,row, 0, THREADS_PER_VECTOR);
 	}/*while*/
 }
 
@@ -631,7 +631,7 @@ __global__ void csr64DynamicWarpBLAS(uint32_t* __restrict cudaRowCounter, const
 		row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
 	}
 	/*broadcast the value to other threads in the same warp*/
-	row = __shfl(row, 0) + warpVectorId;
+	row = __shfl_sync(0xffffffff,row, 0) + warpVectorId;
 
 	/*check the row range*/
 	while (row < _cudaNumRows) {
@@ -685,7 +685,7 @@ __global__ void csr64DynamicWarpBLAS(uint32_t* __restrict cudaRowCounter, const
 			row = atomicAdd(cudaRowCounter, 32 / THREADS_PER_VECTOR);
 		}
 		/*broadcast the value to other threads in the same warp*/
-		row = __shfl(row, 0) + warpVectorId;
+		row = __shfl_sync(0xffffffff,row, 0) + warpVectorId;
 
 	}/*while*/
 }
diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 991c6b56c..089ef6e12 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -232,7 +232,7 @@ template< typename Real,
           template< typename, typename, typename > class Matrix,
           template< typename, typename, typename, typename > class Vector = Containers::Vector >
 void
-benchmarkSpMVLegacy( Benchmark& benchmark,
+benchmarkSpMVLegacy( Benchmark<>& benchmark,
                      const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector,
                      const String& inputFileName,
                      bool verboseMR )
@@ -247,7 +247,7 @@ benchmarkSpMVLegacy( Benchmark& benchmark,
 
    SpMV::ReferenceFormats::Legacy::LegacyMatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR );
 
-   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+   benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
          { "matrix name", convertToString( inputFileName ) },
          { "rows", convertToString( hostMatrix.getRows() ) },
          { "columns", convertToString( hostMatrix.getColumns() ) },
@@ -300,7 +300,7 @@ template< typename Real,
           template< typename, typename, typename > class Matrix,
           template< typename, typename, typename, typename > class Vector = Containers::Vector >
 void
-benchmarkSpMV( Benchmark& benchmark,
+benchmarkSpMV( Benchmark<>& benchmark,
                const InputMatrix& inputMatrix,
                const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector,
                const String& inputFileName,
@@ -322,7 +322,7 @@ benchmarkSpMV( Benchmark& benchmark,
       return;
    }
 
-   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+   benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
          { "matrix name", convertToString( inputFileName ) },
          { "rows", convertToString( hostMatrix.getRows() ) },
          { "columns", convertToString( hostMatrix.getColumns() ) },
@@ -374,7 +374,7 @@ benchmarkSpMV( Benchmark& benchmark,
 template< typename Real = double,
           typename Index = int >
 void
-benchmarkSpmv( Benchmark& benchmark,
+benchmarkSpmv( Benchmark<>& benchmark,
                const String& inputFileName,
                const Config::ParameterContainer& parameters,
                bool verboseMR )
@@ -417,7 +417,7 @@ benchmarkSpmv( Benchmark& benchmark,
    ////
    // Perform benchmark on host with CSR as a reference CPU format
    //
-   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+   benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
          { "matrix name", convertToString( inputFileName ) },
          { "rows", convertToString( csrHostMatrix.getRows() ) },
          { "columns", convertToString( csrHostMatrix.getColumns() ) },
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index 026ed356d..9b9770ea1 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -31,8 +31,8 @@ using namespace TNL::Benchmarks;
 
 template< typename Real >
 void
-runSpMVBenchmarks( Benchmark & benchmark,
-                   Benchmark::MetadataMap metadata,
+runSpMVBenchmarks( Benchmark<> & benchmark,
+                   Benchmark<>::MetadataMap metadata,
                    const String & inputFileName,
                    const Config::ParameterContainer& parameters,
                    bool verboseMR = false )
@@ -129,10 +129,10 @@ main( int argc, char* argv[] )
    std::ofstream logFile( logFileName.getString(), mode );
 
    // init benchmark and common metadata
-   Benchmark benchmark( loops, verbose );
+   Benchmark<> benchmark( loops, verbose );
 
    // prepare global metadata
-   Benchmark::MetadataMap metadata = getHardwareMetadata();
+   Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
 
    // Initiate setup of benchmarks
    if( precision == "all" || precision == "float" )
-- 
GitLab


From 0179d4a09b8ee6dadcd174660b4d73e28c7657bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 14 May 2021 14:53:06 +0200
Subject: [PATCH 058/117] Working on JSON SpMV benchmark.

---
 src/Benchmarks/Benchmarks.h               | 20 ++++---
 src/Benchmarks/JsonLogging.h              | 47 +++++++++++++++-
 src/Benchmarks/Logging.h                  | 15 ++++++
 src/Benchmarks/SpMV/SpmvBenchmarkResult.h | 28 +++++++---
 src/Benchmarks/SpMV/spmv.h                | 66 ++++++++++++++---------
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h  | 10 ++--
 6 files changed, 139 insertions(+), 47 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index ab6c9f522..f5fc8dcfc 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -34,10 +34,11 @@ namespace Benchmarks {
 const double oneGB = 1024.0 * 1024.0 * 1024.0;
 
 
+template< typename Logger = Logging >
 struct BenchmarkResult
 {
-   using HeaderElements = Logging::HeaderElements;
-   using RowElements = Logging::RowElements;
+   using HeaderElements = typename Logger::HeaderElements;
+   using RowElements = typename Logger::RowElements;
 
    double time = std::numeric_limits<double>::quiet_NaN();
    double stddev = std::numeric_limits<double>::quiet_NaN();
@@ -71,6 +72,11 @@ public:
    using typename Logger::MetadataColumns;
    using SolverMonitorType = Solvers::IterativeSolverMonitor< double, int >;
 
+   using typename Logger::CommonLogs;
+   using Logger::addCommonLogs;
+   using Logger::addLogsMetadata;
+   using Logger::writeHeader;
+
    Benchmark( int loops = 10,
               bool verbose = true )
    : Logger(verbose), loops(loops)
@@ -202,7 +208,7 @@ public:
    time( ResetFunction reset,
          const String & performer,
          ComputeFunction & compute,
-         BenchmarkResult & result )
+         BenchmarkResult< Logger > & result )
    {
       result.time = std::numeric_limits<double>::quiet_NaN();
       result.stddev = std::numeric_limits<double>::quiet_NaN();
@@ -247,7 +253,7 @@ public:
          const String & performer,
          ComputeFunction & compute )
    {
-      BenchmarkResult result;
+      BenchmarkResult< Logger > result;
       return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result );
    }
 
@@ -259,7 +265,7 @@ public:
    double
    time( const String & performer,
          ComputeFunction & compute,
-         BenchmarkResult & result )
+         BenchmarkResult< Logger > & result )
    {
       result.time = std::numeric_limits<double>::quiet_NaN();
       result.stddev = std::numeric_limits<double>::quiet_NaN();
@@ -295,7 +301,7 @@ public:
    time( const String & performer,
          ComputeFunction & compute )
    {
-      BenchmarkResult result;
+      BenchmarkResult< Logger > result;
       return time< Device, ComputeFunction >( performer, compute, result );
    }
 
@@ -310,7 +316,7 @@ public:
       std::cerr << msg << std::endl;
    }
 
-   using Logging::save;
+   using Logger::save;
 
    SolverMonitorType& getMonitor() {
       return monitor;
diff --git a/src/Benchmarks/JsonLogging.h b/src/Benchmarks/JsonLogging.h
index 58d6558cf..f97ecf640 100644
--- a/src/Benchmarks/JsonLogging.h
+++ b/src/Benchmarks/JsonLogging.h
@@ -68,6 +68,7 @@ class JsonLoggingRowElements
 
       auto cend() const noexcept { return elements.cend(); }
 
+      size_t size() const noexcept { return this->elements.size(); };
    protected:
       std::list< String > elements;
 
@@ -81,8 +82,11 @@ public:
    using MetadataMap = std::map< const char*, String >;
    using MetadataColumns = std::vector<MetadataElement>;
 
+   using CommonLogs = std::vector< std::pair< const char*, String > >;
+   using LogsMetadata = std::vector< String >;
+
    using HeaderElements = std::vector< String >;
-   using RowElements = LoggingRowElements;
+   using RowElements = JsonLoggingRowElements;
 
    JsonLogging( int verbose = true )
    : verbose(verbose)
@@ -94,6 +98,42 @@ public:
       this->verbose = verbose;
    }
 
+   void addCommonLogs( const CommonLogs& logs )
+   {
+      for( auto lg : logs )
+      {
+         if( verbose )
+            std::cout << lg.first << " = " << lg.second << std::endl;
+         log << "\"" << lg.first << "\" = \"" << lg.second << std::endl;
+      }
+   };
+
+   void resetLogsMetadat() { this->logsMetadata.clear(); };
+
+   void addLogsMetadata( const std::vector< String >& md )
+   {
+      this->logsMetadata.insert( this->logsMetadata.end(), md.begin(), md.end() );
+   }
+
+   void writeHeader()
+   {
+      for( auto md : this->logsMetadata )
+         std::cout << md << "\t";
+      std::cout << std::endl;
+   }
+
+   void writeRow( const RowElements& rowEls )
+   {
+      TNL_ASSERT_EQ( rowEls.size(), this->logsMetadata.size(), "" );
+      auto md = this->logsMetadata.begin();
+      for( auto el : rowEls )
+      {
+         if( verbose )
+            std::cout << el << "\t";
+         log << "    \"" << *md++ << "\" = \"" << el << "," << std::endl;
+      }
+   }
+
    void
    writeTitle( const String & title )
    {
@@ -178,7 +218,7 @@ public:
             std::cout << std::setw( 20 ) << it.second;
          }
          // spanning element is printed as usual column to stdout
-         std::cout << std::setw( 15 ) << spanningElement;
+         //std::cout << std::setw( 15 ) << spanningElement;
          for( auto & it : subElements ) {
             std::cout << std::setw( 15 ) << it;
          }
@@ -279,6 +319,9 @@ protected:
    MetadataColumns metadataColumns;
    bool header_changed = true;
    std::vector< std::pair< String, int > > horizontalGroups;
+
+   // new JSON implementation
+   LogsMetadata logsMetadata;
 };
 
 } // namespace Benchmarks
diff --git a/src/Benchmarks/Logging.h b/src/Benchmarks/Logging.h
index 343cc2cda..70f1c173c 100644
--- a/src/Benchmarks/Logging.h
+++ b/src/Benchmarks/Logging.h
@@ -81,6 +81,8 @@ public:
    using MetadataMap = std::map< const char*, String >;
    using MetadataColumns = std::vector<MetadataElement>;
 
+   using CommonLogs = std::vector< std::pair< const char*, String > >;
+
    using HeaderElements = std::vector< String >;
    using RowElements = LoggingRowElements;
 
@@ -102,6 +104,19 @@ public:
       log << ": title = " << title << std::endl;
    }
 
+   void addCommonLogs( const CommonLogs& logs )
+   {
+      for( auto log : logs )
+      {
+         if( verbose )
+            std::cout << log.first << " = " << log.second << std::endl;
+      }
+   };
+
+   void addLogsMetadata( const std::vector< String >& md ){};
+
+   void writeHeader(){};
+
    void
    writeMetadata( const MetadataMap & metadata )
    {
diff --git a/src/Benchmarks/SpMV/SpmvBenchmarkResult.h b/src/Benchmarks/SpMV/SpmvBenchmarkResult.h
index 7f688b7cb..251e8873b 100644
--- a/src/Benchmarks/SpMV/SpmvBenchmarkResult.h
+++ b/src/Benchmarks/SpMV/SpmvBenchmarkResult.h
@@ -17,9 +17,10 @@ namespace Benchmarks {
 
 template< typename Real,
           typename Device,
-          typename Index >
+          typename Index,
+          typename Logger = JsonLogging >
 struct SpmvBenchmarkResult
-: public BenchmarkResult
+: public BenchmarkResult< Logger >
 {
    using RealType = Real;
    using DeviceType = Device;
@@ -27,23 +28,35 @@ struct SpmvBenchmarkResult
    using HostVector = Containers::Vector< Real, Devices::Host, Index >;
    using BenchmarkVector = Containers::Vector< Real, Device, Index >;
 
-   SpmvBenchmarkResult( const HostVector& csrResult,
+   using typename Logger::HeaderElements;
+   using typename Logger::RowElements;
+   using BenchmarkResult< Logger >::stddev;
+   using BenchmarkResult< Logger >::bandwidth;
+   using BenchmarkResult< Logger >::speedup;
+
+
+   SpmvBenchmarkResult( const String& format,
+                        const HostVector& csrResult,
                         const BenchmarkVector& benchmarkResult,
                         const IndexType nonzeros )
-   : csrResult( csrResult ), benchmarkResult( benchmarkResult ), nonzeros( nonzeros ){};
+   : format( format ), csrResult( csrResult ), benchmarkResult( benchmarkResult ), nonzeros( nonzeros ){};
 
    virtual HeaderElements getTableHeader() const override
    {
-      return HeaderElements( {"non-zeros", "time", "stddev", "stddev/time", "bandwidth", "speedup", "CSR Diff.Max", "CSR Diff.L2"} );
+      return HeaderElements( {"format", "device", "non-zeros", "time", "stddev", "stddev/time", "bandwidth", "speedup", "CSR Diff.Max", "CSR Diff.L2"} );
    }
 
+   void setFormat( const String& format ) { this->format = format; };
+
    virtual RowElements getRowElements() const override
    {
       HostVector benchmarkResultCopy;
       benchmarkResultCopy = benchmarkResult;
       auto diff = csrResult - benchmarkResultCopy;
       RowElements elements;
-      elements << nonzeros << time << stddev << stddev/time << bandwidth;
+      elements << format
+               << ( std::is_same< Device, Devices::Host >::value ? "CPU" : "GPU" )
+               << nonzeros << time << stddev << stddev/time << bandwidth;
       if( speedup != 0.0 )
          elements << speedup;
       else elements << "N/A";
@@ -51,10 +64,11 @@ struct SpmvBenchmarkResult
       return elements;
    }
 
+   String format;
    const HostVector& csrResult;
    const BenchmarkVector& benchmarkResult;
    const IndexType nonzeros;
 };
-   
+
 } //namespace Benchmarks
 } //namespace TNL
diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 089ef6e12..eded0321a 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -17,6 +17,7 @@
 #include <cstdint>
 
 #include "../Benchmarks.h"
+#include "../JsonLogging.h"
 #include "SpmvBenchmarkResult.h"
 
 #include <TNL/Pointers/DevicePointer.h>
@@ -58,7 +59,9 @@ using namespace TNL::Matrices;
 
 namespace TNL {
    namespace Benchmarks {
-      namespace SpMVLegacy {
+      namespace SpMV {
+
+using BenchmarkType = TNL::Benchmarks::Benchmark< JsonLogging >;
 
 /////
 // General sparse matrix aliases
@@ -218,7 +221,7 @@ std::string getFormatShort( const Matrix& matrix )
 }
 
 // Print information about the matrix.
-template< typename Matrix >
+/*template< typename Matrix >
 void printMatrixInfo( const Matrix& matrix,
                       std::ostream& str )
 {
@@ -226,13 +229,13 @@ void printMatrixInfo( const Matrix& matrix,
     str << " Rows: " << matrix.getRows() << std::endl;
     str << " Cols: " << matrix.getColumns() << std::endl;
     str << " Nonzero Elements: " << matrix.getNumberOfNonzeroMatrixElements() << std::endl;
-}
+}*/
 
 template< typename Real,
           template< typename, typename, typename > class Matrix,
           template< typename, typename, typename, typename > class Vector = Containers::Vector >
 void
-benchmarkSpMVLegacy( Benchmark<>& benchmark,
+benchmarkSpMVLegacy( BenchmarkType& benchmark,
                      const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector,
                      const String& inputFileName,
                      bool verboseMR )
@@ -247,12 +250,12 @@ benchmarkSpMVLegacy( Benchmark<>& benchmark,
 
    SpMV::ReferenceFormats::Legacy::LegacyMatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR );
 
-   benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
+   /*benchmark.setMetadataColumns( BenchmarkType::MetadataColumns({
          { "matrix name", convertToString( inputFileName ) },
          { "rows", convertToString( hostMatrix.getRows() ) },
          { "columns", convertToString( hostMatrix.getColumns() ) },
          { "matrix format", MatrixInfo< HostMatrix >::getFormat() }
-      } ));
+      } ));*/
    const int elements = hostMatrix.getNonzeroElementsCount();
    const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
    benchmark.setOperation( datasetSize );
@@ -271,7 +274,7 @@ benchmarkSpMVLegacy( Benchmark<>& benchmark,
       hostMatrix.vectorProduct( hostInVector, hostOutVector );
 
    };
-   SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
+   SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
    benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
 
    /////
@@ -289,7 +292,7 @@ benchmarkSpMVLegacy( Benchmark<>& benchmark,
    auto spmvCuda = [&]() {
       cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
    };
-   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
+   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
    benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
  #endif
     std::cout << std::endl;
@@ -300,7 +303,7 @@ template< typename Real,
           template< typename, typename, typename > class Matrix,
           template< typename, typename, typename, typename > class Vector = Containers::Vector >
 void
-benchmarkSpMV( Benchmark<>& benchmark,
+benchmarkSpMV( BenchmarkType& benchmark,
                const InputMatrix& inputMatrix,
                const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector,
                const String& inputFileName,
@@ -322,7 +325,7 @@ benchmarkSpMV( Benchmark<>& benchmark,
       return;
    }
 
-   benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
+   benchmark.setMetadataColumns( BenchmarkType::MetadataColumns({
          { "matrix name", convertToString( inputFileName ) },
          { "rows", convertToString( hostMatrix.getRows() ) },
          { "columns", convertToString( hostMatrix.getColumns() ) },
@@ -346,7 +349,7 @@ benchmarkSpMV( Benchmark<>& benchmark,
       hostMatrix.vectorProduct( hostInVector, hostOutVector );
 
    };
-   SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
+   SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
    benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
 
    /////
@@ -365,7 +368,7 @@ benchmarkSpMV( Benchmark<>& benchmark,
    auto spmvCuda = [&]() {
       cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
    };
-   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
+   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
    benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
  #endif
     std::cout << std::endl;
@@ -374,7 +377,7 @@ benchmarkSpMV( Benchmark<>& benchmark,
 template< typename Real = double,
           typename Index = int >
 void
-benchmarkSpmv( Benchmark<>& benchmark,
+benchmarkSpmv( BenchmarkType& benchmark,
                const String& inputFileName,
                const Config::ParameterContainer& parameters,
                bool verboseMR )
@@ -417,12 +420,17 @@ benchmarkSpmv( Benchmark<>& benchmark,
    ////
    // Perform benchmark on host with CSR as a reference CPU format
    //
-   benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
+   benchmark.addCommonLogs( BenchmarkType::CommonLogs( {
+      { "matrix name", convertToString( inputFileName ) },
+      { "rows", convertToString( csrHostMatrix.getRows() ) },
+      { "columns", convertToString( csrHostMatrix.getColumns() ) } } ) );
+
+   /*benchmark.setMetadataColumns( BenchmarkType::MetadataColumns({
          { "matrix name", convertToString( inputFileName ) },
          { "rows", convertToString( csrHostMatrix.getRows() ) },
          { "columns", convertToString( csrHostMatrix.getColumns() ) },
          { "matrix format", String( "CSR" ) }
-      } ));
+      } ));*/
 
    HostVector hostInVector( csrHostMatrix.getRows() ), hostOutVector( csrHostMatrix.getRows() );
 
@@ -435,19 +443,21 @@ benchmarkSpmv( Benchmark<>& benchmark,
        csrHostMatrix.vectorProduct( hostInVector, hostOutVector );
    };
 
-   SpmvBenchmarkResult< Real, Devices::Host, int > csrBenchmarkResults( hostOutVector, hostOutVector, csrHostMatrix.getNonzeroElementsCount() );
-   benchmark.time< Devices::Cuda >( resetHostVectors, "CPU", spmvCSRHost, csrBenchmarkResults );
+   SpmvBenchmarkResult< Real, Devices::Host, int > csrBenchmarkResults( String( "CSR" ), hostOutVector, hostOutVector, csrHostMatrix.getNonzeroElementsCount() );
+   benchmark.addLogsMetadata( csrBenchmarkResults.getTableHeader() );
+   benchmark.writeHeader();
+   benchmark.time< Devices::Host >( resetHostVectors, "", spmvCSRHost, csrBenchmarkResults );
 
 #ifdef HAVE_CUDA
    ////
    // Perform benchmark on CUDA device with cuSparse as a reference GPU format
    //
-   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+   /*benchmark.setMetadataColumns( Benchmark::MetadataColumns({
          { "matrix name", convertToString( inputFileName ) },
          { "rows", convertToString( csrHostMatrix.getRows() ) },
          { "columns", convertToString( csrHostMatrix.getColumns() ) },
          { "matrix format", String( "cuSparse" ) }
-      } ));
+      } ));*/
 
    cusparseHandle_t cusparseHandle;
    cusparseCreate( &cusparseHandle );
@@ -469,19 +479,20 @@ benchmarkSpmv( Benchmark<>& benchmark,
        cusparseMatrix.vectorProduct( cudaInVector, cudaOutVector );
    };
 
-   SpmvBenchmarkResult< Real, Devices::Host, int > cudaBenchmarkResults( hostOutVector, hostOutVector, csrHostMatrix.getNonzeroElementsCount() );
+   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( String( "cusprase" ), hostOutVector, hostOutVector, csrHostMatrix.getNonzeroElementsCount() );
    benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, cudaBenchmarkResults );
 
 #ifdef HAVE_CSR5
    ////
    // Perform benchmark on CUDA device with CSR5 as a reference GPU format
    //
-   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+   cudaBenchmarkResults.setFormat( String( "CSR5" ) );
+   /*benchmark.setMetadataColumns( Benchmark::MetadataColumns({
       { "matrix name", convertToString( inputFileName ) },
       { "rows", convertToString( csrHostMatrix.getRows() ) },
       { "columns", convertToString( csrHostMatrix.getColumns() ) },
       { "matrix format", String( "CSR5" ) }
-   } ));
+   } ));*/
 
    CudaVector cudaOutVector2( cudaOutVector );
    CSR5Benchmark::CSR5Benchmark< CSRCudaMatrix > csr5Benchmark( csrCudaMatrix, cudaInVector, cudaOutVector );
@@ -489,6 +500,7 @@ benchmarkSpmv( Benchmark<>& benchmark,
    auto csr5SpMV = [&]() {
        csr5Benchmark.vectorProduct();
    };
+
    benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", csr5SpMV, cudaBenchmarkResults );
    std::cerr << "CSR5 error = " << max( abs( cudaOutVector - cudaOutVector2 ) ) << std::endl;
    csrCudaMatrix.reset();
@@ -497,12 +509,13 @@ benchmarkSpmv( Benchmark<>& benchmark,
    ////
    // Perform benchmark on CUDA device with LightSpMV as a reference GPU format
    //
-   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+   cudaBenchmarkResults.setFormat( String( "LightSpMV Vector" ) );
+   /*benchmark.setMetadataColumns( Benchmark::MetadataColumns({
       { "matrix name", convertToString( inputFileName ) },
       { "rows", convertToString( csrHostMatrix.getRows() ) },
       { "columns", convertToString( csrHostMatrix.getColumns() ) },
       { "matrix format", String( "LightSpMV Vector" ) }
-   } ));
+   } ));*/
 
    LightSpMVCSRHostMatrix lightSpMVCSRHostMatrix;
    lightSpMVCSRHostMatrix = csrHostMatrix;
@@ -516,12 +529,13 @@ benchmarkSpmv( Benchmark<>& benchmark,
    };
    benchmark.time< Devices::Cuda >( resetLightSpMVVectors, "GPU", spmvLightSpMV, cudaBenchmarkResults );
 
-   benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+   cudaBenchmarkResults.setFormat( String( "LightSpMV Warp" ) );
+   /*benchmark.setMetadataColumns( Benchmark::MetadataColumns({
       { "matrix name", convertToString( inputFileName ) },
       { "rows", convertToString( csrHostMatrix.getRows() ) },
       { "columns", convertToString( csrHostMatrix.getColumns() ) },
       { "matrix format", String( "LightSpMV Warp" ) }
-   } ));
+   } ));*/
    lightSpMVBenchmark.setKernelType( LightSpMVBenchmarkKernelWarp );
    benchmark.time< Devices::Cuda >( resetLightSpMVVectors, "GPU", spmvLightSpMV, cudaBenchmarkResults );
 #endif
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index 9b9770ea1..ef3a8b038 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -31,8 +31,8 @@ using namespace TNL::Benchmarks;
 
 template< typename Real >
 void
-runSpMVBenchmarks( Benchmark<> & benchmark,
-                   Benchmark<>::MetadataMap metadata,
+runSpMVBenchmarks( SpMV::BenchmarkType & benchmark,
+                   SpMV::BenchmarkType::MetadataMap metadata,
                    const String & inputFileName,
                    const Config::ParameterContainer& parameters,
                    bool verboseMR = false )
@@ -45,7 +45,7 @@ runSpMVBenchmarks( Benchmark<> & benchmark,
                            metadata );
    // Start the actual benchmark in spmv.h
    try {
-      SpMVLegacy::benchmarkSpmv< Real >( benchmark, inputFileName, parameters, verboseMR );
+      SpMV::benchmarkSpmv< Real >( benchmark, inputFileName, parameters, verboseMR );
    }
    catch( const std::exception& ex ) {
       std::cerr << ex.what() << std::endl;
@@ -129,10 +129,10 @@ main( int argc, char* argv[] )
    std::ofstream logFile( logFileName.getString(), mode );
 
    // init benchmark and common metadata
-   Benchmark<> benchmark( loops, verbose );
+   SpMV::BenchmarkType benchmark( loops, verbose );
 
    // prepare global metadata
-   Benchmark<>::MetadataMap metadata = getHardwareMetadata< Logging >();
+   SpMV::BenchmarkType::MetadataMap metadata = getHardwareMetadata< Logging >();
 
    // Initiate setup of benchmarks
    if( precision == "all" || precision == "float" )
-- 
GitLab


From a1982e6b3c02ec9372c35ac130fe3735cfe0a4e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 14 May 2021 15:32:04 +0200
Subject: [PATCH 059/117] Working on JSON SpMV benchmark - cout is working.

---
 src/Benchmarks/JsonLogging.h              | 20 +++++++++++++-------
 src/Benchmarks/LinearSolvers/benchmarks.h |  2 +-
 src/Benchmarks/SpMV/SpmvBenchmarkResult.h |  5 +++--
 src/Benchmarks/SpMV/spmv.h                |  8 ++++----
 4 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/src/Benchmarks/JsonLogging.h b/src/Benchmarks/JsonLogging.h
index f97ecf640..7070e8263 100644
--- a/src/Benchmarks/JsonLogging.h
+++ b/src/Benchmarks/JsonLogging.h
@@ -117,9 +117,12 @@ public:
 
    void writeHeader()
    {
-      for( auto md : this->logsMetadata )
-         std::cout << md << "\t";
-      std::cout << std::endl;
+      if( verbose )
+      {
+         for( auto md : this->logsMetadata )
+            std::cout << md << "\t";
+         std::cout << std::endl;
+      }
    }
 
    void writeRow( const RowElements& rowEls )
@@ -132,6 +135,8 @@ public:
             std::cout << el << "\t";
          log << "    \"" << *md++ << "\" = \"" << el << "," << std::endl;
       }
+      if( verbose )
+         std::cout << std::endl;
    }
 
    void
@@ -161,7 +166,7 @@ public:
    writeTableHeader( const String & spanningElement,
                      const HeaderElements & subElements )
    {
-      if( verbose && header_changed ) {
+      /*if( verbose && header_changed ) {
          for( auto & it : metadataColumns ) {
             std::cout << std::setw( 20 ) << it.first;
          }
@@ -206,14 +211,15 @@ public:
       if( horizontalGroups.size() > 0 ) {
          horizontalGroups.back().second--;
          header_indent.pop_back();
-      }
+      }*/
    }
 
    void
    writeTableRow( const String & spanningElement,
                   const RowElements & subElements )
    {
-      if( verbose ) {
+      writeRow( subElements );
+      /*if( verbose ) {
          for( auto & it : metadataColumns ) {
             std::cout << std::setw( 20 ) << it.second;
          }
@@ -235,7 +241,7 @@ public:
       const String indent = "    ";
       for( auto & it : subElements ) {
          log << indent << it << std::endl;
-      }
+      }*/
    }
 
    void
diff --git a/src/Benchmarks/LinearSolvers/benchmarks.h b/src/Benchmarks/LinearSolvers/benchmarks.h
index cf05bb0d6..b7f4fded6 100644
--- a/src/Benchmarks/LinearSolvers/benchmarks.h
+++ b/src/Benchmarks/LinearSolvers/benchmarks.h
@@ -126,7 +126,7 @@ benchmarkSolver( Benchmark<>& benchmark,
 
    // subclass BenchmarkResult to add extra columns to the benchmark
    // (iterations, preconditioned residue, true residue)
-   struct MyBenchmarkResult : public BenchmarkResult
+   struct MyBenchmarkResult : public BenchmarkResult<>
    {
       using HeaderElements = BenchmarkResult::HeaderElements;
       using RowElements = BenchmarkResult::RowElements;
diff --git a/src/Benchmarks/SpMV/SpmvBenchmarkResult.h b/src/Benchmarks/SpMV/SpmvBenchmarkResult.h
index 251e8873b..6e7d3c77d 100644
--- a/src/Benchmarks/SpMV/SpmvBenchmarkResult.h
+++ b/src/Benchmarks/SpMV/SpmvBenchmarkResult.h
@@ -28,11 +28,12 @@ struct SpmvBenchmarkResult
    using HostVector = Containers::Vector< Real, Devices::Host, Index >;
    using BenchmarkVector = Containers::Vector< Real, Device, Index >;
 
-   using typename Logger::HeaderElements;
-   using typename Logger::RowElements;
+   using typename BenchmarkResult< Logger >::HeaderElements;
+   using typename BenchmarkResult< Logger >::RowElements;
    using BenchmarkResult< Logger >::stddev;
    using BenchmarkResult< Logger >::bandwidth;
    using BenchmarkResult< Logger >::speedup;
+   using BenchmarkResult< Logger >::time;
 
 
    SpmvBenchmarkResult( const String& format,
diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index eded0321a..1b45502cb 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -295,7 +295,7 @@ benchmarkSpMVLegacy( BenchmarkType& benchmark,
    SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
    benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
  #endif
-    std::cout << std::endl;
+   // std::cout << std::endl;
 }
 
 template< typename Real,
@@ -325,12 +325,12 @@ benchmarkSpMV( BenchmarkType& benchmark,
       return;
    }
 
-   benchmark.setMetadataColumns( BenchmarkType::MetadataColumns({
+   /*benchmark.setMetadataColumns( BenchmarkType::MetadataColumns({
          { "matrix name", convertToString( inputFileName ) },
          { "rows", convertToString( hostMatrix.getRows() ) },
          { "columns", convertToString( hostMatrix.getColumns() ) },
          { "matrix format", MatrixInfo< HostMatrix >::getFormat() }
-      } ));
+      } ));*/
    const int elements = hostMatrix.getNonzeroElementsCount();
    const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
    benchmark.setOperation( datasetSize );
@@ -371,7 +371,7 @@ benchmarkSpMV( BenchmarkType& benchmark,
    SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
    benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
  #endif
-    std::cout << std::endl;
+   // std::cout << std::endl;
 }
 
 template< typename Real = double,
-- 
GitLab


From ebc76a2cba9c9c6f80dd02e5613f7402de6379ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 14 May 2021 16:14:01 +0200
Subject: [PATCH 060/117] Working on JSON SpMV benchmark - JSON is almost
 working.

---
 src/Benchmarks/JsonLogging.h | 105 +++++++++++------------------------
 1 file changed, 32 insertions(+), 73 deletions(-)

diff --git a/src/Benchmarks/JsonLogging.h b/src/Benchmarks/JsonLogging.h
index 7070e8263..6e1df534f 100644
--- a/src/Benchmarks/JsonLogging.h
+++ b/src/Benchmarks/JsonLogging.h
@@ -100,11 +100,18 @@ public:
 
    void addCommonLogs( const CommonLogs& logs )
    {
+      if( this->lineStarted )
+         log << "," << std::endl;
+      log << "   \"benchmarks\" : [" << std::endl;
+      int idx( 0 );
       for( auto lg : logs )
       {
          if( verbose )
             std::cout << lg.first << " = " << lg.second << std::endl;
-         log << "\"" << lg.first << "\" = \"" << lg.second << std::endl;
+         if( idx++ > 0 )
+            log << "," << std::endl;
+         log << "      \"" << lg.first << "\" : \"" << lg.second;
+         this->lineStarted = true;
       }
    };
 
@@ -128,13 +135,26 @@ public:
    void writeRow( const RowElements& rowEls )
    {
       TNL_ASSERT_EQ( rowEls.size(), this->logsMetadata.size(), "" );
+      if( this->lineStarted )
+         log << "," << std::endl;
+      if( ! this->resultsStarted )
+      {
+         log << "      \"results\" : [ " << std::endl;
+         this->resultsStarted = true;
+      }
+      log << "         {" << std::endl;
       auto md = this->logsMetadata.begin();
+      int idx( 0 );
       for( auto el : rowEls )
       {
          if( verbose )
             std::cout << el << "\t";
-         log << "    \"" << *md++ << "\" = \"" << el << "," << std::endl;
+         if( idx++ > 0 )
+            log << "," << std::endl;
+         log << "          \"" << *md++ << "\" : \"" << el << "\"";
       }
+      log << std::endl << "         }";
+      this->lineStarted = true;
       if( verbose )
          std::cout << std::endl;
    }
@@ -144,7 +164,8 @@ public:
    {
       if( verbose )
          std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
-      log << ": title = " << title << std::endl;
+      log << "   \"title\" : \"" << title << "\"";
+      this->lineStarted = true;
    }
 
    void
@@ -153,10 +174,14 @@ public:
       if( verbose )
          std::cout << "properties:" << std::endl;
 
+      int idx( this->lineStarted );
       for( auto & it : metadata ) {
          if( verbose )
             std::cout << "   " << it.first << " = " << it.second << std::endl;
-         log << ": " << it.first << " = " << it.second << std::endl;
+         if( idx++ > 0 )
+            log << "," << std::endl;
+         log << "   \"" << it.first << "\" : \"" << it.second << "\"";
+         this->lineStarted = true;
       }
       if( verbose )
          std::cout << std::endl;
@@ -166,52 +191,6 @@ public:
    writeTableHeader( const String & spanningElement,
                      const HeaderElements & subElements )
    {
-      /*if( verbose && header_changed ) {
-         for( auto & it : metadataColumns ) {
-            std::cout << std::setw( 20 ) << it.first;
-         }
-
-         // spanning element is printed as usual column to stdout,
-         // but is excluded from header
-         std::cout << std::setw( 15 ) << "";
-
-         for( auto & it : subElements ) {
-            std::cout << std::setw( 15 ) << it;
-         }
-         std::cout << std::endl;
-
-         header_changed = false;
-      }
-
-      // initial indent string
-      header_indent = "!";
-      log << std::endl;
-      for( auto & it : metadataColumns ) {
-         log << header_indent << " " << it.first << std::endl;
-      }
-
-      // dump stacked spanning columns
-      if( horizontalGroups.size() > 0 )
-         while( horizontalGroups.back().second <= 0 ) {
-            horizontalGroups.pop_back();
-            header_indent.pop_back();
-         }
-      for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
-         if( horizontalGroups[ i ].second > 0 ) {
-            log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
-            header_indent += "!";
-         }
-      }
-
-      log << header_indent << " " << spanningElement << std::endl;
-      for( auto & it : subElements ) {
-         log << header_indent << "! " << it << std::endl;
-      }
-
-      if( horizontalGroups.size() > 0 ) {
-         horizontalGroups.back().second--;
-         header_indent.pop_back();
-      }*/
    }
 
    void
@@ -219,29 +198,6 @@ public:
                   const RowElements & subElements )
    {
       writeRow( subElements );
-      /*if( verbose ) {
-         for( auto & it : metadataColumns ) {
-            std::cout << std::setw( 20 ) << it.second;
-         }
-         // spanning element is printed as usual column to stdout
-         //std::cout << std::setw( 15 ) << spanningElement;
-         for( auto & it : subElements ) {
-            std::cout << std::setw( 15 ) << it;
-         }
-         std::cout << std::endl;
-      }
-
-      // only when changed (the header has been already adjusted)
-      // print each element on separate line
-      for( auto & it : metadataColumns ) {
-         log << it.second << std::endl;
-      }
-
-      // benchmark data are indented
-      const String indent = "    ";
-      for( auto & it : subElements ) {
-         log << indent << it << std::endl;
-      }*/
    }
 
    void
@@ -328,6 +284,9 @@ protected:
 
    // new JSON implementation
    LogsMetadata logsMetadata;
+
+   bool lineStarted = false;
+   bool resultsStarted = false;
 };
 
 } // namespace Benchmarks
-- 
GitLab


From 2b69cfe1a0342f5e3196f60442f391fa27c9ff6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 17 May 2021 17:02:11 +0200
Subject: [PATCH 061/117] Modifying SpMV JSON benchmark log - can be parsed by
 Pandas now.

---
 src/Benchmarks/Benchmarks.h              |  5 +-
 src/Benchmarks/JsonLogging.h             | 65 ++++++++++++++++--------
 src/Benchmarks/Logging.h                 |  7 ++-
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h |  8 ++-
 4 files changed, 58 insertions(+), 27 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index f5fc8dcfc..d9ef1f12a 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -78,8 +78,9 @@ public:
    using Logger::writeHeader;
 
    Benchmark( int loops = 10,
-              bool verbose = true )
-   : Logger(verbose), loops(loops)
+              bool verbose = true,
+              String outputMode = "" )
+   : Logger(verbose, outputMode), loops(loops)
    {}
 
    static void configSetup( Config::ConfigDescription& config )
diff --git a/src/Benchmarks/JsonLogging.h b/src/Benchmarks/JsonLogging.h
index 6e1df534f..551a18475 100644
--- a/src/Benchmarks/JsonLogging.h
+++ b/src/Benchmarks/JsonLogging.h
@@ -88,8 +88,9 @@ public:
    using HeaderElements = std::vector< String >;
    using RowElements = JsonLoggingRowElements;
 
-   JsonLogging( int verbose = true )
-   : verbose(verbose)
+   JsonLogging( int verbose = true,
+                String outputMode = "" )
+   : verbose(verbose), outputMode( outputMode )
    {}
 
    void
@@ -100,18 +101,19 @@ public:
 
    void addCommonLogs( const CommonLogs& logs )
    {
-      if( this->lineStarted )
-         log << "," << std::endl;
-      log << "   \"benchmarks\" : [" << std::endl;
-      int idx( 0 );
+      //if( this->lineStarted )
+      //   log << "," << std::endl;
+      //log << "   \"benchmarks\" : [" << std::endl;
+      this->commonLogs = logs;
+      //int idx( 0 );
       for( auto lg : logs )
       {
          if( verbose )
             std::cout << lg.first << " = " << lg.second << std::endl;
-         if( idx++ > 0 )
-            log << "," << std::endl;
-         log << "      \"" << lg.first << "\" : \"" << lg.second;
-         this->lineStarted = true;
+         //if( idx++ > 0 )
+         //   log << "," << std::endl;
+         //log << "      \"" << lg.first << "\" : \"" << lg.second << "\"";
+         //this->lineStarted = true;
       }
    };
 
@@ -137,14 +139,21 @@ public:
       TNL_ASSERT_EQ( rowEls.size(), this->logsMetadata.size(), "" );
       if( this->lineStarted )
          log << "," << std::endl;
-      if( ! this->resultsStarted )
+
+      log << "         {" << std::endl;
+
+      // write common logs
+      int idx( 0 );
+      for( auto lg : this->commonLogs )
       {
-         log << "      \"results\" : [ " << std::endl;
-         this->resultsStarted = true;
+         //if( verbose )
+         //   std::cout << lg.first << " = " << lg.second << std::endl;
+         if( idx++ > 0 )
+            log << "," << std::endl;
+         log << "      \"" << lg.first << "\" : \"" << lg.second << "\"";
       }
-      log << "         {" << std::endl;
+
       auto md = this->logsMetadata.begin();
-      int idx( 0 );
       for( auto el : rowEls )
       {
          if( verbose )
@@ -162,6 +171,9 @@ public:
    void
    writeTitle( const String & title )
    {
+      if( outputMode == "append" )
+         return;
+
       if( verbose )
          std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
       log << "   \"title\" : \"" << title << "\"";
@@ -171,6 +183,9 @@ public:
    void
    writeMetadata( const MetadataMap & metadata )
    {
+      if( outputMode == "append" )
+         return;
+
       if( verbose )
          std::cout << "properties:" << std::endl;
 
@@ -181,8 +196,11 @@ public:
          if( idx++ > 0 )
             log << "," << std::endl;
          log << "   \"" << it.first << "\" : \"" << it.second << "\"";
-         this->lineStarted = true;
+         //this->lineStarted = true;
       }
+      log << "," << std::endl << "      \"results\" : [ " << std::endl;
+      this->lineStarted = false;
+
       if( verbose )
          std::cout << std::endl;
    }
@@ -204,8 +222,9 @@ public:
    writeErrorMessage( const char* msg,
                       int colspan = 1 )
    {
+      log << "\"error\" : \"" << msg << "\"" << std::endl;
       // initial indent string
-      header_indent = "!";
+      /*header_indent = "!";
       log << std::endl;
       for( auto & it : metadataColumns ) {
          log << header_indent << " " << it.first << std::endl;
@@ -237,15 +256,17 @@ public:
          log << it.second << std::endl;
       }
       log << msg << std::endl;
+      */
    }
 
    void
    closeTable()
    {
-      log << std::endl;
-      header_indent = body_indent = "";
-      header_changed = true;
-      horizontalGroups.clear();
+      //log << std::endl << "   ]" << std::endl;
+      log << "," << std::endl;
+      //header_indent = body_indent = "";
+      //header_changed = true;
+      //horizontalGroups.clear();
    }
 
    bool save( std::ostream & logFile )
@@ -284,6 +305,8 @@ protected:
 
    // new JSON implementation
    LogsMetadata logsMetadata;
+   CommonLogs commonLogs;
+   String outputMode;
 
    bool lineStarted = false;
    bool resultsStarted = false;
diff --git a/src/Benchmarks/Logging.h b/src/Benchmarks/Logging.h
index 70f1c173c..75e843c9f 100644
--- a/src/Benchmarks/Logging.h
+++ b/src/Benchmarks/Logging.h
@@ -86,8 +86,9 @@ public:
    using HeaderElements = std::vector< String >;
    using RowElements = LoggingRowElements;
 
-   Logging( int verbose = true )
-   : verbose(verbose)
+   Logging( int verbose = true,
+            String outputMode = "" )
+   : verbose(verbose), outputMode( outputMode )
    {}
 
    void
@@ -294,6 +295,8 @@ protected:
    MetadataColumns metadataColumns;
    bool header_changed = true;
    std::vector< std::pair< String, int > > horizontalGroups;
+
+   String outputMode;
 };
 
 } // namespace Benchmarks
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index ef3a8b038..46a87befc 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -25,6 +25,7 @@ using namespace TNL::Matrices;
 
 #include <exception>
 #include <ctime> // Used for file naming, so logs don't get overwritten.
+#include <experimental/filesystem> // check file existence
 
 using namespace TNL;
 using namespace TNL::Benchmarks;
@@ -116,20 +117,23 @@ main( int argc, char* argv[] )
 
    const String & inputFileName = parameters.getParameter< String >( "input-file" );
    const String & logFileName = parameters.getParameter< String >( "log-file" );
-   const String & outputMode = parameters.getParameter< String >( "output-mode" );
+   String outputMode = parameters.getParameter< String >( "output-mode" );
    const String & precision = parameters.getParameter< String >( "precision" );
    const int loops = parameters.getParameter< int >( "loops" );
    const int verbose = parameters.getParameter< int >( "verbose" );
    const int verboseMR = parameters.getParameter< int >( "verbose-MReader" );
 
    // open log file
+   bool exist = std::experimental::filesystem::exists(logFileName.getString());
+   if( ! exist )
+      outputMode = "";
    auto mode = std::ios::out;
    if( outputMode == "append" )
        mode |= std::ios::app;
    std::ofstream logFile( logFileName.getString(), mode );
 
    // init benchmark and common metadata
-   SpMV::BenchmarkType benchmark( loops, verbose );
+   SpMV::BenchmarkType benchmark( loops, verbose, outputMode );
 
    // prepare global metadata
    SpMV::BenchmarkType::MetadataMap metadata = getHardwareMetadata< Logging >();
-- 
GitLab


From 345e290c39ad8704f5f0e09bac6ed4559237253a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 17 May 2021 21:03:05 +0200
Subject: [PATCH 062/117] Fixing SpMV benchmark with CUDA.

---
 src/Benchmarks/BLAS/tnl-benchmark-blas.h |  4 ++--
 src/Benchmarks/SpMV/spmv.h               |  2 +-
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h | 10 +++++-----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/Benchmarks/BLAS/tnl-benchmark-blas.h b/src/Benchmarks/BLAS/tnl-benchmark-blas.h
index 8db1a6e33..bc58aeb2c 100644
--- a/src/Benchmarks/BLAS/tnl-benchmark-blas.h
+++ b/src/Benchmarks/BLAS/tnl-benchmark-blas.h
@@ -52,7 +52,7 @@ runBlasBenchmarks( Benchmark<> & benchmark,
    benchmark.newBenchmark( String("Array operations (") + precision + ", host allocator = CudaHost)",
                            metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
-      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
          { "size", convertToString( size ) },
       } ));
       benchmarkArrayOperations< Real, int, Allocators::CudaHost >( benchmark, size );
@@ -60,7 +60,7 @@ runBlasBenchmarks( Benchmark<> & benchmark,
    benchmark.newBenchmark( String("Array operations (") + precision + ", host allocator = CudaManaged)",
                            metadata );
    for( std::size_t size = minSize; size <= maxSize; size *= 2 ) {
-      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
+      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
          { "size", convertToString( size ) },
       } ));
       benchmarkArrayOperations< Real, int, Allocators::CudaManaged >( benchmark, size );
diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 1b45502cb..d0d79c4f1 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -479,7 +479,7 @@ benchmarkSpmv( BenchmarkType& benchmark,
        cusparseMatrix.vectorProduct( cudaInVector, cudaOutVector );
    };
 
-   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( String( "cusprase" ), hostOutVector, hostOutVector, csrHostMatrix.getNonzeroElementsCount() );
+   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( String( "cusparse" ), hostOutVector, cudaOutVector, csrHostMatrix.getNonzeroElementsCount() );
    benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, cudaBenchmarkResults );
 
 #ifdef HAVE_CSR5
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index 46a87befc..de805536c 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -32,8 +32,8 @@ using namespace TNL::Benchmarks;
 
 template< typename Real >
 void
-runSpMVBenchmarks( SpMV::BenchmarkType & benchmark,
-                   SpMV::BenchmarkType::MetadataMap metadata,
+runSpMVBenchmarks( TNL::Benchmarks::SpMV::BenchmarkType & benchmark,
+                   TNL::Benchmarks::SpMV::BenchmarkType::MetadataMap metadata,
                    const String & inputFileName,
                    const Config::ParameterContainer& parameters,
                    bool verboseMR = false )
@@ -46,7 +46,7 @@ runSpMVBenchmarks( SpMV::BenchmarkType & benchmark,
                            metadata );
    // Start the actual benchmark in spmv.h
    try {
-      SpMV::benchmarkSpmv< Real >( benchmark, inputFileName, parameters, verboseMR );
+      TNL::Benchmarks::SpMV::benchmarkSpmv< Real >( benchmark, inputFileName, parameters, verboseMR );
    }
    catch( const std::exception& ex ) {
       std::cerr << ex.what() << std::endl;
@@ -133,10 +133,10 @@ main( int argc, char* argv[] )
    std::ofstream logFile( logFileName.getString(), mode );
 
    // init benchmark and common metadata
-   SpMV::BenchmarkType benchmark( loops, verbose, outputMode );
+   TNL::Benchmarks::SpMV::BenchmarkType benchmark( loops, verbose, outputMode );
 
    // prepare global metadata
-   SpMV::BenchmarkType::MetadataMap metadata = getHardwareMetadata< Logging >();
+   TNL::Benchmarks::SpMV::BenchmarkType::MetadataMap metadata = getHardwareMetadata< Logging >();
 
    // Initiate setup of benchmarks
    if( precision == "all" || precision == "float" )
-- 
GitLab


From 8c319b5d1ef9fa182213cb547172fa97709d3c8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 19 May 2021 20:42:43 +0200
Subject: [PATCH 063/117] Fixing JSON logging of SpMV benchmark.

---
 src/Benchmarks/JsonLogging.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/Benchmarks/JsonLogging.h b/src/Benchmarks/JsonLogging.h
index 551a18475..964a8bbc7 100644
--- a/src/Benchmarks/JsonLogging.h
+++ b/src/Benchmarks/JsonLogging.h
@@ -184,7 +184,10 @@ public:
    writeMetadata( const MetadataMap & metadata )
    {
       if( outputMode == "append" )
+      {
+         this->lineStarted = true;
          return;
+      }
 
       if( verbose )
          std::cout << "properties:" << std::endl;
@@ -263,7 +266,7 @@ public:
    closeTable()
    {
       //log << std::endl << "   ]" << std::endl;
-      log << "," << std::endl;
+      //log << "," << std::endl;
       //header_indent = body_indent = "";
       //header_changed = true;
       //horizontalGroups.clear();
-- 
GitLab


From 35a017fa3a904fbead72ba7539e5fd7bae7f46dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 28 May 2021 16:35:01 +0200
Subject: [PATCH 064/117] Working on Python scripts for SpMV benchmark results
 processing.

---
 .../tnl-spmv-benchmark-make-tables-json.py    | 99 +++++++++++++++++++
 .../scripts/tnl-spmv-benchmark-make-tables.py |  4 +-
 2 files changed, 101 insertions(+), 2 deletions(-)
 create mode 100755 src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py

diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
new file mode 100755
index 000000000..2be5fc6ac
--- /dev/null
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
@@ -0,0 +1,99 @@
+#!/usr/bin/python3
+
+import json
+import pandas as pd
+from pandas.io.json import json_normalize
+
+
+def slugify(s):
+   s = str(s).strip().replace(' ', '_')
+   return re.sub(r'(?u)[^-\w.]', '', s)
+
+####
+# Parse input file
+print( "Parsing input file...." )
+with open('sparse-matrix-benchmark.log') as f:
+    d = json.load(f)
+input_df = json_normalize( d, record_path=['results'] )
+#input_df.to_html( "orig-pandas.html" )
+
+####
+# Create multiindex for columns
+
+# Get format names - TODO: the first benchmark might not have all of them
+matrixName = input_df.iloc[0]['matrix name']
+df_matrix = input_df.loc[input_df['matrix name'] == matrixName]
+formats = df_matrix.loc[:,'format']
+level1 = [ 'Matrix name', 'rows', 'columns' ]
+level2 = [ '',            '',     ''        ]
+level3 = [ '',            '',     ''        ]
+df_data = [[ ' ',' ',' ']]
+for format in formats:
+   for device in ['CPU','GPU']:
+      for data in ['bandwidth' ]: #,'time','speed-up','non-zeros','stddev','stddev/time','diff.max','diff.l2']:
+         level1.append( format )
+         level2.append( device )
+         level3.append( data )
+         df_data[ 0 ].append( ' ' )
+multiColumns = pd.MultiIndex.from_arrays([ level1, level2, level3 ] )
+frames = []
+
+in_idx = 0
+out_idx = 0
+max_out_idx = 50
+print( "Converting data..." )
+while in_idx < len(input_df.index) and out_idx < max_out_idx:
+   matrixName = input_df.iloc[in_idx]['matrix name']
+   df_matrix = input_df.loc[input_df['matrix name'] == matrixName]
+   print( out_idx, ":", in_idx, "/", len(input_df.index), ":", matrixName )
+   aux_df = pd.DataFrame( df_data, columns = multiColumns, index = [out_idx] )
+   for index,row in df_matrix.iterrows():
+      aux_df.iloc[0]['Matrix name'] = row['matrix name']
+      aux_df.iloc[0]['rows']        = row['rows']
+      aux_df.iloc[0]['columns']     = row['columns']
+      current_format = row['format']
+      current_device = row['device']
+      #print( current_format + " / " + current_device )
+      aux_df.iloc[0][(current_format,current_device,'bandwidth')]   = row['bandwidth']
+      #aux_df.iloc[0][(current_format,current_device,'time')]        = row['time']
+      #aux_df.iloc[0][(current_format,current_device,'speed-up')]    = row['speedup']
+      #aux_df.iloc[0][(current_format,current_device,'non-zeros')]   = row['non-zeros']
+      #aux_df.iloc[0][(current_format,current_device,'stddev')]      = row['stddev']
+      #aux_df.iloc[0][(current_format,current_device,'stddev/time')] = row['stddev/time']
+      #aux_df.iloc[0][(current_format,current_device,'diff.max')]    = row['CSR Diff.Max']
+      #aux_df.iloc[0][(current_format,current_device,'diff.l2')]    = row['CSR Diff.L2']
+
+   frames.append( aux_df )
+   out_idx = out_idx + 1
+   in_idx = in_idx + len(df_matrix.index)
+
+print( "Merging data into one frame..." )
+result = pd.concat( frames )
+
+print( "Setting data types..." )
+for format in formats:
+   for device in ['CPU','GPU']:
+      #df['eps'] = pd.to_numeric(df['eps'], errors='coerce')
+      print(result[(format,device,'bandwidth')].toList())
+      result[(format,device,'bandwidth')] = pd.to_numeric( result[(format,device,'bandwidth')], errors='coerce' )
+      #result[(format,device,'time')].astype('float64')
+      #result[(format,device,'speed-up')].astype('float64')
+      #result[(format,device,'non-zeros')].astype('int64')
+      #result[(format,device,'stddev')].astype('float64')
+      #result[(format,device,'stddev/time')].astype('float64')
+      #result[(format,device,'diff.max')].astype('float64')
+      #result[(format,device,'diff.l2')].astype('float64')
+
+print( "Writting to HTML file..." )
+result.to_html( 'output.html' )
+
+
+
+#result.sort_values(by=[('cusparse','GPU','bandwidth')],inplace=True,ascending=False)
+#for format in formats:
+#   cusparse_bw = result[('cusparse','GPU','bandwidth')].toList()
+#   format_bw = result[(format,'GPU','bandwidth')].toList()
+#
+
+#for format in formats:
+#   result.sort_values(by=[(format,'GPU','bandwidth')],inplace=True,ascending=False)
diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
index 8899dc9eb..3459643fd 100755
--- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py
@@ -154,8 +154,8 @@ Sort by comparison formats
 formats_comparison = defaultdict( list )
 for format in gpu_comparison_formats:
    df.sort_values(by=[f"{format} Bandwidth"],inplace=True,ascending=False)
-   formats_comparison[ format ] = df[format, "GPU", "bandwidth"].tolist();
-   formats_comparison[ gpu_comparison_formats[ format ] ] = df[gpu_comparison_formats[ format ], "GPU", "bandwidth"].tolist();
+   formats_comparison[ format ] = df[format, "GPU", "bandwidth"].tolist()
+   formats_comparison[ gpu_comparison_formats[ format ] ] = df[gpu_comparison_formats[ format ], "GPU", "bandwidth"].tolist()
 
 """
 Writting gnuplot source files
-- 
GitLab


From fc488c4d99778b74e27e820b06c9ea244f34ebde Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 4 Jun 2021 18:57:35 +0200
Subject: [PATCH 065/117] Implementing Python script to analyze SpMV benchmarks
 results.

---
 .../tnl-spmv-benchmark-make-tables-json.py    | 180 +++++++++++++++---
 1 file changed, 156 insertions(+), 24 deletions(-)

diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
index 2be5fc6ac..e5fafe92f 100755
--- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
@@ -1,14 +1,94 @@
 #!/usr/bin/python3
 
+import os
 import json
 import pandas as pd
 from pandas.io.json import json_normalize
+import matplotlib.pyplot as plt
+import numpy as np
 
-
+####
+# Helper function
 def slugify(s):
    s = str(s).strip().replace(' ', '_')
    return re.sub(r'(?u)[^-\w.]', '', s)
 
+####
+# Comparison with Cusparse
+def cusparse_comparison( df, formats ):
+   if not os.path.exists("Cusparse-bw"):
+      os.mkdir("Cusparse-bw")
+   df.sort_values(by=[('cusparse','GPU','bandwidth')],inplace=True,ascending=False)
+   for format in formats:
+      if not format in ['cusparse','CSR']:
+         print( f"Writing comparison of {format} and Cusparse" )
+         t = np.arange(df[(format,'GPU','bandwidth')].size )
+         fig, axs = plt.subplots( 2, 1 )
+         axs[0].plot( t, df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[0].plot( t, df[('cusparse','GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[0].legend( [ format, 'Cusparse' ], loc='upper right' )
+         axs[0].set_ylabel( 'Bandwidth in GB/sec' )
+         axs[1].set_yscale( 'log' )
+         axs[1].plot( t, df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[1].plot( t, df[('cusparse','GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[1].legend( [ format, 'Cusparse' ], loc='upper right' )
+         axs[1].set_xlabel( 'Matrix ID - sorted w.r.t. Cusparse' )
+         axs[1].set_ylabel( 'Bandwidth in GB/sec' )
+         plt.savefig( f"Cusparse-bw/{format}.pdf" )
+         plt.close(fig)
+
+####
+# Comparison with CSR on CPU
+def csr_comparison( df, formats ):
+   if not os.path.exists("CSR-bw"):
+      os.mkdir("CSR-bw")
+   for format in formats:
+      if not format in ['cusparse','CSR']:
+         print( f"Writing comparison of {format} and CSR on CPU" )
+         result.sort_values(by=[(format,'GPU','bandwidth')],inplace=True,ascending=False)
+         fig, axs = plt.subplots( 2, 1 )
+         t = np.arange(result[(format,'GPU','bandwidth')].size )
+         axs[0].plot( t, result[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[0].plot( t, result[('CSR','CPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[0].legend( [ format, 'CSR on CPU' ], loc='upper right' )
+         axs[1].set_yscale( 'log' )
+         axs[1].plot( t, result[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[1].plot( t, result[('CSR','CPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[1].legend( [ format, 'CSR on CPU' ], loc='upper right' )
+         axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {format}" )
+         axs[1].set_ylabel( 'Bandwidth in GB/sec' )
+         plt.savefig( f"CSR-bw/{format}.pdf")
+         plt.close(fig)
+
+####
+# Comparison of Legacy formats
+def legacy_formats_comparison( df, formats ):
+   if not os.path.exists("Legacy-bw"):
+      os.mkdir("Legacy-bw")
+   for ref_format, legacy_format in [ ('Ellpack', 'Ellpack Legacy'),
+                                    ('SlicedEllpack', 'SlicedEllpack Legacy'),
+                                    ('ChunkedEllpack', 'ChunkedEllpack Legacy'),
+                                    ('BiEllpack', 'BiEllpack Legacy'),
+                                    ('CSR< Adaptive >', 'CSR Legacy Adaptive'),
+                                    ('CSR< Scalar >', 'CSR Legacy Scalar'),
+                                    ('CSR< Vector >', 'CSR Legacy Vector') ]:
+      if ref_format in formats and legacy_format in formats:
+         print( f"Writing comparison of {ref_format} and {legacy_format}" )
+         result.sort_values(by=[(ref_format,'GPU','bandwidth')],inplace=True,ascending=False)
+         fig, axs = plt.subplots( 2, 1 )
+         t = np.arange(result[(ref_format,'GPU','bandwidth')].size )
+         axs[0].plot( t, result[(ref_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[0].plot( t, result[(legacy_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[0].legend( [ ref_format, legacy_format ], loc='upper right' )
+         axs[1].set_yscale( 'log' )
+         axs[1].plot( t, result[(ref_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[1].plot( t, result[(legacy_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[1].legend( [ ref_format, legacy_format ], loc='upper right' )
+         axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {ref_format}" )
+         axs[1].set_ylabel( 'Bandwidth in GB/sec' )
+         plt.savefig( f"Legacy-bw/{ref_format}.pdf")
+         plt.close(fig)
+
 ####
 # Parse input file
 print( "Parsing input file...." )
@@ -17,16 +97,19 @@ with open('sparse-matrix-benchmark.log') as f:
 input_df = json_normalize( d, record_path=['results'] )
 #input_df.to_html( "orig-pandas.html" )
 
+
 ####
 # Create multiindex for columns
 
 # Get format names - TODO: the first benchmark might not have all of them
 matrixName = input_df.iloc[0]['matrix name']
 df_matrix = input_df.loc[input_df['matrix name'] == matrixName]
-formats = df_matrix.loc[:,'format']
+formats = df_matrix.loc[:,'format'].values.tolist()
+formats = list(dict.fromkeys(formats)) # remove duplicates
 level1 = [ 'Matrix name', 'rows', 'columns' ]
 level2 = [ '',            '',     ''        ]
 level3 = [ '',            '',     ''        ]
+level4 = [ '',            '',     ''        ]
 df_data = [[ ' ',' ',' ']]
 for format in formats:
    for device in ['CPU','GPU']:
@@ -34,13 +117,21 @@ for format in formats:
          level1.append( format )
          level2.append( device )
          level3.append( data )
+         level4.append( '' )
+         df_data[ 0 ].append( ' ' )
+   if not format in [ 'cusparse', 'CSR' ]:
+      for speedup in [ 'cusparse', 'CSR CPU']:
+         level1.append( format )
+         level2.append( 'GPU' )
+         level3.append( 'speed-up')
+         level4.append( speedup )
          df_data[ 0 ].append( ' ' )
-multiColumns = pd.MultiIndex.from_arrays([ level1, level2, level3 ] )
+multiColumns = pd.MultiIndex.from_arrays([ level1, level2, level3, level4 ] )
 frames = []
 
 in_idx = 0
 out_idx = 0
-max_out_idx = 50
+max_out_idx = 10
 print( "Converting data..." )
 while in_idx < len(input_df.index) and out_idx < max_out_idx:
    matrixName = input_df.iloc[in_idx]['matrix name']
@@ -54,7 +145,7 @@ while in_idx < len(input_df.index) and out_idx < max_out_idx:
       current_format = row['format']
       current_device = row['device']
       #print( current_format + " / " + current_device )
-      aux_df.iloc[0][(current_format,current_device,'bandwidth')]   = row['bandwidth']
+      aux_df.iloc[0][(current_format,current_device,'bandwidth','')]   = pd.to_numeric(row['bandwidth'], errors='coerce')
       #aux_df.iloc[0][(current_format,current_device,'time')]        = row['time']
       #aux_df.iloc[0][(current_format,current_device,'speed-up')]    = row['speedup']
       #aux_df.iloc[0][(current_format,current_device,'non-zeros')]   = row['non-zeros']
@@ -70,30 +161,71 @@ while in_idx < len(input_df.index) and out_idx < max_out_idx:
 print( "Merging data into one frame..." )
 result = pd.concat( frames )
 
-print( "Setting data types..." )
 for format in formats:
-   for device in ['CPU','GPU']:
-      #df['eps'] = pd.to_numeric(df['eps'], errors='coerce')
-      print(result[(format,device,'bandwidth')].toList())
-      result[(format,device,'bandwidth')] = pd.to_numeric( result[(format,device,'bandwidth')], errors='coerce' )
-      #result[(format,device,'time')].astype('float64')
-      #result[(format,device,'speed-up')].astype('float64')
-      #result[(format,device,'non-zeros')].astype('int64')
-      #result[(format,device,'stddev')].astype('float64')
-      #result[(format,device,'stddev/time')].astype('float64')
-      #result[(format,device,'diff.max')].astype('float64')
-      #result[(format,device,'diff.l2')].astype('float64')
+   if not format in [ 'cusparse', 'CSR' ]:
+      print( 'Adding speed-up for ', format )
+      format_bdw_list = result[(format,'GPU','bandwidth')]
+      cusparse_bdw_list = result[('cusparse','GPU','bandwidth')]
+      csr_bdw_list = result[('CSR','CPU','bandwidth')]
+      cusparse_speedup_list = []
+      csr_speedup_list = []
+      for ( format_bdw, cusparse_bdw, csr_bdw ) in zip( format_bdw_list, cusparse_bdw_list,csr_bdw_list ):
+         try:
+            cusparse_speedup_list.append( format_bdw / cusparse_bdw )
+         except:
+            cusparse_speedup_list.append('')
+         try:
+            csr_speedup_list.append( format_bdw / csr_bdw )
+         except:
+            csr_speedup_list.append('')
+         #print( f'**{type(format_bdw)}** -- {type(5.2)}' )
+         #if type(format_bdw) == "<class 'numpy.float64'>":
+         #   print( f'##########{format_bdw / cusparse_bdw}' )
+         #   cusparse_speedup_list.append( format_bdw / cusparse_bdw )
+         #   csr_speedup_list.append( format_bdw / csr_bdw )
+         #else:
+         #   cusparse_speedup_list.append('')
+         #   csr_speedup_list.append('')
+
+      result[(format,'GPU','speed-up','cusparse')] = cusparse_speedup_list
+      result[(format,'GPU','speed-up','CSR CPU')] = csr_speedup_list
 
 print( "Writting to HTML file..." )
 result.to_html( 'output.html' )
 
+result.replace( to_replace=' ',value=np.nan,inplace=True)
 
+####
+# Generate report = tables and figures
 
-#result.sort_values(by=[('cusparse','GPU','bandwidth')],inplace=True,ascending=False)
-#for format in formats:
-#   cusparse_bw = result[('cusparse','GPU','bandwidth')].toList()
-#   format_bw = result[(format,'GPU','bandwidth')].toList()
-#
+#cusparse_comparison( result, formats )
+#csr_comparison( result, formats )
+#legacy_formats_comparison( result, formats )
 
-#for format in formats:
-#   result.sort_values(by=[(format,'GPU','bandwidth')],inplace=True,ascending=False)
+####
+# Comparison of speed-up w.r.t. Cusparse
+if not os.path.exists("Cusparse-speed-up"):
+   os.mkdir("Cusparse-speed-up")
+for format in formats:
+   if not format in ['cusparse','CSR']:
+      print( f"Writing comparison of speed-up of {format} compared to Cusparse" )
+      result['tmp'] = result[(format, 'GPU','bandwidth')]
+      filtered_df=result.dropna(subset=['rows'])
+      filtered_df.to_html( 'tmp.html')
+      break
+      filtered_df.sort_values(by=[(format,'GPU','speed-up','cusparse')],inplace=True,ascending=False)
+      fig, axs = plt.subplots( 2, 1 )
+      size = result[(format,'GPU','bandwidth')].size
+      t = np.arange( size )
+      bar = np.full( size, 1 )
+      axs[0].plot( t, filtered_df[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
+      axs[0].plot( t, bar, '-', ms=1, lw=1 )
+      axs[0].legend( [ format, 'Cusparse' ], loc='upper right' )
+      axs[1].set_yscale( 'log' )
+      axs[1].plot( t, result[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
+      axs[1].plot( t, bar, '-', ms=1, lw=1 )
+      axs[1].legend( [ format, 'Cusparse' ], loc='upper right' )
+      axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {format}" )
+      axs[1].set_ylabel( 'Bandwidth in GB/sec' )
+      plt.savefig( f"Cusparse-speed-up/{format}.pdf")
+      plt.close(fig)
-- 
GitLab


From 15930e8a77251077e1b2e5fd30604b01a1727164 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 7 Jun 2021 12:21:20 +0200
Subject: [PATCH 066/117] Python script for SpMV benchmark results processing
 is working well.

---
 .../tnl-spmv-benchmark-make-tables-json.py    | 311 ++++++++++--------
 1 file changed, 176 insertions(+), 135 deletions(-)

diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
index e5fafe92f..1f497647f 100755
--- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
@@ -13,33 +13,141 @@ def slugify(s):
    s = str(s).strip().replace(' ', '_')
    return re.sub(r'(?u)[^-\w.]', '', s)
 
+####
+# Extract all formats
+def get_formats( input_df ):
+   matrixName = input_df.iloc[0]['matrix name']
+   df_matrix = input_df.loc[input_df['matrix name'] == matrixName]
+   formats = df_matrix.loc[:,'format'].values.tolist() # Get format names - TODO: the first benchmark might not have all of them
+   formats = list(dict.fromkeys(formats))              # remove duplicates
+   return formats
+
+####
+# Create multiindex for columns
+def get_multiindex( input_df, formats ):
+   level1 = [ 'Matrix name', 'rows', 'columns' ]
+   level2 = [ '',            '',     ''        ]
+   level3 = [ '',            '',     ''        ]
+   level4 = [ '',            '',     ''        ]
+   df_data = [[ ' ',' ',' ']]
+   for format in formats:
+      for device in ['CPU','GPU']:
+         for data in ['bandwidth' ]: #,'time','speed-up','non-zeros','stddev','stddev/time','diff.max','diff.l2']:
+            level1.append( format )
+            level2.append( device )
+            level3.append( data )
+            level4.append( '' )
+            df_data[ 0 ].append( ' ' )
+      if not format in [ 'cusparse', 'CSR' ]:
+         for speedup in [ 'cusparse', 'CSR CPU']:
+            level1.append( format )
+            level2.append( 'GPU' )
+            level3.append( 'speed-up')
+            level4.append( speedup )
+            df_data[ 0 ].append( ' ' )
+   multiColumns = pd.MultiIndex.from_arrays([ level1, level2, level3, level4 ] )
+   return multiColumns, df_data
+
+####
+# Convert input table to better structured one
+def convert_data_frame( input_df, multicolumns, df_data, max_rows = -1 ):
+   frames = []
+   in_idx = 0
+   out_idx = 0
+   max_out_idx = max_rows
+   if max_out_idx == -1:
+      max_out_idx = len(input_df.index)
+   while in_idx < len(input_df.index) and out_idx < max_out_idx:
+      matrixName = input_df.iloc[in_idx]['matrix name']
+      df_matrix = input_df.loc[input_df['matrix name'] == matrixName]
+      print( out_idx, ":", in_idx, "/", len(input_df.index), ":", matrixName )
+      aux_df = pd.DataFrame( df_data, columns = multicolumns, index = [out_idx] )
+      for index,row in df_matrix.iterrows():
+         aux_df.iloc[0]['Matrix name'] = row['matrix name']
+         aux_df.iloc[0]['rows']        = row['rows']
+         aux_df.iloc[0]['columns']     = row['columns']
+         current_format = row['format']
+         current_device = row['device']
+         #print( current_format + " / " + current_device )
+         aux_df.iloc[0][(current_format,current_device,'bandwidth','')]   = pd.to_numeric(row['bandwidth'], errors='coerce')
+         #aux_df.iloc[0][(current_format,current_device,'time')]        = row['time']
+         #aux_df.iloc[0][(current_format,current_device,'speed-up')]    = row['speedup']
+         #aux_df.iloc[0][(current_format,current_device,'non-zeros')]   = row['non-zeros']
+         #aux_df.iloc[0][(current_format,current_device,'stddev')]      = row['stddev']
+         #aux_df.iloc[0][(current_format,current_device,'stddev/time')] = row['stddev/time']
+         #aux_df.iloc[0][(current_format,current_device,'diff.max')]    = row['CSR Diff.Max']
+         #aux_df.iloc[0][(current_format,current_device,'diff.l2')]    = row['CSR Diff.L2']
+      frames.append( aux_df )
+      out_idx = out_idx + 1
+      in_idx = in_idx + len(df_matrix.index)
+   result = pd.concat( frames )
+   return result
+
+####
+# Compute speed-up of particular formats compared to Cusparse on GPU and CSR on CPU
+def compute_speedup( df, formats ):
+   for format in formats:
+      if not format in [ 'cusparse', 'CSR' ]:
+         print( 'Adding speed-up for ', format )
+         format_bdw_list = df[(format,'GPU','bandwidth')]
+         cusparse_bdw_list = df[('cusparse','GPU','bandwidth')]
+         csr_bdw_list = df[('CSR','CPU','bandwidth')]
+         cusparse_speedup_list = []
+         csr_speedup_list = []
+         for ( format_bdw, cusparse_bdw, csr_bdw ) in zip( format_bdw_list, cusparse_bdw_list,csr_bdw_list ):
+            try:
+               cusparse_speedup_list.append( format_bdw / cusparse_bdw )
+            except:
+               cusparse_speedup_list.append('')
+            try:
+               csr_speedup_list.append( format_bdw / csr_bdw )
+            except:
+               csr_speedup_list.append('')
+            #print( f'**{type(format_bdw)}** -- {type(5.2)}' )
+            #if type(format_bdw) == "<class 'numpy.float64'>":
+            #   print( f'##########{format_bdw / cusparse_bdw}' )
+            #   cusparse_speedup_list.append( format_bdw / cusparse_bdw )
+            #   csr_speedup_list.append( format_bdw / csr_bdw )
+            #else:
+            #   cusparse_speedup_list.append('')
+            #   csr_speedup_list.append('')
+         df[(format,'GPU','speed-up','cusparse')] = cusparse_speedup_list
+         df[(format,'GPU','speed-up','CSR CPU')] = csr_speedup_list
+
 ####
 # Comparison with Cusparse
-def cusparse_comparison( df, formats ):
+def cusparse_comparison( df, formats, head_size=10 ):
    if not os.path.exists("Cusparse-bw"):
       os.mkdir("Cusparse-bw")
    df.sort_values(by=[('cusparse','GPU','bandwidth')],inplace=True,ascending=False)
    for format in formats:
       if not format in ['cusparse','CSR']:
          print( f"Writing comparison of {format} and Cusparse" )
-         t = np.arange(df[(format,'GPU','bandwidth')].size )
+         filtered_df = df.dropna( subset=[(format,'GPU','bandwidth','')] )
+         t = np.arange(filtered_df[(format,'GPU','bandwidth')].size )
          fig, axs = plt.subplots( 2, 1 )
-         axs[0].plot( t, df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
-         axs[0].plot( t, df[('cusparse','GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[0].plot( t, filtered_df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[0].plot( t, filtered_df[('cusparse','GPU','bandwidth')], '-o', ms=1, lw=1 )
          axs[0].legend( [ format, 'Cusparse' ], loc='upper right' )
          axs[0].set_ylabel( 'Bandwidth in GB/sec' )
          axs[1].set_yscale( 'log' )
-         axs[1].plot( t, df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
-         axs[1].plot( t, df[('cusparse','GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[1].plot( t, filtered_df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[1].plot( t, filtered_df[('cusparse','GPU','bandwidth')], '-o', ms=1, lw=1 )
          axs[1].legend( [ format, 'Cusparse' ], loc='upper right' )
          axs[1].set_xlabel( 'Matrix ID - sorted w.r.t. Cusparse' )
          axs[1].set_ylabel( 'Bandwidth in GB/sec' )
          plt.savefig( f"Cusparse-bw/{format}.pdf" )
          plt.close(fig)
+         head_df = filtered_df.head( head_size )
+         for f in formats:
+            if not f in ['cusparse','CSR',format]:
+               print( f"Droping {f}..." )
+               head_df.drop( labels=f, axis='columns', level=0, inplace=True )
+         head_df.to_html( f"Cusparse-bw/{format}-head.html" )
 
 ####
 # Comparison with CSR on CPU
-def csr_comparison( df, formats ):
+def csr_comparison( df, formats, head_size=10 ):
    if not os.path.exists("CSR-bw"):
       os.mkdir("CSR-bw")
    for format in formats:
@@ -59,10 +167,16 @@ def csr_comparison( df, formats ):
          axs[1].set_ylabel( 'Bandwidth in GB/sec' )
          plt.savefig( f"CSR-bw/{format}.pdf")
          plt.close(fig)
+         head_df = filtered_df.head( head_size )
+         for f in formats:
+            if not f in ['cusparse','CSR',format]:
+               print( f"Droping {f}..." )
+               head_df.drop( labels=f, axis='columns', level=0, inplace=True )
+         head_df.to_html( f"CSR-bw/{format}-head.html" )
 
 ####
 # Comparison of Legacy formats
-def legacy_formats_comparison( df, formats ):
+def legacy_formats_comparison( df, formats, head_size=10 ):
    if not os.path.exists("Legacy-bw"):
       os.mkdir("Legacy-bw")
    for ref_format, legacy_format in [ ('Ellpack', 'Ellpack Legacy'),
@@ -74,20 +188,59 @@ def legacy_formats_comparison( df, formats ):
                                     ('CSR< Vector >', 'CSR Legacy Vector') ]:
       if ref_format in formats and legacy_format in formats:
          print( f"Writing comparison of {ref_format} and {legacy_format}" )
-         result.sort_values(by=[(ref_format,'GPU','bandwidth')],inplace=True,ascending=False)
+         df.sort_values(by=[(ref_format,'GPU','bandwidth')],inplace=True,ascending=False)
          fig, axs = plt.subplots( 2, 1 )
-         t = np.arange(result[(ref_format,'GPU','bandwidth')].size )
-         axs[0].plot( t, result[(ref_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
-         axs[0].plot( t, result[(legacy_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+         t = np.arange(df[(ref_format,'GPU','bandwidth')].size )
+         axs[0].plot( t, df[(ref_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[0].plot( t, df[(legacy_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
          axs[0].legend( [ ref_format, legacy_format ], loc='upper right' )
          axs[1].set_yscale( 'log' )
-         axs[1].plot( t, result[(ref_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
-         axs[1].plot( t, result[(legacy_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[1].plot( t, df[(ref_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[1].plot( t, df[(legacy_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
          axs[1].legend( [ ref_format, legacy_format ], loc='upper right' )
          axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {ref_format}" )
          axs[1].set_ylabel( 'Bandwidth in GB/sec' )
          plt.savefig( f"Legacy-bw/{ref_format}.pdf")
          plt.close(fig)
+         head_df = filtered_df.head( head_size )
+         for f in formats:
+            if not f in ['cusparse','CSR',format]:
+               print( f"Droping {f}..." )
+               head_df.drop( labels=f, axis='columns', level=0, inplace=True )
+         head_df.to_html( f"Legacy-bw/{format}-head.html" )
+
+####
+# Comparison of speed-up w.r.t. Cusparse
+def cusparse_speedup_comparison( df, formats, head_size=10 ):
+   if not os.path.exists("Cusparse-speed-up"):
+      os.mkdir("Cusparse-speed-up")
+   for format in formats:
+      if not format in ['cusparse','CSR']:
+         print( f"Writing comparison of speed-up of {format} compared to Cusparse" )
+         df['tmp'] = df[(format, 'GPU','bandwidth')]
+         filtered_df=df.dropna(subset=[('tmp','','','')])
+         filtered_df.sort_values(by=[(format,'GPU','speed-up','cusparse')],inplace=True,ascending=False)
+         fig, axs = plt.subplots( 2, 1 )
+         size = len(filtered_df[(format,'GPU','speed-up','cusparse')].index)
+         t = np.arange( size )
+         bar = np.full( size, 1 )
+         axs[0].plot( t, filtered_df[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
+         axs[0].plot( t, bar, '-', ms=1, lw=1 )
+         axs[0].legend( [ format, 'Cusparse' ], loc='upper right' )
+         axs[1].set_yscale( 'log' )
+         axs[1].plot( t, filtered_df[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
+         axs[1].plot( t, bar, '-', ms=1, lw=1 )
+         axs[1].legend( [ format, 'Cusparse' ], loc='upper right' )
+         axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {format}" )
+         axs[1].set_ylabel( 'Bandwidth in GB/sec' )
+         plt.savefig( f"Cusparse-speed-up/{format}.pdf")
+         plt.close(fig)
+         head_df = filtered_df.head( head_size )
+         for f in formats:
+            if not f in ['cusparse','CSR',format]:
+               print( f"Droping {f}..." )
+               head_df.drop( labels=f, axis='columns', level=0, inplace=True )
+         head_df.to_html( f"Cusparse-speed-up/{format}-head.html" )
 
 ####
 # Parse input file
@@ -97,98 +250,12 @@ with open('sparse-matrix-benchmark.log') as f:
 input_df = json_normalize( d, record_path=['results'] )
 #input_df.to_html( "orig-pandas.html" )
 
+formats = get_formats( input_df )
+multicolumns, df_data = get_multiindex( input_df, formats )
 
-####
-# Create multiindex for columns
-
-# Get format names - TODO: the first benchmark might not have all of them
-matrixName = input_df.iloc[0]['matrix name']
-df_matrix = input_df.loc[input_df['matrix name'] == matrixName]
-formats = df_matrix.loc[:,'format'].values.tolist()
-formats = list(dict.fromkeys(formats)) # remove duplicates
-level1 = [ 'Matrix name', 'rows', 'columns' ]
-level2 = [ '',            '',     ''        ]
-level3 = [ '',            '',     ''        ]
-level4 = [ '',            '',     ''        ]
-df_data = [[ ' ',' ',' ']]
-for format in formats:
-   for device in ['CPU','GPU']:
-      for data in ['bandwidth' ]: #,'time','speed-up','non-zeros','stddev','stddev/time','diff.max','diff.l2']:
-         level1.append( format )
-         level2.append( device )
-         level3.append( data )
-         level4.append( '' )
-         df_data[ 0 ].append( ' ' )
-   if not format in [ 'cusparse', 'CSR' ]:
-      for speedup in [ 'cusparse', 'CSR CPU']:
-         level1.append( format )
-         level2.append( 'GPU' )
-         level3.append( 'speed-up')
-         level4.append( speedup )
-         df_data[ 0 ].append( ' ' )
-multiColumns = pd.MultiIndex.from_arrays([ level1, level2, level3, level4 ] )
-frames = []
-
-in_idx = 0
-out_idx = 0
-max_out_idx = 10
 print( "Converting data..." )
-while in_idx < len(input_df.index) and out_idx < max_out_idx:
-   matrixName = input_df.iloc[in_idx]['matrix name']
-   df_matrix = input_df.loc[input_df['matrix name'] == matrixName]
-   print( out_idx, ":", in_idx, "/", len(input_df.index), ":", matrixName )
-   aux_df = pd.DataFrame( df_data, columns = multiColumns, index = [out_idx] )
-   for index,row in df_matrix.iterrows():
-      aux_df.iloc[0]['Matrix name'] = row['matrix name']
-      aux_df.iloc[0]['rows']        = row['rows']
-      aux_df.iloc[0]['columns']     = row['columns']
-      current_format = row['format']
-      current_device = row['device']
-      #print( current_format + " / " + current_device )
-      aux_df.iloc[0][(current_format,current_device,'bandwidth','')]   = pd.to_numeric(row['bandwidth'], errors='coerce')
-      #aux_df.iloc[0][(current_format,current_device,'time')]        = row['time']
-      #aux_df.iloc[0][(current_format,current_device,'speed-up')]    = row['speedup']
-      #aux_df.iloc[0][(current_format,current_device,'non-zeros')]   = row['non-zeros']
-      #aux_df.iloc[0][(current_format,current_device,'stddev')]      = row['stddev']
-      #aux_df.iloc[0][(current_format,current_device,'stddev/time')] = row['stddev/time']
-      #aux_df.iloc[0][(current_format,current_device,'diff.max')]    = row['CSR Diff.Max']
-      #aux_df.iloc[0][(current_format,current_device,'diff.l2')]    = row['CSR Diff.L2']
-
-   frames.append( aux_df )
-   out_idx = out_idx + 1
-   in_idx = in_idx + len(df_matrix.index)
-
-print( "Merging data into one frame..." )
-result = pd.concat( frames )
-
-for format in formats:
-   if not format in [ 'cusparse', 'CSR' ]:
-      print( 'Adding speed-up for ', format )
-      format_bdw_list = result[(format,'GPU','bandwidth')]
-      cusparse_bdw_list = result[('cusparse','GPU','bandwidth')]
-      csr_bdw_list = result[('CSR','CPU','bandwidth')]
-      cusparse_speedup_list = []
-      csr_speedup_list = []
-      for ( format_bdw, cusparse_bdw, csr_bdw ) in zip( format_bdw_list, cusparse_bdw_list,csr_bdw_list ):
-         try:
-            cusparse_speedup_list.append( format_bdw / cusparse_bdw )
-         except:
-            cusparse_speedup_list.append('')
-         try:
-            csr_speedup_list.append( format_bdw / csr_bdw )
-         except:
-            csr_speedup_list.append('')
-         #print( f'**{type(format_bdw)}** -- {type(5.2)}' )
-         #if type(format_bdw) == "<class 'numpy.float64'>":
-         #   print( f'##########{format_bdw / cusparse_bdw}' )
-         #   cusparse_speedup_list.append( format_bdw / cusparse_bdw )
-         #   csr_speedup_list.append( format_bdw / csr_bdw )
-         #else:
-         #   cusparse_speedup_list.append('')
-         #   csr_speedup_list.append('')
-
-      result[(format,'GPU','speed-up','cusparse')] = cusparse_speedup_list
-      result[(format,'GPU','speed-up','CSR CPU')] = csr_speedup_list
+result = convert_data_frame( input_df, multicolumns, df_data, 200 )
+compute_speedup( result, formats )
 
 print( "Writting to HTML file..." )
 result.to_html( 'output.html' )
@@ -197,35 +264,9 @@ result.replace( to_replace=' ',value=np.nan,inplace=True)
 
 ####
 # Generate report = tables and figures
+head_size = 10
+cusparse_comparison( result, formats, head_size )
+csr_comparison( result, formats, head_size )
+legacy_formats_comparison( result, formats, head_size )
+cusparse_speedup_comparison( result, formats, head_size )
 
-#cusparse_comparison( result, formats )
-#csr_comparison( result, formats )
-#legacy_formats_comparison( result, formats )
-
-####
-# Comparison of speed-up w.r.t. Cusparse
-if not os.path.exists("Cusparse-speed-up"):
-   os.mkdir("Cusparse-speed-up")
-for format in formats:
-   if not format in ['cusparse','CSR']:
-      print( f"Writing comparison of speed-up of {format} compared to Cusparse" )
-      result['tmp'] = result[(format, 'GPU','bandwidth')]
-      filtered_df=result.dropna(subset=['rows'])
-      filtered_df.to_html( 'tmp.html')
-      break
-      filtered_df.sort_values(by=[(format,'GPU','speed-up','cusparse')],inplace=True,ascending=False)
-      fig, axs = plt.subplots( 2, 1 )
-      size = result[(format,'GPU','bandwidth')].size
-      t = np.arange( size )
-      bar = np.full( size, 1 )
-      axs[0].plot( t, filtered_df[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
-      axs[0].plot( t, bar, '-', ms=1, lw=1 )
-      axs[0].legend( [ format, 'Cusparse' ], loc='upper right' )
-      axs[1].set_yscale( 'log' )
-      axs[1].plot( t, result[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
-      axs[1].plot( t, bar, '-', ms=1, lw=1 )
-      axs[1].legend( [ format, 'Cusparse' ], loc='upper right' )
-      axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {format}" )
-      axs[1].set_ylabel( 'Bandwidth in GB/sec' )
-      plt.savefig( f"Cusparse-speed-up/{format}.pdf")
-      plt.close(fig)
-- 
GitLab


From d4ea98b34def22bdf7c4bd352c1005b6912c9dbb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 7 Jun 2021 13:10:26 +0200
Subject: [PATCH 067/117] Splitting source code of Benchmark into two files,
 added test for existence of the log file.

---
 src/Benchmarks/Benchmark.hpp             | 312 ++++++++++++++++++
 src/Benchmarks/Benchmarks.h              | 399 +++++++----------------
 src/Benchmarks/JsonLogging.h             |   6 +-
 src/Benchmarks/Logging.h                 |   3 +-
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h |  20 +-
 5 files changed, 458 insertions(+), 282 deletions(-)
 create mode 100644 src/Benchmarks/Benchmark.hpp

diff --git a/src/Benchmarks/Benchmark.hpp b/src/Benchmarks/Benchmark.hpp
new file mode 100644
index 000000000..e2357990a
--- /dev/null
+++ b/src/Benchmarks/Benchmark.hpp
@@ -0,0 +1,312 @@
+/***************************************************************************
+                          Benchmarks.hpp  -  description
+                             -------------------
+    begin                : Jun 7, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky,
+//                 Tomas Oberhuber
+
+#pragma once
+
+#include "FunctionTimer.h"
+#include "Logging.h"
+
+#include <iostream>
+#include <exception>
+#include <limits>
+
+#include <TNL/String.h>
+
+#include <TNL/Devices/Host.h>
+#include <TNL/SystemInfo.h>
+#include <TNL/Cuda/DeviceInfo.h>
+#include <TNL/Config/ConfigDescription.h>
+#include <TNL/MPI/Wrappers.h>
+
+namespace TNL {
+namespace Benchmarks {
+
+
+template< typename Logger >
+Benchmark< Logger >::
+Benchmark( int loops,
+           bool verbose,
+           String outputMode,
+           bool logFileAppend )
+: Logger(verbose, outputMode, logFileAppend), loops(loops)
+{}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+configSetup( Config::ConfigDescription& config )
+{
+   config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
+   config.addEntry< bool >( "reset", "Call reset function between loops.", true );
+   config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 0.0 );
+   config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 );
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+setup( const Config::ParameterContainer& parameters )
+{
+   this->loops = parameters.getParameter< int >( "loops" );
+   this->reset = parameters.getParameter< bool >( "reset" );
+   this->minTime = parameters.getParameter< double >( "min-time" );
+   const int verbose = parameters.getParameter< int >( "verbose" );
+   Logger::setVerbose( verbose );
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+setLoops( int loops )
+{
+   this->loops = loops;
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+setMinTime( const double& minTime )
+{
+   this->minTime = minTime;
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+newBenchmark( const String & title )
+{
+   Logger::closeTable();
+   Logger::writeTitle( title );
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+newBenchmark( const String & title,
+               MetadataMap metadata )
+{
+   Logger::closeTable();
+   Logger::writeTitle( title );
+   // add loops and reset flag to metadata
+   metadata["loops"] = convertToString(loops);
+   metadata["reset"] = convertToString( reset );
+   metadata["minimal test time"] = convertToString( minTime );
+   Logger::writeMetadata( metadata );
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+setMetadataColumns( const MetadataColumns & metadata )
+{
+   if( Logger::metadataColumns != metadata )
+      Logger::header_changed = true;
+   Logger::metadataColumns = metadata;
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+setOperation( const String & operation,
+              const double datasetSize,
+              const double baseTime )
+{
+   monitor.setStage( operation.getString() );
+   if( Logger::metadataColumns.size() > 0 && String(Logger::metadataColumns[ 0 ].first) == "operation" ) {
+      Logger::metadataColumns[ 0 ].second = operation;
+   }
+   else {
+      Logger::metadataColumns.insert( Logger::metadataColumns.begin(), {"operation", operation} );
+   }
+   setOperation( datasetSize, baseTime );
+   Logger::header_changed = true;
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+setOperation( const double datasetSize,
+              const double baseTime )
+{
+   this->datasetSize = datasetSize;
+   this->baseTime = baseTime;
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+createHorizontalGroup( const String & name,
+                       int subcolumns )
+{
+   if( Logger::horizontalGroups.size() == 0 ) {
+      Logger::horizontalGroups.push_back( {name, subcolumns} );
+   }
+   else {
+      auto & last = Logger::horizontalGroups.back();
+      if( last.first != name && last.second > 0 ) {
+         Logger::horizontalGroups.push_back( {name, subcolumns} );
+      }
+      else {
+         last.first = name;
+         last.second = subcolumns;
+      }
+   }
+}
+
+template< typename Logger >
+   template< typename Device,
+             typename ResetFunction,
+             typename ComputeFunction >
+double
+Benchmark< Logger >::
+time( ResetFunction reset,
+      const String & performer,
+      ComputeFunction & compute,
+      BenchmarkResult< Logger > & result )
+{
+   result.time = std::numeric_limits<double>::quiet_NaN();
+   result.stddev = std::numeric_limits<double>::quiet_NaN();
+   FunctionTimer< Device > functionTimer;
+   try {
+      if( Logger::verbose > 1 ) {
+         // run the monitor main loop
+         Solvers::SolverMonitorThread monitor_thread( monitor );
+         if( this->reset )
+            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, Logger::verbose, monitor );
+         else
+            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor );
+      }
+      else {
+         if( this->reset )
+            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, Logger::verbose, monitor );
+         else
+            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor );
+      }
+      this->performedLoops = functionTimer.getPerformedLoops();
+   }
+   catch ( const std::exception& e ) {
+      std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl;
+   }
+
+   result.bandwidth = datasetSize / result.time;
+   result.speedup = this->baseTime / result.time;
+   if( this->baseTime == 0.0 )
+      this->baseTime = result.time;
+
+   Logger::writeTableHeader( performer, result.getTableHeader() );
+   Logger::writeTableRow( performer, result.getRowElements() );
+
+   return this->baseTime;
+}
+
+template< typename Logger >
+   template< typename Device,
+             typename ResetFunction,
+             typename ComputeFunction >
+inline double
+Benchmark< Logger >::
+time( ResetFunction reset,
+      const String& performer,
+      ComputeFunction& compute )
+{
+   BenchmarkResult< Logger > result;
+   return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result );
+}
+
+template< typename Logger >
+   template< typename Device,
+             typename ComputeFunction >
+double
+Benchmark< Logger >::
+time( const String & performer,
+      ComputeFunction & compute,
+      BenchmarkResult< Logger > & result )
+{
+   result.time = std::numeric_limits<double>::quiet_NaN();
+   result.stddev = std::numeric_limits<double>::quiet_NaN();
+   FunctionTimer< Device > functionTimer;
+   try {
+      if( Logger::verbose > 1 ) {
+         // run the monitor main loop
+         Solvers::SolverMonitorThread monitor_thread( monitor );
+         std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor );
+      }
+      else {
+         std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor );
+      }
+   }
+   catch ( const std::exception& e ) {
+      std::cerr << "Function timer failed due to a C++ exception with description: " << e.what() << std::endl;
+   }
+
+   result.bandwidth = datasetSize / result.time;
+   result.speedup = this->baseTime / result.time;
+   if( this->baseTime == 0.0 )
+      this->baseTime = result.time;
+
+   Logger::writeTableHeader( performer, result.getTableHeader() );
+   Logger::writeTableRow( performer, result.getRowElements() );
+
+   return this->baseTime;
+}
+
+template< typename Logger >
+   template< typename Device,
+             typename ComputeFunction >
+inline double
+Benchmark< Logger >::
+time( const String & performer,
+      ComputeFunction & compute )
+{
+   BenchmarkResult< Logger > result;
+   return time< Device, ComputeFunction >( performer, compute, result );
+}
+
+template< typename Logger >
+void
+Benchmark< Logger >::
+addErrorMessage( const char* msg,
+                 int numberOfComputations )
+{
+   // each computation has 3 subcolumns
+   const int colspan = 3 * numberOfComputations;
+   Logger::writeErrorMessage( msg, colspan );
+   std::cerr << msg << std::endl;
+}
+
+template< typename Logger >
+auto
+Benchmark< Logger >::
+getMonitor() -> SolverMonitorType&
+{
+   return monitor;
+}
+
+template< typename Logger >
+int
+Benchmark< Logger >::
+getPerformedLoops() const
+{
+   return this->performedLoops;
+}
+
+template< typename Logger >
+bool
+Benchmark< Logger >::
+isResetingOn() const
+{
+   return reset;
+}
+
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index d9ef1f12a..7d89100bd 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          benchmarks.h  -  description
+                          Benchmarks.h  -  description
                              -------------------
     begin                : Dec 30, 2015
     copyright            : (C) 2015 by Tomas Oberhuber et al.
@@ -66,278 +66,129 @@ template< typename Logger = Logging >
 class Benchmark
 : protected Logger
 {
-public:
-   using typename Logger::MetadataElement;
-   using typename Logger::MetadataMap;
-   using typename Logger::MetadataColumns;
-   using SolverMonitorType = Solvers::IterativeSolverMonitor< double, int >;
-
-   using typename Logger::CommonLogs;
-   using Logger::addCommonLogs;
-   using Logger::addLogsMetadata;
-   using Logger::writeHeader;
-
-   Benchmark( int loops = 10,
-              bool verbose = true,
-              String outputMode = "" )
-   : Logger(verbose, outputMode), loops(loops)
-   {}
-
-   static void configSetup( Config::ConfigDescription& config )
-   {
-      config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
-      config.addEntry< bool >( "reset", "Call reset function between loops.", true );
-      config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 0.0 );
-      config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 );
-   }
-
-   void setup( const Config::ParameterContainer& parameters )
-   {
-      this->loops = parameters.getParameter< int >( "loops" );
-      this->reset = parameters.getParameter< bool >( "reset" );
-      this->minTime = parameters.getParameter< double >( "min-time" );
-      const int verbose = parameters.getParameter< int >( "verbose" );
-      Logger::setVerbose( verbose );
-   }
-   // TODO: ensure that this is not called in the middle of the benchmark
-   // (or just remove it completely?)
-   void
-   setLoops( int loops )
-   {
-      this->loops = loops;
-   }
-
-   void setMinTime( const double& minTime )
-   {
-      this->minTime = minTime;
-   }
-
-   // Marks the start of a new benchmark
-   void
-   newBenchmark( const String & title )
-   {
-      Logger::closeTable();
-      Logger::writeTitle( title );
-   }
-
-   // Marks the start of a new benchmark (with custom metadata)
-   void
-   newBenchmark( const String & title,
-                 MetadataMap metadata )
-   {
-      Logger::closeTable();
-      Logger::writeTitle( title );
-      // add loops and reset flag to metadata
-      metadata["loops"] = convertToString(loops);
-      metadata["reset"] = convertToString( reset );
-      metadata["minimal test time"] = convertToString( minTime );
-      Logger::writeMetadata( metadata );
-   }
-
-   // Sets metadata columns -- values used for all subsequent rows until
-   // the next call to this function.
-   void
-   setMetadataColumns( const MetadataColumns & metadata )
-   {
-      if( Logger::metadataColumns != metadata )
-         Logger::header_changed = true;
-      Logger::metadataColumns = metadata;
-   }
-
-   // TODO: maybe should be renamed to createVerticalGroup and ensured that vertical and horizontal groups are not used within the same "Benchmark"
-   // Sets current operation -- operations expand the table vertically
-   //  - baseTime should be reset to 0.0 for most operations, but sometimes
-   //    it is useful to override it
-   //  - Order of operations inside a "Benchmark" does not matter, rows can be
-   //    easily sorted while converting to HTML.)
-   void
-   setOperation( const String & operation,
-                 const double datasetSize = 0.0, // in GB
-                 const double baseTime = 0.0 )
-   {
-      monitor.setStage( operation.getString() );
-      if( Logger::metadataColumns.size() > 0 && String(Logger::metadataColumns[ 0 ].first) == "operation" ) {
-         Logger::metadataColumns[ 0 ].second = operation;
-      }
-      else {
-         Logger::metadataColumns.insert( Logger::metadataColumns.begin(), {"operation", operation} );
-      }
-      setOperation( datasetSize, baseTime );
-      Logger::header_changed = true;
-   }
-
-   void
-   setOperation( const double datasetSize = 0.0,
-                 const double baseTime = 0.0 )
-   {
-      this->datasetSize = datasetSize;
-      this->baseTime = baseTime;
-   }
-
-   // Creates new horizontal groups inside a benchmark -- increases the number
-   // of columns in the "Benchmark", implies column spanning.
-   // (Useful e.g. for SpMV formats, different configurations etc.)
-   void
-   createHorizontalGroup( const String & name,
-                          int subcolumns )
-   {
-      if( Logger::horizontalGroups.size() == 0 ) {
-         Logger::horizontalGroups.push_back( {name, subcolumns} );
-      }
-      else {
-         auto & last = Logger::horizontalGroups.back();
-         if( last.first != name && last.second > 0 ) {
-            Logger::horizontalGroups.push_back( {name, subcolumns} );
-         }
-         else {
-            last.first = name;
-            last.second = subcolumns;
-         }
-      }
-   }
-
-   // Times a single ComputeFunction. Subsequent calls implicitly split
-   // the current "horizontal group" into sub-columns identified by
-   // "performer", which are further split into "bandwidth", "time" and
-   // "speedup" columns.
-   // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation)
-   // Also terminates the recursion of the following variadic template.
-   template< typename Device,
-             typename ResetFunction,
-             typename ComputeFunction >
-   double
-   time( ResetFunction reset,
-         const String & performer,
-         ComputeFunction & compute,
-         BenchmarkResult< Logger > & result )
-   {
-      result.time = std::numeric_limits<double>::quiet_NaN();
-      result.stddev = std::numeric_limits<double>::quiet_NaN();
-      FunctionTimer< Device > functionTimer;
-      try {
-         if( Logger::verbose > 1 ) {
-            // run the monitor main loop
-            Solvers::SolverMonitorThread monitor_thread( monitor );
-            if( this->reset )
-               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, Logger::verbose, monitor );
-            else
-               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor );
-         }
-         else {
-            if( this->reset )
-               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, Logger::verbose, monitor );
-            else
-               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor );
-         }
-         this->performedLoops = functionTimer.getPerformedLoops();
-      }
-      catch ( const std::exception& e ) {
-         std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl;
-      }
-
-      result.bandwidth = datasetSize / result.time;
-      result.speedup = this->baseTime / result.time;
-      if( this->baseTime == 0.0 )
-         this->baseTime = result.time;
-
-      Logger::writeTableHeader( performer, result.getTableHeader() );
-      Logger::writeTableRow( performer, result.getRowElements() );
-
-      return this->baseTime;
-   }
-
-   template< typename Device,
-             typename ResetFunction,
-             typename ComputeFunction >
-   inline double
-   time( ResetFunction reset,
-         const String & performer,
-         ComputeFunction & compute )
-   {
-      BenchmarkResult< Logger > result;
-      return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result );
-   }
-
-   /****
-    * The same methods as above but without reset function
-    */
-   template< typename Device,
-             typename ComputeFunction >
-   double
-   time( const String & performer,
-         ComputeFunction & compute,
-         BenchmarkResult< Logger > & result )
-   {
-      result.time = std::numeric_limits<double>::quiet_NaN();
-      result.stddev = std::numeric_limits<double>::quiet_NaN();
-      FunctionTimer< Device > functionTimer;
-      try {
-         if( Logger::verbose > 1 ) {
-            // run the monitor main loop
-            Solvers::SolverMonitorThread monitor_thread( monitor );
-            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor );
-         }
-         else {
-            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, Logger::verbose, monitor );
-         }
-      }
-      catch ( const std::exception& e ) {
-         std::cerr << "Function timer failed due to a C++ exception with description: " << e.what() << std::endl;
-      }
-
-      result.bandwidth = datasetSize / result.time;
-      result.speedup = this->baseTime / result.time;
-      if( this->baseTime == 0.0 )
-         this->baseTime = result.time;
-
-      Logger::writeTableHeader( performer, result.getTableHeader() );
-      Logger::writeTableRow( performer, result.getRowElements() );
-
-      return this->baseTime;
-   }
-
-   template< typename Device,
-             typename ComputeFunction >
-   inline double
-   time( const String & performer,
-         ComputeFunction & compute )
-   {
-      BenchmarkResult< Logger > result;
-      return time< Device, ComputeFunction >( performer, compute, result );
-   }
-
-   // Adds an error message to the log. Should be called in places where the
-   // "time" method could not be called (e.g. due to failed allocation).
-   void
-   addErrorMessage( const char* msg,
-                    int numberOfComputations = 1 ) {
-      // each computation has 3 subcolumns
-      const int colspan = 3 * numberOfComputations;
-      Logger::writeErrorMessage( msg, colspan );
-      std::cerr << msg << std::endl;
-   }
-
-   using Logger::save;
-
-   SolverMonitorType& getMonitor() {
-      return monitor;
-   }
-
-   int getPerformedLoops() const {
-      return this->performedLoops;
-   }
-
-   bool isResetingOn() const {
-      return reset;
-   }
-
-protected:
-   int loops = 1, performedLoops = 0;
-   double minTime = 0.0;
-   double datasetSize = 0.0;
-   double baseTime = 0.0;
-   bool reset = true;
-   SolverMonitorType monitor;
+   public:
+      using typename Logger::MetadataElement;
+      using typename Logger::MetadataMap;
+      using typename Logger::MetadataColumns;
+      using SolverMonitorType = Solvers::IterativeSolverMonitor< double, int >;
+
+      using typename Logger::CommonLogs;
+      using Logger::addCommonLogs;
+      using Logger::addLogsMetadata;
+      using Logger::writeHeader;
+
+      Benchmark( int loops = 10,
+               bool verbose = true,
+               String outputMode = "",
+               bool logFileAppend = false );
+
+      static void configSetup( Config::ConfigDescription& config );
+
+      void setup( const Config::ParameterContainer& parameters );
+
+      // TODO: ensure that this is not called in the middle of the benchmark
+      // (or just remove it completely?)
+      void setLoops( int loops );
+
+      void setMinTime( const double& minTime );
+
+      // Marks the start of a new benchmark
+      void newBenchmark( const String & title );
+
+      // Marks the start of a new benchmark (with custom metadata)
+      void newBenchmark( const String & title,
+                        MetadataMap metadata );
+
+      // Sets metadata columns -- values used for all subsequent rows until
+      // the next call to this function.
+      void setMetadataColumns( const MetadataColumns & metadata );
+
+      // TODO: maybe should be renamed to createVerticalGroup and ensured that vertical and horizontal groups are not used within the same "Benchmark"
+      // Sets current operation -- operations expand the table vertically
+      //  - baseTime should be reset to 0.0 for most operations, but sometimes
+      //    it is useful to override it
+      //  - Order of operations inside a "Benchmark" does not matter, rows can be
+      //    easily sorted while converting to HTML.)
+      void
+      setOperation( const String & operation,
+                  const double datasetSize = 0.0, // in GB
+                  const double baseTime = 0.0 );
+
+      void setOperation( const double datasetSize = 0.0,
+                        const double baseTime = 0.0 );
+
+      // Creates new horizontal groups inside a benchmark -- increases the number
+      // of columns in the "Benchmark", implies column spanning.
+      // (Useful e.g. for SpMV formats, different configurations etc.)
+      void
+      createHorizontalGroup( const String & name,
+                           int subcolumns );
+
+      // Times a single ComputeFunction. Subsequent calls implicitly split
+      // the current "horizontal group" into sub-columns identified by
+      // "performer", which are further split into "bandwidth", "time" and
+      // "speedup" columns.
+      // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation)
+      // Also terminates the recursion of the following variadic template.
+      template< typename Device,
+               typename ResetFunction,
+               typename ComputeFunction >
+      double time( ResetFunction reset,
+                  const String & performer,
+                  ComputeFunction & compute,
+                  BenchmarkResult< Logger > & result );
+
+      template< typename Device,
+               typename ResetFunction,
+               typename ComputeFunction >
+      inline double time( ResetFunction reset,
+                        const String & performer,
+                        ComputeFunction & compute );
+      /*{
+         BenchmarkResult< Logger > result;
+         return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result );
+      }*/
+
+      /****
+       * The same methods as above but without reset function
+       */
+      template< typename Device,
+               typename ComputeFunction >
+      double time( const String & performer,
+                  ComputeFunction & compute,
+                  BenchmarkResult< Logger > & result );
+
+      template< typename Device,
+               typename ComputeFunction >
+      inline double time( const String & performer,
+                        ComputeFunction & compute );
+
+      // Adds an error message to the log. Should be called in places where the
+      // "time" method could not be called (e.g. due to failed allocation).
+      void addErrorMessage( const char* msg,
+                           int numberOfComputations = 1 );
+
+      using Logger::save;
+
+      SolverMonitorType& getMonitor();
+
+      int getPerformedLoops() const;
+
+      bool isResetingOn() const;
+
+   protected:
+
+      int loops = 1, performedLoops = 0;
+
+      double minTime = 0.0;
+
+      double datasetSize = 0.0;
+
+      double baseTime = 0.0;
+
+      bool reset = true;
+
+      SolverMonitorType monitor;
 };
 
 
@@ -396,3 +247,5 @@ inline typename Benchmark< Logger >::MetadataMap getHardwareMetadata()
 
 } // namespace Benchmarks
 } // namespace TNL
+
+#include <Benchmarks/Benchmark.hpp>
diff --git a/src/Benchmarks/JsonLogging.h b/src/Benchmarks/JsonLogging.h
index 964a8bbc7..87c7d251a 100644
--- a/src/Benchmarks/JsonLogging.h
+++ b/src/Benchmarks/JsonLogging.h
@@ -89,8 +89,9 @@ public:
    using RowElements = JsonLoggingRowElements;
 
    JsonLogging( int verbose = true,
-                String outputMode = "" )
-   : verbose(verbose), outputMode( outputMode )
+                String outputMode = "",
+                bool logFileAppend = false )
+   : verbose(verbose), outputMode( outputMode ), logFileAppend( logFileAppend )
    {}
 
    void
@@ -313,6 +314,7 @@ protected:
 
    bool lineStarted = false;
    bool resultsStarted = false;
+   bool logFileAppend = false;
 };
 
 } // namespace Benchmarks
diff --git a/src/Benchmarks/Logging.h b/src/Benchmarks/Logging.h
index 75e843c9f..2246da558 100644
--- a/src/Benchmarks/Logging.h
+++ b/src/Benchmarks/Logging.h
@@ -87,7 +87,8 @@ public:
    using RowElements = LoggingRowElements;
 
    Logging( int verbose = true,
-            String outputMode = "" )
+            String outputMode = "",
+            bool logFileAppend = false )
    : verbose(verbose), outputMode( outputMode )
    {}
 
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index de805536c..66b4f034a 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -74,7 +74,7 @@ setupConfig( Config::ConfigDescription & config )
    config.addEntry< bool >( "with-symmetric-matrices", "Perform benchmark even for symmetric matrix formats.", true );
    config.addEntry< bool >( "with-legacy-matrices", "Perform benchmark even for legacy TNL matrix formats.", true );
    config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv::" + getCurrDateTime() + ".log");
-   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
+   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "append" );
    config.addEntryEnum( "append" );
    config.addEntryEnum( "overwrite" );
    config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
@@ -101,7 +101,7 @@ main( int argc, char* argv[] )
    // FIXME: When ./tnl-benchmark-spmv-dbg is called without parameters:
    //           * The guide on what parameters to use prints twice.
    // FIXME: When ./tnl-benchmark-spmv-dbg is called with '--help':
-   //           * The guide on what parameter to use print once. 
+   //           * The guide on what parameter to use print once.
    //              But then it CRASHES due to segfault:
    //              The program attempts to get unknown parameter openmp-enabled
    //              Aborting the program.
@@ -124,16 +124,24 @@ main( int argc, char* argv[] )
    const int verboseMR = parameters.getParameter< int >( "verbose-MReader" );
 
    // open log file
-   bool exist = std::experimental::filesystem::exists(logFileName.getString());
-   if( ! exist )
-      outputMode = "";
+   bool logFileAppend( false );
+   if( std::experimental::filesystem::exists(logFileName.getString()) )
+   {
+      logFileAppend = true;
+      std::cout << "Log file " << logFileName << "exists and ";
+      if( outputMode == "append" )
+         std::cout << "new logs will be appended." << std::endl;
+      else
+         std::cout << "will be overwritten." << std::endl;
+   }
+
    auto mode = std::ios::out;
    if( outputMode == "append" )
        mode |= std::ios::app;
    std::ofstream logFile( logFileName.getString(), mode );
 
    // init benchmark and common metadata
-   TNL::Benchmarks::SpMV::BenchmarkType benchmark( loops, verbose, outputMode );
+   TNL::Benchmarks::SpMV::BenchmarkType benchmark( loops, verbose, outputMode, logFileAppend );
 
    // prepare global metadata
    TNL::Benchmarks::SpMV::BenchmarkType::MetadataMap metadata = getHardwareMetadata< Logging >();
-- 
GitLab


From f45610549268e0823db54f3c4b41894332bdc23c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 7 Jun 2021 20:33:29 +0200
Subject: [PATCH 068/117] Deleting useless script file.

---
 src/Benchmarks/scripts/tnl-run-spmv-benchmark | 501 ------------------
 1 file changed, 501 deletions(-)
 delete mode 100755 src/Benchmarks/scripts/tnl-run-spmv-benchmark

diff --git a/src/Benchmarks/scripts/tnl-run-spmv-benchmark b/src/Benchmarks/scripts/tnl-run-spmv-benchmark
deleted file mode 100755
index a20c179d7..000000000
--- a/src/Benchmarks/scripts/tnl-run-spmv-benchmark
+++ /dev/null
@@ -1,501 +0,0 @@
-#!/usr/bin/env bash
-                
-DEBUG="no"
-FORMAT_TEST="yes"
-STOP_TIME="1"
-MAX_ITERATIONS="10"
-export CUDA_PROFILE=0
-
-PWD=`pwd`
-IWD="$PWD"
-BASE="ftp://math.nist.gov/pub/MatrixMarket2/Harwell-Boeing/"
-SPARSE_MATRIX_BENCHMARK="tnl-sparse-matrix-benchmark"
-SPARSE_MATRIX_BENCHMARK_DBG="tnl-sparse-matrix-benchmark-dbg"
-#SPARSE_MATRIX_BENCHMARK="tnl-sparse-matrix-benchmark-dbg"
-
-export CUDA_PROFILE_CONFIG="$IWD/cuda-profiler.conf"
-PROCESS_CUDA_PROFILE="$IWD/process-cuda-profile.pl"
-source matrix-market
-source florida-matrix-market
-
-write_header()
-{
-   echo "<html>" > $1
-   echo "   <body>" >> $1
-   echo "      <table border=1>" >> $1
-   echo "          <tr>" >> $1
-   echo "             <td rowspan=4 colspan=4 align=center>Matrix</td>" >> $1
-   echo "             <td rowspan=4 colspan=2 align=center>CSR</td>" >> $1
-   echo "             <td rowspan=4 colspan=3 align=center>Cusparse</td>" >> $1
-   echo "             <td rowspan=4 colspan=3 align=center>Hybrid</td>" >> $1
-   echo "             <td colspan=68 align=center>Row-Grouped CSR</td>" >> $1   
-   echo "             <td colspan=68 align=center>Row-Grouped CSR with rows sorted decreasingly by the number of the nonzeros</td>" >> $1
-   echo "             <td colspan=120 align=center>Adaptive Row-Grouped CSR</td>" >> $1   
-   echo "          </tr>" >> $1
-   
-   echo "          <tr>" >> $1
-   echo "             <td colspan=17>Group Size = 16</td>" >> $1       # RgCSR
-   echo "             <td colspan=17>Group Size = 32</td>" >> $1
-   echo "             <td colspan=17>Group Size = 64</td>" >> $1
-   echo "             <td colspan=17>Group Size Variable</td>" >> $1    # RgCSR adaptive group size
-   echo "             <td colspan=17>Group Size = 16</td>" >> $1       # RgCSR rows sorted decreasingly
-   echo "             <td colspan=17>Group Size = 32</td>" >> $1
-   echo "             <td colspan=17>Group Size = 64</td>" >> $1
-   echo "             <td colspan=17>Group Size Variable</td>" >> $1      # RgCSR rows sorted decreasingly, adaptive group size
-   echo "             <td rowspan=2 colspan=20>Chunk Size = 1</td>" >> $1          # Adaptive RgCSR 
-   echo "             <td rowspan=2 colspan=20>Chunk Size = 2</td>" >> $1
-   echo "             <td rowspan=2 colspan=20>Chunk Size = 4</td>" >> $1
-   echo "             <td rowspan=2 colspan=20>Chunk Size = 8</td>" >> $1
-   echo "             <td rowspan=2 colspan=20>Chunk Size = 16</td>" >> $1
-   echo "             <td rowspan=2 colspan=20>Chunk Size = 32</td>" >> $1   
-   echo "          </tr>" >> $1
-   
-   echo "          <tr>" >> $1
-   echo "             <td rowspan=2 colspan=2></td>" >> $1                         # RgCSR format with the group size = 16
-   echo "             <td rowspan=2 colspan=3>CPU</td>" >> $1
-   echo "             <td colspan=12>GPU</td>" >> $1
-   echo "             <td rowspan=2 colspan=2></td>" >> $1                         # RgCSR format with the group size = 32
-   echo "             <td rowspan=2 colspan=3>CPU</td>" >> $1
-   echo "             <td colspan=12>GPU</td>" >> $1   
-   echo "             <td rowspan=2 colspan=2></td>" >> $1                         # RgCSR format with the group size = 64
-   echo "             <td rowspan=2 colspan=3>CPU</td>" >> $1
-   echo "             <td colspan=12>GPU</td>" >> $1
-
-   echo "             <td rowspan=2 colspan=2></td>" >> $1                         # RgCSR format with the variable group size 
-   echo "             <td rowspan=2 colspan=3>CPU</td>" >> $1
-   echo "             <td colspan=12>GPU</td>" >> $1
-   
-   echo "             <td rowspan=2 colspan=2></td>" >> $1                         # RgCSR (sorted rows) format with the group size = 16
-   echo "             <td rowspan=2 colspan=3>CPU</td>" >> $1
-   echo "             <td colspan=12>GPU</td>" >> $1
-   echo "             <td rowspan=2 colspan=2></td>" >> $1                         # RgCSR (sorted rows) format with the group size = 32
-   echo "             <td rowspan=2 colspan=3>CPU</td>" >> $1
-   echo "             <td colspan=12>GPU</td>" >> $1   
-   echo "             <td rowspan=2 colspan=2></td>" >> $1                         # RgCSR (sorted rows) format with the group size = 64
-   echo "             <td rowspan=2 colspan=3>CPU</td>" >> $1
-   echo "             <td colspan=12>GPU</td>" >> $1
-      
-   echo "             <td rowspan=2 colspan=2></td>" >> $1                         # RgCSR (sorted rows) format with variable group size
-   echo "             <td rowspan=2 colspan=3>CPU</td>" >> $1
-   echo "             <td colspan=12>GPU</td>" >> $1
-   
-#   echo "             <td colspan=20>GPU</td>" >> $1                     # Adaptive RgCSR format with the chunk size 1
-#   echo "             <td colspan=20>GPU</td>" >> $1                     # Adaptive RgCSR format with the chunk size 2
-#   echo "             <td colspan=20>GPU</td>" >> $1                     # Adaptive RgCSR format with the chunk size 4
-#   echo "             <td colspan=20>GPU</td>" >> $1                     # Adaptive RgCSR format with the chunk size 8
-#   echo "             <td colspan=20>GPU</td>" >> $1                     # Adaptive RgCSR format with the chunk size 16
-#   echo "             <td colspan=20>GPU</td>" >> $1                     # Adaptive RgCSR format with the chunk size 32
-   
-   
-   
-                  
-   echo "          </tr>" >> $1
-   
-   echo "          <tr>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 32</td>" >> $1        # RgCSR format with the group size = 16
-   echo "             <td colspan=3>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 256</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 32</td>" >> $1        # RgCSR format with the group size = 32
-   echo "             <td colspan=3>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 256</td>" >> $1         
-   echo "             <td colspan=3>CUDA Block Size = 32</td>" >> $1        # RgCSR format with the group size = 64
-   echo "             <td colspan=3>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 256</td>" >> $1
-   
-   echo "             <td colspan=3>CUDA Block Size = 32</td>" >> $1        # RgCSR format with the group size cca 16
-   echo "             <td colspan=3>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 256</td>" >> $1
-   
-   
-   echo "             <td colspan=3>CUDA Block Size = 32</td>" >> $1        # RgCSR (sorted rows) format with the group size = 16
-   echo "             <td colspan=3>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 256</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 32</td>" >> $1        # RgCSR (sorted rows) format with the group size = 32
-   echo "             <td colspan=3>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 256</td>" >> $1         
-   echo "             <td colspan=3>CUDA Block Size = 32</td>" >> $1        # RgCSR (sorted rows) format with the group size = 64
-   echo "             <td colspan=3>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 256</td>" >> $1
-   
-   echo "             <td colspan=3>CUDA Block Size = 32</td>" >> $1        # RgCSR (sorted rows) format with the group size >= 16
-   echo "             <td colspan=3>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=3>CUDA Block Size = 256</td>" >> $1
-   
-   echo "             <td colspan=5>CUDA Block Size = 32</td>" >> $1        # Adaptive RgCSR format with the chunk size = 1
-   echo "             <td colspan=5>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 256</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 32</td>" >> $1        # Adaptive RgCSR format with the chunk size = 2
-   echo "             <td colspan=5>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 256</td>" >> $1         
-   echo "             <td colspan=5>CUDA Block Size = 32</td>" >> $1        # Adaptive RgCSR format with the chunk size = 4
-   echo "             <td colspan=5>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 256</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 32</td>" >> $1        # Adaptive RgCSR format with the chunk size = 8
-   echo "             <td colspan=5>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 256</td>" >> $1  
-   echo "             <td colspan=5>CUDA Block Size = 32</td>" >> $1        # Adaptive RgCSR format with the chunk size = 16
-   echo "             <td colspan=5>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 256</td>" >> $1  
-   echo "             <td colspan=5>CUDA Block Size = 32</td>" >> $1        # Adaptive RgCSR format with the chunk size = 32
-   echo "             <td colspan=5>CUDA Block Size = 64</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 128</td>" >> $1
-   echo "             <td colspan=5>CUDA Block Size = 256</td>" >> $1  
-     
-            
-   echo "          </tr>" >> $1      
-         
-   echo "          <tr>" >> $1
-   echo "             <td>Name</td>" >> $1                      # Matrix description
-   echo "             <td>Size</td>" >> $1
-   echo "             <td>NonZeros No.</td>" >> $1
-   echo "             <td>NonZeros %</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1                 # CSR format on CPU
-   echo "             <td>GFlops</td>" >> $1                    
-
-   echo "             <td>Time/sec.</td>" >> $1                 # Cusparse
-   echo "             <td>GFlops</td>" >> $1                    
-   echo "             <td>Speed-up</td>" >> $1
-   
-   echo "             <td>Time/sec.</td>" >> $1                 # Hybrid format Bell, Garland on GPU
-   echo "             <td>GFlops</td>" >> $1                    
-   echo "             <td>Speed-up</td>" >> $1
-   
-   echo "             <td>Info</td>" >> $1                      # RgCSR format with the group size = 16
-   echo "             <td>Artificial Zeros</td>" >> $1    
-   echo "             <td>Time/sec.</td>" >> $1  
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1      
-   echo "             <td>Info</td>" >> $1                      # RgCSR format with the group size = 32
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1      
-   echo "             <td>Info</td>" >> $1                      # RgCSR format with the group size = 64   
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1          
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   
-   echo "             <td>Info</td>" >> $1                        # RgCSR format with the group size variable
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1      
-   
-   echo "             <td>Info</td>" >> $1                        # RgCSR (sorted rows) format with the group size = 16
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1      
-   echo "             <td>Info</td>" >> $1                        # RgCSR (sorted rows) format with the group size = 32   
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1      
-   echo "             <td>Info</td>" >> $1                         # RgCSR (sorted rows) format with the group size = 64   
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   
-   echo "             <td>Info</td>" >> $1                          # RgCSR (sorted rows) format with the group size variable
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1      
-   
-   echo "             <td>Info</td>" >> $1                         # Adaptive RgCSR format with the chunk size = 1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1                         # Adaptive RgCSR format with the chunk size = 2   
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1                        # Adaptive RgCSR format with the chunk size = 4   
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1                        # Adaptive RgCSR format with the chunk size = 8   
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1                        # Adaptive RgCSR format with the chunk size = 16   
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1                        # Adaptive RgCSR format with the chunk size = 32   
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1
-   echo "             <td>Info</td>" >> $1
-   echo "             <td>Artificial Zeros</td>" >> $1
-   echo "             <td>Time/sec.</td>" >> $1   
-   echo "             <td>GFlops</td>" >> $1
-   echo "             <td>Speed-up</td>" >> $1   
-}
-
-write_closing()
-{
-   echo "      </table>" >> $1
-   echo "   </body>" >> $1
-   echo "</html>" >> $1
-}
-
-write_header sparse-matrix-benchmark-float.log.html
-write_header sparse-matrix-benchmark-double.log.html
-write_header sparse-matrix-benchmark-amd-float.log.html
-write_header sparse-matrix-benchmark-amd-double.log.html
-
-for link in $MM_MATRICES;
-do
-   echo "###############################################################################################"
-   matrix=matrices`echo $link | sed 's/ftp:\/\/math.nist.gov\/pub//'`
-   unzipped_matrix=`echo $matrix | sed 's/.gz//'`
-   if test ! -e $matrix;
-   then
-      echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first."
-      #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log            
-   else
-      gunzip -c ${matrix} > ${unzipped_matrix}      
-      echo "Checking with the matrix $unzipped_matrix in single precision ..."
-      export CUDA_PROFILE_LOG=$unzipped_matrix.float.log
-      if test x$DEBUG = xyes;
-      then
-         gdb --args ${SPARSE_MATRIX_BENCHMARK_DBG} --input-mtx-file $unzipped_matrix --log-file sparse-matrix-benchmark-float.log.html --stop-time $STOP_TIME --max-iterations $MAX_ITERATIONS --verbose 1
-      else
-         $SPARSE_MATRIX_BENCHMARK --input-mtx-file $unzipped_matrix --pdf-file $unzipped_matrix.pdf --log-file sparse-matrix-benchmark-float.log.html --stop-time $STOP_TIME --max-iterations $MAX_ITERATIONS --format-test $FORMAT_TEST --verbose 1
-      fi
-      #perl $PROCESS_CUDA_PROFILE $unzipped_matrix.float.log sparse-matrix-profiling-float.log          
-   fi
-done
-
-for link in $FLORIDA_MM_MATRICES;
-do
-   matrix=matrices`echo $link | sed 's/http:\/\/www.cise.ufl.edu\/research\/sparse//'`
-   if test ! -e $matrix;
-   then      
-      echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first."
-      #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log
-   else
-     DIRNAME=`dirname $matrix`
-     FILENAME=`basename $matrix`
-     cd $DIRNAME
-     tar zxvf $FILENAME
-     cd $IWD
-     SUBDIRNAME=`echo $FILENAME | sed 's/.tar.gz//'`
-     rm -f $DIRNAME/$SUBDIRNAME/*_b.mtx # these are usualy in array format
-     for file in $DIRNAME/$SUBDIRNAME/*.mtx;
-     do
-         echo "###############################################################################################"
-         echo "Checking with the matrix $file ..."
-         $SPARSE_MATRIX_BENCHMARK --input-file $file.float.bin.bz2 --input-mtx-file $file --pdf-file $file.pdf --log-file sparse-matrix-benchmark-float.log.html --stop-time $STOP_TIME --max-iterations $MAX_ITERATIONS --format-test $FORMAT_TEST --verbose 1                        
-     done
-   fi
-done
-
-write_closing sparse-matrix-benchmark-float.log.html
-write_closing sparse-matrix-benchmark-double.log.html
-write_closing sparse-matrix-benchmark-amd-float.log.html
-write_closing sparse-matrix-benchmark-amd-double.log.html   
-
-- 
GitLab


From 7f3e53bd623a1840e45979b6e74f1b3f91169a2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 7 Jun 2021 20:33:58 +0200
Subject: [PATCH 069/117] Fixed JSON loging of SpMV benchmark.

---
 src/Benchmarks/JsonLogging.h                  | 79 +++----------------
 src/Benchmarks/SpMV/spmv.h                    |  4 +-
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h      | 19 ++++-
 src/Benchmarks/scripts/run-tnl-benchmark-spmv |  2 +-
 4 files changed, 31 insertions(+), 73 deletions(-)

diff --git a/src/Benchmarks/JsonLogging.h b/src/Benchmarks/JsonLogging.h
index 87c7d251a..58549d339 100644
--- a/src/Benchmarks/JsonLogging.h
+++ b/src/Benchmarks/JsonLogging.h
@@ -102,23 +102,15 @@ public:
 
    void addCommonLogs( const CommonLogs& logs )
    {
-      //if( this->lineStarted )
-      //   log << "," << std::endl;
-      //log << "   \"benchmarks\" : [" << std::endl;
       this->commonLogs = logs;
-      //int idx( 0 );
       for( auto lg : logs )
       {
          if( verbose )
             std::cout << lg.first << " = " << lg.second << std::endl;
-         //if( idx++ > 0 )
-         //   log << "," << std::endl;
-         //log << "      \"" << lg.first << "\" : \"" << lg.second << "\"";
-         //this->lineStarted = true;
       }
    };
 
-   void resetLogsMetadat() { this->logsMetadata.clear(); };
+   void resetLogsMetada() { this->logsMetadata.clear(); };
 
    void addLogsMetadata( const std::vector< String >& md )
    {
@@ -141,17 +133,15 @@ public:
       if( this->lineStarted )
          log << "," << std::endl;
 
-      log << "         {" << std::endl;
+      log << "      {" << std::endl;
 
       // write common logs
       int idx( 0 );
       for( auto lg : this->commonLogs )
       {
-         //if( verbose )
-         //   std::cout << lg.first << " = " << lg.second << std::endl;
          if( idx++ > 0 )
             log << "," << std::endl;
-         log << "      \"" << lg.first << "\" : \"" << lg.second << "\"";
+         log << "         \"" << lg.first << "\" : \"" << lg.second << "\"";
       }
 
       auto md = this->logsMetadata.begin();
@@ -161,9 +151,9 @@ public:
             std::cout << el << "\t";
          if( idx++ > 0 )
             log << "," << std::endl;
-         log << "          \"" << *md++ << "\" : \"" << el << "\"";
+         log << "         \"" << *md++ << "\" : \"" << el << "\"";
       }
-      log << std::endl << "         }";
+      log << std::endl << "      }";
       this->lineStarted = true;
       if( verbose )
          std::cout << std::endl;
@@ -177,18 +167,13 @@ public:
 
       if( verbose )
          std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
-      log << "   \"title\" : \"" << title << "\"";
-      this->lineStarted = true;
    }
 
    void
    writeMetadata( const MetadataMap & metadata )
    {
       if( outputMode == "append" )
-      {
-         this->lineStarted = true;
          return;
-      }
 
       if( verbose )
          std::cout << "properties:" << std::endl;
@@ -197,13 +182,7 @@ public:
       for( auto & it : metadata ) {
          if( verbose )
             std::cout << "   " << it.first << " = " << it.second << std::endl;
-         if( idx++ > 0 )
-            log << "," << std::endl;
-         log << "   \"" << it.first << "\" : \"" << it.second << "\"";
-         //this->lineStarted = true;
       }
-      log << "," << std::endl << "      \"results\" : [ " << std::endl;
-      this->lineStarted = false;
 
       if( verbose )
          std::cout << std::endl;
@@ -227,56 +206,22 @@ public:
                       int colspan = 1 )
    {
       log << "\"error\" : \"" << msg << "\"" << std::endl;
-      // initial indent string
-      /*header_indent = "!";
-      log << std::endl;
-      for( auto & it : metadataColumns ) {
-         log << header_indent << " " << it.first << std::endl;
-      }
-
-      // make sure there is a header column for the message
-      if( horizontalGroups.size() == 0 )
-         horizontalGroups.push_back( {"", 1} );
-
-      // dump stacked spanning columns
-      while( horizontalGroups.back().second <= 0 ) {
-         horizontalGroups.pop_back();
-         header_indent.pop_back();
-      }
-      for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
-         if( horizontalGroups[ i ].second > 0 ) {
-            log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
-            header_indent += "!";
-         }
-      }
-      if( horizontalGroups.size() > 0 ) {
-         horizontalGroups.back().second -= colspan;
-         header_indent.pop_back();
-      }
-
-      // only when changed (the header has been already adjusted)
-      // print each element on separate line
-      for( auto & it : metadataColumns ) {
-         log << it.second << std::endl;
-      }
-      log << msg << std::endl;
-      */
    }
 
    void
    closeTable()
    {
-      //log << std::endl << "   ]" << std::endl;
-      //log << "," << std::endl;
-      //header_indent = body_indent = "";
-      //header_changed = true;
-      //horizontalGroups.clear();
    }
 
    bool save( std::ostream & logFile )
    {
-      closeTable();
-      logFile << log.str();
+      if( ! this->logFileAppend )
+      {
+         logFile << "{" << std::endl;
+         logFile << "   \"results\" : [ " << std::endl;
+      }
+      else
+         logFile << log.str();
       if( logFile.good() ) {
          log.str() = "";
          return true;
diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index d0d79c4f1..ee0f551aa 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -41,8 +41,8 @@
 
 // Comment the following to turn off some groups of SpMV benchmarks and speed-up the compilation
 #define WITH_TNL_BENCHMARK_SPMV_GENERAL_MATRICES
-#define WITH_TNL_BENCHMARK_SPMV_SYMMETRIC_MATRICES
-#define WITH_TNL_BENCHMARK_SPMV_LEGACY_FORMATS
+//#define WITH_TNL_BENCHMARK_SPMV_SYMMETRIC_MATRICES
+//#define WITH_TNL_BENCHMARK_SPMV_LEGACY_FORMATS
 
 // Uncomment the following line to enable benchmarking the sandbox sparse matrix.
 //#define WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index 66b4f034a..0954558fd 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -70,13 +70,14 @@ void
 setupConfig( Config::ConfigDescription & config )
 {
    config.addDelimiter( "Benchmark settings:" );
-   config.addRequiredEntry< String >( "input-file", "Input file name." );
+   config.addEntry< String >( "input-file", "Input file name.", "" );
    config.addEntry< bool >( "with-symmetric-matrices", "Perform benchmark even for symmetric matrix formats.", true );
    config.addEntry< bool >( "with-legacy-matrices", "Perform benchmark even for legacy TNL matrix formats.", true );
    config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv::" + getCurrDateTime() + ".log");
-   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "append" );
+   config.addEntry< String >( "output-mode", "Mode for opening the log file - 'close' will only finalize the log file.", "append" );
    config.addEntryEnum( "append" );
    config.addEntryEnum( "overwrite" );
+   config.addEntryEnum( "close" );
    config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
    config.addEntryEnum( "float" );
    config.addEntryEnum( "double" );
@@ -124,11 +125,23 @@ main( int argc, char* argv[] )
    const int verboseMR = parameters.getParameter< int >( "verbose-MReader" );
 
    // open log file
+   if( outputMode == "close" )
+   {
+      std::fstream file;
+      file.open( logFileName.getString(), std::ios::out | std::ios::app );
+      file << std::endl << "   ]" << std::endl << "}";
+      return EXIT_SUCCESS;
+   }
+   if( inputFileName == "" )
+   {
+      std::cerr << "ERROR: Input file name is required." << std::endl;
+      return EXIT_FAILURE;
+   }
    bool logFileAppend( false );
    if( std::experimental::filesystem::exists(logFileName.getString()) )
    {
       logFileAppend = true;
-      std::cout << "Log file " << logFileName << "exists and ";
+      std::cout << "Log file " << logFileName << " exists and ";
       if( outputMode == "append" )
          std::cout << "new logs will be appended." << std::endl;
       else
diff --git a/src/Benchmarks/scripts/run-tnl-benchmark-spmv b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
index adccbb0aa..c307ec84c 100755
--- a/src/Benchmarks/scripts/run-tnl-benchmark-spmv
+++ b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
@@ -73,4 +73,4 @@ do
      done
    fi
 done
-
+$BENCHMARK --log-file log-files/sparse-matrix-benchmark.log --output-mode close --verbose 1
-- 
GitLab


From 2c0a8123ec4e5c849b6d89627595b4bf851cef4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 8 Jun 2021 13:21:36 +0200
Subject: [PATCH 070/117] Improved logging of SpMV benchmark.

---
 src/Benchmarks/Benchmarks.h                   |  7 +-
 src/Benchmarks/JsonLogging.h                  | 21 +++---
 src/Benchmarks/LinearSolvers/benchmarks.h     | 10 ++-
 src/Benchmarks/Logging.h                      |  6 +-
 src/Benchmarks/SpMV/SpmvBenchmarkResult.h     | 12 +++-
 src/Benchmarks/SpMV/spmv.h                    | 65 ++-----------------
 src/Benchmarks/scripts/run-tnl-benchmark-spmv |  8 +--
 7 files changed, 48 insertions(+), 81 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 7d89100bd..77fa9e47c 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -47,7 +47,12 @@ struct BenchmarkResult
 
    virtual HeaderElements getTableHeader() const
    {
-      return HeaderElements({ "time", "stddev", "stddev/time", "bandwidth", "speedup" });
+      return HeaderElements( {
+         std::pair< String, int >( "time", 8 ),
+         std::pair< String, int >( "stddev", 8 ),
+         std::pair< String, int >( "stddev/time", 8 ),
+         std::pair< String, int >( "bandwidth", 8 ),
+         std::pair< String, int >( "speedup", 8 ) } );
    }
 
    virtual RowElements getRowElements() const
diff --git a/src/Benchmarks/JsonLogging.h b/src/Benchmarks/JsonLogging.h
index 58549d339..7d9817c65 100644
--- a/src/Benchmarks/JsonLogging.h
+++ b/src/Benchmarks/JsonLogging.h
@@ -83,9 +83,9 @@ public:
    using MetadataColumns = std::vector<MetadataElement>;
 
    using CommonLogs = std::vector< std::pair< const char*, String > >;
-   using LogsMetadata = std::vector< String >;
+   using LogsMetadata = std::vector< std::pair< String, int > >;
 
-   using HeaderElements = std::vector< String >;
+   using HeaderElements = std::vector< std::pair< String, int > >;
    using RowElements = JsonLoggingRowElements;
 
    JsonLogging( int verbose = true,
@@ -103,16 +103,18 @@ public:
    void addCommonLogs( const CommonLogs& logs )
    {
       this->commonLogs = logs;
-      for( auto lg : logs )
+      if( verbose )
       {
-         if( verbose )
-            std::cout << lg.first << " = " << lg.second << std::endl;
+         std::cout << std::endl << "Benchmark setup:" << std::endl;
+         for( auto lg : logs )
+            std::cout << "   " << lg.first << " = " << lg.second << std::endl;
+         std::cout << std::endl;
       }
    };
 
    void resetLogsMetada() { this->logsMetadata.clear(); };
 
-   void addLogsMetadata( const std::vector< String >& md )
+   void addLogsMetadata( const std::vector< std::pair< String, int > >& md )
    {
       this->logsMetadata.insert( this->logsMetadata.end(), md.begin(), md.end() );
    }
@@ -122,7 +124,7 @@ public:
       if( verbose )
       {
          for( auto md : this->logsMetadata )
-            std::cout << md << "\t";
+            std::cout << std::setw( md.second ) << md.first;
          std::cout << std::endl;
       }
    }
@@ -148,10 +150,10 @@ public:
       for( auto el : rowEls )
       {
          if( verbose )
-            std::cout << el << "\t";
+            std::cout << std::setw( md->second ) << el;
          if( idx++ > 0 )
             log << "," << std::endl;
-         log << "         \"" << *md++ << "\" : \"" << el << "\"";
+         log << "         \"" << md++->first << "\" : \"" << el << "\"";
       }
       log << std::endl << "      }";
       this->lineStarted = true;
@@ -178,7 +180,6 @@ public:
       if( verbose )
          std::cout << "properties:" << std::endl;
 
-      int idx( this->lineStarted );
       for( auto & it : metadata ) {
          if( verbose )
             std::cout << "   " << it.first << " = " << it.second << std::endl;
diff --git a/src/Benchmarks/LinearSolvers/benchmarks.h b/src/Benchmarks/LinearSolvers/benchmarks.h
index b7f4fded6..59d2ab3de 100644
--- a/src/Benchmarks/LinearSolvers/benchmarks.h
+++ b/src/Benchmarks/LinearSolvers/benchmarks.h
@@ -145,7 +145,15 @@ benchmarkSolver( Benchmark<>& benchmark,
 
       virtual HeaderElements getTableHeader() const override
       {
-         return HeaderElements({"time", "stddev", "stddev/time", "speedup", "converged", "iterations", "residue_precond", "residue_true"});
+         return HeaderElements( {
+            std::pair< String, int >( "time", 8 ),
+            std::pair< String, int >( "stddev", 8 ),
+            std::pair< String, int >( "stddev/time", 8 ),
+            std::pair< String, int >( "speedup", 8 ),
+            std::pair< String, int >( "converged", 8 ),
+            std::pair< String, int >( "iterations", 8 ),
+            std::pair< String, int >( "residue_precond", 8 ),
+            std::pair< String, int >( "residue_true", 8 ) } );
       }
 
       virtual RowElements getRowElements() const override
diff --git a/src/Benchmarks/Logging.h b/src/Benchmarks/Logging.h
index 2246da558..2c8262d21 100644
--- a/src/Benchmarks/Logging.h
+++ b/src/Benchmarks/Logging.h
@@ -83,7 +83,7 @@ public:
 
    using CommonLogs = std::vector< std::pair< const char*, String > >;
 
-   using HeaderElements = std::vector< String >;
+   using HeaderElements = std::vector< std::pair< String, int > >;
    using RowElements = LoggingRowElements;
 
    Logging( int verbose = true,
@@ -148,7 +148,7 @@ public:
          std::cout << std::setw( 15 ) << "";
 
          for( auto & it : subElements ) {
-            std::cout << std::setw( 15 ) << it;
+            std::cout << std::setw( 15 ) << it.first;
          }
          std::cout << std::endl;
 
@@ -177,7 +177,7 @@ public:
 
       log << header_indent << " " << spanningElement << std::endl;
       for( auto & it : subElements ) {
-         log << header_indent << "! " << it << std::endl;
+         log << header_indent << "! " << it.first << std::endl;
       }
 
       if( horizontalGroups.size() > 0 ) {
diff --git a/src/Benchmarks/SpMV/SpmvBenchmarkResult.h b/src/Benchmarks/SpMV/SpmvBenchmarkResult.h
index 6e7d3c77d..3c34c8a18 100644
--- a/src/Benchmarks/SpMV/SpmvBenchmarkResult.h
+++ b/src/Benchmarks/SpMV/SpmvBenchmarkResult.h
@@ -44,7 +44,17 @@ struct SpmvBenchmarkResult
 
    virtual HeaderElements getTableHeader() const override
    {
-      return HeaderElements( {"format", "device", "non-zeros", "time", "stddev", "stddev/time", "bandwidth", "speedup", "CSR Diff.Max", "CSR Diff.L2"} );
+      return HeaderElements( {
+         std::pair< String, int >( "format", 30 ),
+         std::pair< String, int >( "device", 12 ),
+         std::pair< String, int >( "non-zeros", 12 ),
+         std::pair< String, int >( "time", 12 ),
+         std::pair< String, int >( "stddev", 12 ),
+         std::pair< String, int >( "stddev/time", 14 ),
+         std::pair< String, int >( "bandwidth", 12 ),
+         std::pair< String, int >( "speedup", 12 ),
+         std::pair< String, int >( "CSR Diff.Max", 14 ),
+         std::pair< String, int >( "CSR Diff.L2", 14 ) } );
    }
 
    void setFormat( const String& format ) { this->format = format; };
diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index ee0f551aa..05ce46304 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -41,8 +41,8 @@
 
 // Comment the following to turn off some groups of SpMV benchmarks and speed-up the compilation
 #define WITH_TNL_BENCHMARK_SPMV_GENERAL_MATRICES
-//#define WITH_TNL_BENCHMARK_SPMV_SYMMETRIC_MATRICES
-//#define WITH_TNL_BENCHMARK_SPMV_LEGACY_FORMATS
+#define WITH_TNL_BENCHMARK_SPMV_SYMMETRIC_MATRICES
+#define WITH_TNL_BENCHMARK_SPMV_LEGACY_FORMATS
 
 // Uncomment the following line to enable benchmarking the sandbox sparse matrix.
 //#define WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX
@@ -220,17 +220,6 @@ std::string getFormatShort( const Matrix& matrix )
     return format;
 }
 
-// Print information about the matrix.
-/*template< typename Matrix >
-void printMatrixInfo( const Matrix& matrix,
-                      std::ostream& str )
-{
-    str << "\n Format: " << getMatrixFormat( matrix ) << std::endl;
-    str << " Rows: " << matrix.getRows() << std::endl;
-    str << " Cols: " << matrix.getColumns() << std::endl;
-    str << " Nonzero Elements: " << matrix.getNumberOfNonzeroMatrixElements() << std::endl;
-}*/
-
 template< typename Real,
           template< typename, typename, typename > class Matrix,
           template< typename, typename, typename, typename > class Vector = Containers::Vector >
@@ -250,12 +239,6 @@ benchmarkSpMVLegacy( BenchmarkType& benchmark,
 
    SpMV::ReferenceFormats::Legacy::LegacyMatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR );
 
-   /*benchmark.setMetadataColumns( BenchmarkType::MetadataColumns({
-         { "matrix name", convertToString( inputFileName ) },
-         { "rows", convertToString( hostMatrix.getRows() ) },
-         { "columns", convertToString( hostMatrix.getColumns() ) },
-         { "matrix format", MatrixInfo< HostMatrix >::getFormat() }
-      } ));*/
    const int elements = hostMatrix.getNonzeroElementsCount();
    const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
    benchmark.setOperation( datasetSize );
@@ -295,7 +278,6 @@ benchmarkSpMVLegacy( BenchmarkType& benchmark,
    SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
    benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
  #endif
-   // std::cout << std::endl;
 }
 
 template< typename Real,
@@ -325,12 +307,6 @@ benchmarkSpMV( BenchmarkType& benchmark,
       return;
    }
 
-   /*benchmark.setMetadataColumns( BenchmarkType::MetadataColumns({
-         { "matrix name", convertToString( inputFileName ) },
-         { "rows", convertToString( hostMatrix.getRows() ) },
-         { "columns", convertToString( hostMatrix.getColumns() ) },
-         { "matrix format", MatrixInfo< HostMatrix >::getFormat() }
-      } ));*/
    const int elements = hostMatrix.getNonzeroElementsCount();
    const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
    benchmark.setOperation( datasetSize );
@@ -371,7 +347,6 @@ benchmarkSpMV( BenchmarkType& benchmark,
    SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
    benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
  #endif
-   // std::cout << std::endl;
 }
 
 template< typename Real = double,
@@ -425,13 +400,6 @@ benchmarkSpmv( BenchmarkType& benchmark,
       { "rows", convertToString( csrHostMatrix.getRows() ) },
       { "columns", convertToString( csrHostMatrix.getColumns() ) } } ) );
 
-   /*benchmark.setMetadataColumns( BenchmarkType::MetadataColumns({
-         { "matrix name", convertToString( inputFileName ) },
-         { "rows", convertToString( csrHostMatrix.getRows() ) },
-         { "columns", convertToString( csrHostMatrix.getColumns() ) },
-         { "matrix format", String( "CSR" ) }
-      } ));*/
-
    HostVector hostInVector( csrHostMatrix.getRows() ), hostOutVector( csrHostMatrix.getRows() );
 
    auto resetHostVectors = [&]() {
@@ -452,13 +420,6 @@ benchmarkSpmv( BenchmarkType& benchmark,
    ////
    // Perform benchmark on CUDA device with cuSparse as a reference GPU format
    //
-   /*benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-         { "matrix name", convertToString( inputFileName ) },
-         { "rows", convertToString( csrHostMatrix.getRows() ) },
-         { "columns", convertToString( csrHostMatrix.getColumns() ) },
-         { "matrix format", String( "cuSparse" ) }
-      } ));*/
-
    cusparseHandle_t cusparseHandle;
    cusparseCreate( &cusparseHandle );
 
@@ -487,12 +448,6 @@ benchmarkSpmv( BenchmarkType& benchmark,
    // Perform benchmark on CUDA device with CSR5 as a reference GPU format
    //
    cudaBenchmarkResults.setFormat( String( "CSR5" ) );
-   /*benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-      { "matrix name", convertToString( inputFileName ) },
-      { "rows", convertToString( csrHostMatrix.getRows() ) },
-      { "columns", convertToString( csrHostMatrix.getColumns() ) },
-      { "matrix format", String( "CSR5" ) }
-   } ));*/
 
    CudaVector cudaOutVector2( cudaOutVector );
    CSR5Benchmark::CSR5Benchmark< CSRCudaMatrix > csr5Benchmark( csrCudaMatrix, cudaInVector, cudaOutVector );
@@ -510,12 +465,6 @@ benchmarkSpmv( BenchmarkType& benchmark,
    // Perform benchmark on CUDA device with LightSpMV as a reference GPU format
    //
    cudaBenchmarkResults.setFormat( String( "LightSpMV Vector" ) );
-   /*benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-      { "matrix name", convertToString( inputFileName ) },
-      { "rows", convertToString( csrHostMatrix.getRows() ) },
-      { "columns", convertToString( csrHostMatrix.getColumns() ) },
-      { "matrix format", String( "LightSpMV Vector" ) }
-   } ));*/
 
    LightSpMVCSRHostMatrix lightSpMVCSRHostMatrix;
    lightSpMVCSRHostMatrix = csrHostMatrix;
@@ -530,12 +479,6 @@ benchmarkSpmv( BenchmarkType& benchmark,
    benchmark.time< Devices::Cuda >( resetLightSpMVVectors, "GPU", spmvLightSpMV, cudaBenchmarkResults );
 
    cudaBenchmarkResults.setFormat( String( "LightSpMV Warp" ) );
-   /*benchmark.setMetadataColumns( Benchmark::MetadataColumns({
-      { "matrix name", convertToString( inputFileName ) },
-      { "rows", convertToString( csrHostMatrix.getRows() ) },
-      { "columns", convertToString( csrHostMatrix.getColumns() ) },
-      { "matrix format", String( "LightSpMV Warp" ) }
-   } ));*/
    lightSpMVBenchmark.setKernelType( LightSpMVBenchmarkKernelWarp );
    benchmark.time< Devices::Cuda >( resetLightSpMVVectors, "GPU", spmvLightSpMV, cudaBenchmarkResults );
 #endif
@@ -625,6 +568,6 @@ benchmarkSpmv( BenchmarkType& benchmark,
 #endif
 }
 
-} // namespace SpMVLegacy
-} // namespace Benchmarks
+      } // namespace SpMVLegacy
+   } // namespace Benchmarks
 } // namespace TNL
diff --git a/src/Benchmarks/scripts/run-tnl-benchmark-spmv b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
index c307ec84c..ecf17d53b 100755
--- a/src/Benchmarks/scripts/run-tnl-benchmark-spmv
+++ b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
@@ -27,7 +27,7 @@ source florida-matrix-market
 #      #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log
 #   else
 #      gunzip -c ${matrix} > ${unzipped_matrix}
-#      echo "Benchmarking with the matrix $unzipped_matrix ..."
+#      #echo "Benchmarking with the matrix $unzipped_matrix ..."
 #      export CUDA_PROFILE_LOG=$unzipped_matrix.float.log
 #      if test x$DEBUG = xyes;
 #      then
@@ -61,9 +61,9 @@ do
      for file in $DIRNAME/$SUBDIRNAME/*.mtx;
      do
          echo "======================================================================================================"
-         echo "Benchmarking with the matrix $file ..."
-	 mtx_file_name=`basename $file`
-	 mtx_file_name=${mtx_file_name%.mtx}
+         #echo "Benchmarking with the matrix $file ..."
+	      mtx_file_name=`basename $file`
+	      mtx_file_name=${mtx_file_name%.mtx}
          if test x$DEBUG = xyes;
          then
             gdb --args ${BENCHMARK_DBG} --input-file $file --log-file log-files/sparse-matrix-benchmark.log --output-mode append --verbose 1
-- 
GitLab


From d0cb19f1301e24a72eec79df78177dafb851cb84 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 8 Jun 2021 14:35:26 +0200
Subject: [PATCH 071/117] Added SpMV benchmark of binary matrices.

---
 src/Benchmarks/SpMV/SpmvBenchmarkResult.h |  5 +-
 src/Benchmarks/SpMV/spmv.h                | 92 ++++++++++++++++++++++-
 src/TNL/Matrices/MatrixInfo.h             | 13 +++-
 3 files changed, 104 insertions(+), 6 deletions(-)

diff --git a/src/Benchmarks/SpMV/SpmvBenchmarkResult.h b/src/Benchmarks/SpMV/SpmvBenchmarkResult.h
index 3c34c8a18..61fae4f60 100644
--- a/src/Benchmarks/SpMV/SpmvBenchmarkResult.h
+++ b/src/Benchmarks/SpMV/SpmvBenchmarkResult.h
@@ -18,6 +18,7 @@ namespace Benchmarks {
 template< typename Real,
           typename Device,
           typename Index,
+          typename ResultReal = Real,
           typename Logger = JsonLogging >
 struct SpmvBenchmarkResult
 : public BenchmarkResult< Logger >
@@ -26,7 +27,7 @@ struct SpmvBenchmarkResult
    using DeviceType = Device;
    using IndexType = Index;
    using HostVector = Containers::Vector< Real, Devices::Host, Index >;
-   using BenchmarkVector = Containers::Vector< Real, Device, Index >;
+   using BenchmarkVector = Containers::Vector< ResultReal, Device, Index >;
 
    using typename BenchmarkResult< Logger >::HeaderElements;
    using typename BenchmarkResult< Logger >::RowElements;
@@ -45,7 +46,7 @@ struct SpmvBenchmarkResult
    virtual HeaderElements getTableHeader() const override
    {
       return HeaderElements( {
-         std::pair< String, int >( "format", 30 ),
+         std::pair< String, int >( "format", 35 ),
          std::pair< String, int >( "device", 12 ),
          std::pair< String, int >( "non-zeros", 12 ),
          std::pair< String, int >( "time", 12 ),
diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 05ce46304..72fd553a9 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -42,6 +42,7 @@
 // Comment the following to turn off some groups of SpMV benchmarks and speed-up the compilation
 #define WITH_TNL_BENCHMARK_SPMV_GENERAL_MATRICES
 #define WITH_TNL_BENCHMARK_SPMV_SYMMETRIC_MATRICES
+#define WITH_TNL_BENCHMARK_SPMV_BINARY_MATRICES
 #define WITH_TNL_BENCHMARK_SPMV_LEGACY_FORMATS
 
 // Uncomment the following line to enable benchmarking the sandbox sparse matrix.
@@ -349,6 +350,75 @@ benchmarkSpMV( BenchmarkType& benchmark,
  #endif
 }
 
+template< typename Real,
+          typename InputMatrix,
+          template< typename, typename, typename > class Matrix,
+          template< typename, typename, typename, typename > class Vector = Containers::Vector >
+void
+benchmarkBinarySpMV( BenchmarkType& benchmark,
+                     const InputMatrix& inputMatrix,
+                     const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector,
+                     const String& inputFileName,
+                     bool verboseMR )
+{
+   using HostMatrix = Matrix< bool, TNL::Devices::Host, int >;
+   using CudaMatrix = Matrix< bool, TNL::Devices::Cuda, int >;
+   using HostVector = Containers::Vector< Real, Devices::Host, int >;
+   using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;
+
+   HostMatrix hostMatrix;
+   try
+   {
+      hostMatrix = inputMatrix;
+   }
+   catch(const std::exception& e)
+   {
+      std::cerr << "Unable to convert the matrix to the target format." << std::endl;
+      return;
+   }
+
+   const int elements = hostMatrix.getNonzeroElementsCount();
+   const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+   benchmark.setOperation( datasetSize );
+
+   /////
+   // Benchmark SpMV on host
+   //
+   HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() );
+
+   auto resetHostVectors = [&]() {
+      hostInVector = 1.0;
+      hostOutVector = 0.0;
+   };
+
+   auto spmvHost = [&]() {
+      hostMatrix.vectorProduct( hostInVector, hostOutVector );
+
+   };
+   SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
+   benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
+
+   /////
+   // Benchmark SpMV on CUDA
+   //
+#ifdef HAVE_CUDA
+   CudaMatrix cudaMatrix;
+   cudaMatrix = inputMatrix;
+   CudaVector cudaInVector( hostMatrix.getColumns() ), cudaOutVector( hostMatrix.getRows() );
+
+   auto resetCudaVectors = [&]() {
+      cudaInVector = 1.0;
+      cudaOutVector = 0.0;
+   };
+
+   auto spmvCuda = [&]() {
+      cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
+   };
+   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
+   benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
+ #endif
+}
+
 template< typename Real = double,
           typename Index = int >
 void
@@ -378,9 +448,9 @@ benchmarkSpmv( BenchmarkType& benchmark,
    using CusparseMatrix = TNL::CusparseCSR< Real >;
 #endif
 
-
    using HostVector = Containers::Vector< Real, Devices::Host, int >;
    using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;
+   using BinaryHostVector = Containers::Vector< int, Devices::Host, int >;
 
    CSRHostMatrix csrHostMatrix;
 
@@ -526,6 +596,16 @@ benchmarkSpmv( BenchmarkType& benchmark,
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack                >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_ChunkedEllpack               >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_BiEllpack                    >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
+#ifdef WITH_TNL_BENCHMARK_SPMV_BINARY_MATRICES
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar             >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector             >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Hybrid             >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive           >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_Ellpack                >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack          >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_ChunkedEllpack         >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_BiEllpack              >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
+#endif
 #ifdef WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX
    benchmarkSpMV< Real, HostMatrixType, SparseSandboxMatrix                       >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
 #endif
@@ -564,6 +644,16 @@ benchmarkSpmv( BenchmarkType& benchmark,
       benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack                >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
       benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack               >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
       benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack                    >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
+#ifdef WITH_TNL_BENCHMARK_SPMV_BINARY_MATRICES
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive           >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack                >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack          >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack         >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack              >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
+#endif
    }
 #endif
 }
diff --git a/src/TNL/Matrices/MatrixInfo.h b/src/TNL/Matrices/MatrixInfo.h
index 716423884..820a4d8e5 100644
--- a/src/TNL/Matrices/MatrixInfo.h
+++ b/src/TNL/Matrices/MatrixInfo.h
@@ -67,10 +67,17 @@ struct MatrixInfo< SparseMatrixView< Real, Device, Index, MatrixType, SegmentsVi
 
    static String getFormat()
    {
+      String prefix;
       if( MatrixType::isSymmetric() )
-         return TNL::String( "Symmetric " ) + SegmentsView< Device, Index >::getSegmentsType();
-      else
-         return SegmentsView< Device, Index >::getSegmentsType();
+      {
+         if( std::is_same< Real, bool >::value )
+            prefix = "Symmetric binary ";
+         else
+            prefix = "Symmetric ";
+      }
+      else if( std::is_same< Real, bool >::value )
+         prefix = "Binary ";
+      return prefix + SegmentsView< Device, Index >::getSegmentsType();
    };
 };
 
-- 
GitLab


From 71476a177178329090d1070e2f4246f070d8cc56 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 9 Jun 2021 13:56:11 +0200
Subject: [PATCH 072/117] Enabling symmetric binary sparse matrices - it seems
 to work well. Unit tests need to created in future.

---
 src/TNL/Matrices/SparseMatrix.h     | 2 +-
 src/TNL/Matrices/SparseMatrixView.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index 3b139d941..2deccd645 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -54,7 +54,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
    static_assert(
          ! MatrixType::isSymmetric() ||
          ! std::is_same< Device, Devices::Cuda >::value ||
-         ( std::is_same< Real, float >::value || std::is_same< Real, double >::value || std::is_same< Real, int >::value || std::is_same< Real, long long int >::value ),
+         ( std::is_same< Real, float >::value || std::is_same< Real, double >::value || std::is_same< Real, int >::value || std::is_same< Real, long long int >::value || std::is_same< Real, bool >::value ),
          "Given Real type is not supported by atomic operations on GPU which are necessary for symmetric operations." );
 
    public:
diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index c59a79690..376ceab5c 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -67,7 +67,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
    static_assert(
       ! MatrixType::isSymmetric() ||
       ! std::is_same< Device, Devices::Cuda >::value ||
-      ( std::is_same< Real, float >::value || std::is_same< Real, double >::value || std::is_same< Real, int >::value || std::is_same< Real, long long int >::value ),
+      ( std::is_same< Real, float >::value || std::is_same< Real, double >::value || std::is_same< Real, int >::value || std::is_same< Real, long long int >::value || std::is_same< Real, bool >::value ),
       "Given Real type is not supported by atomic operations on GPU which are necessary for symmetric operations." );
 
    public:
-- 
GitLab


From f32bb18d9ac8a104c7169d52acdd1669dbed5def Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 9 Jun 2021 13:56:59 +0200
Subject: [PATCH 073/117] Disabling various versions of legacy CSR Light SpMV
 benchmarks.

---
 src/Benchmarks/SpMV/spmv.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 72fd553a9..a57719aa8 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -563,12 +563,12 @@ benchmarkSpmv( BenchmarkType& benchmark,
       using namespace Benchmarks::SpMV::ReferenceFormats;
       benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Scalar             >( benchmark, hostOutVector, inputFileName, verboseMR );
       benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Vector             >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light              >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light2             >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light3             >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light4             >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light5             >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light6             >( benchmark, hostOutVector, inputFileName, verboseMR );
+      //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light              >( benchmark, hostOutVector, inputFileName, verboseMR );
+      //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light2             >( benchmark, hostOutVector, inputFileName, verboseMR );
+      //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light3             >( benchmark, hostOutVector, inputFileName, verboseMR );
+      //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light4             >( benchmark, hostOutVector, inputFileName, verboseMR );
+      //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light5             >( benchmark, hostOutVector, inputFileName, verboseMR );
+      //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light6             >( benchmark, hostOutVector, inputFileName, verboseMR );
       benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Adaptive           >( benchmark, hostOutVector, inputFileName, verboseMR );
       benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_MultiVector        >( benchmark, hostOutVector, inputFileName, verboseMR );
       benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_LightWithoutAtomic >( benchmark, hostOutVector, inputFileName, verboseMR );
-- 
GitLab


From 5e86335704fb58b6dee951127a5f9b59330725fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 11 Jun 2021 12:32:40 +0200
Subject: [PATCH 074/117] Added exception handeling to SpMV benchmark.

---
 src/Benchmarks/SpMV/spmv.h | 82 +++++++++++++++++++-------------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index a57719aa8..3cd831e6d 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -186,41 +186,6 @@ using SparseMatrixLegacy_CSR_LightWithoutAtomic = Benchmarks::SpMV::ReferenceFor
 template< typename Real, typename Device, typename Index >
 using SlicedEllpackAlias = Benchmarks::SpMV::ReferenceFormats::Legacy::SlicedEllpack< Real, Device, Index >;
 
-// Get the name (with extension) of input matrix file
-std::string getMatrixFileName( const String& InputFileName )
-{
-    std::string fileName = InputFileName;
-
-    const size_t last_slash_idx = fileName.find_last_of( "/\\" );
-    if( std::string::npos != last_slash_idx )
-        fileName.erase( 0, last_slash_idx + 1 );
-
-    return fileName;
-}
-
-// Get only the name of the format from getType()
-template< typename Matrix >
-std::string getMatrixFormat( const Matrix& matrix )
-{
-    std::string mtrxFullType = getType( matrix );
-    std::string mtrxType = mtrxFullType.substr( 0, mtrxFullType.find( "<" ) );
-    std::string format = mtrxType.substr( mtrxType.find( ':' ) + 2 );
-
-    return format;
-}
-
-template< typename Matrix >
-std::string getFormatShort( const Matrix& matrix )
-{
-    std::string mtrxFullType = getType( matrix );
-    std::string mtrxType = mtrxFullType.substr( 0, mtrxFullType.find( "<" ) );
-    std::string format = mtrxType.substr( mtrxType.find( ':' ) + 2 );
-    format = format.substr( format.find(':') + 2);
-    format = format.substr( 0, 3 );
-
-    return format;
-}
-
 template< typename Real,
           template< typename, typename, typename > class Matrix,
           template< typename, typename, typename, typename > class Vector = Containers::Vector >
@@ -238,7 +203,15 @@ benchmarkSpMVLegacy( BenchmarkType& benchmark,
    HostMatrix hostMatrix;
    CudaMatrix cudaMatrix;
 
-   SpMV::ReferenceFormats::Legacy::LegacyMatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR );
+   try
+   {
+      SpMV::ReferenceFormats::Legacy::LegacyMatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR );
+   }
+   catch(const std::exception& e)
+   {
+      std::cerr << "Unable to read the matrix: " << e.what() << std::endl;
+      return;
+   }
 
    const int elements = hostMatrix.getNonzeroElementsCount();
    const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
@@ -265,7 +238,16 @@ benchmarkSpMVLegacy( BenchmarkType& benchmark,
    // Benchmark SpMV on CUDA
    //
 #ifdef HAVE_CUDA
-   cudaMatrix = hostMatrix;
+   try
+   {
+      cudaMatrix = hostMatrix;
+   }
+   catch(const std::exception& e)
+   {
+      std::cerr << "Unable to copy the matrix on GPU: " << e.what() << std::endl;
+      return;
+   }
+
    CudaVector cudaInVector( hostMatrix.getColumns() ), cudaOutVector( hostMatrix.getRows() );
 
    auto resetCudaVectors = [&]() {
@@ -304,7 +286,7 @@ benchmarkSpMV( BenchmarkType& benchmark,
    }
    catch(const std::exception& e)
    {
-      std::cerr << "Unable to convert the matrix to the target format." << std::endl;
+      std::cerr << "Unable to convert the matrix to the target format:"  << e.what() << std::endl;
       return;
    }
 
@@ -334,7 +316,16 @@ benchmarkSpMV( BenchmarkType& benchmark,
    //
 #ifdef HAVE_CUDA
    CudaMatrix cudaMatrix;
-   cudaMatrix = inputMatrix;
+   try
+   {
+      cudaMatrix = inputMatrix;
+   }
+   catch(const std::exception& e)
+   {
+      std::cerr << "Unable to copy the matrix on GPU:" << e.what() << std::endl;
+      return;
+   }
+
    CudaVector cudaInVector( hostMatrix.getColumns() ), cudaOutVector( hostMatrix.getRows() );
 
    auto resetCudaVectors = [&]() {
@@ -373,7 +364,7 @@ benchmarkBinarySpMV( BenchmarkType& benchmark,
    }
    catch(const std::exception& e)
    {
-      std::cerr << "Unable to convert the matrix to the target format." << std::endl;
+      std::cerr << "Unable to convert the matrix to the target format:" << e.what() << std::endl;
       return;
    }
 
@@ -403,7 +394,16 @@ benchmarkBinarySpMV( BenchmarkType& benchmark,
    //
 #ifdef HAVE_CUDA
    CudaMatrix cudaMatrix;
-   cudaMatrix = inputMatrix;
+   try
+   {
+      cudaMatrix = inputMatrix;
+   }
+   catch(const std::exception& e)
+   {
+      std::cerr << "Unable to copy the matrix on GPU:" << e.what() << std::endl;
+      return;
+   }
+
    CudaVector cudaInVector( hostMatrix.getColumns() ), cudaOutVector( hostMatrix.getRows() );
 
    auto resetCudaVectors = [&]() {
-- 
GitLab


From a34cb1113c437f1e3c6b180d3e2b11fbf8015c0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 11 Jun 2021 12:35:11 +0200
Subject: [PATCH 075/117] Fixing and refactoring SpMV benchmark script.

---
 src/Benchmarks/scripts/run-tnl-benchmark-spmv | 58 +++++++++----------
 1 file changed, 28 insertions(+), 30 deletions(-)

diff --git a/src/Benchmarks/scripts/run-tnl-benchmark-spmv b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
index ecf17d53b..0c75634e4 100755
--- a/src/Benchmarks/scripts/run-tnl-benchmark-spmv
+++ b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
@@ -7,37 +7,36 @@ export CUDA_PROFILE=0
 PWD=`pwd`
 IWD="$PWD"
 BASE="ftp://math.nist.gov/pub/MatrixMarket2/Harwell-Boeing/"
-BENCHMARK="tnl-benchmark-spmv"
-BENCHMARK_DBG="tnl-benchmark-spmv-dbg"
+BENCHMARK="tnl-benchmark-spmv --with-legacy-matrices yes --precision double"
+BENCHMARK_DBG="tnl-benchmark-spmv-dbg --with-legacy-matrices no"
 
 export CUDA_PROFILE_CONFIG="$IWD/cuda-profiler.conf"
 PROCESS_CUDA_PROFILE="$IWD/process-cuda-profile.pl"
-#source matrix-market
+source matrix-market
+M_MATRICES=""
 source florida-matrix-market
+#FLORIDA_MM_MATRICES=""
 
-# !!!Matrices in MatrixMarket2 don't load properly, formatting issues with every file. MatrixReader fails.
-#for link in $MM_MATRICES;
-#do
-#   echo "======================================================================================================"
-#   matrix=matrices`echo $link | sed 's/ftp:\/\/math.nist.gov\/pub//'`
-#   unzipped_matrix=`echo $matrix | sed 's/.gz//'`
-#   if test ! -e $matrix;
-#   then
-#      echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first."
-#      #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log
-#   else
-#      gunzip -c ${matrix} > ${unzipped_matrix}
-#      #echo "Benchmarking with the matrix $unzipped_matrix ..."
-#      export CUDA_PROFILE_LOG=$unzipped_matrix.float.log
-#      if test x$DEBUG = xyes;
-#      then
-#         gdb --args ${BENCHMARK_DBG} --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --verbose 1
-#      else
-#         $BENCHMARK --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --verbose 1
-#      fi
-#      #perl $PROCESS_CUDA_PROFILE $unzipped_matrix.float.log sparse-matrix-profiling-float.log
-#   fi
-#done
+for link in $MM_MATRICES;
+do
+   echo "======================================================================================================"
+   matrix=matrices`echo $link | sed 's/ftp:\/\/math.nist.gov\/pub//'`
+   unzipped_matrix=`echo $matrix | sed 's/.gz//'`
+   if test ! -e $matrix;
+   then
+      echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first."
+   else
+      gunzip -c ${matrix} > ${unzipped_matrix}
+      echo "Benchmarking with the matrix $unzipped_matrix ..."
+      export CUDA_PROFILE_LOG=$unzipped_matrix.float.log
+      if test x$DEBUG = xyes;
+      then
+         gdb --args ${BENCHMARK_DBG} --input-file $unzipped_matrix --log-file log-files/sparse-matrix-benchmark.log --verbose 1
+      else
+         $BENCHMARK --input-file $unzipped_matrix --log-file log-files/sparse-matrix-benchmark.log --verbose 1
+      fi
+   fi
+done
 
 for link in $FLORIDA_MM_MATRICES;
 do
@@ -45,7 +44,6 @@ do
    if test ! -e $matrix;
    then
       echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first."
-      #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log
    else
      DIRNAME=`dirname $matrix`
      FILENAME=`basename $matrix`
@@ -61,9 +59,8 @@ do
      for file in $DIRNAME/$SUBDIRNAME/*.mtx;
      do
          echo "======================================================================================================"
-         #echo "Benchmarking with the matrix $file ..."
-	      mtx_file_name=`basename $file`
-	      mtx_file_name=${mtx_file_name%.mtx}
+         mtx_file_name=`basename $file`
+         mtx_file_name=${mtx_file_name%.mtx}
          if test x$DEBUG = xyes;
          then
             gdb --args ${BENCHMARK_DBG} --input-file $file --log-file log-files/sparse-matrix-benchmark.log --output-mode append --verbose 1
@@ -74,3 +71,4 @@ do
    fi
 done
 $BENCHMARK --log-file log-files/sparse-matrix-benchmark.log --output-mode close --verbose 1
+
-- 
GitLab


From cd398c0f8fd277727e3e56fa6495562d49f09151 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 11 Jun 2021 12:37:51 +0200
Subject: [PATCH 076/117] Trying different setup of CSR Hybrid kernel and
 fixing description in unint test of CSR Hybrid sparse matrix.

---
 src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.h   | 6 +++---
 src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp | 2 +-
 src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cpp   | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.h b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.h
index 138819352..c3271d776 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.h
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.h
@@ -22,13 +22,13 @@ namespace TNL {
 
 template< typename Index,
           typename Device,
-          int ThreadsInBlock = 256 >
+          int ThreadsInBlock = 128 >
 struct CSRHybridKernel
 {
    using IndexType = Index;
    using DeviceType = Device;
-   using ViewType = CSRHybridKernel< Index, Device >;
-   using ConstViewType = CSRHybridKernel< Index, Device >;
+   using ViewType = CSRHybridKernel< Index, Device, ThreadsInBlock >;
+   using ConstViewType = CSRHybridKernel< Index, Device, ThreadsInBlock >;
 
    template< typename Offsets >
    void init( const Offsets& offsets );
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
index 64f414cf8..07225cc4e 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
@@ -209,7 +209,7 @@ TNL::String
 CSRHybridKernel< Index, Device, ThreadsInBlock >::
 getKernelType()
 {
-    return "Hybrid";
+    return "Hybrid " + TNL::convertToString( ThreadsInBlock );
 }
 
 template< typename Index,
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cpp
index 5aef16abb..214ed2ca7 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          SparseMatrixVectorProductTest_CSRHybrid.cpp -  description
+                          SparseMatrixTest_CSRHybrid.cpp -  description
                              -------------------
     begin                : Jan 23, 2021
     copyright            : (C) 2021 by Tomas Oberhuber et al.
@@ -8,4 +8,4 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#include "SparseMatrixVectorProductTest_CSRHybrid.h"
+#include "SparseMatrixTest_CSRHybrid.h"
-- 
GitLab


From 14060b9f091d7c93ab0c7d1d5e3fc85c6da0ab8c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 11 Jun 2021 12:39:58 +0200
Subject: [PATCH 077/117] Small fixes in Python script for processing of SpMV
 benchmark results.

---
 .../scripts/tnl-spmv-benchmark-make-tables-json.py   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
index 1f497647f..fc156b4b6 100755
--- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
@@ -141,7 +141,7 @@ def cusparse_comparison( df, formats, head_size=10 ):
          head_df = filtered_df.head( head_size )
          for f in formats:
             if not f in ['cusparse','CSR',format]:
-               print( f"Droping {f}..." )
+               #print( f"Droping {f}..." )
                head_df.drop( labels=f, axis='columns', level=0, inplace=True )
          head_df.to_html( f"Cusparse-bw/{format}-head.html" )
 
@@ -167,10 +167,10 @@ def csr_comparison( df, formats, head_size=10 ):
          axs[1].set_ylabel( 'Bandwidth in GB/sec' )
          plt.savefig( f"CSR-bw/{format}.pdf")
          plt.close(fig)
-         head_df = filtered_df.head( head_size )
+         head_df = df.head( head_size )
          for f in formats:
             if not f in ['cusparse','CSR',format]:
-               print( f"Droping {f}..." )
+               #print( f"Droping {f}..." )
                head_df.drop( labels=f, axis='columns', level=0, inplace=True )
          head_df.to_html( f"CSR-bw/{format}-head.html" )
 
@@ -202,10 +202,10 @@ def legacy_formats_comparison( df, formats, head_size=10 ):
          axs[1].set_ylabel( 'Bandwidth in GB/sec' )
          plt.savefig( f"Legacy-bw/{ref_format}.pdf")
          plt.close(fig)
-         head_df = filtered_df.head( head_size )
+         head_df = df.head( head_size )
          for f in formats:
             if not f in ['cusparse','CSR',format]:
-               print( f"Droping {f}..." )
+               #print( f"Droping {f}..." )
                head_df.drop( labels=f, axis='columns', level=0, inplace=True )
          head_df.to_html( f"Legacy-bw/{format}-head.html" )
 
@@ -238,7 +238,7 @@ def cusparse_speedup_comparison( df, formats, head_size=10 ):
          head_df = filtered_df.head( head_size )
          for f in formats:
             if not f in ['cusparse','CSR',format]:
-               print( f"Droping {f}..." )
+               #print( f"Droping {f}..." )
                head_df.drop( labels=f, axis='columns', level=0, inplace=True )
          head_df.to_html( f"Cusparse-speed-up/{format}-head.html" )
 
-- 
GitLab


From 45ad3fa79451c194e26fc492a695cb7ed0efd7a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 11 Jun 2021 16:11:16 +0200
Subject: [PATCH 078/117] Implementing Light CSR kernel for segments.

---
 src/TNL/Algorithms/Segments/CSR.h             |   6 +-
 src/TNL/Algorithms/Segments/CSRView.h         |   8 +-
 .../Segments/Kernels/CSRLightKernel.h         |  64 +++
 .../Segments/Kernels/CSRLightKernel.hpp       | 422 ++++++++++++++++++
 src/UnitTests/Matrices/CMakeLists.txt         |   2 +
 .../Matrices/SparseMatrixTest_CSRLight.cpp    |  11 +
 .../Matrices/SparseMatrixTest_CSRLight.cu     |   1 +
 .../Matrices/SparseMatrixTest_CSRLight.h      |  46 ++
 ...SparseMatrixVectorProductTest_CSRLight.cpp |  11 +
 .../SparseMatrixVectorProductTest_CSRLight.cu |   1 +
 .../SparseMatrixVectorProductTest_CSRLight.h  |  46 ++
 11 files changed, 616 insertions(+), 2 deletions(-)
 create mode 100644 src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h
 create mode 100644 src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
 create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp
 create mode 120000 src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu
 create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cpp
 create mode 120000 src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cu
 create mode 100644 src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.h

diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index eebd186a6..8fba00b2a 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -546,6 +546,11 @@ template< typename Device,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
 using CSRHybrid = CSR< Device, Index, CSRHybridKernel< Index, Device >, IndexAllocator >;
 
+template< typename Device,
+          typename Index,
+          typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
+using CSRLight = CSR< Device, Index, CSRLightKernel< Index, Device >, IndexAllocator >;
+
 template< typename Device,
           typename Index,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
@@ -556,7 +561,6 @@ template< typename Device,
           typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
 using CSRDefault = CSRScalar< Device, Index, IndexAllocator >;
 
-
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index 8770f8ca8..884ed71cf 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -17,6 +17,7 @@
 #include <TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h>
 #include <TNL/Algorithms/Segments/Kernels/CSRVectorKernel.h>
 #include <TNL/Algorithms/Segments/Kernels/CSRHybridKernel.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.h>
 #include <TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.h>
 #include <TNL/Algorithms/Segments/SegmentsPrinting.h>
 
@@ -163,9 +164,14 @@ template< typename Device,
           typename Index >
 using CSRViewVector = CSRView< Device, Index, CSRVectorKernel< Index, Device > >;
 
+template< typename Device,
+          typename Index,
+          int ThreadsInBlock = 256 >
+using CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< Index, Device, ThreadsInBlock > >;
+
 template< typename Device,
           typename Index >
-using CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< Index, Device > >;
+using CSRViewLight = CSRView< Device, Index, CSRLightKernel< Index, Device > >;
 
 template< typename Device,
           typename Index >
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h
new file mode 100644
index 000000000..a3aa961b4
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h
@@ -0,0 +1,64 @@
+/***************************************************************************
+                          CSRLightKernel.h -  description
+                             -------------------
+    begin                : Jun 9, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+template< typename Index,
+          typename Device >
+struct CSRLightKernel
+{
+   using IndexType = Index;
+   using DeviceType = Device;
+   using ViewType = CSRLightKernel< Index, Device >;
+   using ConstViewType = CSRLightKernel< Index, Device >;
+
+   template< typename Offsets >
+   void init( const Offsets& offsets );
+
+   void reset();
+
+   ViewType getView();
+
+   ConstViewType getConstView() const;
+
+   static TNL::String getKernelType();
+
+   template< typename OffsetsView,
+             typename Fetch,
+             typename Reduction,
+             typename ResultKeeper,
+             typename Real >
+   void reduceSegments( const OffsetsView& offsets,
+                        Index first,
+                        Index last,
+                        Fetch& fetch,
+                        const Reduction& reduction,
+                        ResultKeeper& keeper,
+                        const Real& zero ) const;
+
+   protected:
+      int threadsPerSegment = 0;
+};
+
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
+
+#include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp>
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
new file mode 100644
index 000000000..e31ff3f43
--- /dev/null
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
@@ -0,0 +1,422 @@
+/***************************************************************************
+                          CSRLightKernel.hpp -  description
+                             -------------------
+    begin                : Jun 9, 2021 -> Joe Biden inauguration
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Segments {
+
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index,
+          typename OffsetsView,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+__global__
+void SpMVCSRLightWithoutAtomic2( OffsetsView offsets,
+                                 const Index first,
+                                 const Index last,
+                                 Fetch fetch,
+                                 Reduce reduce,
+                                 Keep keep,
+                                 const Real zero,
+                                 const Index gridID)
+{
+   const Index segmentIdx =
+      first + ( ( gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x ) / 2;
+   if( segmentIdx >= last )
+      return;
+
+   const Index inGroupID = threadIdx.x & 1; // & is cheaper than %
+   const Index maxID = offsets[ segmentIdx  + 1];
+
+   Real result = zero;
+   for( Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 2)
+      result = reduce( result, fetch( i, true ) );
+
+   /* Parallel reduction */
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) );
+
+   /* Write result */
+   if( inGroupID == 0 )
+      keep( segmentIdx, result );
+}
+
+template< typename Real,
+          typename Index,
+          typename OffsetsView,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+__global__
+void SpMVCSRLightWithoutAtomic4( OffsetsView offsets,
+                                 const Index first,
+                                 const Index last,
+                                 Fetch fetch,
+                                 Reduce reduce,
+                                 Keep keep,
+                                 const Real zero,
+                                 const Index gridID )
+{
+   const Index segmentIdx =
+      first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / 4;
+   if (segmentIdx >= last)
+      return;
+
+   const Index inGroupID = threadIdx.x & 3; // & is cheaper than %
+   const Index maxID = offsets[segmentIdx + 1];
+
+   Real result = zero;
+   for (Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 4)
+      result = reduce( result, fetch( i, true ) );
+
+   /* Parallel reduction */
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) );
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) );
+
+   /* Write result */
+   if( inGroupID == 0 )
+      keep( segmentIdx, result );
+
+}
+
+template< typename Real,
+          typename Index,
+          typename OffsetsView,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+__global__
+void SpMVCSRLightWithoutAtomic8( OffsetsView offsets,
+                                 const Index first,
+                                 const Index last,
+                                 Fetch fetch,
+                                 Reduce reduce,
+                                 Keep keep,
+                                 const Real zero,
+                                 const Index gridID)
+{
+   const Index segmentIdx =
+      first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / 8;
+   if (segmentIdx >= last)
+      return;
+
+   Index i;
+   const Index inGroupID = threadIdx.x & 7; // & is cheaper than %
+   const Index maxID = offsets[segmentIdx + 1];
+
+   Real result = zero;
+   for (i = offsets[segmentIdx] + inGroupID; i < maxID; i += 8)
+      result = reduce( result, fetch( i, true ) );
+
+   /* Parallel reduction */
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 4 ) );
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) );
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) );
+
+   /* Write result */
+   if( inGroupID == 0 )
+      keep( segmentIdx, result );
+}
+
+template< typename Real,
+          typename Index,
+          typename OffsetsView,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+__global__
+void SpMVCSRLightWithoutAtomic16( OffsetsView offsets,
+                                  const Index first,
+                                  const Index last,
+                                  Fetch fetch,
+                                  Reduce reduce,
+                                  Keep keep,
+                                  const Real zero,
+                                  const Index gridID )
+{
+   const Index segmentIdx =
+      first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x ) / 16;
+   if( segmentIdx >= last )
+      return;
+
+
+   Index i;
+   const Index inGroupID = threadIdx.x & 15; // & is cheaper than %
+   const Index maxID = offsets[segmentIdx + 1];
+
+   Real result = zero;
+   for( i = offsets[segmentIdx] + inGroupID; i < maxID; i += 16 )
+      result = reduce( result, fetch( i, true ) );
+
+   /* Parallel reduction */
+   result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) );
+   result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) );
+   result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) );
+   result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) );
+
+   /* Write result */
+   if( inGroupID == 0 )
+      keep( segmentIdx, result );
+}
+
+template< typename Real,
+          typename Index,
+          typename OffsetsView,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+__global__
+void SpMVCSRVector( OffsetsView offsets,
+                    const Index first,
+                    const Index last,
+                    Fetch fetch,
+                    Reduce reduce,
+                    Keep keep,
+                    const Real zero,
+                    const Index gridID )
+{
+   const int warpSize = 32;
+   const Index warpID = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize;
+   if (warpID >= last)
+      return;
+
+   Real result = zero;
+   const Index laneID = threadIdx.x & 31; // & is cheaper than %
+   Index endID = offsets[warpID + 1];
+
+   /* Calculate result */
+   for (Index i = offsets[warpID] + laneID; i < endID; i += warpSize)
+      result = reduce( result, fetch( i, true ) );
+
+   /* Reduction */
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) );
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  8 ) );
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  4 ) );
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  2 ) );
+   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  1 ) );
+   /* Write result */
+   if( laneID == 0 )
+      keep( warpID, result );
+}
+#endif
+template< typename Index,
+          typename Device,
+          typename Fetch,
+          typename Reduce,
+          typename Keep,
+          bool DispatchScalarCSR =
+            details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ||
+            std::is_same< Device, Devices::Host >::value >
+struct CSRLightKernelreduceSegmentsDispatcher;
+
+template< typename Index,
+          typename Device,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper >
+struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, true >
+{
+
+   template< typename Offsets,
+             typename Real >
+   static void reduce( const Offsets& offsets,
+                       Index first,
+                       Index last,
+                       Fetch& fetch,
+                       const Reduction& reduce,
+                       ResultKeeper& keep,
+                       const Real& zero,
+                       const Index threadsPerSegment )
+   {
+      TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
+         reduceSegments( offsets, first, last, fetch, reduce, keep, zero );
+   }
+};
+
+template< typename Index,
+          typename Device,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Keep, false >
+{
+   template< typename OffsetsView,
+             typename Real >
+   static void reduce( const OffsetsView& offsets,
+                       Index first,
+                       Index last,
+                       Fetch& fetch,
+                       const Reduce& reduce,
+                       Keep& keep,
+                       const Real& zero,
+                       const Index threadsPerSegment )
+   {
+#ifdef HAVE_CUDA
+      const int threads = 128;
+      Index blocks, groupSize;
+      //if (KernelType == CSRLightWithoutAtomic)
+      int  neededThreads = threadsPerSegment * ( last - first );
+      //else
+      //   neededThreads = rows * (threadsPerSegment > 32 ? 32 : threadsPerSegment);
+
+      /* Execute kernels on device */
+      for (Index grid = 0; neededThreads != 0; ++grid)
+      {
+         if( TNL::Cuda::getMaxGridXSize() * threads >= neededThreads)
+         {
+            blocks = roundUpDivision(neededThreads, threads);
+            neededThreads = 0;
+         }
+         else
+         {
+            blocks = TNL::Cuda::getMaxGridXSize();
+            neededThreads -= TNL::Cuda::getMaxGridXSize() * threads;
+         }
+
+         if (threadsPerSegment == 2)
+            SpMVCSRLightWithoutAtomic2<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         else if (threadsPerSegment == 4)
+            SpMVCSRLightWithoutAtomic4<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         else if (threadsPerSegment == 8)
+            SpMVCSRLightWithoutAtomic8<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         else if (threadsPerSegment == 16)
+            SpMVCSRLightWithoutAtomic16<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         else // if (threadsPerSegment == 32)
+         { // CSR SpMV Light with threadsPerSegment = 32 is CSR Vector
+            SpMVCSRVector<Real, Index, OffsetsView, Fetch, Reduce, Keep, warpSize ><<<blocks, threads>>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         }
+         /*else
+         { // Execute CSR MultiVector
+            SpMVCSRMultiVector<Real, Index, warpSize><<<blocks, threads>>>(
+                     inVector, outVector, matrix.getoffsets().getData(),
+                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
+                     rows, threadsPerSegment / 32, grid
+            );
+         }*/
+
+      }
+#endif
+
+   }
+};
+
+
+template< typename Index,
+          typename Device >
+    template< typename Offsets >
+void
+CSRLightKernel< Index, Device >::
+init( const Offsets& offsets )
+{
+   //const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount );
+   //this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ) ); //TNL::Cuda::getWarpSize() );
+
+   const Index segmentsCount = offsets.getSize() - 1;
+   //const Index threads = 128; // !!!!!!!!!!!!!!!!!!!!!! block size
+   size_t neededThreads = segmentsCount * 32;//warpSize;
+   Index blocks, threadsPerSegment;
+
+   const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row
+   if( elementsInSegment <= 2 )
+      threadsPerSegment = 2;
+   else if( elementsInSegment <= 4 )
+      threadsPerSegment = 4;
+   else if( elementsInSegment <= 8 )
+      threadsPerSegment = 8;
+   else if( elementsInSegment <= 16 )
+      threadsPerSegment = 16;
+   else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP)
+      threadsPerSegment = 32; // CSR Vector
+   //else
+   //   threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector
+
+   TNL_ASSERT_GE( threadsPerSegment, 0, "" );
+   TNL_ASSERT_LE( threadsPerSegment, 33, "" );
+
+}
+
+template< typename Index,
+          typename Device >
+void
+CSRLightKernel< Index, Device >::
+reset()
+{
+    this->threadsPerSegment = 0;
+}
+
+template< typename Index,
+          typename Device >
+auto
+CSRLightKernel< Index, Device >::
+getView() -> ViewType
+{
+    return *this;
+}
+
+template< typename Index,
+          typename Device >
+TNL::String
+CSRLightKernel< Index, Device >::
+getKernelType()
+{
+    return "Light";
+}
+
+template< typename Index,
+          typename Device >
+auto
+CSRLightKernel< Index, Device >::
+getConstView() const -> ConstViewType
+{
+    return *this;
+};
+
+
+template< typename Index,
+          typename Device >
+    template< typename OffsetsView,
+              typename Fetch,
+              typename Reduce,
+              typename Keep,
+              typename Real >
+void
+CSRLightKernel< Index, Device >::
+reduceSegments( const OffsetsView& offsets,
+                Index first,
+                Index last,
+                Fetch& fetch,
+                const Reduce& reduce,
+                Keep& keep,
+                const Real& zero ) const
+{
+   TNL_ASSERT_GE( this->threadsPerSegment, 0, "" );
+   TNL_ASSERT_LE( this->threadsPerSegment, 33, "" );
+   CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Keep >::reduce(
+      offsets, first, last, fetch, reduce, keep, zero, this->threadsPerSegment );
+}
+
+      } // namespace Segments
+   }  // namespace Algorithms
+} // namespace TNL
diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt
index fa8876993..2fe0f39ee 100644
--- a/src/UnitTests/Matrices/CMakeLists.txt
+++ b/src/UnitTests/Matrices/CMakeLists.txt
@@ -8,6 +8,7 @@ set( COMMON_TESTS
             SparseMatrixTest_CSRScalar
             SparseMatrixTest_CSRVector
             SparseMatrixTest_CSRHybrid
+            SparseMatrixTest_CSRLight
             SparseMatrixTest_CSRAdaptive
             SparseMatrixTest_Ellpack
             SparseMatrixTest_SlicedEllpack
@@ -16,6 +17,7 @@ set( COMMON_TESTS
             SparseMatrixVectorProductTest_CSRScalar
             SparseMatrixVectorProductTest_CSRVector
             SparseMatrixVectorProductTest_CSRHybrid
+            SparseMatrixVectorProductTest_CSRLight
             SparseMatrixVectorProductTest_CSRAdaptive
             SparseMatrixVectorProductTest_Ellpack
             SparseMatrixVectorProductTest_SlicedEllpack
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp
new file mode 100644
index 000000000..d6a3a41cd
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixTest_CSRLight.cpp -  description
+                             -------------------
+    begin                : Jun 9, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixTest_CSRLight.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu
new file mode 120000
index 000000000..e40135b9e
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu
@@ -0,0 +1 @@
+SparseMatrixTest_CSRLight.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h
new file mode 100644
index 000000000..ddd956a52
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h
@@ -0,0 +1,46 @@
+/***************************************************************************
+                          SparseMatrixTest_CSRLight.h -  description
+                             -------------------
+    begin                : Jun 9, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRLight_segments";
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixTest.h"
+#include "../main.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cpp b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cpp
new file mode 100644
index 000000000..274fa20b5
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cpp
@@ -0,0 +1,11 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRHybrid.cpp -  description
+                             -------------------
+    begin                : Jun 9, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include "SparseMatrixVectorProductTest_CSRHybrid.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cu b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cu
new file mode 120000
index 000000000..68e56b2ee
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.cu
@@ -0,0 +1 @@
+SparseMatrixVectorProductTest_CSRLight.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.h
new file mode 100644
index 000000000..eef049eac
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_CSRLight.h
@@ -0,0 +1,46 @@
+/***************************************************************************
+                          SparseMatrixVectorProductTest_CSRLight.h -  description
+                             -------------------
+    begin                : Jun 9, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <iostream>
+#include <TNL/Algorithms/Segments/CSR.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRLight_segments";
+
+// types for which MatrixTest is instantiated
+using MatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >
+#endif
+>;
+
+#endif
+
+#include "SparseMatrixVectorProductTest.h"
+#include "../main.h"
-- 
GitLab


From b9603592ffa8588ffdd39ba12009c040f4c6860f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 11 Jun 2021 16:49:22 +0200
Subject: [PATCH 079/117] Fixing Light CSR kernel.

---
 src/Benchmarks/SpMV/spmv.h                     |  9 ++++++++-
 .../Segments/Kernels/CSRLightKernel.hpp        | 18 +++++++++++-------
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 3cd831e6d..5663f01d2 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -76,6 +76,9 @@ using SparseMatrix_CSR_Vector = Matrices::SparseMatrix< Real, Device, Index, Mat
 template< typename Real, typename Device, typename Index >
 using SparseMatrix_CSR_Hybrid = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRHybrid >;
 
+template< typename Real, typename Device, typename Index >
+using SparseMatrix_CSR_Light = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRLight >;
+
 template< typename Real, typename Device, typename Index >
 using SparseMatrix_CSR_Adaptive = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRAdaptive >;
 
@@ -115,6 +118,9 @@ using SymmetricSparseMatrix_CSR_Vector = Matrices::SparseMatrix< Real, Device, I
 template< typename Real, typename Device, typename Index >
 using SymmetricSparseMatrix_CSR_Hybrid = Matrices::SparseMatrix< Real, Device, Index, Matrices::SymmetricMatrix, Algorithms::Segments::CSRHybrid >;
 
+template< typename Real, typename Device, typename Index >
+using SymmetricSparseMatrix_CSR_Light = Matrices::SparseMatrix< Real, Device, Index, Matrices::SymmetricMatrix, Algorithms::Segments::CSRLight >;
+
 template< typename Real, typename Device, typename Index >
 using SymmetricSparseMatrix_CSR_Adaptive = Matrices::SparseMatrix< Real, Device, Index, Matrices::SymmetricMatrix, Algorithms::Segments::CSRAdaptive >;
 
@@ -591,6 +597,7 @@ benchmarkSpmv( BenchmarkType& benchmark,
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar                   >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector                   >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Hybrid                   >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Light                    >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive                 >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_Ellpack                      >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack                >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
@@ -599,7 +606,7 @@ benchmarkSpmv( BenchmarkType& benchmark,
 #ifdef WITH_TNL_BENCHMARK_SPMV_BINARY_MATRICES
    benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar             >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
    benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector             >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Hybrid             >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Light              >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
    benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive           >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
    benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_Ellpack                >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
    benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack          >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
index e31ff3f43..5dc963d2c 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
@@ -47,8 +47,9 @@ void SpMVCSRLightWithoutAtomic2( OffsetsView offsets,
    const Index maxID = offsets[ segmentIdx  + 1];
 
    Real result = zero;
+   bool compute = true;
    for( Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 2)
-      result = reduce( result, fetch( i, true ) );
+      result = reduce( result, fetch( i, compute ) );
 
    /* Parallel reduction */
    result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) );
@@ -83,8 +84,9 @@ void SpMVCSRLightWithoutAtomic4( OffsetsView offsets,
    const Index maxID = offsets[segmentIdx + 1];
 
    Real result = zero;
+   bool compute = true;
    for (Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 4)
-      result = reduce( result, fetch( i, true ) );
+      result = reduce( result, fetch( i, compute ) );
 
    /* Parallel reduction */
    result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) );
@@ -122,8 +124,9 @@ void SpMVCSRLightWithoutAtomic8( OffsetsView offsets,
    const Index maxID = offsets[segmentIdx + 1];
 
    Real result = zero;
+   bool compute = true;
    for (i = offsets[segmentIdx] + inGroupID; i < maxID; i += 8)
-      result = reduce( result, fetch( i, true ) );
+      result = reduce( result, fetch( i, compute ) );
 
    /* Parallel reduction */
    result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 4 ) );
@@ -156,14 +159,14 @@ void SpMVCSRLightWithoutAtomic16( OffsetsView offsets,
    if( segmentIdx >= last )
       return;
 
-
    Index i;
    const Index inGroupID = threadIdx.x & 15; // & is cheaper than %
    const Index maxID = offsets[segmentIdx + 1];
 
    Real result = zero;
+   bool compute = true;
    for( i = offsets[segmentIdx] + inGroupID; i < maxID; i += 16 )
-      result = reduce( result, fetch( i, true ) );
+      result = reduce( result, fetch( i, compute ) );
 
    /* Parallel reduction */
    result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) );
@@ -202,8 +205,9 @@ void SpMVCSRVector( OffsetsView offsets,
    Index endID = offsets[warpID + 1];
 
    /* Calculate result */
+   bool compute = true;
    for (Index i = offsets[warpID] + laneID; i < endID; i += warpSize)
-      result = reduce( result, fetch( i, true ) );
+      result = reduce( result, fetch( i, compute ) );
 
    /* Reduction */
    result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) );
@@ -304,7 +308,7 @@ struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Kee
                offsets, first, last, fetch, reduce, keep, zero, grid );
          else // if (threadsPerSegment == 32)
          { // CSR SpMV Light with threadsPerSegment = 32 is CSR Vector
-            SpMVCSRVector<Real, Index, OffsetsView, Fetch, Reduce, Keep, warpSize ><<<blocks, threads>>>(
+            SpMVCSRVector<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
                offsets, first, last, fetch, reduce, keep, zero, grid );
          }
          /*else
-- 
GitLab


From 5ac2ba2452580e5189476f3a6ff43561502bf002 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 14 Jun 2021 15:25:55 +0200
Subject: [PATCH 080/117] Fixing CSR Light kernel.

---
 src/Benchmarks/scripts/run-tnl-benchmark-spmv |  2 +-
 .../tnl-spmv-benchmark-make-tables-json.py    |  2 +-
 .../Segments/Kernels/CSRLightKernel.hpp       | 25 ++++++-------------
 3 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/src/Benchmarks/scripts/run-tnl-benchmark-spmv b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
index 0c75634e4..fe511db11 100755
--- a/src/Benchmarks/scripts/run-tnl-benchmark-spmv
+++ b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
@@ -13,7 +13,7 @@ BENCHMARK_DBG="tnl-benchmark-spmv-dbg --with-legacy-matrices no"
 export CUDA_PROFILE_CONFIG="$IWD/cuda-profiler.conf"
 PROCESS_CUDA_PROFILE="$IWD/process-cuda-profile.pl"
 source matrix-market
-M_MATRICES=""
+#MM_MATRICES=""
 source florida-matrix-market
 #FLORIDA_MM_MATRICES=""
 
diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
index fc156b4b6..b0af4ccc6 100755
--- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
@@ -254,7 +254,7 @@ formats = get_formats( input_df )
 multicolumns, df_data = get_multiindex( input_df, formats )
 
 print( "Converting data..." )
-result = convert_data_frame( input_df, multicolumns, df_data, 200 )
+result = convert_data_frame( input_df, multicolumns, df_data, 20000 )
 compute_speedup( result, formats )
 
 print( "Writting to HTML file..." )
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
index 5dc963d2c..2148e405a 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
@@ -275,12 +275,8 @@ struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Kee
 #ifdef HAVE_CUDA
       const int threads = 128;
       Index blocks, groupSize;
-      //if (KernelType == CSRLightWithoutAtomic)
       int  neededThreads = threadsPerSegment * ( last - first );
-      //else
-      //   neededThreads = rows * (threadsPerSegment > 32 ? 32 : threadsPerSegment);
 
-      /* Execute kernels on device */
       for (Index grid = 0; neededThreads != 0; ++grid)
       {
          if( TNL::Cuda::getMaxGridXSize() * threads >= neededThreads)
@@ -334,30 +330,25 @@ void
 CSRLightKernel< Index, Device >::
 init( const Offsets& offsets )
 {
-   //const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount );
-   //this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ) ); //TNL::Cuda::getWarpSize() );
-
    const Index segmentsCount = offsets.getSize() - 1;
-   //const Index threads = 128; // !!!!!!!!!!!!!!!!!!!!!! block size
    size_t neededThreads = segmentsCount * 32;//warpSize;
-   Index blocks, threadsPerSegment;
 
    const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row
    if( elementsInSegment <= 2 )
-      threadsPerSegment = 2;
+      this->threadsPerSegment = 2;
    else if( elementsInSegment <= 4 )
-      threadsPerSegment = 4;
+      this->threadsPerSegment = 4;
    else if( elementsInSegment <= 8 )
-      threadsPerSegment = 8;
+      this->threadsPerSegment = 8;
    else if( elementsInSegment <= 16 )
-      threadsPerSegment = 16;
+      this->threadsPerSegment = 16;
    else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP)
-      threadsPerSegment = 32; // CSR Vector
+      this->threadsPerSegment = 32; // CSR Vector
    //else
    //   threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector
 
-   TNL_ASSERT_GE( threadsPerSegment, 0, "" );
-   TNL_ASSERT_LE( threadsPerSegment, 33, "" );
+   TNL_ASSERT_GE( this->threadsPerSegment, 0, "" );
+   TNL_ASSERT_LE( this->threadsPerSegment, 33, "" );
 
 }
 
@@ -367,7 +358,7 @@ void
 CSRLightKernel< Index, Device >::
 reset()
 {
-    this->threadsPerSegment = 0;
+   this->threadsPerSegment = 0;
 }
 
 template< typename Index,
-- 
GitLab


From 1e800ca30a63de5a863b8b58867a4eea1ef8ac5a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 1 Jul 2021 17:24:44 +0200
Subject: [PATCH 081/117] Small fixes in segments and sparse matrix.

---
 src/TNL/Algorithms/Segments/CSR.h     | 2 +-
 src/TNL/Matrices/SparseMatrix.h       | 2 +-
 src/TNL/Matrices/SparseMatrixView.hpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index 8fba00b2a..aa3f16d6b 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -306,7 +306,7 @@ class CSR
        * Declaration of the lambda function \e function is supposed to be
        *
        * ```
-       * auto f = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType localIdx, IndexType globalIdx ) {...} 
+       * auto f = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType localIdx, IndexType globalIdx ) {...}
        * ```
        * where \e segmentIdx is index of segment where given element belong to, \e localIdx is rank of the element
        * within the segment and \e globalIdx is index of the element within the related container.
diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index 2deccd645..d64204381 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -797,7 +797,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * The lambda function `function` should be declared like follows:
        *
        * ```
-       * auto function = [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value ) mutable { ... }
+       * auto function = [] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType& columnIdx, const RealType& value ) mutable { ... }
        * ```
        *
        *  The \e localIdx parameter is a rank of the non-zero element in given row.
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 2d9a06a8c..e32236b3c 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -417,7 +417,7 @@ vectorProduct( const InVector& inVector,
    const auto valuesView = this->values.getConstView();
    const auto columnIndexesView = this->columnIndexes.getConstView();
    const IndexType paddingIndex = this->getPaddingIndex();
-   if( isSymmetric() )
+   if( isSymmetric() && outVectorMultiplicator != 1.0 )
       outVector *= outVectorMultiplicator;
    auto symmetricFetch = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable -> ComputeRealType {
       const IndexType column = columnIndexesView[ globalIdx ];
-- 
GitLab


From 9e6566fe749dfbfbc6e62875a10e879b791c30f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 9 Jul 2021 10:56:32 +0200
Subject: [PATCH 082/117] Fixed calling of refactored reduction.

---
 src/TNL/Matrices/DenseMatrixView.hpp                 | 2 +-
 src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index 4faa25aa0..9168e1c47 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -849,7 +849,7 @@ operator==( const Matrix& m ) const
    {
       return view1.getRow( i ) == view2.getRow( i );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, this->getRows(), fetch, std::logical_and<>{}, true );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, this->getRows(), fetch, std::logical_and<>{}, true );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp
index 900e3bf15..421b5c129 100644
--- a/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp
@@ -171,7 +171,7 @@ getNonzeroElementsCount() const
       auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> IndexType {
          return ( columns_view[ i ] != paddingIndex );
       };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, this->columnIndexes.getSize(), fetch, std::plus<>{}, 0 );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, this->columnIndexes.getSize(), fetch, std::plus<>{}, 0 );
    }
    else
    {
@@ -901,7 +901,7 @@ operator==( const Matrix& m ) const
    {
       return view1.getRow( i ) == view2.getRow( i );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( 0, this->getRows(), fetch, std::logical_and<>{}, true );
+   return Algorithms::reduce< DeviceType >( 0, this->getRows(), fetch, std::logical_and<>{}, true );
 }
 
 template< typename Real,
-- 
GitLab


From 650b12ed929ea3b2eed5097c96c755c91f191b95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 9 Jul 2021 10:57:54 +0200
Subject: [PATCH 083/117] Added CUDA kernel for RowMajor ordering of Ellpack.

---
 src/TNL/Algorithms/Segments/EllpackView.h     |   2 +
 src/TNL/Algorithms/Segments/EllpackView.hpp   | 147 ++++++++++++++++--
 .../Matrices/SparseMatrixTest_Ellpack.h       |  10 +-
 .../SparseMatrixVectorProductTest_Ellpack.h   |  10 +-
 4 files changed, 155 insertions(+), 14 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/EllpackView.h b/src/TNL/Algorithms/Segments/EllpackView.h
index b8066f635..1a14db338 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.h
+++ b/src/TNL/Algorithms/Segments/EllpackView.h
@@ -22,6 +22,8 @@ namespace TNL {
    namespace Algorithms {
       namespace Segments {
 
+enum EllpackKernelType { Scalar, Vector, Vector2, Vector4, Vector8, Vector16 };
+
 template< typename Device,
           typename Index,
           ElementsOrganization Organization = Segments::DefaultElementsOrganization< Device >::getOrganization(),
diff --git a/src/TNL/Algorithms/Segments/EllpackView.hpp b/src/TNL/Algorithms/Segments/EllpackView.hpp
index 7abf2caed..6f49c55ee 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/EllpackView.hpp
@@ -19,6 +19,124 @@ namespace TNL {
    namespace Algorithms {
       namespace Segments {
 
+#ifdef HAVE_CUDA
+template< typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real >
+__global__ void
+EllpackCudaReductionKernelFull( Index first, Index last, Fetch fetch, const Reduction reduction, ResultKeeper keep, const Real zero, Index segmentSize )
+{
+   const int warpSize = 32;
+   const int gridID = 0;
+   const Index segmentIdx = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize;
+   if (segmentIdx >= last)
+      return;
+
+   Real result = zero;
+   const Index laneID = threadIdx.x & 31; // & is cheaper than %
+   const Index begin = segmentIdx * segmentSize;
+   const Index end = begin + segmentSize;
+
+   /* Calculate result */
+   Index localIdx( 0 );
+   bool compute( true );
+   for( Index i = begin + laneID; i < end; i += warpSize)
+      result = reduction( result, fetch( segmentIdx, localIdx++, i, compute ) );
+
+   /* Reduction */
+   result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) );
+   result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result,  8 ) );
+   result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result,  4 ) );
+   result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result,  2 ) );
+   result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result,  1 ) );
+   /* Write result */
+   if( laneID == 0 )
+      keep( segmentIdx, result );
+}
+
+template< typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real >
+__global__ void
+EllpackCudaReductionKernelCompact( Index first, Index last, Fetch fetch, const Reduction reduction, ResultKeeper keep, const Real zero, Index segmentSize )
+{
+   const int warpSize = 32;
+   const int gridID = 0;
+   const Index segmentIdx = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize;
+   if (segmentIdx >= last)
+      return;
+
+   Real result = zero;
+   const Index laneID = threadIdx.x & 31; // & is cheaper than %
+   const Index begin = segmentIdx * segmentSize;
+   const Index end = begin + segmentSize;
+
+   /* Calculate result */
+   bool compute( true );
+   for( Index i = begin + laneID; i < end; i += warpSize)
+      result = reduction( result, fetch( i, compute ) );
+
+   /* Reduction */
+   result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) );
+   result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result,  8 ) );
+   result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result,  4 ) );
+   result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result,  2 ) );
+   result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result,  1 ) );
+   /* Write result */
+   if( laneID == 0 )
+      keep( segmentIdx, result );
+
+}
+#endif
+
+template< typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real,
+          bool FullFetch = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
+struct EllpackCudaReductionDispatcher
+{
+   static void
+   exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize )
+   {
+   #ifdef HAVE_CUDA
+      const Index segmentsCount = last - first;
+      const Index threadsCount = segmentsCount * 32;
+      const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 );
+      dim3 blockSize( 256 );
+      dim3 gridSize( blocksCount );
+      EllpackCudaReductionKernelFull<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize );
+      cudaDeviceSynchronize();
+   #endif
+   }
+};
+
+template< typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real >
+struct EllpackCudaReductionDispatcher< Index, Fetch, Reduction, ResultKeeper, Real, false >
+{
+   static void
+   exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize )
+   {
+   #ifdef HAVE_CUDA
+      const Index segmentsCount = last - first;
+      const Index threadsCount = segmentsCount * 32;
+      const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 );
+      dim3 blockSize( 256 );
+      dim3 gridSize( blocksCount );
+      EllpackCudaReductionKernelCompact<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize );
+      cudaDeviceSynchronize();
+   #endif
+   }
+};
 
 template< typename Device,
           typename Index,
@@ -277,18 +395,23 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
    using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( Organization == RowMajorOrder )
    {
-      const IndexType segmentSize = this->segmentSize;
-      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx ) mutable {
-         const IndexType begin = segmentIdx * segmentSize;
-         const IndexType end = begin + segmentSize;
-         RealType aux( zero );
-         IndexType localIdx( 0 );
-         bool compute( true );
-         for( IndexType j = begin; j < end && compute; j++  )
-            aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
-         keeper( segmentIdx, aux );
-      };
-      Algorithms::ParallelFor< Device >::exec( first, last, l );
+      if( std::is_same< Device, Devices::Cuda >::value )
+         EllpackCudaReductionDispatcher< IndexType, Fetch, Reduction, ResultKeeper, Real>::exec( first, last, fetch, reduction, keeper, zero, segmentSize );
+      else
+      {
+         const IndexType segmentSize = this->segmentSize;
+         auto l = [=] __cuda_callable__ ( const IndexType segmentIdx ) mutable {
+            const IndexType begin = segmentIdx * segmentSize;
+            const IndexType end = begin + segmentSize;
+            RealType aux( zero );
+            IndexType localIdx( 0 );
+            bool compute( true );
+            for( IndexType j = begin; j < end && compute; j++  )
+               aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
+            keeper( segmentIdx, aux );
+         };
+         Algorithms::ParallelFor< Device >::exec( first, last, l );
+      }
    }
    else
    {
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h b/src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h
index ef56ec63a..b13a19c6a 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h
@@ -46,7 +46,15 @@ using MatrixTypes = ::testing::Types
     TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
     TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
     TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >
 #endif
 >;
 
diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.h
index abb4213ca..c93aace75 100644
--- a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.h
+++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.h
@@ -46,7 +46,15 @@ using MatrixTypes = ::testing::Types
     TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
     TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
     TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, RowMajorEllpack >
 #endif
 >;
 
-- 
GitLab


From f2e53c3982601658f9ddf8ce397b2fca9fdb746c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 9 Jul 2021 10:58:55 +0200
Subject: [PATCH 084/117] Added BLAS benchmark for dense matrix-vector
 multiplication.

---
 src/Benchmarks/BLAS/cublasWrappers.h     |  25 ++++
 src/Benchmarks/BLAS/dense-mv.h           | 144 +++++++++++++++++++++++
 src/Benchmarks/BLAS/tnl-benchmark-blas.h |  14 +++
 3 files changed, 183 insertions(+)
 create mode 100644 src/Benchmarks/BLAS/dense-mv.h

diff --git a/src/Benchmarks/BLAS/cublasWrappers.h b/src/Benchmarks/BLAS/cublasWrappers.h
index 1e63e139d..f0d8952e6 100644
--- a/src/Benchmarks/BLAS/cublasWrappers.h
+++ b/src/Benchmarks/BLAS/cublasWrappers.h
@@ -118,4 +118,29 @@ cublasGscal( cublasHandle_t handle, int n,
    return cublasDscal( handle, n, alpha, x, incx );
 }
 
+
+inline cublasStatus_t
+cublasGemv( cublasHandle_t handle, cublasOperation_t trans,
+            int m, int n,
+            const float           *alpha,
+            const float           *A, int lda,
+            const float           *x, int incx,
+            const float           *beta,
+            float           *y, int incy )
+{
+   return cublasSgemv( handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy );
+}
+
+inline cublasStatus_t
+cublasGemv( cublasHandle_t handle, cublasOperation_t trans,
+            int m, int n,
+            const double          *alpha,
+            const double          *A, int lda,
+            const double          *x, int incx,
+            const double          *beta,
+            double          *y, int incy )
+{
+   return cublasDgemv( handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy );
+}
+
 #endif
diff --git a/src/Benchmarks/BLAS/dense-mv.h b/src/Benchmarks/BLAS/dense-mv.h
new file mode 100644
index 000000000..99978c55b
--- /dev/null
+++ b/src/Benchmarks/BLAS/dense-mv.h
@@ -0,0 +1,144 @@
+/***************************************************************************
+                          dense-mv.h  -  description
+                             -------------------
+    begin                : Jul 8, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include "../Benchmarks.h"
+#include "cublasWrappers.h"
+
+#include <TNL/Containers/Vector.h>
+#include <TNL/Pointers/DevicePointer.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Devices/Host.h>
+
+namespace TNL {
+namespace Benchmarks {
+
+template< typename Matrix >
+void setMatrix( Matrix& matrix )
+{
+   using RealType = typename Matrix::RealType;
+   using IndexType = typename Matrix::IndexType;
+   matrix.forAllElements( [] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value ) {
+       value = 1.0; } );
+}
+
+template< typename Real >
+void
+benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
+                           const int & size )
+{
+   using HostMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Host >;
+   using RowMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda, int, TNL::Algorithms::Segments::RowMajorOrder >;
+   using ColumnMajorCudaMatrix = TNL::Matrices::DenseMatrix< Real, TNL::Devices::Cuda >;
+   using HostVector = Containers::Vector< Real, Devices::Host, int >;
+   using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;
+
+   HostMatrix hostMatrix;
+   RowMajorCudaMatrix rowMajorCudaMatrix;
+   ColumnMajorCudaMatrix columnMajorCudaMatrix;
+   HostVector inHostVector, outHostVector;
+   CudaVector inCudaVector, outCudaVector1, outCudaVector2;
+
+   // create benchmark group
+   const std::vector< String > parsedType = parseObjectType( getType< HostMatrix >() );
+#ifdef HAVE_CUDA
+   benchmark.createHorizontalGroup( parsedType[ 0 ], 2 );
+#else
+   benchmark.createHorizontalGroup( parsedType[ 0 ], 1 );
+#endif
+
+   hostMatrix.setDimensions( size, size );
+   inHostVector.setSize( size );
+   outHostVector.setSize( size );
+
+   setMatrix< HostMatrix >( hostMatrix );
+   const double datasetSize = (double) ( size * size ) * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+
+   // reset function
+   auto reset = [&]() {
+      inHostVector = 1.0;
+      outHostVector = 0.0;
+#ifdef HAVE_CUDA
+      inCudaVector = 1.0;
+      //outCudaVector1 = 0.0;
+      //outCudaVector2 = 0.0;
+#endif
+   };
+
+   // compute functions
+   auto spmvHost = [&]() {
+      hostMatrix.vectorProduct( inHostVector, outHostVector );
+   };
+   benchmark.setOperation( datasetSize );
+   benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
+
+#ifdef HAVE_CUDA
+   columnMajorCudaMatrix.setDimensions( size, size );
+   inCudaVector.setSize( size );
+   outCudaVector1.setSize( size );
+   outCudaVector2.setSize( size );
+   setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix );
+
+   auto columnMajorMvCuda = [&]() {
+      columnMajorCudaMatrix.vectorProduct( inCudaVector, outCudaVector1 );
+   };
+   benchmark.time< Devices::Cuda >( reset, "GPU col", columnMajorMvCuda );
+
+   columnMajorCudaMatrix.reset();
+
+   rowMajorCudaMatrix.setDimensions( size, size );
+   setMatrix< RowMajorCudaMatrix >( rowMajorCudaMatrix );
+
+   auto rowMajorMvCuda = [&]() {
+      rowMajorCudaMatrix.vectorProduct( inCudaVector, outCudaVector2 );
+   };
+   benchmark.time< Devices::Cuda >( reset, "GPU row", rowMajorMvCuda );
+
+   //std::cerr << "Diff. = " << TNL::max( abs( outCudaVector2 - outCudaVector1 ) ) << std::endl;
+
+   rowMajorCudaMatrix.reset();
+   columnMajorCudaMatrix.setDimensions( size, size );
+   setMatrix< ColumnMajorCudaMatrix >( columnMajorCudaMatrix );
+
+   cublasHandle_t cublasHandle;
+   cublasCreate( &cublasHandle );
+   auto mvCublas = [&] () {
+      Real alpha = 1.0;
+      Real beta = 0.0;
+      cublasGemv( cublasHandle, CUBLAS_OP_N, size, size, &alpha,
+                  columnMajorCudaMatrix.getValues().getData(), size,
+                  inCudaVector.getData(), 1, &beta,
+                  outCudaVector1.getData(), 1 );
+   };
+   benchmark.time< Devices::Cuda >( reset, "GPU cublas", mvCublas );
+
+#endif
+}
+
+/*template< typename Real = double,
+          typename Index = int >
+void
+benchmarkDenseSynthetic( Benchmark<> & benchmark,
+                         const int & size )
+{
+   // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
+   // NOTE: CSR is disabled because it is very slow on GPU
+   //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, size, elementsPerRow );
+   benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::Ellpack >( benchmark, size, elementsPerRow );
+   benchmarkSpMV< Real, SlicedEllpack >( benchmark, size, elementsPerRow );
+   benchmarkSpMV< Real, Benchmarks::SpMV::ReferenceFormats::Legacy::ChunkedEllpack >( benchmark, size, elementsPerRow );
+}*/
+
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/src/Benchmarks/BLAS/tnl-benchmark-blas.h b/src/Benchmarks/BLAS/tnl-benchmark-blas.h
index bc58aeb2c..9b061adf6 100644
--- a/src/Benchmarks/BLAS/tnl-benchmark-blas.h
+++ b/src/Benchmarks/BLAS/tnl-benchmark-blas.h
@@ -22,6 +22,8 @@
 #include "vector-operations.h"
 #include "triad.h"
 #include "spmv.h"
+#include "dense-mv.h"
+
 
 using namespace TNL;
 using namespace TNL::Benchmarks;
@@ -100,6 +102,18 @@ runBlasBenchmarks( Benchmark<> & benchmark,
       } ));
       benchmarkSpmvSynthetic< Real >( benchmark, size, elementsPerRow );
    }
+
+   // Dense matrix-vector multiplication
+   benchmark.newBenchmark( String("Dense matrix-vector multiplication (") + precision + ")",
+                           metadata );
+   for( std::size_t size = 10; size <= 20000; size *= 2 ) {
+      benchmark.setMetadataColumns( Benchmark<>::MetadataColumns({
+         { "rows", convertToString( size ) },
+         { "columns", convertToString( size ) }
+      } ));
+      benchmarkDenseMVSynthetic< Real >( benchmark, size );
+   }
+
 }
 
 void
-- 
GitLab


From b7ed8cf1cc0587f3df175d44f27d197b5d9eb666 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 9 Jul 2021 11:48:18 +0200
Subject: [PATCH 085/117] Fixing comparison of different integer types.

---
 src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
index 2148e405a..662a896c6 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
@@ -273,9 +273,9 @@ struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Kee
                        const Index threadsPerSegment )
    {
 #ifdef HAVE_CUDA
-      const int threads = 128;
+      const size_t threads = 128;
       Index blocks, groupSize;
-      int  neededThreads = threadsPerSegment * ( last - first );
+      size_t  neededThreads = threadsPerSegment * ( last - first );
 
       for (Index grid = 0; neededThreads != 0; ++grid)
       {
-- 
GitLab


From b002d9451f120fa7a44fd943b93878197c65a255 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 9 Jul 2021 12:26:03 +0200
Subject: [PATCH 086/117] Commenting unused variable in CSR Light kernel.

---
 src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
index 662a896c6..7fddef69d 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
@@ -331,7 +331,7 @@ CSRLightKernel< Index, Device >::
 init( const Offsets& offsets )
 {
    const Index segmentsCount = offsets.getSize() - 1;
-   size_t neededThreads = segmentsCount * 32;//warpSize;
+   //size_t neededThreads = segmentsCount * 32;//warpSize;
 
    const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row
    if( elementsInSegment <= 2 )
-- 
GitLab


From 1f968d3a08f4c2ae25d3de1a5816533ec8b14981 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 21 Jul 2021 10:20:08 +0200
Subject: [PATCH 087/117] Implementing CUDA kernel for dense matrix-vector
 multiplication.

---
 src/TNL/Matrices/DenseMatrixView.hpp | 75 ++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index 9168e1c47..78101062a 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -19,6 +19,54 @@
 namespace TNL {
 namespace Matrices {
 
+#ifdef HAVE_CUDA
+template< typename Matrix, typename InVector, typename OutVector >
+__global__ void
+DenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx )
+{
+   using Real = typename Matrix::RealType;
+   using Index = typename Matrix::IndexType;
+   constexpr int  inVectorCacheSize = 20480 / sizeof( Real );
+   __shared__ Real inVectorCache[ inVectorCacheSize ];
+
+   const int rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * 256 + threadIdx.x + begin;
+
+   Real result( 0.0 );
+   Index columnIdx( 0 );
+   const auto& values = matrix.getValues();
+   const auto& rowsCount = matrix.getRows();
+   Index valuesPtr = rowIdx;
+
+   while( columnIdx < matrix.getColumns() )
+   {
+      const Index lastIdx = min( matrix.getColumns(), columnIdx + inVectorCacheSize );
+      Index matrixColIdx = columnIdx + threadIdx.x;
+      Index cacheColIdx = threadIdx.x;
+      while( matrixColIdx < lastIdx )
+      {
+         inVectorCache[ cacheColIdx ] = inVector[ matrixColIdx ];
+         cacheColIdx += 256;
+         matrixColIdx += 256;
+      }
+      __syncthreads();
+
+      matrixColIdx = columnIdx;
+      cacheColIdx = 0;
+      if( rowIdx < end )
+         while( matrixColIdx < lastIdx )
+         {
+            result += values[ valuesPtr ] * inVectorCache[ cacheColIdx ];
+            cacheColIdx++;
+            matrixColIdx++;
+            valuesPtr += rowsCount;
+         }
+      columnIdx = lastIdx;
+   }
+   if( rowIdx < end )
+      outVector[ rowIdx ] = result;
+}
+#endif
+
 template< typename Real,
           typename Device,
           typename Index,
@@ -535,6 +583,33 @@ vectorProduct( const InVector& inVector,
    const auto valuesView = this->values.getConstView();
    if( end == 0 )
       end = this->getRows();
+
+   if( std::is_same< DeviceType, Devices::Cuda >::value )
+   {
+#ifdef HAVE_CUDA
+      if( Organization == Algorithms::Segments::ColumnMajorOrder )
+      {
+         const size_t threadsCount = end - begin;
+         const size_t blocksCount = roundUpDivision( threadsCount, 256 );
+         const size_t gridsCount = roundUpDivision( blocksCount, Cuda::getMaxGridSize() );
+         const size_t sharedMemSize = 20480;
+         for( size_t gridIdx = 0; gridIdx < gridsCount; gridIdx++ )
+         {
+            dim3 blocks( Cuda::getMaxGridSize() );
+            if( gridIdx == gridsCount - 1 )
+               blocks = blocksCount % Cuda::getMaxGridSize();
+            DenseMatrixViewVectorMultiplicationKernel<<< blocks, 256, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx );
+         }
+         TNL_CHECK_CUDA_DEVICE;
+         return;
+      }
+#endif
+   }
+
+   /***
+    * The rest is general implementation based on segments
+    */
+
    auto fetch = [=] __cuda_callable__ ( IndexType row, IndexType column, IndexType offset, bool& compute ) -> RealType {
       return valuesView[ offset ] * inVectorView[ column ];
    };
-- 
GitLab


From 2c5667275db0c3b56eb2e126d18fb559d534bb87 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 21 Jul 2021 20:42:25 +0200
Subject: [PATCH 088/117] Implemented kernel for row major ordered dense matrix
 vector multiplication on CUDA GPU.

---
 src/TNL/Matrices/DenseMatrixView.hpp | 169 ++++++++++++++++++++++++++-
 1 file changed, 165 insertions(+), 4 deletions(-)

diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index 78101062a..418baeee3 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -20,9 +20,77 @@ namespace TNL {
 namespace Matrices {
 
 #ifdef HAVE_CUDA
+/**
+ * The following kernel is an attempt to map more CUDA threads to one matrix row.
+ */
+template< int BlockSize, int ThreadsPerRow, typename Matrix, typename InVector, typename OutVector >
+__global__ void
+VectorColumnMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx )
+{
+   using Real = typename Matrix::RealType;
+   using Index = typename Matrix::IndexType;
+   constexpr int  inVectorCacheSize = 20480 / sizeof( Real );
+   __shared__ Real inVectorCache[ inVectorCacheSize ];
+   __shared__ Real result_[ BlockSize ];
+
+   constexpr Index rowsPerBlock = 256 / ThreadsPerRow;
+   const Index rowIdx = ( ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * 256 + threadIdx.x ) / ThreadsPerRow + begin;
+   const Index localColIdx = threadIdx.x / rowsPerBlock;
+   const Index localRowIdx = threadIdx.x % rowsPerBlock;
+
+   Real result( 0.0 );
+   Index columnIdx( 0 );
+   const auto& values = matrix.getValues();
+   const auto& rowsCount = matrix.getRows();
+   Index valuesPtr = rowIdx + localColIdx * rowsCount;
+
+   while( columnIdx < matrix.getColumns() )
+   {
+      const Index lastIdx = min( matrix.getColumns(), columnIdx + inVectorCacheSize );
+      Index matrixColIdx = columnIdx + threadIdx.x;
+      Index cacheColIdx = threadIdx.x;
+      while( matrixColIdx < lastIdx )
+      {
+         inVectorCache[ cacheColIdx ] = inVector[ matrixColIdx ];
+         cacheColIdx += 256;
+         matrixColIdx += 256;
+      }
+      __syncthreads();
+
+      matrixColIdx = columnIdx + localColIdx;
+      cacheColIdx = localColIdx;
+      if( rowIdx < end )
+         while( matrixColIdx < lastIdx )
+         {
+            result += values[ valuesPtr ] * inVectorCache[ cacheColIdx ];
+            cacheColIdx += ThreadsPerRow;
+            matrixColIdx += ThreadsPerRow;
+            valuesPtr += ThreadsPerRow * rowsCount;
+         }
+      columnIdx = lastIdx;
+   }
+   const int idx = localRowIdx * ThreadsPerRow + localColIdx;
+   result_[ idx ] = result;
+   if( ThreadsPerRow > 8 && localColIdx < ThreadsPerRow - 8 )
+      result_[ idx ] += result_[ idx + 8 ];
+   __syncwarp();
+   if( ThreadsPerRow > 4 && localColIdx < ThreadsPerRow - 4 )
+      result_[ idx ] += result_[ idx + 4 ];
+   __syncwarp();
+   if( ThreadsPerRow > 2 && localColIdx < ThreadsPerRow - 2 )
+      result_[ idx ] += result_[ idx + 2 ];
+   __syncwarp();
+   if( ThreadsPerRow > 1 && localColIdx < ThreadsPerRow - 1 )
+      result_[ idx ] += result_[ idx + 1 ];
+   __syncwarp();
+
+   if( rowIdx < end && localColIdx == 0 )
+      outVector[ rowIdx ] = result_[ idx ];
+}
+
 template< typename Matrix, typename InVector, typename OutVector >
 __global__ void
-DenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx )
+ColumnMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int begin, const int end, int gridIdx )
 {
    using Real = typename Matrix::RealType;
    using Index = typename Matrix::IndexType;
@@ -65,6 +133,78 @@ DenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector i
    if( rowIdx < end )
       outVector[ rowIdx ] = result;
 }
+
+template< typename Matrix, typename InVector, typename OutVector >
+__global__ void
+RowMajorDenseMatrixViewVectorMultiplicationKernel( const Matrix matrix, const InVector inVector, OutVector outVector, const int first, const int last, int gridIdx )
+{
+   using Real = typename Matrix::RealType;
+   using Index = typename Matrix::IndexType;
+   constexpr int  inVectorCacheSize = 20480 / sizeof( Real );
+   __shared__ Real inVectorCache[ inVectorCacheSize ];
+
+   constexpr int threadsPerRow = 32;
+   //const Index rowIdx = begin + ((gridIdx * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / threadsPerRow;
+   const Index rowIdx = first + ( ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * 256 + threadIdx.x ) /  threadsPerRow;
+
+   Real result = 0.0;
+   const Index laneID = threadIdx.x & 31; // & is cheaper than %
+   const Real* values = matrix.getValues().getData();
+
+   Index columnIdx( 0 );
+   /*while( columnIdx < matrix.getColumns() )
+   {
+      const Index lastIdx = min( matrix.getColumns(), columnIdx + inVectorCacheSize );
+      Index matrixColIdx = columnIdx + threadIdx.x;
+      Index cacheColIdx = threadIdx.x;
+      while( matrixColIdx < lastIdx )
+      {
+         inVectorCache[ cacheColIdx ] = inVector[ matrixColIdx ];
+         cacheColIdx += 256;
+         matrixColIdx += 256;
+      }
+      __syncthreads();
+
+      // Calculate result
+      if( rowIdx < last )
+      {
+         const Index begin = rowIdx * matrix.getColumns() + columnIdx;
+         const Index end = rowIdx * matrix.getColumns() + lastIdx;
+         Index localColumn( 0 );
+
+         for( Index i = begin + laneID; i < end; i += threadsPerRow, localColumn += threadsPerRow )
+            result += values[ i ] * inVectorCache[ localColumn ];
+      }
+      columnIdx = lastIdx;
+   }*/
+
+   if( rowIdx < last )
+   {
+      const Index begin = rowIdx * matrix.getColumns();
+      const Index end = begin + matrix.getColumns();
+
+      for( Index i = begin + laneID; i < end; i += threadsPerRow, columnIdx += threadsPerRow )
+         result += values[ i ] * inVector[ columnIdx ];
+   }
+
+   if( rowIdx < last )
+   {
+      // Reduction
+      if( threadsPerRow > 16 )
+         result += __shfl_down_sync(0xFFFFFFFF, result, 16 );
+      if( threadsPerRow > 8 )
+         result += __shfl_down_sync(0xFFFFFFFF, result,  8 );
+      if( threadsPerRow > 4 )
+         result += __shfl_down_sync(0xFFFFFFFF, result,  4 );
+      if( threadsPerRow > 2 )
+         result += __shfl_down_sync(0xFFFFFFFF, result,  2 );
+      if( threadsPerRow > 1 )
+         result += __shfl_down_sync(0xFFFFFFFF, result,  1 );
+      // Write result
+      if( laneID == 0 )
+         outVector[ rowIdx ] = result;
+   }
+}
 #endif
 
 template< typename Real,
@@ -589,8 +729,10 @@ vectorProduct( const InVector& inVector,
 #ifdef HAVE_CUDA
       if( Organization == Algorithms::Segments::ColumnMajorOrder )
       {
-         const size_t threadsCount = end - begin;
-         const size_t blocksCount = roundUpDivision( threadsCount, 256 );
+         constexpr int BlockSize = 256;
+         constexpr int ThreadsPerRow = 1;
+         const size_t threadsCount = ( end - begin ) * ThreadsPerRow;
+         const size_t blocksCount = roundUpDivision( threadsCount, BlockSize );
          const size_t gridsCount = roundUpDivision( blocksCount, Cuda::getMaxGridSize() );
          const size_t sharedMemSize = 20480;
          for( size_t gridIdx = 0; gridIdx < gridsCount; gridIdx++ )
@@ -598,11 +740,30 @@ vectorProduct( const InVector& inVector,
             dim3 blocks( Cuda::getMaxGridSize() );
             if( gridIdx == gridsCount - 1 )
                blocks = blocksCount % Cuda::getMaxGridSize();
-            DenseMatrixViewVectorMultiplicationKernel<<< blocks, 256, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx );
+            ColumnMajorDenseMatrixViewVectorMultiplicationKernel<<< blocks, BlockSize, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx );
          }
          TNL_CHECK_CUDA_DEVICE;
          return;
       }
+      if( Organization == Algorithms::Segments::RowMajorOrder )
+      {
+         constexpr int BlockSize = 256;
+         constexpr int ThreadsPerRow = 32;
+         const size_t threadsCount = ( end - begin ) * ThreadsPerRow;
+         const size_t blocksCount = roundUpDivision( threadsCount, BlockSize );
+         const size_t gridsCount = roundUpDivision( blocksCount, Cuda::getMaxGridSize() );
+         const size_t sharedMemSize = 20480;
+         for( size_t gridIdx = 0; gridIdx < gridsCount; gridIdx++ )
+         {
+            dim3 blocks( Cuda::getMaxGridSize() );
+            if( gridIdx == gridsCount - 1 )
+               blocks = blocksCount % Cuda::getMaxGridSize();
+            RowMajorDenseMatrixViewVectorMultiplicationKernel<<< blocks, BlockSize, sharedMemSize >>>( *this, inVectorView, outVectorView, begin, end, gridIdx );
+         }
+         TNL_CHECK_CUDA_DEVICE;
+         return;
+      }
+
 #endif
    }
 
-- 
GitLab


From 6fc0cab53c5fbb9ab7cf60641a89a14651058e57 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 25 Jul 2021 20:51:45 +0200
Subject: [PATCH 089/117] Fixing necessary condition for specialization of
 dense matrix vector multiplication.

---
 src/Benchmarks/BLAS/dense-mv.h       | 4 +++-
 src/TNL/Matrices/DenseMatrixView.hpp | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/Benchmarks/BLAS/dense-mv.h b/src/Benchmarks/BLAS/dense-mv.h
index 99978c55b..1204257cc 100644
--- a/src/Benchmarks/BLAS/dense-mv.h
+++ b/src/Benchmarks/BLAS/dense-mv.h
@@ -105,7 +105,8 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
    };
    benchmark.time< Devices::Cuda >( reset, "GPU row", rowMajorMvCuda );
 
-   //std::cerr << "Diff. = " << TNL::max( abs( outCudaVector2 - outCudaVector1 ) ) << std::endl;
+   auto diff = TNL::max( abs( outCudaVector2 - outCudaVector1 ) );
+   //std::cerr << outCudaVector1 << std::endl << outCudaVector2 << std::endl;
 
    rowMajorCudaMatrix.reset();
    columnMajorCudaMatrix.setDimensions( size, size );
@@ -123,6 +124,7 @@ benchmarkDenseMVSynthetic( Benchmark<> & benchmark,
    };
    benchmark.time< Devices::Cuda >( reset, "GPU cublas", mvCublas );
 
+   //std::cerr << "Diff. = " << diff << std::endl;
 #endif
 }
 
diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index 418baeee3..b8ad99394 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -724,7 +724,8 @@ vectorProduct( const InVector& inVector,
    if( end == 0 )
       end = this->getRows();
 
-   if( std::is_same< DeviceType, Devices::Cuda >::value )
+   if( std::is_same< DeviceType, Devices::Cuda >::value &&
+      matrixMultiplicator == 1.0 && outVectorMultiplicator == 0.0 )
    {
 #ifdef HAVE_CUDA
       if( Organization == Algorithms::Segments::ColumnMajorOrder )
-- 
GitLab


From fff8442fb3d9ba7a2f8343299e19f6b952bc717b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 27 Jul 2021 22:34:35 +0200
Subject: [PATCH 090/117] Fix of CMakeLists.

---
 Documentation/Examples/Algorithms/CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Documentation/Examples/Algorithms/CMakeLists.txt b/Documentation/Examples/Algorithms/CMakeLists.txt
index 339ab5754..a2fc4569d 100644
--- a/Documentation/Examples/Algorithms/CMakeLists.txt
+++ b/Documentation/Examples/Algorithms/CMakeLists.txt
@@ -6,8 +6,6 @@ set( COMMON_EXAMPLES
    SortingExample3
    ParallelForExample
    SequentialForExample
-   unrolledForExample.out
-   staticForExample.out
 )
 
 set( HOST_EXAMPLES
-- 
GitLab


From 11f7692a5fb5795ee9728d8e0683d8db70ce6508 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 3 Aug 2021 20:40:42 +0200
Subject: [PATCH 091/117] Improving Python script for processing results of
 SpMV benchmark.

---
 .../tnl-spmv-benchmark-make-tables-json.py    | 224 +++++++++++++++---
 1 file changed, 196 insertions(+), 28 deletions(-)

diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
index b0af4ccc6..69e23fa2d 100755
--- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
@@ -13,6 +13,11 @@ def slugify(s):
    s = str(s).strip().replace(' ', '_')
    return re.sub(r'(?u)[^-\w.]', '', s)
 
+def latexFormatName( name ):
+   name = name.replace('<','')
+   name = name.replace('>','')
+   return name
+
 ####
 # Extract all formats
 def get_formats( input_df ):
@@ -20,6 +25,7 @@ def get_formats( input_df ):
    df_matrix = input_df.loc[input_df['matrix name'] == matrixName]
    formats = df_matrix.loc[:,'format'].values.tolist() # Get format names - TODO: the first benchmark might not have all of them
    formats = list(dict.fromkeys(formats))              # remove duplicates
+   formats.append('TNL Best')
    return formats
 
 ####
@@ -45,6 +51,19 @@ def get_multiindex( input_df, formats ):
             level3.append( 'speed-up')
             level4.append( speedup )
             df_data[ 0 ].append( ' ' )
+         if format == 'CSR< Light >':
+            level1.append( format )
+            level2.append( 'GPU' )
+            level3.append( 'speed-up')
+            level4.append( 'LightSpMV Vector' )
+            df_data[ 0 ].append( ' ' )
+         if format == 'TNL Best':
+            level1.append( format )
+            level2.append( 'GPU' )
+            level3.append( 'format')
+            level4.append( '' )
+            df_data[ 0 ].append( ' ' )
+
    multiColumns = pd.MultiIndex.from_arrays([ level1, level2, level3, level4 ] )
    return multiColumns, df_data
 
@@ -62,6 +81,7 @@ def convert_data_frame( input_df, multicolumns, df_data, max_rows = -1 ):
       df_matrix = input_df.loc[input_df['matrix name'] == matrixName]
       print( out_idx, ":", in_idx, "/", len(input_df.index), ":", matrixName )
       aux_df = pd.DataFrame( df_data, columns = multicolumns, index = [out_idx] )
+      best_bw = 0
       for index,row in df_matrix.iterrows():
          aux_df.iloc[0]['Matrix name'] = row['matrix name']
          aux_df.iloc[0]['rows']        = row['rows']
@@ -69,7 +89,19 @@ def convert_data_frame( input_df, multicolumns, df_data, max_rows = -1 ):
          current_format = row['format']
          current_device = row['device']
          #print( current_format + " / " + current_device )
-         aux_df.iloc[0][(current_format,current_device,'bandwidth','')]   = pd.to_numeric(row['bandwidth'], errors='coerce')
+         bw = pd.to_numeric(row['bandwidth'], errors='coerce')
+         aux_df.iloc[0][(current_format,current_device,'bandwidth','')] = bw
+         if( current_device == 'GPU' and
+             not 'Binary' in current_format and
+             not 'Symmetric' in current_format and
+             not 'Legacy' in current_format and
+             not 'cusparse' in current_format and
+             not 'LightSpMV' in current_format and
+             bw > best_bw ):
+            best_bw = bw
+            best_format = current_format
+         if current_format == 'cusparse':
+            cusparse_bw = bw
          #aux_df.iloc[0][(current_format,current_device,'time')]        = row['time']
          #aux_df.iloc[0][(current_format,current_device,'speed-up')]    = row['speedup']
          #aux_df.iloc[0][(current_format,current_device,'non-zeros')]   = row['non-zeros']
@@ -77,6 +109,11 @@ def convert_data_frame( input_df, multicolumns, df_data, max_rows = -1 ):
          #aux_df.iloc[0][(current_format,current_device,'stddev/time')] = row['stddev/time']
          #aux_df.iloc[0][(current_format,current_device,'diff.max')]    = row['CSR Diff.Max']
          #aux_df.iloc[0][(current_format,current_device,'diff.l2')]    = row['CSR Diff.L2']
+      aux_df.iloc[0][('TNL Best','GPU','bandwidth','')] = best_bw
+      if best_bw > cusparse_bw:
+         aux_df.iloc[0][('TNL Best','GPU','format','')] = best_format
+      else:
+         aux_df.iloc[0][('TNL Best','GPU','format','')] = 'cusparse'
       frames.append( aux_df )
       out_idx = out_idx + 1
       in_idx = in_idx + len(df_matrix.index)
@@ -114,6 +151,18 @@ def compute_speedup( df, formats ):
          df[(format,'GPU','speed-up','cusparse')] = cusparse_speedup_list
          df[(format,'GPU','speed-up','CSR CPU')] = csr_speedup_list
 
+   csr_light_bdw_list = df[('CSR< Light >','GPU','bandwidth')]
+   light_spmv_bdw_list = df[('LightSpMV Vector','GPU','bandwidth')]
+
+   csr_light_speedup_list = []
+   for ( csr_light_bdw, light_spmv_bdw ) in zip(csr_light_bdw_list,light_spmv_bdw_list):
+      try:
+         csr_light_speedup_list.append( csr_light_bdw / light_spmv_bdw  )
+      except:
+         csr_light_speedup_list.append('')
+   df[('CSR< Light >','GPU','speed-up','LightSpMV Vector')] = csr_light_speedup_list
+
+
 ####
 # Comparison with Cusparse
 def cusparse_comparison( df, formats, head_size=10 ):
@@ -133,8 +182,8 @@ def cusparse_comparison( df, formats, head_size=10 ):
          axs[1].set_yscale( 'log' )
          axs[1].plot( t, filtered_df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
          axs[1].plot( t, filtered_df[('cusparse','GPU','bandwidth')], '-o', ms=1, lw=1 )
-         axs[1].legend( [ format, 'Cusparse' ], loc='upper right' )
-         axs[1].set_xlabel( 'Matrix ID - sorted w.r.t. Cusparse' )
+         axs[1].legend( [ latexFormatName(format), 'Cusparse' ], loc='lower left' )
+         axs[1].set_xlabel( 'Matrix ID - sorted w.r.t. Cusparse performance' )
          axs[1].set_ylabel( 'Bandwidth in GB/sec' )
          plt.savefig( f"Cusparse-bw/{format}.pdf" )
          plt.close(fig)
@@ -158,13 +207,24 @@ def csr_comparison( df, formats, head_size=10 ):
          t = np.arange(result[(format,'GPU','bandwidth')].size )
          axs[0].plot( t, result[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
          axs[0].plot( t, result[('CSR','CPU','bandwidth')], '-o', ms=1, lw=1 )
-         axs[0].legend( [ format, 'CSR on CPU' ], loc='upper right' )
+         axs[0].legend( [ latexFormatName(format), 'CSR on CPU' ], loc='upper right' )
+         axs[0].set_ylabel( 'Bandwidth in GB/sec' )
          axs[1].set_yscale( 'log' )
          axs[1].plot( t, result[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
          axs[1].plot( t, result[('CSR','CPU','bandwidth')], '-o', ms=1, lw=1 )
-         axs[1].legend( [ format, 'CSR on CPU' ], loc='upper right' )
-         axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {format}" )
+         axs[1].legend( [ latexFormatName(format), 'CSR on CPU' ], loc='lower left' )
+         axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} performance" )
          axs[1].set_ylabel( 'Bandwidth in GB/sec' )
+         plt.rcParams.update({
+            "text.usetex": True,
+            "font.family": "sans-serif",
+            "font.sans-serif": ["Helvetica"]})
+         # for Palatino and other serif fonts use:
+         #plt.rcParams.update({
+         #   "text.usetex": True,
+         #   "font.family": "serif",
+         #   "font.serif": ["Palatino"],
+         #})
          plt.savefig( f"CSR-bw/{format}.pdf")
          plt.close(fig)
          head_df = df.head( head_size )
@@ -193,13 +253,24 @@ def legacy_formats_comparison( df, formats, head_size=10 ):
          t = np.arange(df[(ref_format,'GPU','bandwidth')].size )
          axs[0].plot( t, df[(ref_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
          axs[0].plot( t, df[(legacy_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
-         axs[0].legend( [ ref_format, legacy_format ], loc='upper right' )
+         axs[0].legend( [ latexFormatName(ref_format), latexFormatName(legacy_format) ], loc='upper right' )
+         axs[0].set_ylabel( 'Bandwidth in GB/sec' )
          axs[1].set_yscale( 'log' )
          axs[1].plot( t, df[(ref_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
          axs[1].plot( t, df[(legacy_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
-         axs[1].legend( [ ref_format, legacy_format ], loc='upper right' )
-         axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {ref_format}" )
+         axs[1].legend( [ latexFormatName(ref_format), latexFormatName(legacy_format) ], loc='lower left' )
+         axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(ref_format)}  performance" )
          axs[1].set_ylabel( 'Bandwidth in GB/sec' )
+         plt.rcParams.update({
+            "text.usetex": True,
+            "font.family": "sans-serif",
+            "font.sans-serif": ["Helvetica"]})
+         # for Palatino and other serif fonts use:
+         #plt.rcParams.update({
+         #   "text.usetex": True,
+         #   "font.family": "serif",
+         #   "font.serif": ["Palatino"],
+         #})
          plt.savefig( f"Legacy-bw/{ref_format}.pdf")
          plt.close(fig)
          head_df = df.head( head_size )
@@ -220,21 +291,37 @@ def cusparse_speedup_comparison( df, formats, head_size=10 ):
          df['tmp'] = df[(format, 'GPU','bandwidth')]
          filtered_df=df.dropna(subset=[('tmp','','','')])
          filtered_df.sort_values(by=[(format,'GPU','speed-up','cusparse')],inplace=True,ascending=False)
-         fig, axs = plt.subplots( 2, 1 )
+         fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
          size = len(filtered_df[(format,'GPU','speed-up','cusparse')].index)
          t = np.arange( size )
          bar = np.full( size, 1 )
-         axs[0].plot( t, filtered_df[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
-         axs[0].plot( t, bar, '-', ms=1, lw=1 )
-         axs[0].legend( [ format, 'Cusparse' ], loc='upper right' )
-         axs[1].set_yscale( 'log' )
-         axs[1].plot( t, filtered_df[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
-         axs[1].plot( t, bar, '-', ms=1, lw=1 )
-         axs[1].legend( [ format, 'Cusparse' ], loc='upper right' )
-         axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {format}" )
-         axs[1].set_ylabel( 'Bandwidth in GB/sec' )
+         axs.plot( t, filtered_df[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
+         axs.plot( t, bar, '-', ms=1, lw=1 )
+         axs.legend( [ latexFormatName(format), 'Cusparse' ], loc='upper right' )
+         axs.set_ylabel( 'Speedup' )
+         axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         plt.rcParams.update({
+            "text.usetex": True,
+            "font.family": "sans-serif",
+            "font.sans-serif": ["Helvetica"]})
+         # for Palatino and other serif fonts use:
+         #plt.rcParams.update({
+         #   "text.usetex": True,
+         #   "font.family": "serif",
+         #   "font.serif": ["Palatino"],
+         #})
          plt.savefig( f"Cusparse-speed-up/{format}.pdf")
          plt.close(fig)
+
+         fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+         axs.set_yscale( 'log' )
+         axs.plot( t, filtered_df[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
+         axs.plot( t, bar, '-', ms=1, lw=1 )
+         axs.legend( [ latexFormatName(format), 'Cusparse' ], loc='lower left' )
+         axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         axs.set_ylabel( 'Speedup' )
+         plt.savefig( f"Cusparse-speed-up/{format}-log.pdf")
+         plt.close(fig)
          head_df = filtered_df.head( head_size )
          for f in formats:
             if not f in ['cusparse','CSR',format]:
@@ -242,6 +329,53 @@ def cusparse_speedup_comparison( df, formats, head_size=10 ):
                head_df.drop( labels=f, axis='columns', level=0, inplace=True )
          head_df.to_html( f"Cusparse-speed-up/{format}-head.html" )
 
+####
+# Comparison of speed-up w.r.t. LightSpMV
+def csr_light_speedup_comparison( df, head_size=10 ):
+   format = 'CSR< Light >'
+   print( f"Writing comparison of speed-up of CSR Light compared to LightSPMV" )
+   df['tmp'] = df[(format, 'GPU','bandwidth')]
+   filtered_df=df.dropna(subset=[('tmp','','','')])
+   filtered_df.sort_values(by=[(format,'GPU','speed-up','LightSpMV Vector')],inplace=True,ascending=False)
+   fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+   size = len(filtered_df[(format,'GPU','speed-up','LightSpMV Vector')].index)
+   t = np.arange( size )
+   bar = np.full( size, 1 )
+   axs.plot( t, filtered_df[(format,'GPU','speed-up','LightSpMV Vector')], '-o', ms=1, lw=1 )
+   axs.plot( t, bar, '-', ms=1, lw=1 )
+   axs.legend( [ latexFormatName(format), 'LightSpMV' ], loc='upper right' )
+   axs.set_ylabel( 'Speedup' )
+   axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
+   plt.rcParams.update({
+      "text.usetex": True,
+      "font.family": "sans-serif",
+      "font.sans-serif": ["Helvetica"]})
+   # for Palatino and other serif fonts use:
+   #plt.rcParams.update({
+   #   "text.usetex": True,
+   #   "font.family": "serif",
+   #   "font.serif": ["Palatino"],
+   #})
+   plt.savefig( f"LightSpMV-speed-up.pdf")
+   plt.close(fig)
+
+   fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+   axs.set_yscale( 'log' )
+   axs.plot( t, filtered_df[(format,'GPU','speed-up','LightSpMV Vector')], '-o', ms=1, lw=1 )
+   axs.plot( t, bar, '-', ms=1, lw=1 )
+   axs.legend( [ latexFormatName(format), 'LightSpMV' ], loc='lower left' )
+   axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
+   axs.set_ylabel( 'Speedup' )
+   plt.savefig( f"LightSpMV-speed-up-log.pdf")
+   plt.close(fig)
+   head_df = filtered_df.head( head_size )
+   for f in formats:
+      if not f in ['cusparse','CSR',format]:
+         #print( f"Droping {f}..." )
+         head_df.drop( labels=f, axis='columns', level=0, inplace=True )
+   head_df.to_html( f"LightSpMV-speed-up-head.html" )
+
+
 ####
 # Parse input file
 print( "Parsing input file...." )
@@ -254,19 +388,53 @@ formats = get_formats( input_df )
 multicolumns, df_data = get_multiindex( input_df, formats )
 
 print( "Converting data..." )
-result = convert_data_frame( input_df, multicolumns, df_data, 20000 )
+result = convert_data_frame( input_df, multicolumns, df_data, 20 )
 compute_speedup( result, formats )
 
-print( "Writting to HTML file..." )
-result.to_html( 'output.html' )
-
 result.replace( to_replace=' ',value=np.nan,inplace=True)
 
 ####
-# Generate report = tables and figures
+# Make data analysis
+def processDf( df, formats, head_size = 10 ):
+   print( "Writting to HTML file..." )
+   df.to_html( f'output.html' )
+
+   # Generate tables and figures
+   cusparse_comparison( df, formats, head_size )
+   csr_comparison( df, formats, head_size )
+   legacy_formats_comparison( df, formats, head_size )
+   cusparse_speedup_comparison( df, formats, head_size )
+   csr_light_speedup_comparison( df, head_size )
+
+   best = df[('TNL Best','GPU','format')].tolist()
+   for format in formats:
+      if( not 'Binary' in format and
+          not 'Symmetric' in format and
+          not 'Legacy' in format and
+          not 'LightSpMV' in format and
+          not 'TNL Best' in format ):
+         cases = best.count(format)
+         print( f'{format} is best in {cases} cases.')
+
 head_size = 10
-cusparse_comparison( result, formats, head_size )
-csr_comparison( result, formats, head_size )
-legacy_formats_comparison( result, formats, head_size )
-cusparse_speedup_comparison( result, formats, head_size )
+if not os.path.exists( 'general' ):
+   os.mkdir( 'general' )
+os.chdir( 'general' )
+processDf( result, formats, head_size )
+os.chdir( '..' )
+
+#for rows_count in [ 10, 100, 1000, 10000, 100000, 1000000, 10000000 ]:
+#   filtered_df = result[ result['rows'].astype('int32') <= rows_count ]
+#   if not os.path.exists(f'rows-le-{rows_count}'):
+#      os.mkdir( f'rows-le-{rows_count}')
+#   os.chdir( f'rows-le-{rows_count}')
+#   processDf( filtered_df, formats, head_size )
+#   os.chdir( '..' )
 
+#for rows_count in [ 10, 100, 1000, 10000, 100000, 1000000, 10000000 ]:
+#   filtered_df = result[ result['rows'].astype('int32') >= rows_count ]
+#   if not os.path.exists(f'rows-ge-{rows_count}'):
+#      os.mkdir( f'rows-ge-{rows_count}')
+#   os.chdir( f'rows-ge-{rows_count}')
+#   processDf( filtered_df, formats, head_size )
+#   os.chdir( '..' )
-- 
GitLab


From 777c0c95ceb2e53c693e5165555858d5f791cfd1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 3 Aug 2021 20:41:28 +0200
Subject: [PATCH 092/117] Fixing wrong type in SparseMatrixView documentation.

---
 src/TNL/Matrices/SparseMatrixView.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index 376ceab5c..a20964b0c 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -451,7 +451,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \tparam Keep is a type of lambda function for storing results of reduction in each row. It is declared as
        *
        * ```
-       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const double& value ) { ... };
+       * auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const RealType& value ) { ... };
        * ```
        *
        * \tparam FetchValue is type returned by the Fetch lambda function.
-- 
GitLab


From d05990b9598ffd023005b16f3ec1213b2dd1db83 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 3 Aug 2021 20:41:53 +0200
Subject: [PATCH 093/117] Added multi vector CSR kernel to CSR Light.

---
 .../Segments/Kernels/CSRLightKernel.hpp       | 110 ++++++++++++++++--
 1 file changed, 101 insertions(+), 9 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
index 7fddef69d..565ce3ecf 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
@@ -219,6 +219,97 @@ void SpMVCSRVector( OffsetsView offsets,
    if( laneID == 0 )
       keep( warpID, result );
 }
+
+template< int BlockSize,
+          int ThreadsPerSegment,
+          typename Offsets,
+          typename Index,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper,
+          typename Real >
+__global__
+void reduceSegmentsCSRLightMultivectorKernel(
+    int gridIdx,
+    const Offsets offsets,
+    Index first,
+    Index last,
+    Fetch fetch,
+    const Reduction reduce,
+    ResultKeeper keep,
+    const Real zero )
+{
+    const Index segmentIdx =  TNL::Cuda::getGlobalThreadIdx( gridIdx ) / ThreadsPerSegment + first;
+    if( segmentIdx >= last )
+        return;
+
+    __shared__ Real shared[ BlockSize / 32 ];
+    if( threadIdx.x < BlockSize / TNL::Cuda::getWarpSize() )
+        shared[ threadIdx.x ] = zero;
+
+    const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than %
+    const int inWarpLaneIdx = threadIdx.x & ( TNL::Cuda::getWarpSize() - 1 ); // & is cheaper than %
+    const Index beginIdx = offsets[ segmentIdx ];
+    const Index endIdx   = offsets[ segmentIdx + 1 ] ;
+
+    Real result = zero;
+    bool compute( true );
+    Index localIdx = laneIdx;
+    for( Index globalIdx = beginIdx + laneIdx; globalIdx < endIdx && compute; globalIdx += ThreadsPerSegment )
+    {
+       result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
+       localIdx += ThreadsPerSegment;
+    }
+    result += __shfl_down_sync(0xFFFFFFFF, result, 16);
+    result += __shfl_down_sync(0xFFFFFFFF, result, 8);
+    result += __shfl_down_sync(0xFFFFFFFF, result, 4);
+    result += __shfl_down_sync(0xFFFFFFFF, result, 2);
+    result += __shfl_down_sync(0xFFFFFFFF, result, 1);
+
+    const Index warpIdx = threadIdx.x / TNL::Cuda::getWarpSize();
+    if( inWarpLaneIdx == 0 )
+        shared[ warpIdx ] = result;
+
+    __syncthreads();
+    // Reduction in shared
+    if( warpIdx == 0 && inWarpLaneIdx < 16 )
+    {
+        //constexpr int totalWarps = BlockSize / WarpSize;
+        constexpr int warpsPerSegment = ThreadsPerSegment / TNL::Cuda::getWarpSize();
+        if( warpsPerSegment >= 32 )
+        {
+            shared[ inWarpLaneIdx ] =  reduce( shared[ inWarpLaneIdx ], shared[ inWarpLaneIdx + 16 ] );
+            __syncwarp();
+        }
+        if( warpsPerSegment >= 16 )
+        {
+            shared[ inWarpLaneIdx ] =  reduce( shared[ inWarpLaneIdx ], shared[ inWarpLaneIdx +  8 ] );
+            __syncwarp();
+        }
+        if( warpsPerSegment >= 8 )
+        {
+            shared[ inWarpLaneIdx ] =  reduce( shared[ inWarpLaneIdx ], shared[ inWarpLaneIdx +  4 ] );
+            __syncwarp();
+        }
+        if( warpsPerSegment >= 4 )
+        {
+            shared[ inWarpLaneIdx ] =  reduce( shared[ inWarpLaneIdx ], shared[ inWarpLaneIdx +  2 ] );
+            __syncwarp();
+        }
+        if( warpsPerSegment >= 2 )
+        {
+            shared[ inWarpLaneIdx ] =  reduce( shared[ inWarpLaneIdx ], shared[ inWarpLaneIdx +  1 ] );
+            __syncwarp();
+        }
+        constexpr int segmentsCount = BlockSize / ThreadsPerSegment;
+        if( inWarpLaneIdx < segmentsCount && segmentIdx + inWarpLaneIdx < last )
+        {
+            //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, aux );
+            keep( segmentIdx + inWarpLaneIdx, shared[ inWarpLaneIdx * ThreadsPerSegment / 32 ] );
+        }
+    }
+}
+
 #endif
 template< typename Index,
           typename Device,
@@ -302,20 +393,21 @@ struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Kee
          else if (threadsPerSegment == 16)
             SpMVCSRLightWithoutAtomic16<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
                offsets, first, last, fetch, reduce, keep, zero, grid );
-         else // if (threadsPerSegment == 32)
+         else if (threadsPerSegment == 32)
          { // CSR SpMV Light with threadsPerSegment = 32 is CSR Vector
             SpMVCSRVector<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
                offsets, first, last, fetch, reduce, keep, zero, grid );
          }
-         /*else
+         else if (threadsPerSegment == 64 )
          { // Execute CSR MultiVector
-            SpMVCSRMultiVector<Real, Index, warpSize><<<blocks, threads>>>(
-                     inVector, outVector, matrix.getoffsets().getData(),
-                     matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
-                     rows, threadsPerSegment / 32, grid
-            );
-         }*/
-
+            reduceSegmentsCSRLightMultivectorKernel< 128, 64 ><<<blocks, threads>>>(
+                     grid, offsets, first, last, fetch, reduce, keep, zero );
+         }
+         else //if (threadsPerSegment == 64 )
+         { // Execute CSR MultiVector
+            reduceSegmentsCSRLightMultivectorKernel< 128, 128 ><<<blocks, threads>>>(
+                     grid, offsets, first, last, fetch, reduce, keep, zero );
+         }
       }
 #endif
 
-- 
GitLab


From cb1730735bb870a0dc5e35defb9772d71b61531f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 5 Aug 2021 22:12:19 +0200
Subject: [PATCH 094/117] Refactoring Light CSR kernel.

---
 .../Segments/Kernels/CSRLightKernel.hpp       | 91 +++++++++++++++++--
 1 file changed, 85 insertions(+), 6 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
index 565ce3ecf..cc9693f34 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
@@ -179,7 +179,7 @@ void SpMVCSRLightWithoutAtomic16( OffsetsView offsets,
       keep( segmentIdx, result );
 }
 
-template< typename Real,
+/*template< typename Real,
           typename Index,
           typename OffsetsView,
           typename Fetch,
@@ -204,22 +204,71 @@ void SpMVCSRVector( OffsetsView offsets,
    const Index laneID = threadIdx.x & 31; // & is cheaper than %
    Index endID = offsets[warpID + 1];
 
-   /* Calculate result */
+   // Calculate result
    bool compute = true;
    for (Index i = offsets[warpID] + laneID; i < endID; i += warpSize)
       result = reduce( result, fetch( i, compute ) );
 
-   /* Reduction */
+   // Reduction
    result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) );
    result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  8 ) );
    result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  4 ) );
    result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  2 ) );
    result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  1 ) );
-   /* Write result */
+   // Write result
+   if( laneID == 0 )
+      keep( warpID, result );
+}*/
+
+template< int ThreadsPerSegment,
+          typename Real,
+          typename Index,
+          typename OffsetsView,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+__global__
+void SpMVCSRVector( OffsetsView offsets,
+                    const Index first,
+                    const Index last,
+                    Fetch fetch,
+                    Reduce reduce,
+                    Keep keep,
+                    const Real zero,
+                    const Index gridID )
+{
+   //const int warpSize = 32;
+   const Index warpID = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / ThreadsPerSegment;
+   if (warpID >= last)
+      return;
+
+   Real result = zero;
+   const Index laneID = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than %
+   Index endID = offsets[warpID + 1];
+
+   // Calculate result
+   bool compute = true;
+   for (Index i = offsets[warpID] + laneID; i < endID; i += ThreadsPerSegment )
+      result = reduce( result, fetch( i, compute ) );
+
+   // Reduction
+   if( ThreadsPerSegment > 16 )
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) );
+   if( ThreadsPerSegment > 8 )
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  8 ) );
+   if( ThreadsPerSegment > 4 )
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  4 ) );
+   if( ThreadsPerSegment > 2 )
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  2 ) );
+   if( ThreadsPerSegment > 1 )
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  1 ) );
+
+   // Store result
    if( laneID == 0 )
       keep( warpID, result );
 }
 
+
 template< int BlockSize,
           int ThreadsPerSegment,
           typename Offsets,
@@ -381,7 +430,37 @@ struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Kee
             neededThreads -= TNL::Cuda::getMaxGridXSize() * threads;
          }
 
-         if (threadsPerSegment == 2)
+         if( threadsPerSegment == 1 )
+            SpMVCSRVector< 1, Real, Index, OffsetsView, Fetch, Reduce, Keep ><<< blocks, threads >>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         if( threadsPerSegment == 2 )
+            SpMVCSRVector< 2, Real, Index, OffsetsView, Fetch, Reduce, Keep ><<< blocks, threads >>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         if( threadsPerSegment == 4 )
+            SpMVCSRVector< 4, Real, Index, OffsetsView, Fetch, Reduce, Keep ><<< blocks, threads >>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         if( threadsPerSegment == 8 )
+            SpMVCSRVector< 8, Real, Index, OffsetsView, Fetch, Reduce, Keep ><<< blocks, threads >>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         if( threadsPerSegment == 16 )
+            SpMVCSRVector< 16, Real, Index, OffsetsView, Fetch, Reduce, Keep ><<< blocks, threads >>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         if( threadsPerSegment == 32 )
+            SpMVCSRVector< 32, Real, Index, OffsetsView, Fetch, Reduce, Keep ><<< blocks, threads >>>(
+               offsets, first, last, fetch, reduce, keep, zero, grid );
+         if( threadsPerSegment == 64 )
+         { // Execute CSR MultiVector
+            reduceSegmentsCSRLightMultivectorKernel< 128, 64 ><<<blocks, threads>>>(
+                     grid, offsets, first, last, fetch, reduce, keep, zero );
+         }
+         if (threadsPerSegment >= 128 )
+         { // Execute CSR MultiVector
+            reduceSegmentsCSRLightMultivectorKernel< 128, 128 ><<<blocks, threads>>>(
+                     grid, offsets, first, last, fetch, reduce, keep, zero );
+         }
+
+
+         /*if (threadsPerSegment == 2)
             SpMVCSRLightWithoutAtomic2<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
                offsets, first, last, fetch, reduce, keep, zero, grid );
          else if (threadsPerSegment == 4)
@@ -407,7 +486,7 @@ struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Kee
          { // Execute CSR MultiVector
             reduceSegmentsCSRLightMultivectorKernel< 128, 128 ><<<blocks, threads>>>(
                      grid, offsets, first, last, fetch, reduce, keep, zero );
-         }
+         }*/
       }
 #endif
 
-- 
GitLab


From 36e04c6f23312b9e3cebba0c6f12ab7e00d3aab5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 6 Aug 2021 10:04:04 +0200
Subject: [PATCH 095/117] Commenting symmetric matrix check in SpMV benchmark
 since the comparison operator with symmetric matrices does not work properly
 yet.

---
 src/Benchmarks/SpMV/spmv.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 5663f01d2..03f3e6258 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -639,10 +639,11 @@ benchmarkSpmv( BenchmarkType& benchmark,
       }
       InputMatrix hostMatrix;
       TNL::Matrices::MatrixReader< InputMatrix >::readMtx( inputFileName, hostMatrix, verboseMR );
-      if( hostMatrix != symmetricHostMatrix )
-      {
-         std::cerr << "ERROR: Symmetric matrices do not match !!!" << std::endl;
-      }
+      // TODO: Comparison of symmetric and general matrix does not work yet.
+      //if( hostMatrix != symmetricHostMatrix )
+      //{
+      //   std::cerr << "ERROR: Symmetric matrices do not match !!!" << std::endl;
+      //}
       benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
       benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
       benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-- 
GitLab


From a68cfcc8f4d00c63bfaa87744cfcc4033ac62818 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 6 Aug 2021 17:03:58 +0200
Subject: [PATCH 096/117] Working on the script for processing results of SpMV
 benchmark.

---
 .../tnl-spmv-benchmark-make-tables-json.py    | 428 +++++++++++++++---
 1 file changed, 356 insertions(+), 72 deletions(-)

diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
index 69e23fa2d..b9f2ddeda 100755
--- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
@@ -7,6 +7,21 @@ from pandas.io.json import json_normalize
 import matplotlib.pyplot as plt
 import numpy as np
 
+#Latex fonst set-up
+
+#plt.rcParams.update({
+#   "text.usetex": True,
+#   "font.family": "sans-serif",
+#   "font.sans-serif": ["Helvetica"]})
+#
+# for Palatino and other serif fonts use:
+#plt.rcParams.update({
+#   "text.usetex": True,
+#   "font.family": "serif",
+#   "font.serif": ["Palatino"],
+#})
+
+
 ####
 # Helper function
 def slugify(s):
@@ -51,35 +66,50 @@ def get_multiindex( input_df, formats ):
             level3.append( 'speed-up')
             level4.append( speedup )
             df_data[ 0 ].append( ' ' )
-         if format == 'CSR< Light >':
-            level1.append( format )
-            level2.append( 'GPU' )
-            level3.append( 'speed-up')
-            level4.append( 'LightSpMV Vector' )
-            df_data[ 0 ].append( ' ' )
-         if format == 'TNL Best':
-            level1.append( format )
-            level2.append( 'GPU' )
-            level3.append( 'format')
-            level4.append( '' )
-            df_data[ 0 ].append( ' ' )
+      if 'Binary' in format:
+         level1.append( format )
+         level2.append( 'GPU' )
+         level3.append( 'speed-up')
+         level4.append( 'non-binary' )
+         df_data[ 0 ].append( ' ' )
+      if 'Symmetric' in format:
+         level1.append( format )
+         level2.append( 'GPU' )
+         level3.append( 'speed-up')
+         level4.append( 'non-symmetric' )
+         df_data[ 0 ].append( ' ' )
+      if format == 'CSR< Light >':
+         level1.append( format )
+         level2.append( 'GPU' )
+         level3.append( 'speed-up')
+         level4.append( 'LightSpMV Vector' )
+         df_data[ 0 ].append( ' ' )
+      if format == 'TNL Best':
+         level1.append( format )
+         level2.append( 'GPU' )
+         level3.append( 'format')
+         level4.append( '' )
+         df_data[ 0 ].append( ' ' )
 
    multiColumns = pd.MultiIndex.from_arrays([ level1, level2, level3, level4 ] )
    return multiColumns, df_data
 
 ####
 # Convert input table to better structured one
-def convert_data_frame( input_df, multicolumns, df_data, max_rows = -1 ):
+def convert_data_frame( input_df, multicolumns, df_data, begin_idx = 0, end_idx = -1 ):
    frames = []
    in_idx = 0
    out_idx = 0
-   max_out_idx = max_rows
-   if max_out_idx == -1:
-      max_out_idx = len(input_df.index)
-   while in_idx < len(input_df.index) and out_idx < max_out_idx:
+   #max_out_idx = max_rows
+   if end_idx == -1:
+      end_idx = len(input_df.index)
+   while in_idx < len(input_df.index) and out_idx < end_idx:
       matrixName = input_df.iloc[in_idx]['matrix name']
       df_matrix = input_df.loc[input_df['matrix name'] == matrixName]
-      print( out_idx, ":", in_idx, "/", len(input_df.index), ":", matrixName )
+      if out_idx >= begin_idx:
+         print( f'{out_idx} : {in_idx} / {len(input_df.index)} : {matrixName}' )
+      else:
+         print( f'{out_idx} : {in_idx} / {len(input_df.index)} : {matrixName} - SKIP' )
       aux_df = pd.DataFrame( df_data, columns = multicolumns, index = [out_idx] )
       best_bw = 0
       for index,row in df_matrix.iterrows():
@@ -97,6 +127,7 @@ def convert_data_frame( input_df, multicolumns, df_data, max_rows = -1 ):
              not 'Legacy' in current_format and
              not 'cusparse' in current_format and
              not 'LightSpMV' in current_format and
+             not 'Hybrid' in current_format and
              bw > best_bw ):
             best_bw = bw
             best_format = current_format
@@ -114,7 +145,8 @@ def convert_data_frame( input_df, multicolumns, df_data, max_rows = -1 ):
          aux_df.iloc[0][('TNL Best','GPU','format','')] = best_format
       else:
          aux_df.iloc[0][('TNL Best','GPU','format','')] = 'cusparse'
-      frames.append( aux_df )
+      if out_idx >= begin_idx:
+         frames.append( aux_df )
       out_idx = out_idx + 1
       in_idx = in_idx + len(df_matrix.index)
    result = pd.concat( frames )
@@ -122,7 +154,7 @@ def convert_data_frame( input_df, multicolumns, df_data, max_rows = -1 ):
 
 ####
 # Compute speed-up of particular formats compared to Cusparse on GPU and CSR on CPU
-def compute_speedup( df, formats ):
+def compute_cusparse_speedup( df, formats ):
    for format in formats:
       if not format in [ 'cusparse', 'CSR' ]:
          print( 'Adding speed-up for ', format )
@@ -135,22 +167,17 @@ def compute_speedup( df, formats ):
             try:
                cusparse_speedup_list.append( format_bdw / cusparse_bdw )
             except:
-               cusparse_speedup_list.append('')
+               cusparse_speedup_list.append(float('nan'))
             try:
                csr_speedup_list.append( format_bdw / csr_bdw )
             except:
-               csr_speedup_list.append('')
-            #print( f'**{type(format_bdw)}** -- {type(5.2)}' )
-            #if type(format_bdw) == "<class 'numpy.float64'>":
-            #   print( f'##########{format_bdw / cusparse_bdw}' )
-            #   cusparse_speedup_list.append( format_bdw / cusparse_bdw )
-            #   csr_speedup_list.append( format_bdw / csr_bdw )
-            #else:
-            #   cusparse_speedup_list.append('')
-            #   csr_speedup_list.append('')
+               csr_speedup_list.append(float('nan'))
          df[(format,'GPU','speed-up','cusparse')] = cusparse_speedup_list
          df[(format,'GPU','speed-up','CSR CPU')] = csr_speedup_list
 
+####
+# Compute speedup of Light CSR
+def compute_csr_light_speedup( df ):
    csr_light_bdw_list = df[('CSR< Light >','GPU','bandwidth')]
    light_spmv_bdw_list = df[('LightSpMV Vector','GPU','bandwidth')]
 
@@ -159,20 +186,154 @@ def compute_speedup( df, formats ):
       try:
          csr_light_speedup_list.append( csr_light_bdw / light_spmv_bdw  )
       except:
-         csr_light_speedup_list.append('')
+         csr_light_speedup_list.append(float('nan'))
    df[('CSR< Light >','GPU','speed-up','LightSpMV Vector')] = csr_light_speedup_list
 
+####
+# Compute speed-up of binary formats
+def compute_binary_speedup( df, formats ):
+   for format in formats:
+      if 'Binary' in format:
+         non_binary_format = format.replace( 'Binary ', '' )
+         print( f'Adding speed-up of {format} vs {non_binary_format}' )
+         format_bdw_list = df[(format,'GPU','bandwidth')]
+         non_binary_bdw_list = df[(non_binary_format,'GPU','bandwidth')]
+         binary_speedup_list = []
+         for ( format_bdw, non_binary_bdw ) in zip( format_bdw_list, non_binary_bdw_list ):
+            try:
+               binary_speedup_list.append( format_bdw / non_binary_bdw )
+            except:
+               binary_speedup_list.append( float('nan'))
+         df[(format,'GPU','speed-up','non-binary')] = binary_speedup_list
+
+####
+# Compute speed-up of symmetric formats
+def compute_symmetric_speedup( df, formats ):
+   for format in formats:
+      if 'Symmetric' in format:
+         non_symmetric_format = format.replace( 'Symmetric ', '' )
+         print( f'Adding speed-up of {format} vs {non_symmetric_format}' )
+         format_bdw_list = df[(format,'GPU','bandwidth')]
+         non_symmetric_bdw_list = df[(non_symmetric_format,'GPU','bandwidth')]
+         symmetric_speedup_list = []
+         for ( format_bdw, non_symmetric_bdw ) in zip( format_bdw_list, non_symmetric_bdw_list ):
+            try:
+               symmetric_speedup_list.append( format_bdw / non_symmetric_bdw )
+            except:
+               symmetric_speedup_list.append(float('nan'))
+         df[(format,'GPU','speed-up','non-symmetric')] = symmetric_speedup_list
+
+def compute_speedup( df, formats ):
+   compute_cusparse_speedup( df, formats )
+   compute_csr_light_speedup( df )
+   compute_binary_speedup( df, formats )
+   compute_symmetric_speedup( df, formats )
+
+###
+# Draw several profiles into one figure
+def draw_profiles( formats, profiles, xlabel, ylabel, filename, style=[] ):
+   fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+   latexNames = []
+   size = 1
+   for format in formats:
+      t = np.arange(profiles[format].size )
+      axs.plot( t, profiles[format], '-o', ms=1, lw=1 )
+      size = len( profiles[format] )
+      latexNames.append( latexFormatName( format ) )
+   if 'draw-bar' in style:
+      #print( f'size = {size}' )
+      bar = np.full( size, 1 )
+      axs.plot( t, bar, '-', ms=1, lw=1.5 )
+
+   axs.legend( latexNames, loc='upper right' )
+   axs.set_xlabel( xlabel )
+   axs.set_ylabel( ylabel )
+   axs.set_yscale( 'log' )
+   plt.rcParams.update({
+      "text.usetex": True,
+      "font.family": "sans-serif",
+      "font.sans-serif": ["Helvetica"]})
+   plt.savefig( filename )
+   plt.close(fig)
+
+
+####
+# Effective BW profile
+def effective_bw_profile( df, formats, head_size=10 ):
+   if not os.path.exists("BW-profile"):
+      os.mkdir("BW-profile")
+   profiles = {}
+   for format in formats:
+      print( f"Writing BW profile of {format}" )
+      fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+      t = np.arange(df[(format,'GPU','bandwidth')].size )
+      if format == 'CSR':
+         df.sort_values(by=[(format,'CPU','bandwidth')],inplace=True,ascending=False)
+         profiles[format] = df[(format,'CPU','bandwidth')].copy()
+         axs.plot( t, df[(format,'CPU','bandwidth')], '-o', ms=1, lw=1 )
+      else:
+         df.sort_values(by=[(format,'GPU','bandwidth')],inplace=True,ascending=False)
+         profiles[format] = df[(format,'GPU','bandwidth')].copy()
+         axs.plot( t, df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+      axs.legend( [ latexFormatName(format), 'CSR on CPU' ], loc='upper right' )
+      axs.set_ylabel( 'Bandwidth in GB/sec' )
+      plt.rcParams.update({
+         "text.usetex": True,
+         "font.family": "sans-serif",
+         "font.sans-serif": ["Helvetica"]})
+      plt.savefig( f"BW-profile/{format}.pdf")
+      plt.close(fig)
+      fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+      axs.set_yscale( 'log' )
+      axs.plot( t, result[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+      axs.legend( [ latexFormatName(format), 'CSR on CPU' ], loc='lower left' )
+      axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} performance" )
+      axs.set_ylabel( 'Bandwidth in GB/sec' )
+      plt.rcParams.update({
+         "text.usetex": True,
+         "font.family": "sans-serif",
+         "font.sans-serif": ["Helvetica"]})
+      plt.savefig( f"BW-profile/{format}-log.pdf")
+      plt.close(fig)
+      copy_df = df.copy()
+      for f in formats:
+         if not f in ['cusparse','CSR',format]:
+            copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
+      copy_df.to_html( f"BW-profile/{format}.html" )
+
+   # Draw ellpack formats profiles
+   current_formats = []
+   xlabel = "Matrix ID - sorted by particular formats effective BW"
+   ylabel = "Bandwidth in GB/sec"
+   for format in formats:
+      if( ( 'Ellpack' in format and not 'Binary' in format and not 'Legacy' in format ) or
+          format == 'CSR' or
+          format == 'cusparse' ):
+         current_formats.append( format )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "ellpack-profiles-bw.pdf" )
+
+   # Draw CSR formats profiles
+   current_formats.clear()
+   for format in formats:
+      if( ( 'CSR' in format and not 'Binary' in format and not 'Legacy' in format and not 'Hybrid' in format ) or
+          format == 'cusparse' ):
+         current_formats.append( format )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "csr-profiles-bw.pdf" )
+
 
 ####
 # Comparison with Cusparse
 def cusparse_comparison( df, formats, head_size=10 ):
    if not os.path.exists("Cusparse-bw"):
       os.mkdir("Cusparse-bw")
+   ascend_df = df.copy()
    df.sort_values(by=[('cusparse','GPU','bandwidth')],inplace=True,ascending=False)
+   ascend_df.sort_values(by=[('cusparse','GPU','bandwidth')],inplace=True,ascending=True)
    for format in formats:
       if not format in ['cusparse','CSR']:
          print( f"Writing comparison of {format} and Cusparse" )
          filtered_df = df.dropna( subset=[(format,'GPU','bandwidth','')] )
+         filtered_ascend_df = ascend_df.dropna( subset=[(format,'GPU','bandwidth','')] )
          t = np.arange(filtered_df[(format,'GPU','bandwidth')].size )
          fig, axs = plt.subplots( 2, 1 )
          axs[0].plot( t, filtered_df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
@@ -187,12 +348,11 @@ def cusparse_comparison( df, formats, head_size=10 ):
          axs[1].set_ylabel( 'Bandwidth in GB/sec' )
          plt.savefig( f"Cusparse-bw/{format}.pdf" )
          plt.close(fig)
-         head_df = filtered_df.head( head_size )
+         copy_df = df.copy()
          for f in formats:
             if not f in ['cusparse','CSR',format]:
-               #print( f"Droping {f}..." )
-               head_df.drop( labels=f, axis='columns', level=0, inplace=True )
-         head_df.to_html( f"Cusparse-bw/{format}-head.html" )
+               copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
+         copy_df.to_html( f"Cusparse-bw/{format}.html" )
 
 ####
 # Comparison with CSR on CPU
@@ -202,11 +362,13 @@ def csr_comparison( df, formats, head_size=10 ):
    for format in formats:
       if not format in ['cusparse','CSR']:
          print( f"Writing comparison of {format} and CSR on CPU" )
-         result.sort_values(by=[(format,'GPU','bandwidth')],inplace=True,ascending=False)
+         ascend_df = df.copy()
+         df.sort_values(by=[(format,'GPU','bandwidth')],inplace=True,ascending=False)
+         ascend_df.sort_values(by=[(format,'GPU','bandwidth')],inplace=True,ascending=True)
          fig, axs = plt.subplots( 2, 1 )
-         t = np.arange(result[(format,'GPU','bandwidth')].size )
-         axs[0].plot( t, result[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
-         axs[0].plot( t, result[('CSR','CPU','bandwidth')], '-o', ms=1, lw=1 )
+         t = np.arange(df[(format,'GPU','bandwidth')].size )
+         axs[0].plot( t, df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
+         axs[0].plot( t, df[('CSR','CPU','bandwidth')], '-o', ms=1, lw=1 )
          axs[0].legend( [ latexFormatName(format), 'CSR on CPU' ], loc='upper right' )
          axs[0].set_ylabel( 'Bandwidth in GB/sec' )
          axs[1].set_yscale( 'log' )
@@ -219,20 +381,13 @@ def csr_comparison( df, formats, head_size=10 ):
             "text.usetex": True,
             "font.family": "sans-serif",
             "font.sans-serif": ["Helvetica"]})
-         # for Palatino and other serif fonts use:
-         #plt.rcParams.update({
-         #   "text.usetex": True,
-         #   "font.family": "serif",
-         #   "font.serif": ["Palatino"],
-         #})
          plt.savefig( f"CSR-bw/{format}.pdf")
          plt.close(fig)
-         head_df = df.head( head_size )
+         copy_df = df.copy()
          for f in formats:
             if not f in ['cusparse','CSR',format]:
-               #print( f"Droping {f}..." )
-               head_df.drop( labels=f, axis='columns', level=0, inplace=True )
-         head_df.to_html( f"CSR-bw/{format}-head.html" )
+               copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
+         copy_df.to_html( f"CSR-bw/{format}.html" )
 
 ####
 # Comparison of Legacy formats
@@ -248,7 +403,9 @@ def legacy_formats_comparison( df, formats, head_size=10 ):
                                     ('CSR< Vector >', 'CSR Legacy Vector') ]:
       if ref_format in formats and legacy_format in formats:
          print( f"Writing comparison of {ref_format} and {legacy_format}" )
+         ascend_df = df.copy()
          df.sort_values(by=[(ref_format,'GPU','bandwidth')],inplace=True,ascending=False)
+         ascend_df.sort_values(by=[(ref_format,'GPU','bandwidth')],inplace=True,ascending=True)
          fig, axs = plt.subplots( 2, 1 )
          t = np.arange(df[(ref_format,'GPU','bandwidth')].size )
          axs[0].plot( t, df[(ref_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
@@ -265,32 +422,27 @@ def legacy_formats_comparison( df, formats, head_size=10 ):
             "text.usetex": True,
             "font.family": "sans-serif",
             "font.sans-serif": ["Helvetica"]})
-         # for Palatino and other serif fonts use:
-         #plt.rcParams.update({
-         #   "text.usetex": True,
-         #   "font.family": "serif",
-         #   "font.serif": ["Palatino"],
-         #})
          plt.savefig( f"Legacy-bw/{ref_format}.pdf")
          plt.close(fig)
-         head_df = df.head( head_size )
+         copy_df = df.copy()
          for f in formats:
             if not f in ['cusparse','CSR',format]:
-               #print( f"Droping {f}..." )
-               head_df.drop( labels=f, axis='columns', level=0, inplace=True )
-         head_df.to_html( f"Legacy-bw/{format}-head.html" )
+               copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
+         copy_df.to_html( f"Legacy-bw/{format}.html" )
 
 ####
 # Comparison of speed-up w.r.t. Cusparse
 def cusparse_speedup_comparison( df, formats, head_size=10 ):
    if not os.path.exists("Cusparse-speed-up"):
       os.mkdir("Cusparse-speed-up")
+   profiles = {}
    for format in formats:
       if not format in ['cusparse','CSR']:
          print( f"Writing comparison of speed-up of {format} compared to Cusparse" )
          df['tmp'] = df[(format, 'GPU','bandwidth')]
          filtered_df=df.dropna(subset=[('tmp','','','')])
          filtered_df.sort_values(by=[(format,'GPU','speed-up','cusparse')],inplace=True,ascending=False)
+         profiles[format] = filtered_df[(format,'GPU','speed-up','cusparse')].copy()
          fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
          size = len(filtered_df[(format,'GPU','speed-up','cusparse')].index)
          t = np.arange( size )
@@ -300,6 +452,68 @@ def cusparse_speedup_comparison( df, formats, head_size=10 ):
          axs.legend( [ latexFormatName(format), 'Cusparse' ], loc='upper right' )
          axs.set_ylabel( 'Speedup' )
          axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         plt.rcParams.update({
+            "text.usetex": True,
+            "font.family": "sans-serif",
+            "font.sans-serif": ["Helvetica"]})
+         plt.savefig( f"Cusparse-speed-up/{format}.pdf")
+         plt.close(fig)
+
+         fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+         axs.set_yscale( 'log' )
+         axs.plot( t, filtered_df[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
+         axs.plot( t, bar, '-', ms=1, lw=1 )
+         axs.legend( [ latexFormatName(format), 'Cusparse' ], loc='lower left' )
+         axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         axs.set_ylabel( 'Speedup' )
+         plt.savefig( f"Cusparse-speed-up/{format}-log.pdf")
+         plt.close(fig)
+         copy_df = df.copy()
+         for f in formats:
+            if not f in ['cusparse','CSR',format]:
+               copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
+         copy_df.to_html( f"Cusparse-speed-up/{format}.html" )
+
+   # Draw Ellpack formats profiles
+   xlabel = "Matrix ID - sorted particular by formats speedup compared to Cusparse"
+   ylabel = "Speedup"
+   current_formats = []
+   for format in formats:
+      if( 'Ellpack' in format and not 'Binary' in format and not 'Legacy' in format ):
+         current_formats.append( format )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "ellpack-profiles-cusparse-speedup.pdf", "draw-bar" )
+
+   # Draw CSR formats profiles
+   current_formats.clear()
+   for format in formats:
+      if( 'CSR' in format and not 'Binary' in format and not 'Legacy' in format and not 'Hybrid' in format and format != 'CSR' ):
+         current_formats.append( format )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "csr-profiles-cusparse-speedup.pdf", "draw-bar" )
+
+####
+# Comparison of binary matrices
+def binary_matrices_comparison( df, formats, head_size = 10 ):
+   if not os.path.exists("Binary-speed-up"):
+      os.mkdir("Binary-speed-up")
+   for format in formats:
+      if 'Binary' in format:
+         non_binary_format = format.replace('Binary ','')
+         print( f"Writing comparison of speed-up of {format} vs {non_binary_format}" )
+         #df['tmp'] = df[(format, 'GPU','speed-up','non-binary')]
+         filtered_df=df.dropna(subset=[(format, 'GPU','speed-up','non-binary')]) #('tmp','','','')])
+         #print( f"{format} -> {filtered_df[(format,'GPU','speed-up','non-binary')]}" )
+         ascend_df = filtered_df.copy()
+         filtered_df.sort_values(by=[(format,'GPU','speed-up','non-binary')],inplace=True,ascending=False)
+         ascend_df.sort_values(by=[(format,'GPU','speed-up','non-binary')],inplace=True,ascending=True)
+         fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+         size = len(filtered_df[(format,'GPU','speed-up','non-binary')].index)
+         t = np.arange( size )
+         bar = np.full( size, 1 )
+         axs.plot( t, filtered_df[(format,'GPU','speed-up','non-binary')], '-o', ms=1, lw=1 )
+         axs.plot( t, bar, '-', ms=1, lw=1 )
+         axs.legend( [ latexFormatName(format), latexFormatName(non_binary_format) ], loc='upper right' )
+         axs.set_ylabel( 'Speedup' )
+         axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
          plt.rcParams.update({
             "text.usetex": True,
             "font.family": "sans-serif",
@@ -310,24 +524,85 @@ def cusparse_speedup_comparison( df, formats, head_size=10 ):
          #   "font.family": "serif",
          #   "font.serif": ["Palatino"],
          #})
-         plt.savefig( f"Cusparse-speed-up/{format}.pdf")
+         plt.savefig( f"Binary-speed-up/{format}.pdf")
          plt.close(fig)
 
          fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
          axs.set_yscale( 'log' )
-         axs.plot( t, filtered_df[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
+         axs.plot( t, filtered_df[(format,'GPU','speed-up','non-binary')], '-o', ms=1, lw=1 )
          axs.plot( t, bar, '-', ms=1, lw=1 )
-         axs.legend( [ latexFormatName(format), 'Cusparse' ], loc='lower left' )
+         axs.legend( [ latexFormatName(format), latexFormatName(non_binary_format) ], loc='upper right' )
          axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
          axs.set_ylabel( 'Speedup' )
-         plt.savefig( f"Cusparse-speed-up/{format}-log.pdf")
+         plt.savefig( f"Binary-speed-up/{format}-log.pdf")
          plt.close(fig)
-         head_df = filtered_df.head( head_size )
+         #head_df = filtered_df.head( head_size )
+         #bottom_df = ascend_df.head( head_size )
+         copy_df = df.copy()
          for f in formats:
-            if not f in ['cusparse','CSR',format]:
+            if not f in ['cusparse','CSR',format,non_binary_format]:
+               #print( f"Droping {f}..." )
+               #head_df.drop( labels=f, axis='columns', level=0, inplace=True )
+               copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
+         #head_df.to_html( f"Binary-speed-up/{format}-head.html" )
+         copy_df.to_html( f"Binary-speed-up/{format}.html" )
+
+####
+# Comparison of symmetric matrices
+def symmetric_matrices_comparison( df, formats, head_size = 10 ):
+   if not os.path.exists("Symmetric-speed-up"):
+      os.mkdir("Symmetric-speed-up")
+   for format in formats:
+      if 'Symmetric' in format:
+         non_symmetric_format = format.replace('Symmetric ','')
+         print( f"Writing comparison of speed-up of {format} vs {non_symmetric_format}" )
+         #df['tmp'] = df[(format, 'GPU','speed-up','non-symmetric')]
+         filtered_df=df.dropna(subset=[(format, 'GPU','speed-up','non-symmetric')]) #('tmp','','','')])
+         ascend_df = filtered_df.copy()
+         #print( f"{format} -> {filtered_df[(format,'GPU','speed-up','non-symmetric')]}" )
+         filtered_df.sort_values(by=[(format,'GPU','speed-up','non-symmetric')],inplace=True,ascending=False)
+         ascend_df.sort_values(by=[(format,'GPU','speed-up','non-symmetric')],inplace=True,ascending=True)
+         fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+         size = len(filtered_df[(format,'GPU','speed-up','non-symmetric')].index)
+         t = np.arange( size )
+         bar = np.full( size, 1 )
+         axs.plot( t, filtered_df[(format,'GPU','speed-up','non-symmetric')], '-o', ms=1, lw=1 )
+         axs.plot( t, bar, '-', ms=1, lw=1 )
+         axs.legend( [ latexFormatName(format), latexFormatName(non_symmetric_format) ], loc='upper right' )
+         axs.set_ylabel( 'Speedup' )
+         axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         plt.rcParams.update({
+            "text.usetex": True,
+            "font.family": "sans-serif",
+            "font.sans-serif": ["Helvetica"]})
+         # for Palatino and other serif fonts use:
+         #plt.rcParams.update({
+         #   "text.usetex": True,
+         #   "font.family": "serif",
+         #   "font.serif": ["Palatino"],
+         #})
+         plt.savefig( f"Symmetric-speed-up/{format}.pdf")
+         plt.close(fig)
+
+         fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+         axs.set_yscale( 'log' )
+         axs.plot( t, filtered_df[(format,'GPU','speed-up','non-symmetric')], '-o', ms=1, lw=1 )
+         axs.plot( t, bar, '-', ms=1, lw=1 )
+         axs.legend( [ latexFormatName(format), latexFormatName(non_symmetric_format) ], loc='lower left' )
+         axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         axs.set_ylabel( 'Speedup' )
+         plt.savefig( f"Symmetric-speed-up/{format}-log.pdf")
+         plt.close(fig)
+         #head_df = filtered_df.head( head_size )
+         #bottom_df = ascend_df.head( head_size )
+         copy_df = df.copy()
+         for f in formats:
+            if not f in ['cusparse','CSR',format,non_symmetric_format]:
                #print( f"Droping {f}..." )
-               head_df.drop( labels=f, axis='columns', level=0, inplace=True )
-         head_df.to_html( f"Cusparse-speed-up/{format}-head.html" )
+               #head_df.drop( labels=f, axis='columns', level=0, inplace=True )
+               copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
+         #head_df.to_html( f"Symmetric-speed-up/{format}-head.html" )
+         copy_df.to_html( f"Symmetric-speed-up/{format}.html" )
 
 ####
 # Comparison of speed-up w.r.t. LightSpMV
@@ -336,7 +611,9 @@ def csr_light_speedup_comparison( df, head_size=10 ):
    print( f"Writing comparison of speed-up of CSR Light compared to LightSPMV" )
    df['tmp'] = df[(format, 'GPU','bandwidth')]
    filtered_df=df.dropna(subset=[('tmp','','','')])
+   ascend_df = filtered_df.copy()
    filtered_df.sort_values(by=[(format,'GPU','speed-up','LightSpMV Vector')],inplace=True,ascending=False)
+   ascend_df.sort_values(by=[(format,'GPU','speed-up','LightSpMV Vector')],inplace=True,ascending=True)
    fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
    size = len(filtered_df[(format,'GPU','speed-up','LightSpMV Vector')].index)
    t = np.arange( size )
@@ -368,12 +645,16 @@ def csr_light_speedup_comparison( df, head_size=10 ):
    axs.set_ylabel( 'Speedup' )
    plt.savefig( f"LightSpMV-speed-up-log.pdf")
    plt.close(fig)
-   head_df = filtered_df.head( head_size )
+   #head_df = filtered_df.head( head_size )
+   #bottom_df = ascend_df.head( head_size )
+   copy_df = df.copy()
    for f in formats:
       if not f in ['cusparse','CSR',format]:
          #print( f"Droping {f}..." )
-         head_df.drop( labels=f, axis='columns', level=0, inplace=True )
-   head_df.to_html( f"LightSpMV-speed-up-head.html" )
+         #head_df.drop( labels=f, axis='columns', level=0, inplace=True )
+         copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
+   #head_df.to_html( f"LightSpMV-speed-up-head.html" )
+   copy_df.to_html( f"LightSpMV-speed-up-bottom.html" )
 
 
 ####
@@ -388,7 +669,7 @@ formats = get_formats( input_df )
 multicolumns, df_data = get_multiindex( input_df, formats )
 
 print( "Converting data..." )
-result = convert_data_frame( input_df, multicolumns, df_data, 20 )
+result = convert_data_frame( input_df, multicolumns, df_data, 0, 10000 )
 compute_speedup( result, formats )
 
 result.replace( to_replace=' ',value=np.nan,inplace=True)
@@ -400,10 +681,13 @@ def processDf( df, formats, head_size = 10 ):
    df.to_html( f'output.html' )
 
    # Generate tables and figures
+   effective_bw_profile( df, formats, head_size )
    cusparse_comparison( df, formats, head_size )
    csr_comparison( df, formats, head_size )
    legacy_formats_comparison( df, formats, head_size )
    cusparse_speedup_comparison( df, formats, head_size )
+   binary_matrices_comparison( df, formats, head_size )
+   symmetric_matrices_comparison( df, formats, head_size )
    csr_light_speedup_comparison( df, head_size )
 
    best = df[('TNL Best','GPU','format')].tolist()
@@ -416,7 +700,7 @@ def processDf( df, formats, head_size = 10 ):
          cases = best.count(format)
          print( f'{format} is best in {cases} cases.')
 
-head_size = 10
+head_size = 25
 if not os.path.exists( 'general' ):
    os.mkdir( 'general' )
 os.chdir( 'general' )
-- 
GitLab


From b00aed9a7728df77a9f3e73448248015ea4d17b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 9 Aug 2021 21:51:10 +0200
Subject: [PATCH 097/117] Added SpMV benchmark with PETSC library.

---
 CMakeLists.txt                           | 55 +++++++++++++-----------
 src/Benchmarks/SpMV/CMakeLists.txt       |  7 +--
 src/Benchmarks/SpMV/spmv.h               | 36 ++++++++++++++++
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h |  8 ++++
 4 files changed, 79 insertions(+), 27 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ecc556346..18c358ee6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -309,30 +309,34 @@ if( ${WITH_GMP} )
    endif()
 endif()
 
-#if( BUILD_MPI )
-#   FIND_PATH( PETSC_INCLUDE_DIR petsc.h
-#     /usr/include/petsc
-#     ${PETSC_DIR}/${PETSC_ARCH}/include
-#     ${PETSC_DIR}/include
-#     DOC "PETSC headers."
-#   )
-#   if( ${PETSC_INCLUDE_DIR} STREQUAL "PETSC_INCLUDE_DIR-NOTFOUND" )
-#      message( "PETSC not found." )
-#   else()
-#      message( "PETSC headers found -- ${PETSC_INCLUDE_DIR}" )
-#      FIND_LIBRARY(PETSC_LIBRARY petsc
-#                  ${PETSC_INCLUDE_DIR}/../lib
-#                  /usr/local/lib
-#                  /usr/lib)
-#      if( PETSC_LIBRARY )
-#         #string( REPLACE ";" " " MPI_LIBRARIES "${MPI_CXX_LIBRARIES}" )
-#         #set( PETSC_LIBRARY "${MPI_LIBRARIES} ${PETSC_LIBRARY}")
-#         message( "PETSC library found -- ${PETSC_LIBRARY}")
-#         list( GET MPI_CXX_INCLUDE_PATH 0 MPI_CXX_PATH )
-#         set(PETSC_CXX_FLAGS "-DHAVE_PETSC -I${PETSC_INCLUDE_DIR} -DHAVE_MPI -I${MPI_CXX_PATH}")
-#      endif()
-#   endif()
-#endif()
+####
+# Test for PETSc
+if( BUILD_MPI )
+   FIND_PATH( PETSC_INCLUDE_DIR petsc.h
+      /usr/include/petsc
+      ${PETSC_DIR}/${PETSC_ARCH}/include
+      ${PETSC_DIR}/include
+      DOC "PETSC headers."
+   )
+   if( ${PETSC_INCLUDE_DIR} STREQUAL "PETSC_INCLUDE_DIR-NOTFOUND" )
+      message( "PETSC not found." )
+   else()
+      message( "PETSC headers found -- ${PETSC_INCLUDE_DIR}" )
+      FIND_LIBRARY(PETSC_LIBRARY petsc
+                  ${PETSC_INCLUDE_DIR}/../lib
+                  /usr/local/lib
+                  /usr/lib)
+      if( PETSC_LIBRARY )
+         #string( REPLACE ";" " " MPI_LIBRARIES "${MPI_CXX_LIBRARIES}" )
+         #set( PETSC_LIBRARY "${MPI_LIBRARIES} ${PETSC_LIBRARY}")
+         message( "PETSC library found -- ${PETSC_LIBRARY}")
+         #list( GET MPI_CXX_INCLUDE_PATH 0 MPI_CXX_PATH )
+         #set(PETSC_CXX_FLAGS "-DHAVE_PETSC -I${PETSC_INCLUDE_DIR} -DHAVE_MPI -I${MPI_CXX_PATH}")
+         set(PETSC_CXX_FLAGS -DHAVE_PETSC -I${PETSC_INCLUDE_DIR})
+         set(PETSC_LINKER_FLAGS ${PETSC_LIBRARY})
+      endif()
+   endif()
+endif()
 
 # configure build paths
 set( CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin )
@@ -413,6 +417,9 @@ message( "   CMAKE_SHARED_LINKER_FLAGS_RELEASE = ${CMAKE_SHARED_LINKER_FLAGS_REL
 message( "   CUDA_NVCC_FLAGS = ${CUDA_NVCC_FLAGS}" )
 message( "   CUDA_SAMPLES_FLAGS = ${CUDA_SAMPLES_FLAGS}" )
 message( "   GMP_LIBRARIES = ${GMP_LIBRARIES}" )
+message( "   PETSC_CXX_FLAGS = ${PETSC_CXX_FLAGS}" )
+message( "   PETSC_LINKER_FLAGS = ${PETSC_LINKER_FLAGS}" )
+
 if( MPI_CXX_FOUND AND ${WITH_MPI} )
    message( "   MPI_CXX_COMPILE_OPTIONS = ${MPI_CXX_COMPILE_OPTIONS}" )
    message( "   MPI_CXX_COMPILE_DEFINITIONS = ${MPI_CXX_COMPILE_DEFINITIONS}" )
diff --git a/src/Benchmarks/SpMV/CMakeLists.txt b/src/Benchmarks/SpMV/CMakeLists.txt
index 9976c19c8..93dccab0d 100644
--- a/src/Benchmarks/SpMV/CMakeLists.txt
+++ b/src/Benchmarks/SpMV/CMakeLists.txt
@@ -11,12 +11,13 @@
 if( BUILD_CUDA )
     cuda_include_directories( ${CXX_BENCHMARKS_INCLUDE_DIRS} )
     message( STATUS ${CXX_BENCHMARKS_FLAGS} )
-    CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu OPTIONS ${CXX_BENCHMARKS_FLAGS} )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} ${CUDA_cudadevrt_LIBRARY} )
+    CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu OPTIONS ${CXX_BENCHMARKS_FLAGS} ${PETSC_CXX_FLAGS} )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} ${CUDA_cudadevrt_LIBRARY} ${PETSC_LINKER_FLAGS})
 else()
     ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cpp )
-    target_compile_options( tnl-benchmark-spmv  PRIVATE ${CXX_BENCHMARKS_FLAGS} )
+    target_compile_options( tnl-benchmark-spmv  PRIVATE ${CXX_BENCHMARKS_FLAGS} ${PETSC_CXX_FLAGS} )
     target_include_directories( tnl-benchmark-spmv PRIVATE ${CXX_BENCHMARKS_INCLUDE_DIRS} )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${PETSC_LINKER_FLAGS} )
 endif()
 
 install( TARGETS tnl-benchmark-spmv RUNTIME DESTINATION bin )
diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 03f3e6258..2f5dacd4f 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -39,6 +39,10 @@
 #include <TNL/Algorithms/Segments/ChunkedEllpack.h>
 #include <TNL/Algorithms/Segments/BiEllpack.h>
 
+#ifdef HAVE_PETSC
+#include <petscmat.h>
+#endif
+
 // Comment the following to turn off some groups of SpMV benchmarks and speed-up the compilation
 #define WITH_TNL_BENCHMARK_SPMV_GENERAL_MATRICES
 #define WITH_TNL_BENCHMARK_SPMV_SYMMETRIC_MATRICES
@@ -492,6 +496,38 @@ benchmarkSpmv( BenchmarkType& benchmark,
    benchmark.writeHeader();
    benchmark.time< Devices::Host >( resetHostVectors, "", spmvCSRHost, csrBenchmarkResults );
 
+#ifdef HAVE_PETSC
+   Mat petscMatrix;
+   Containers::Vector< PetscInt, Devices::Host, PetscInt > petscRowPointers( csrHostMatrix.getRowPointers() );
+   Containers::Vector< PetscInt, Devices::Host, PetscInt > petscColumns( csrHostMatrix.getColumnIndexes() );
+   Containers::Vector< PetscScalar, Devices::Host, PetscInt > petscValues( csrHostMatrix.getValues() );
+   MatCreateSeqAIJWithArrays( PETSC_COMM_WORLD, //PETSC_COMM_SELF,
+                              csrHostMatrix.getRows(),
+                              csrHostMatrix.getColumns(),
+                              petscRowPointers.getData(),
+                              petscColumns.getData(),
+                              petscValues.getData(),
+                              &petscMatrix );
+   Vec inVector, outVector;
+   VecCreateSeq( PETSC_COMM_WORLD, csrHostMatrix.getColumns(), &inVector );
+   VecCreateSeq( PETSC_COMM_WORLD, csrHostMatrix.getRows(), &outVector );
+
+   auto resetPetscVectors = [&]() {
+      VecSet( inVector, 1.0 );
+      VecSet( outVector, 0.0 );
+   };
+
+   auto petscSpmvCSRHost = [&]() {
+      MatMult( petscMatrix, inVector, outVector );
+   };
+
+   SpmvBenchmarkResult< Real, Devices::Host, int > petscBenchmarkResults( String( "Petsc" ), hostOutVector, hostOutVector, csrHostMatrix.getNonzeroElementsCount() );
+   //benchmark.addLogsMetadata( petscBenchmarkResults.getTableHeader() );
+   //benchmark.writeHeader();
+   benchmark.time< Devices::Host >( resetPetscVectors, "", petscSpmvCSRHost, petscBenchmarkResults );
+#endif
+
+
 #ifdef HAVE_CUDA
    ////
    // Perform benchmark on CUDA device with cuSparse as a reference GPU format
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index 0954558fd..54f1cc7a9 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -21,6 +21,11 @@
 #include "spmv.h"
 
 #include <TNL/Matrices/MatrixReader.h>
+
+#ifdef HAVE_PETSC
+#include <petscmat.h>
+#endif
+
 using namespace TNL::Matrices;
 
 #include <exception>
@@ -94,6 +99,9 @@ setupConfig( Config::ConfigDescription & config )
 int
 main( int argc, char* argv[] )
 {
+#ifdef HAVE_PETSC
+   PetscInitialize( &argc, &argv, nullptr, nullptr );
+#endif
    Config::ParameterContainer parameters;
    Config::ConfigDescription conf_desc;
 
-- 
GitLab


From 546a69d0614f60bdcf78b85dcf603569a797b6da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 9 Aug 2021 21:51:51 +0200
Subject: [PATCH 098/117] Optimizing CPU kernel for CSR format.

---
 .../Segments/Kernels/CSRScalarKernel.hpp      | 111 +++++++++++++++++-
 1 file changed, 108 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRScalarKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRScalarKernel.hpp
index d98f88661..5b9c5e723 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRScalarKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRScalarKernel.hpp
@@ -21,6 +21,108 @@ namespace TNL {
    namespace Algorithms {
       namespace Segments {
 
+template< typename Index,
+          typename Device,
+          typename Fetch,
+          typename Reduce,
+          typename Keep,
+          bool DispatchScalarCSR = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
+struct CSRScalarKernelreduceSegmentsDispatcher;
+
+template< typename Index,
+          typename Device,
+          typename Fetch,
+          typename Reduction,
+          typename ResultKeeper >
+struct CSRScalarKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, true >
+{
+
+   template< typename Offsets,
+             typename Real >
+   static void reduce( const Offsets& offsets,
+                       Index first,
+                       Index last,
+                       Fetch& fetch,
+                       const Reduction& reduction,
+                       ResultKeeper& keep,
+                       const Real& zero )
+   {
+      auto l = [=] __cuda_callable__ ( const Index segmentIdx ) mutable {
+         const Index begin = offsets[ segmentIdx ];
+         const Index end = offsets[ segmentIdx + 1 ];
+         Real aux( zero );
+         Index localIdx( 0 );
+         bool compute( true );
+         for( Index globalIdx = begin; globalIdx < end && compute; globalIdx++  )
+             aux = reduction( aux, fetch( segmentIdx, localIdx++, globalIdx, compute ) );
+         keep( segmentIdx, aux );
+      };
+
+      if( std::is_same< Device, TNL::Devices::Sequential >::value )
+      {
+         for( Index segmentIdx = first; segmentIdx < last; segmentIdx ++ )
+            l( segmentIdx );
+      }
+      else if( std::is_same< Device, TNL::Devices::Host >::value )
+      {
+#ifdef HAVE_OPENMP
+        #pragma omp parallel for firstprivate( l ) schedule( dynamic, 100 ), if( Devices::Host::isOMPEnabled() )
+#endif
+         for( Index segmentIdx = first; segmentIdx < last; segmentIdx ++ )
+            l( segmentIdx );
+      }
+      else
+         Algorithms::ParallelFor< Device >::exec( first, last, l );
+   }
+};
+
+template< typename Index,
+          typename Device,
+          typename Fetch,
+          typename Reduce,
+          typename Keep >
+struct CSRScalarKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Keep, false >
+{
+   template< typename OffsetsView,
+             typename Real >
+   static void reduce( const OffsetsView& offsets,
+                       Index first,
+                       Index last,
+                       Fetch& fetch,
+                       const Reduce& reduction,
+                       Keep& keep,
+                       const Real& zero )
+   {
+      auto l = [=] __cuda_callable__ ( const Index segmentIdx ) mutable {
+         const Index begin = offsets[ segmentIdx ];
+         const Index end = offsets[ segmentIdx + 1 ];
+         Real aux( zero );
+         bool compute( true );
+         for( Index globalIdx = begin; globalIdx < end && compute; globalIdx++  )
+             aux = reduction( aux, fetch( globalIdx, compute ) );
+         keep( segmentIdx, aux );
+      };
+
+      if( std::is_same< Device, TNL::Devices::Sequential >::value )
+      {
+         for( Index segmentIdx = first; segmentIdx < last; segmentIdx ++ )
+            l( segmentIdx );
+      }
+      else if( std::is_same< Device, TNL::Devices::Host >::value )
+      {
+#ifdef HAVE_OPENMP
+        #pragma omp parallel for firstprivate( l ) schedule( dynamic, 100 ), if( Devices::Host::isOMPEnabled() )
+#endif
+         for( Index segmentIdx = first; segmentIdx < last; segmentIdx ++ )
+            l( segmentIdx );
+      }
+      else
+         Algorithms::ParallelFor< Device >::exec( first, last, l );
+
+   }
+};
+
+
 template< typename Index,
           typename Device >
     template< typename Offsets >
@@ -84,6 +186,9 @@ reduceSegments( const OffsetsView& offsets,
                    const Real& zero,
                    Args... args )
 {
+   CSRScalarKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduction, ResultKeeper >::reduce(
+      offsets, first, last, fetch, reduction, keeper, zero );
+   /*
     auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
         const IndexType begin = offsets[ segmentIdx ];
         const IndexType end = offsets[ segmentIdx + 1 ];
@@ -102,7 +207,7 @@ reduceSegments( const OffsetsView& offsets,
 #endif
         for( Index segmentIdx = first; segmentIdx < last; segmentIdx ++ )
             l( segmentIdx, args... );
-        /*{
+        {
             const IndexType begin = offsets[ segmentIdx ];
             const IndexType end = offsets[ segmentIdx + 1 ];
             Real aux( zero );
@@ -111,10 +216,10 @@ reduceSegments( const OffsetsView& offsets,
             for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
                 aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
             keeper( segmentIdx, aux );
-        }*/
+        }
     }
     else
-        Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+        Algorithms::ParallelFor< Device >::exec( first, last, l, args... );*/
 }
       } // namespace Segments
    }  // namespace Algorithms
-- 
GitLab


From 1f3e97274fdbe56f5bdb3a263e536f2652fddfa3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 11 Aug 2021 18:15:38 +0200
Subject: [PATCH 099/117] Added CSR speedup on CPU into skript for processing
 of SpMV benchmark results.

---
 .../tnl-spmv-benchmark-make-tables-json.py    | 150 ++++++++++++------
 1 file changed, 104 insertions(+), 46 deletions(-)

diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
index b9f2ddeda..eb38fbfab 100755
--- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
@@ -155,25 +155,31 @@ def convert_data_frame( input_df, multicolumns, df_data, begin_idx = 0, end_idx
 ####
 # Compute speed-up of particular formats compared to Cusparse on GPU and CSR on CPU
 def compute_cusparse_speedup( df, formats ):
-   for format in formats:
-      if not format in [ 'cusparse', 'CSR' ]:
-         print( 'Adding speed-up for ', format )
-         format_bdw_list = df[(format,'GPU','bandwidth')]
-         cusparse_bdw_list = df[('cusparse','GPU','bandwidth')]
-         csr_bdw_list = df[('CSR','CPU','bandwidth')]
-         cusparse_speedup_list = []
-         csr_speedup_list = []
-         for ( format_bdw, cusparse_bdw, csr_bdw ) in zip( format_bdw_list, cusparse_bdw_list,csr_bdw_list ):
-            try:
-               cusparse_speedup_list.append( format_bdw / cusparse_bdw )
-            except:
-               cusparse_speedup_list.append(float('nan'))
+   for device in [ 'CPU', 'GPU' ]:
+      for format in formats:
+         if not format in [ 'cusparse', 'CSR' ]:
+            print( 'Adding speed-up for ', format )
             try:
-               csr_speedup_list.append( format_bdw / csr_bdw )
+               format_bdw_list = df[(format,device,'bandwidth')]
             except:
-               csr_speedup_list.append(float('nan'))
-         df[(format,'GPU','speed-up','cusparse')] = cusparse_speedup_list
-         df[(format,'GPU','speed-up','CSR CPU')] = csr_speedup_list
+               continue
+            cusparse_bdw_list = df[('cusparse','GPU','bandwidth')]
+            csr_bdw_list = df[('CSR','CPU','bandwidth')]
+            cusparse_speedup_list = []
+            csr_speedup_list = []
+            for( format_bdw, cusparse_bdw, csr_bdw ) in zip( format_bdw_list, cusparse_bdw_list,csr_bdw_list ):
+               if( device == 'GPU' ):
+                  try:
+                     cusparse_speedup_list.append( format_bdw / cusparse_bdw )
+                  except:
+                     cusparse_speedup_list.append(float('nan'))
+               try:
+                  csr_speedup_list.append( format_bdw / csr_bdw )
+               except:
+                  csr_speedup_list.append(float('nan'))
+            if( device == 'GPU' ):
+               df[(format,'GPU','speed-up','cusparse')] = cusparse_speedup_list
+            df[(format,device,'speed-up','CSR CPU')] = csr_speedup_list
 
 ####
 # Compute speedup of Light CSR
@@ -359,35 +365,37 @@ def cusparse_comparison( df, formats, head_size=10 ):
 def csr_comparison( df, formats, head_size=10 ):
    if not os.path.exists("CSR-bw"):
       os.mkdir("CSR-bw")
-   for format in formats:
-      if not format in ['cusparse','CSR']:
-         print( f"Writing comparison of {format} and CSR on CPU" )
-         ascend_df = df.copy()
-         df.sort_values(by=[(format,'GPU','bandwidth')],inplace=True,ascending=False)
-         ascend_df.sort_values(by=[(format,'GPU','bandwidth')],inplace=True,ascending=True)
-         fig, axs = plt.subplots( 2, 1 )
-         t = np.arange(df[(format,'GPU','bandwidth')].size )
-         axs[0].plot( t, df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
-         axs[0].plot( t, df[('CSR','CPU','bandwidth')], '-o', ms=1, lw=1 )
-         axs[0].legend( [ latexFormatName(format), 'CSR on CPU' ], loc='upper right' )
-         axs[0].set_ylabel( 'Bandwidth in GB/sec' )
-         axs[1].set_yscale( 'log' )
-         axs[1].plot( t, result[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
-         axs[1].plot( t, result[('CSR','CPU','bandwidth')], '-o', ms=1, lw=1 )
-         axs[1].legend( [ latexFormatName(format), 'CSR on CPU' ], loc='lower left' )
-         axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} performance" )
-         axs[1].set_ylabel( 'Bandwidth in GB/sec' )
-         plt.rcParams.update({
-            "text.usetex": True,
-            "font.family": "sans-serif",
-            "font.sans-serif": ["Helvetica"]})
-         plt.savefig( f"CSR-bw/{format}.pdf")
-         plt.close(fig)
-         copy_df = df.copy()
-         for f in formats:
-            if not f in ['cusparse','CSR',format]:
-               copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
-         copy_df.to_html( f"CSR-bw/{format}.html" )
+   for device in [ 'CPU', 'GPU' ]:
+      for format in formats:
+         if not format in ['cusparse','CSR']:
+            print( f"Writing comparison of {format} and CSR on CPU" )
+            try:
+               df.sort_values(by=[(format,device,'bandwidth')],inplace=True,ascending=False)
+            except:
+               continue
+            fig, axs = plt.subplots( 2, 1 )
+            t = np.arange(df[(format,device,'bandwidth')].size )
+            axs[0].plot( t, df[(format,device,'bandwidth')], '-o', ms=1, lw=1 )
+            axs[0].plot( t, df[('CSR','CPU','bandwidth')], '-o', ms=1, lw=1 )
+            axs[0].legend( [ latexFormatName(format), 'CSR on CPU' ], loc='upper right' )
+            axs[0].set_ylabel( 'Bandwidth in GB/sec' )
+            axs[1].set_yscale( 'log' )
+            axs[1].plot( t, result[(format,device,'bandwidth')], '-o', ms=1, lw=1 )
+            axs[1].plot( t, result[('CSR','CPU','bandwidth')], '-o', ms=1, lw=1 )
+            axs[1].legend( [ latexFormatName(format), 'CSR on CPU' ], loc='lower left' )
+            axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} performance" )
+            axs[1].set_ylabel( 'Bandwidth in GB/sec' )
+            plt.rcParams.update({
+               "text.usetex": True,
+               "font.family": "sans-serif",
+               "font.sans-serif": ["Helvetica"]})
+            plt.savefig( f"CSR-bw/{format}-{device}.pdf")
+            plt.close(fig)
+            copy_df = df.copy()
+            for f in formats:
+               if not f in ['cusparse','CSR',format]:
+                  copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
+            copy_df.to_html( f"CSR-bw/{format}-{device}.html" )
 
 ####
 # Comparison of Legacy formats
@@ -430,6 +438,55 @@ def legacy_formats_comparison( df, formats, head_size=10 ):
                copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
          copy_df.to_html( f"Legacy-bw/{format}.html" )
 
+####
+# Comparison of speed-up w.r.t. CSR
+def csr_speedup_comparison( df, formats, head_size=10 ):
+   if not os.path.exists("CSR-speed-up"):
+      os.mkdir("CSR-speed-up")
+   for device in ['CPU', 'GPU']:
+      profiles = {}
+      for format in formats:
+         if not format in ['cusparse','CSR']:
+            print( f"Writing comparison of speed-up of {format} compared to CSR" )
+            df['tmp'] = df[(format, device,'bandwidth')]
+            filtered_df=df.dropna(subset=[('tmp','','','')])
+            try:
+               filtered_df.sort_values(by=[(format,device,'speed-up','CSR CPU')],inplace=True,ascending=False)
+            except:
+               continue
+            profiles[format] = filtered_df[(format,device,'speed-up','CSR CPU')].copy()
+            fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+            size = len(filtered_df[(format,device,'speed-up','CSR CPU')].index)
+            t = np.arange( size )
+            bar = np.full( size, 1 )
+            axs.plot( t, filtered_df[(format,device,'speed-up','CSR CPU')], '-o', ms=1, lw=1 )
+            axs.plot( t, bar, '-', ms=1, lw=1 )
+            axs.legend( [ latexFormatName(format), 'CSR CPU' ], loc='upper right' )
+            axs.set_ylabel( 'Speedup' )
+            axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
+            plt.rcParams.update({
+               "text.usetex": True,
+               "font.family": "sans-serif",
+               "font.sans-serif": ["Helvetica"]})
+            plt.savefig( f"CSR-speed-up/{format}.pdf")
+            plt.close(fig)
+
+            fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+            axs.set_yscale( 'log' )
+            axs.plot( t, filtered_df[(format,device,'speed-up','CSR CPU')], '-o', ms=1, lw=1 )
+            axs.plot( t, bar, '-', ms=1, lw=1 )
+            axs.legend( [ latexFormatName(format), 'CSR' ], loc='lower left' )
+            axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
+            axs.set_ylabel( 'Speedup' )
+            plt.savefig( f"CSR-speed-up/{format}-{device}-log.pdf")
+            plt.close(fig)
+            copy_df = df.copy()
+            for f in formats:
+               if not f in ['cusparse','CSR',format]:
+                  copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
+            copy_df.to_html( f"CSR-speed-up/{format}-{device}.html" )
+
+
 ####
 # Comparison of speed-up w.r.t. Cusparse
 def cusparse_speedup_comparison( df, formats, head_size=10 ):
@@ -685,6 +742,7 @@ def processDf( df, formats, head_size = 10 ):
    cusparse_comparison( df, formats, head_size )
    csr_comparison( df, formats, head_size )
    legacy_formats_comparison( df, formats, head_size )
+   csr_speedup_comparison( df, formats, head_size )
    cusparse_speedup_comparison( df, formats, head_size )
    binary_matrices_comparison( df, formats, head_size )
    symmetric_matrices_comparison( df, formats, head_size )
-- 
GitLab


From f220ea79e1000de005a1c36ae87d2e8c83381aa7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 11 Aug 2021 18:16:18 +0200
Subject: [PATCH 100/117] Turning off OpenMP in run script of SpMV benchmark.

---
 src/Benchmarks/scripts/run-tnl-benchmark-spmv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Benchmarks/scripts/run-tnl-benchmark-spmv b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
index fe511db11..84d2536ef 100755
--- a/src/Benchmarks/scripts/run-tnl-benchmark-spmv
+++ b/src/Benchmarks/scripts/run-tnl-benchmark-spmv
@@ -7,7 +7,7 @@ export CUDA_PROFILE=0
 PWD=`pwd`
 IWD="$PWD"
 BASE="ftp://math.nist.gov/pub/MatrixMarket2/Harwell-Boeing/"
-BENCHMARK="tnl-benchmark-spmv --with-legacy-matrices yes --precision double"
+BENCHMARK="tnl-benchmark-spmv --with-legacy-matrices yes --precision double --openmp-enabled no"
 BENCHMARK_DBG="tnl-benchmark-spmv-dbg --with-legacy-matrices no"
 
 export CUDA_PROFILE_CONFIG="$IWD/cuda-profiler.conf"
-- 
GitLab


From ab6d5310a66e0ca34e26ec6a234b25e783a7e5e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 11 Aug 2021 18:53:34 +0200
Subject: [PATCH 101/117] Added parametr --with-all-cpu-tests to
 tnl-benchmark-spmv.

---
 src/Benchmarks/SpMV/spmv.h               | 157 ++++++++++++-----------
 src/Benchmarks/SpMV/tnl-benchmark-spmv.h |   1 +
 2 files changed, 86 insertions(+), 72 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 2f5dacd4f..d583fad26 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -203,6 +203,7 @@ void
 benchmarkSpMVLegacy( BenchmarkType& benchmark,
                      const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector,
                      const String& inputFileName,
+                     bool allCpuTests,
                      bool verboseMR )
 {
    using HostMatrix = Matrix< Real, TNL::Devices::Host, int >;
@@ -230,19 +231,22 @@ benchmarkSpMVLegacy( BenchmarkType& benchmark,
    /////
    // Benchmark SpMV on host
    //
-   HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() );
+   if( allCpuTests )
+   {
+      HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() );
 
-   auto resetHostVectors = [&]() {
-      hostInVector = 1.0;
-      hostOutVector = 0.0;
-   };
+      auto resetHostVectors = [&]() {
+         hostInVector = 1.0;
+         hostOutVector = 0.0;
+      };
 
-   auto spmvHost = [&]() {
-      hostMatrix.vectorProduct( hostInVector, hostOutVector );
+      auto spmvHost = [&]() {
+         hostMatrix.vectorProduct( hostInVector, hostOutVector );
 
-   };
-   SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
-   benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
+      };
+      SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
+      benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
+   }
 
    /////
    // Benchmark SpMV on CUDA
@@ -282,6 +286,7 @@ benchmarkSpMV( BenchmarkType& benchmark,
                const InputMatrix& inputMatrix,
                const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector,
                const String& inputFileName,
+               bool allCpuTests,
                bool verboseMR )
 {
    using HostMatrix = Matrix< Real, TNL::Devices::Host, int >;
@@ -307,19 +312,22 @@ benchmarkSpMV( BenchmarkType& benchmark,
    /////
    // Benchmark SpMV on host
    //
-   HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() );
+   if( allCpuTests )
+   {
+      HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() );
 
-   auto resetHostVectors = [&]() {
-      hostInVector = 1.0;
-      hostOutVector = 0.0;
-   };
+      auto resetHostVectors = [&]() {
+         hostInVector = 1.0;
+         hostOutVector = 0.0;
+      };
 
-   auto spmvHost = [&]() {
-      hostMatrix.vectorProduct( hostInVector, hostOutVector );
+      auto spmvHost = [&]() {
+         hostMatrix.vectorProduct( hostInVector, hostOutVector );
 
-   };
-   SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
-   benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
+      };
+      SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
+      benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
+   }
 
    /////
    // Benchmark SpMV on CUDA
@@ -360,6 +368,7 @@ benchmarkBinarySpMV( BenchmarkType& benchmark,
                      const InputMatrix& inputMatrix,
                      const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector,
                      const String& inputFileName,
+                     bool allCpuTests,
                      bool verboseMR )
 {
    using HostMatrix = Matrix< bool, TNL::Devices::Host, int >;
@@ -385,19 +394,22 @@ benchmarkBinarySpMV( BenchmarkType& benchmark,
    /////
    // Benchmark SpMV on host
    //
-   HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() );
+   if( allCpuTests )
+   {
+      HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() );
 
-   auto resetHostVectors = [&]() {
-      hostInVector = 1.0;
-      hostOutVector = 0.0;
-   };
+      auto resetHostVectors = [&]() {
+         hostInVector = 1.0;
+         hostOutVector = 0.0;
+      };
 
-   auto spmvHost = [&]() {
-      hostMatrix.vectorProduct( hostInVector, hostOutVector );
+      auto spmvHost = [&]() {
+         hostMatrix.vectorProduct( hostInVector, hostOutVector );
 
-   };
-   SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
-   benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
+      };
+      SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
+      benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
+   }
 
    /////
    // Benchmark SpMV on CUDA
@@ -596,6 +608,7 @@ benchmarkSpmv( BenchmarkType& benchmark,
 #endif
    csrHostMatrix.reset();
 
+   bool allCpuTests = parameters.getParameter< bool >( "with-all-cpu-tests" );
 #ifdef WITH_TNL_BENCHMARK_SPMV_LEGACY_FORMATS
    /////
    // Benchmarking of TNL legacy formats
@@ -603,21 +616,21 @@ benchmarkSpmv( BenchmarkType& benchmark,
    if( parameters.getParameter< bool >("with-legacy-matrices") )
    {
       using namespace Benchmarks::SpMV::ReferenceFormats;
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Scalar             >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Vector             >( benchmark, hostOutVector, inputFileName, verboseMR );
+      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Scalar             >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Vector             >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
       //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light              >( benchmark, hostOutVector, inputFileName, verboseMR );
       //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light2             >( benchmark, hostOutVector, inputFileName, verboseMR );
       //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light3             >( benchmark, hostOutVector, inputFileName, verboseMR );
       //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light4             >( benchmark, hostOutVector, inputFileName, verboseMR );
       //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light5             >( benchmark, hostOutVector, inputFileName, verboseMR );
       //benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Light6             >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Adaptive           >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_MultiVector        >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_LightWithoutAtomic >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, Legacy::Ellpack                           >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, SlicedEllpackAlias                        >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, Legacy::ChunkedEllpack                    >( benchmark, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMVLegacy< Real, Legacy::BiEllpack                         >( benchmark, hostOutVector, inputFileName, verboseMR );
+      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_Adaptive           >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_MultiVector        >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMVLegacy< Real, SparseMatrixLegacy_CSR_LightWithoutAtomic >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMVLegacy< Real, Legacy::Ellpack                           >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMVLegacy< Real, SlicedEllpackAlias                        >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMVLegacy< Real, Legacy::ChunkedEllpack                    >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMVLegacy< Real, Legacy::BiEllpack                         >( benchmark, hostOutVector, inputFileName, allCpuTests, verboseMR );
    }
    // AdEllpack is broken
    //benchmarkSpMV< Real, Matrices::AdEllpack              >( benchmark, hostOutVector, inputFileName, verboseMR );
@@ -630,24 +643,24 @@ benchmarkSpmv( BenchmarkType& benchmark,
    using HostMatrixType = TNL::Matrices::SparseMatrix< Real, TNL::Devices::Host >;
    HostMatrixType hostMatrix;
    TNL::Matrices::MatrixReader< HostMatrixType >::readMtx( inputFileName, hostMatrix, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar                   >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector                   >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Hybrid                   >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Light                    >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive                 >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_Ellpack                      >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack                >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_ChunkedEllpack               >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_BiEllpack                    >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar                   >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector                   >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Hybrid                   >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Light                    >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive                 >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_Ellpack                      >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack                >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_ChunkedEllpack               >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_BiEllpack                    >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
 #ifdef WITH_TNL_BENCHMARK_SPMV_BINARY_MATRICES
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar             >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector             >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Light              >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive           >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_Ellpack                >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack          >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_ChunkedEllpack         >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_BiEllpack              >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar             >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector             >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Light              >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive           >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_Ellpack                >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack          >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_ChunkedEllpack         >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_BiEllpack              >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
 #endif
 #ifdef WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX
    benchmarkSpMV< Real, HostMatrixType, SparseSandboxMatrix                       >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
@@ -680,23 +693,23 @@ benchmarkSpmv( BenchmarkType& benchmark,
       //{
       //   std::cerr << "ERROR: Symmetric matrices do not match !!!" << std::endl;
       //}
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive                 >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack                      >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack                >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack               >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack                    >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive                 >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack                      >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack                >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack               >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack                    >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
 #ifdef WITH_TNL_BENCHMARK_SPMV_BINARY_MATRICES
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive           >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack                >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack          >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack         >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack              >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive           >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack                >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack          >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack         >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack              >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
 #endif
    }
 #endif
diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
index 54f1cc7a9..c5ff2bb3f 100644
--- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
+++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h
@@ -78,6 +78,7 @@ setupConfig( Config::ConfigDescription & config )
    config.addEntry< String >( "input-file", "Input file name.", "" );
    config.addEntry< bool >( "with-symmetric-matrices", "Perform benchmark even for symmetric matrix formats.", true );
    config.addEntry< bool >( "with-legacy-matrices", "Perform benchmark even for legacy TNL matrix formats.", true );
+   config.addEntry< bool >( "with-all-cpu-tests", "All matrix formats are tested on both CPU and GPU. ", false );
    config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv::" + getCurrDateTime() + ".log");
    config.addEntry< String >( "output-mode", "Mode for opening the log file - 'close' will only finalize the log file.", "append" );
    config.addEntryEnum( "append" );
-- 
GitLab


From 40044cd522c72bceda7fe5c551f40d37d01ab66a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 11 Aug 2021 20:13:25 +0200
Subject: [PATCH 102/117] Small maybe optimization in Lihgt CSR kernel.

---
 .../Segments/Kernels/CSRLightKernel.hpp       | 35 ++++++++++++-------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
index cc9693f34..59d8ae0e3 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
@@ -29,7 +29,7 @@ template< typename Real,
           typename Reduce,
           typename Keep >
 __global__
-void SpMVCSRLightWithoutAtomic2( OffsetsView offsets,
+void SpMVCSRLight2( OffsetsView offsets,
                                  const Index first,
                                  const Index last,
                                  Fetch fetch,
@@ -66,7 +66,7 @@ template< typename Real,
           typename Reduce,
           typename Keep >
 __global__
-void SpMVCSRLightWithoutAtomic4( OffsetsView offsets,
+void SpMVCSRLight4( OffsetsView offsets,
                                  const Index first,
                                  const Index last,
                                  Fetch fetch,
@@ -105,7 +105,7 @@ template< typename Real,
           typename Reduce,
           typename Keep >
 __global__
-void SpMVCSRLightWithoutAtomic8( OffsetsView offsets,
+void SpMVCSRLight8( OffsetsView offsets,
                                  const Index first,
                                  const Index last,
                                  Fetch fetch,
@@ -145,7 +145,7 @@ template< typename Real,
           typename Reduce,
           typename Keep >
 __global__
-void SpMVCSRLightWithoutAtomic16( OffsetsView offsets,
+void SpMVCSRLight16( OffsetsView offsets,
                                   const Index first,
                                   const Index last,
                                   Fetch fetch,
@@ -253,14 +253,25 @@ void SpMVCSRVector( OffsetsView offsets,
 
    // Reduction
    if( ThreadsPerSegment > 16 )
+   {
       result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) );
-   if( ThreadsPerSegment > 8 )
       result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  8 ) );
-   if( ThreadsPerSegment > 4 )
       result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  4 ) );
-   if( ThreadsPerSegment > 2 )
       result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  2 ) );
-   if( ThreadsPerSegment > 1 )
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  1 ) );
+   } else if( ThreadsPerSegment > 8 ) {
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  8 ) );
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  4 ) );
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  2 ) );
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  1 ) );
+   } else if( ThreadsPerSegment > 4 ) {
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  4 ) );
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  2 ) );
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  1 ) );
+   } else if( ThreadsPerSegment > 2 ) {
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  2 ) );
+      result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  1 ) );
+   } else if( ThreadsPerSegment > 1 )
       result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result,  1 ) );
 
    // Store result
@@ -461,16 +472,16 @@ struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Kee
 
 
          /*if (threadsPerSegment == 2)
-            SpMVCSRLightWithoutAtomic2<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
+            SpMVCSRLight2<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
                offsets, first, last, fetch, reduce, keep, zero, grid );
          else if (threadsPerSegment == 4)
-            SpMVCSRLightWithoutAtomic4<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
+            SpMVCSRLight4<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
                offsets, first, last, fetch, reduce, keep, zero, grid );
          else if (threadsPerSegment == 8)
-            SpMVCSRLightWithoutAtomic8<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
+            SpMVCSRLight8<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
                offsets, first, last, fetch, reduce, keep, zero, grid );
          else if (threadsPerSegment == 16)
-            SpMVCSRLightWithoutAtomic16<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
+            SpMVCSRLight16<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
                offsets, first, last, fetch, reduce, keep, zero, grid );
          else if (threadsPerSegment == 32)
          { // CSR SpMV Light with threadsPerSegment = 32 is CSR Vector
-- 
GitLab


From a1c6fffa3c2f4e09064c1d38540d1db8ea944919 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 11 Aug 2021 20:14:32 +0200
Subject: [PATCH 103/117] Added specialized benchmark of Light CSR.

---
 src/Benchmarks/SpMV/spmv.h | 87 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 86 insertions(+), 1 deletion(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index d583fad26..767d446a6 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -359,6 +359,89 @@ benchmarkSpMV( BenchmarkType& benchmark,
  #endif
 }
 
+template< typename Real,
+          typename InputMatrix,
+          template< typename, typename, typename > class Matrix,
+          template< typename, typename, typename, typename > class Vector = Containers::Vector >
+void
+benchmarkSpMVCSRLight( BenchmarkType& benchmark,
+                       const InputMatrix& inputMatrix,
+                       const TNL::Containers::Vector< Real, Devices::Host, int >& csrResultVector,
+                       const String& inputFileName,
+                       bool allCpuTests,
+                       bool verboseMR )
+{
+   using HostMatrix = Matrix< Real, TNL::Devices::Host, int >;
+   using CudaMatrix = Matrix< Real, TNL::Devices::Cuda, int >;
+   using HostVector = Containers::Vector< Real, Devices::Host, int >;
+   using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;
+
+   HostMatrix hostMatrix;
+   try
+   {
+      hostMatrix = inputMatrix;
+   }
+   catch(const std::exception& e)
+   {
+      std::cerr << "Unable to convert the matrix to the target format:"  << e.what() << std::endl;
+      return;
+   }
+
+   const int elements = hostMatrix.getNonzeroElementsCount();
+   const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
+   benchmark.setOperation( datasetSize );
+
+   /////
+   // Benchmark SpMV on host
+   //
+   if( allCpuTests )
+   {
+      HostVector hostInVector( hostMatrix.getColumns() ), hostOutVector( hostMatrix.getRows() );
+
+      auto resetHostVectors = [&]() {
+         hostInVector = 1.0;
+         hostOutVector = 0.0;
+      };
+
+      auto spmvHost = [&]() {
+         hostMatrix.vectorProduct( hostInVector, hostOutVector );
+
+      };
+      SpmvBenchmarkResult< Real, Devices::Host, int > hostBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, hostOutVector, hostMatrix.getNonzeroElementsCount() );
+      benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost, hostBenchmarkResults );
+   }
+
+   /////
+   // Benchmark SpMV on CUDA
+   //
+#ifdef HAVE_CUDA
+   CudaMatrix cudaMatrix;
+   try
+   {
+      cudaMatrix = inputMatrix;
+   }
+   catch(const std::exception& e)
+   {
+      std::cerr << "Unable to copy the matrix on GPU:" << e.what() << std::endl;
+      return;
+   }
+
+   CudaVector cudaInVector( hostMatrix.getColumns() ), cudaOutVector( hostMatrix.getRows() );
+
+   auto resetCudaVectors = [&]() {
+      cudaInVector = 1.0;
+      cudaOutVector = 0.0;
+   };
+
+   auto spmvCuda = [&]() {
+      cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
+   };
+   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
+   benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
+ #endif
+}
+
+
 template< typename Real,
           typename InputMatrix,
           template< typename, typename, typename > class Matrix,
@@ -646,7 +729,7 @@ benchmarkSpmv( BenchmarkType& benchmark,
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar                   >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector                   >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Hybrid                   >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Light                    >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMVCSRLight< Real, HostMatrixType, SparseMatrix_CSR_Light            >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive                 >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_Ellpack                      >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack                >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
@@ -696,6 +779,7 @@ benchmarkSpmv( BenchmarkType& benchmark,
       benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
       benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
       benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMVCSRLight< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Light            >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
       benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive                 >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
       benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack                      >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
       benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack                >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
@@ -705,6 +789,7 @@ benchmarkSpmv( BenchmarkType& benchmark,
       benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
       benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
       benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Light              >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
       benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive           >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
       benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack                >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
       benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack          >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-- 
GitLab


From 7bc3df04ea99d0ec09f2935900526abbabfd8364 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 12 Aug 2021 18:54:24 +0200
Subject: [PATCH 104/117] Threads per segment in Light CSR can be set by the
 user.

---
 src/Benchmarks/SpMV/spmv.h                    | 15 ++-
 src/TNL/Algorithms/Segments/CSR.h             |  4 +
 src/TNL/Algorithms/Segments/CSRView.h         |  4 +
 .../Segments/Kernels/CSRLightKernel.h         | 18 +++-
 .../Segments/Kernels/CSRLightKernel.hpp       | 94 ++++++++++++++++---
 src/TNL/Matrices/SparseMatrix.hpp             |  2 +-
 6 files changed, 117 insertions(+), 20 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 767d446a6..e187f5436 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -436,8 +436,14 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark,
    auto spmvCuda = [&]() {
       cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
    };
-   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( MatrixInfo< HostMatrix >::getFormat(), csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
-   benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
+
+   for( auto threadsPerRow : std::vector< int >{ 1, 2, 4, 8, 16, 32 } )
+   {
+      cudaMatrix.getSegments().getKernel().setThreadsPerSegment( threadsPerRow );
+      String format = MatrixInfo< HostMatrix >::getFormat() + " " + convertToString( threadsPerRow );
+      SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( format, csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
+      benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
+   }
  #endif
 }
 
@@ -570,10 +576,13 @@ benchmarkSpmv( BenchmarkType& benchmark,
    ////
    // Perform benchmark on host with CSR as a reference CPU format
    //
+   auto nonzeros = csrHostMatrix.getNonzeroElementsCount();
    benchmark.addCommonLogs( BenchmarkType::CommonLogs( {
       { "matrix name", convertToString( inputFileName ) },
       { "rows", convertToString( csrHostMatrix.getRows() ) },
-      { "columns", convertToString( csrHostMatrix.getColumns() ) } } ) );
+      { "columns", convertToString( csrHostMatrix.getColumns() ) },
+      { "nonzeros", convertToString( nonzeros ) },
+      { "nonzeros per row", convertToString( ( double ) nonzeros / ( double ) csrHostMatrix.getRows() ) } } ) );
 
    HostVector hostInVector( csrHostMatrix.getRows() ), hostOutVector( csrHostMatrix.getRows() );
 
diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h
index aa3f16d6b..27bdfe3e2 100644
--- a/src/TNL/Algorithms/Segments/CSR.h
+++ b/src/TNL/Algorithms/Segments/CSR.h
@@ -507,6 +507,10 @@ class CSR
       template< typename Fetch >
       SegmentsPrinter< CSR, Fetch > print( Fetch&& fetch ) const;
 
+      KernelType& getKernel() { return kernel; }
+
+      const KernelType& getKernel() const { return kernel; }
+
    protected:
 
       OffsetsContainer offsets;
diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h
index 884ed71cf..b593dc467 100644
--- a/src/TNL/Algorithms/Segments/CSRView.h
+++ b/src/TNL/Algorithms/Segments/CSRView.h
@@ -143,6 +143,10 @@ class CSRView
       template< typename Fetch >
       SegmentsPrinter< CSRView, Fetch > print( Fetch&& fetch ) const;
 
+      KernelType& getKernel() { return kernel; }
+
+      const KernelType& getKernel() const { return kernel; }
+
    protected:
 
       OffsetsView offsets;
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h
index a3aa961b4..49a662ccf 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h
@@ -20,6 +20,8 @@ namespace TNL {
    namespace Algorithms {
       namespace Segments {
 
+enum LightCSRSThreadsMapping { LightCSRConstantThreads, CSRLightAutomaticThreads, CSRLightAutomaticThreadsLightSpMV };
+
 template< typename Index,
           typename Device >
 struct CSRLightKernel
@@ -40,6 +42,8 @@ struct CSRLightKernel
 
    static TNL::String getKernelType();
 
+   TNL::String getSetup() const;
+
    template< typename OffsetsView,
              typename Fetch,
              typename Reduction,
@@ -53,8 +57,20 @@ struct CSRLightKernel
                         ResultKeeper& keeper,
                         const Real& zero ) const;
 
+
+   void setThreadsMapping( LightCSRSThreadsMapping mapping );
+
+   LightCSRSThreadsMapping getThreadsMapping() const;
+
+   void setThreadsPerSegment( int threadsPerSegment );
+
+   int getThreadsPerSegment() const;
+
    protected:
-      int threadsPerSegment = 0;
+
+      LightCSRSThreadsMapping mapping = LightCSRConstantThreads;
+
+      int threadsPerSegment = 32;
 };
 
       } // namespace Segments
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
index 59d8ae0e3..1c3518288 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
@@ -426,6 +426,7 @@ struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Kee
 #ifdef HAVE_CUDA
       const size_t threads = 128;
       Index blocks, groupSize;
+
       size_t  neededThreads = threadsPerSegment * ( last - first );
 
       for (Index grid = 0; neededThreads != 0; ++grid)
@@ -513,21 +514,40 @@ CSRLightKernel< Index, Device >::
 init( const Offsets& offsets )
 {
    const Index segmentsCount = offsets.getSize() - 1;
-   //size_t neededThreads = segmentsCount * 32;//warpSize;
-
-   const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row
-   if( elementsInSegment <= 2 )
-      this->threadsPerSegment = 2;
-   else if( elementsInSegment <= 4 )
-      this->threadsPerSegment = 4;
-   else if( elementsInSegment <= 8 )
-      this->threadsPerSegment = 8;
-   else if( elementsInSegment <= 16 )
-      this->threadsPerSegment = 16;
-   else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP)
-      this->threadsPerSegment = 32; // CSR Vector
-   //else
-   //   threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector
+
+   if( this->getThreadsMapping() == CSRLightAutomaticThreads )
+   {
+      const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row
+      if( elementsInSegment <= 2 )
+         this->threadsPerSegment = 2;
+      else if( elementsInSegment <= 4 )
+         this->threadsPerSegment = 4;
+      else if( elementsInSegment <= 8 )
+         this->threadsPerSegment = 8;
+      else if( elementsInSegment <= 16 )
+         this->threadsPerSegment = 16;
+      else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP)
+         this->threadsPerSegment = 32; // CSR Vector
+      //else
+      //   threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector
+   }
+
+   if( this->getThreadsMapping() == CSRLightAutomaticThreadsLightSpMV )
+   {
+      const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row
+      if( elementsInSegment <= 2 )
+         this->threadsPerSegment = 2;
+      else if( elementsInSegment <= 4 )
+         this->threadsPerSegment = 4;
+      else if( elementsInSegment <= 8 )
+         this->threadsPerSegment = 8;
+      else if( elementsInSegment <= 16 )
+         this->threadsPerSegment = 16;
+      else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP)
+         this->threadsPerSegment = 32; // CSR Vector
+      //else
+      //   threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector
+   }
 
    TNL_ASSERT_GE( this->threadsPerSegment, 0, "" );
    TNL_ASSERT_LE( this->threadsPerSegment, 33, "" );
@@ -594,6 +614,50 @@ reduceSegments( const OffsetsView& offsets,
       offsets, first, last, fetch, reduce, keep, zero, this->threadsPerSegment );
 }
 
+template< typename Index,
+          typename Device >
+void
+CSRLightKernel< Index, Device >::
+setThreadsMapping( LightCSRSThreadsMapping mapping )
+{
+   this-> mapping = mapping;
+}
+
+template< typename Index,
+          typename Device >
+LightCSRSThreadsMapping
+CSRLightKernel< Index, Device >::
+getThreadsMapping() const
+{
+   return this->mapping;
+}
+
+template< typename Index,
+          typename Device >
+void
+CSRLightKernel< Index, Device >::
+setThreadsPerSegment( int threadsPerSegment )
+{
+   if( threadsPerSegment !=  1 &&
+       threadsPerSegment !=  2 &&
+       threadsPerSegment !=  4 &&
+       threadsPerSegment !=  8 &&
+       threadsPerSegment != 16 &&
+       threadsPerSegment != 32 )
+       throw std::runtime_error( "Number of threads per segment must be power of 2 - 1, 2, ... 32." );
+   this->threadsPerSegment = threadsPerSegment;
+}
+
+template< typename Index,
+          typename Device >
+int
+CSRLightKernel< Index, Device >::
+getThreadsPerSegment() const
+{
+   return this->threadsPerSegment;
+}
+
+
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index ac8688425..dd11c6cf7 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -524,7 +524,7 @@ vectorProduct( const InVector& inVector,
                const IndexType firstRow,
                const IndexType lastRow ) const
 {
-   this->view.vectorProduct( inVector, outVector, matrixMultiplicator, outVectorMultiplicator, firstRow, lastRow );
+   this->getView().vectorProduct( inVector, outVector, matrixMultiplicator, outVectorMultiplicator, firstRow, lastRow );
 }
 
 template< typename Real,
-- 
GitLab


From 6224bccfe1feb030a0ca33857666afeda0cb5b4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 17 Sep 2021 10:38:09 +0200
Subject: [PATCH 105/117] Fixing large dense matrices.

---
 src/TNL/Matrices/DenseMatrix.hpp         |  2 +-
 src/UnitTests/Matrices/DenseMatrixTest.h | 49 ++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/src/TNL/Matrices/DenseMatrix.hpp b/src/TNL/Matrices/DenseMatrix.hpp
index e265f6f15..a42421aa7 100644
--- a/src/TNL/Matrices/DenseMatrix.hpp
+++ b/src/TNL/Matrices/DenseMatrix.hpp
@@ -162,7 +162,7 @@ setDimensions( const IndexType rows,
 {
    Matrix< Real, Device, Index, RealAllocator >::setDimensions( rows, columns );
    this->segments.setSegmentsSizes( rows, columns );
-   this->values.setSize( rows * columns );
+   this->values.setSize( this->segments.getStorageSize() );
    this->values = 0.0;
    this->view = this->getView();
 }
diff --git a/src/UnitTests/Matrices/DenseMatrixTest.h b/src/UnitTests/Matrices/DenseMatrixTest.h
index 9a5fee6e9..4558387e4 100644
--- a/src/UnitTests/Matrices/DenseMatrixTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixTest.h
@@ -996,6 +996,48 @@ void test_VectorProduct()
     EXPECT_EQ( outVector.getElement( 4 ), 148 );
 }
 
+template< typename Matrix >
+void test_LargeVectorProduct()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+
+   if( std::is_same< IndexType, short >::value )
+      return;
+
+   const IndexType rows = 5000;
+   const IndexType cols = 5000;
+
+   Matrix m( rows, cols );
+   m.forAllElements(
+      [] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, RealType& value ) {
+         value = columnIdx + 1.0;
+      }
+   );
+
+
+   using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;
+
+   VectorType inVector( cols );
+   inVector.forAllElements( [] __cuda_callable__ ( IndexType i, RealType& value ) {
+      value = 1.0;
+   } );
+
+   VectorType outVector( rows, 0.0 );
+
+   m.vectorProduct( inVector, outVector);
+
+   for( IndexType i = 0; i < rows; i++ )
+   {
+      //RealType diag = ( i % 2 == 1 ? cols - 1 : -cols + 1 );
+      //RealType non_diag = ( cols % 2 == 0 ? 0.0 : 1.0 );
+      RealType rcols = cols;
+      EXPECT_EQ( outVector.getElement( i ),  ( 0.5 * rcols ) * ( rcols + 1.0 ) );
+   }
+}
+
+
 template< typename Matrix >
 void test_AddMatrix()
 {
@@ -1622,6 +1664,13 @@ TYPED_TEST( MatrixTest, vectorProductTest )
     test_VectorProduct< MatrixType >();
 }
 
+TYPED_TEST( MatrixTest, largeVectorProductTest )
+{
+    using MatrixType = typename TestFixture::MatrixType;
+
+    test_LargeVectorProduct< MatrixType >();
+}
+
 TYPED_TEST( MatrixTest, addMatrixTest )
 {
     using MatrixType = typename TestFixture::MatrixType;
-- 
GitLab


From 4bc80e2b7162d54a715b4415ce277ed7190d58b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 17 Sep 2021 10:40:00 +0200
Subject: [PATCH 106/117] Adding SpMV benchmark for CSRLightAutomaticThreads.

---
 src/Benchmarks/SpMV/spmv.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index e187f5436..b596d9ea8 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -437,13 +437,18 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark,
       cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
    };
 
-   for( auto threadsPerRow : std::vector< int >{ 1, 2, 4, 8, 16, 32 } )
+   cudaMatrix.getSegments().getKernel().setThreadsMapping( Algorithms::Segments::CSRLightAutomaticThreadsLightSpMV );
+   String format = MatrixInfo< HostMatrix >::getFormat();
+   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( format, csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
+   benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
+
+   /*for( auto threadsPerRow : std::vector< int >{ 1, 2, 4, 8, 16, 32 } )
    {
       cudaMatrix.getSegments().getKernel().setThreadsPerSegment( threadsPerRow );
       String format = MatrixInfo< HostMatrix >::getFormat() + " " + convertToString( threadsPerRow );
       SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( format, csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
       benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
-   }
+   }*/
  #endif
 }
 
-- 
GitLab


From d02d59ae894638ed66a909d7e5c25b5910078ee7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 20 Sep 2021 21:02:33 +0200
Subject: [PATCH 107/117] Fixed format name for symmetric binary matrices.

---
 src/TNL/Matrices/MatrixInfo.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Matrices/MatrixInfo.h b/src/TNL/Matrices/MatrixInfo.h
index 820a4d8e5..fd48ddc5f 100644
--- a/src/TNL/Matrices/MatrixInfo.h
+++ b/src/TNL/Matrices/MatrixInfo.h
@@ -71,7 +71,7 @@ struct MatrixInfo< SparseMatrixView< Real, Device, Index, MatrixType, SegmentsVi
       if( MatrixType::isSymmetric() )
       {
          if( std::is_same< Real, bool >::value )
-            prefix = "Symmetric binary ";
+            prefix = "Symmetric Binary ";
          else
             prefix = "Symmetric ";
       }
-- 
GitLab


From 51fe72e275c4492c9a89830488f0fee45befc76f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 20 Sep 2021 21:03:46 +0200
Subject: [PATCH 108/117] Removed CSR hybrid kernel from SpMV benchmarks.

---
 src/Benchmarks/SpMV/spmv.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index b596d9ea8..babbfb759 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -742,7 +742,7 @@ benchmarkSpmv( BenchmarkType& benchmark,
    TNL::Matrices::MatrixReader< HostMatrixType >::readMtx( inputFileName, hostMatrix, verboseMR );
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar                   >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector                   >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Hybrid                   >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   //benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Hybrid                   >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
    benchmarkSpMVCSRLight< Real, HostMatrixType, SparseMatrix_CSR_Light            >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive                 >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_Ellpack                      >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
@@ -792,7 +792,7 @@ benchmarkSpmv( BenchmarkType& benchmark,
       //}
       benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
       benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      //benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
       benchmarkSpMVCSRLight< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Light            >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
       benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive                 >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
       benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack                      >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
@@ -802,7 +802,7 @@ benchmarkSpmv( BenchmarkType& benchmark,
 #ifdef WITH_TNL_BENCHMARK_SPMV_BINARY_MATRICES
       benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
       benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      //benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
       benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Light              >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
       benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive           >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
       benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack                >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-- 
GitLab


From 922acbc389ced1f9c0e62a29cdc472bb5fb46387 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 20 Sep 2021 21:04:28 +0200
Subject: [PATCH 109/117] Fixing detection of formats list in SpMV benchmark
 results processing script.

---
 .../tnl-spmv-benchmark-make-tables-json.py        | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
index eb38fbfab..b576de5a0 100755
--- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
@@ -33,16 +33,6 @@ def latexFormatName( name ):
    name = name.replace('>','')
    return name
 
-####
-# Extract all formats
-def get_formats( input_df ):
-   matrixName = input_df.iloc[0]['matrix name']
-   df_matrix = input_df.loc[input_df['matrix name'] == matrixName]
-   formats = df_matrix.loc[:,'format'].values.tolist() # Get format names - TODO: the first benchmark might not have all of them
-   formats = list(dict.fromkeys(formats))              # remove duplicates
-   formats.append('TNL Best')
-   return formats
-
 ####
 # Create multiindex for columns
 def get_multiindex( input_df, formats ):
@@ -231,7 +221,7 @@ def compute_symmetric_speedup( df, formats ):
 
 def compute_speedup( df, formats ):
    compute_cusparse_speedup( df, formats )
-   compute_csr_light_speedup( df )
+   #compute_csr_light_speedup( df )
    compute_binary_speedup( df, formats )
    compute_symmetric_speedup( df, formats )
 
@@ -722,7 +712,8 @@ with open('sparse-matrix-benchmark.log') as f:
 input_df = json_normalize( d, record_path=['results'] )
 #input_df.to_html( "orig-pandas.html" )
 
-formats = get_formats( input_df )
+formats = list(set( input_df['format'].values.tolist() )) # list of all formats in the benchmark results
+formats.append('TNL Best')
 multicolumns, df_data = get_multiindex( input_df, formats )
 
 print( "Converting data..." )
-- 
GitLab


From 47606affd9ca13ec7ef662e91b42ef1aa5413132 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 20 Sep 2021 21:08:59 +0200
Subject: [PATCH 110/117] Changing default setting for CSR light threads
 mapping.

---
 src/Benchmarks/SpMV/spmv.h                           | 2 +-
 src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index babbfb759..b1ef02263 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -437,7 +437,7 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark,
       cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
    };
 
-   cudaMatrix.getSegments().getKernel().setThreadsMapping( Algorithms::Segments::CSRLightAutomaticThreadsLightSpMV );
+   cudaMatrix.getSegments().getKernel().setThreadsMapping( Algorithms::Segments::CSRLightAutomaticThreads );
    String format = MatrixInfo< HostMatrix >::getFormat();
    SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( format, csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
    benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h
index 49a662ccf..50322b826 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h
@@ -68,7 +68,7 @@ struct CSRLightKernel
 
    protected:
 
-      LightCSRSThreadsMapping mapping = LightCSRConstantThreads;
+      LightCSRSThreadsMapping mapping = CSRLightAutomaticThreads;
 
       int threadsPerSegment = 32;
 };
-- 
GitLab


From 00409974980649645f09ca3c6d5ca3a63ebc0879 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 21 Sep 2021 20:24:27 +0200
Subject: [PATCH 111/117] Added both strategies of threads mapping in CSR Light
 to SpMV benchmark.

---
 src/Benchmarks/SpMV/spmv.h | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index b1ef02263..c46cff95c 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -437,10 +437,19 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark,
       cudaMatrix.vectorProduct( cudaInVector, cudaOutVector );
    };
 
-   cudaMatrix.getSegments().getKernel().setThreadsMapping( Algorithms::Segments::CSRLightAutomaticThreads );
-   String format = MatrixInfo< HostMatrix >::getFormat();
-   SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( format, csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
-   benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
+   {
+      cudaMatrix.getSegments().getKernel().setThreadsMapping( Algorithms::Segments::CSRLightAutomaticThreads );
+      String format = MatrixInfo< HostMatrix >::getFormat() + " Automatic";
+      SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( format, csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
+      benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
+   };
+
+   {
+      cudaMatrix.getSegments().getKernel().setThreadsMapping( Algorithms::Segments::CSRLightAutomaticThreadsLightSpMV );
+      String format = MatrixInfo< HostMatrix >::getFormat() + " Automatic Light";
+      SpmvBenchmarkResult< Real, Devices::Cuda, int > cudaBenchmarkResults( format, csrResultVector, cudaOutVector, cudaMatrix.getNonzeroElementsCount() );
+      benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda, cudaBenchmarkResults );
+   };
 
    /*for( auto threadsPerRow : std::vector< int >{ 1, 2, 4, 8, 16, 32 } )
    {
-- 
GitLab


From 389129345d4e77919a2f0fc51479439d3b7d809e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 23 Sep 2021 08:33:00 +0200
Subject: [PATCH 112/117] Fixing names of CSR Light kernels in spmv benchmarks.

---
 src/Benchmarks/SpMV/spmv.h | 55 +++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index c46cff95c..559adadff 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -362,6 +362,7 @@ benchmarkSpMV( BenchmarkType& benchmark,
 template< typename Real,
           typename InputMatrix,
           template< typename, typename, typename > class Matrix,
+          typename TestReal = Real,
           template< typename, typename, typename, typename > class Vector = Containers::Vector >
 void
 benchmarkSpMVCSRLight( BenchmarkType& benchmark,
@@ -371,8 +372,8 @@ benchmarkSpMVCSRLight( BenchmarkType& benchmark,
                        bool allCpuTests,
                        bool verboseMR )
 {
-   using HostMatrix = Matrix< Real, TNL::Devices::Host, int >;
-   using CudaMatrix = Matrix< Real, TNL::Devices::Cuda, int >;
+   using HostMatrix = Matrix< TestReal, TNL::Devices::Host, int >;
+   using CudaMatrix = Matrix< TestReal, TNL::Devices::Cuda, int >;
    using HostVector = Containers::Vector< Real, Devices::Host, int >;
    using CudaVector = Containers::Vector< Real, Devices::Cuda, int >;
 
@@ -759,14 +760,14 @@ benchmarkSpmv( BenchmarkType& benchmark,
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_ChunkedEllpack               >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
    benchmarkSpMV< Real, HostMatrixType, SparseMatrix_BiEllpack                    >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
 #ifdef WITH_TNL_BENCHMARK_SPMV_BINARY_MATRICES
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar             >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector             >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Light              >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive           >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_Ellpack                >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack          >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_ChunkedEllpack         >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_BiEllpack              >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar              >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector              >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkSpMVCSRLight< Real, HostMatrixType, SparseMatrix_CSR_Light, bool >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive            >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_Ellpack                 >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack           >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_ChunkedEllpack          >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_BiEllpack               >( benchmark, hostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
 #endif
 #ifdef WITH_TNL_BENCHMARK_SPMV_SANDBOX_MATRIX
    benchmarkSpMV< Real, HostMatrixType, SparseSandboxMatrix                       >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
@@ -799,25 +800,25 @@ benchmarkSpmv( BenchmarkType& benchmark,
       //{
       //   std::cerr << "ERROR: Symmetric matrices do not match !!!" << std::endl;
       //}
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar                    >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector                    >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
       //benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid                   >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMVCSRLight< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Light            >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive                 >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack                      >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack                >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack               >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack                    >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMVCSRLight< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Light             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive                  >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack                       >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack                 >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack                >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack                     >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
 #ifdef WITH_TNL_BENCHMARK_SPMV_BINARY_MATRICES
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      //benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid             >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Light              >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive           >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack                >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack          >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack         >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
-      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack              >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Scalar              >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Vector              >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      //benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Hybrid            >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkSpMVCSRLight< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Light, bool       >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_CSR_Adaptive            >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_Ellpack                 >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_SlicedEllpack           >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_ChunkedEllpack          >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
+      benchmarkBinarySpMV< Real, SymmetricInputMatrix, SymmetricSparseMatrix_BiEllpack               >( benchmark, symmetricHostMatrix, hostOutVector, inputFileName, allCpuTests, verboseMR );
 #endif
    }
 #endif
-- 
GitLab


From 9651e3c3d85d676dc7ca597ccae4440b7afd4eb9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 23 Sep 2021 08:33:44 +0200
Subject: [PATCH 113/117] Splitting graphs of symmetric anf binary matrices in
 spmv benchmark results postporcessing script.

---
 .../tnl-spmv-benchmark-make-tables-json.py    | 69 +++++++++++++++----
 1 file changed, 54 insertions(+), 15 deletions(-)

diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
index b576de5a0..341d88ab9 100755
--- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
@@ -68,7 +68,7 @@ def get_multiindex( input_df, formats ):
          level3.append( 'speed-up')
          level4.append( 'non-symmetric' )
          df_data[ 0 ].append( ' ' )
-      if format == 'CSR< Light >':
+      if format == 'CSR< Light > Automatic':
          level1.append( format )
          level2.append( 'GPU' )
          level3.append( 'speed-up')
@@ -174,16 +174,17 @@ def compute_cusparse_speedup( df, formats ):
 ####
 # Compute speedup of Light CSR
 def compute_csr_light_speedup( df ):
-   csr_light_bdw_list = df[('CSR< Light >','GPU','bandwidth')]
-   light_spmv_bdw_list = df[('LightSpMV Vector','GPU','bandwidth')]
-
-   csr_light_speedup_list = []
-   for ( csr_light_bdw, light_spmv_bdw ) in zip(csr_light_bdw_list,light_spmv_bdw_list):
-      try:
-         csr_light_speedup_list.append( csr_light_bdw / light_spmv_bdw  )
-      except:
-         csr_light_speedup_list.append(float('nan'))
-   df[('CSR< Light >','GPU','speed-up','LightSpMV Vector')] = csr_light_speedup_list
+   for light in [ 'CSR< Light > Automatic', 'CSR< Light > Automatic Light']:
+      csr_light_bdw_list = df[(light,'GPU','bandwidth')]
+      light_spmv_bdw_list = df[('LightSpMV Vector','GPU','bandwidth')]
+
+      csr_light_speedup_list = []
+      for ( csr_light_bdw, light_spmv_bdw ) in zip(csr_light_bdw_list,light_spmv_bdw_list):
+         try:
+            csr_light_speedup_list.append( csr_light_bdw / light_spmv_bdw  )
+         except:
+            csr_light_speedup_list.append(float('nan'))
+      df[(light,'GPU','speed-up','LightSpMV Vector')] = csr_light_speedup_list
 
 ####
 # Compute speed-up of binary formats
@@ -211,6 +212,7 @@ def compute_symmetric_speedup( df, formats ):
          print( f'Adding speed-up of {format} vs {non_symmetric_format}' )
          format_bdw_list = df[(format,'GPU','bandwidth')]
          non_symmetric_bdw_list = df[(non_symmetric_format,'GPU','bandwidth')]
+
          symmetric_speedup_list = []
          for ( format_bdw, non_symmetric_bdw ) in zip( format_bdw_list, non_symmetric_bdw_list ):
             try:
@@ -526,16 +528,53 @@ def cusparse_speedup_comparison( df, formats, head_size=10 ):
    ylabel = "Speedup"
    current_formats = []
    for format in formats:
-      if( 'Ellpack' in format and not 'Binary' in format and not 'Legacy' in format ):
+      if( 'Ellpack' in format and not 'Symmetric' in format and not 'Binary' in format and not 'Legacy' in format ):
          current_formats.append( format )
    draw_profiles( current_formats, profiles, xlabel, ylabel, "ellpack-profiles-cusparse-speedup.pdf", "draw-bar" )
 
+   current_formats.clear()
+   for format in formats:
+      if( 'Ellpack' in format and 'Symmetric' in format and not 'Binary' in format and not 'Legacy' in format ):
+         current_formats.append( format )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "symmetric-ellpack-profiles-cusparse-speedup.pdf", "draw-bar" )
+
+   current_formats.clear()
+   for format in formats:
+      if( 'Ellpack' in format and not 'Symmetric' in format and 'Binary' in format and not 'Legacy' in format ):
+         current_formats.append( format )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "binary-ellpack-profiles-cusparse-speedup.pdf", "draw-bar" )
+
+   current_formats.clear()
+   for format in formats:
+      if( 'Ellpack' in format and 'Symmetric' in format and 'Binary' in format and not 'Legacy' in format ):
+         current_formats.append( format )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "symmetric-binary-ellpack-profiles-cusparse-speedup.pdf", "draw-bar" )
+
+
    # Draw CSR formats profiles
    current_formats.clear()
    for format in formats:
-      if( 'CSR' in format and not 'Binary' in format and not 'Legacy' in format and not 'Hybrid' in format and format != 'CSR' ):
+      if( 'CSR' in format and not 'Symmetric' in format and not 'Binary' in format and not 'Legacy' in format and not 'Hybrid' in format and format != 'CSR' ):
          current_formats.append( format )
    draw_profiles( current_formats, profiles, xlabel, ylabel, "csr-profiles-cusparse-speedup.pdf", "draw-bar" )
+   current_formats.clear()
+   for format in formats:
+      if( 'CSR' in format and 'Symmetric' in format and not 'Binary' in format and not 'Legacy' in format and not 'Hybrid' in format and format != 'CSR' ):
+         current_formats.append( format )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "symmetric-csr-profiles-cusparse-speedup.pdf", "draw-bar" )
+   current_formats.clear()
+
+   for format in formats:
+      if( 'CSR' in format and not 'Symmetric' in format and 'Binary' in format and not 'Legacy' in format and not 'Hybrid' in format and format != 'CSR' ):
+         current_formats.append( format )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "binary-csr-profiles-cusparse-speedup.pdf", "draw-bar" )
+   current_formats.clear()
+
+   for format in formats:
+      if( 'CSR' in format and 'Symmetric' in format and 'Binary' in format and not 'Legacy' in format and not 'Hybrid' in format and format != 'CSR' ):
+         current_formats.append( format )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "-symmetric-binary-csr-profiles-cusparse-speedup.pdf", "draw-bar" )
+   current_formats.clear()
 
 ####
 # Comparison of binary matrices
@@ -654,7 +693,7 @@ def symmetric_matrices_comparison( df, formats, head_size = 10 ):
 ####
 # Comparison of speed-up w.r.t. LightSpMV
 def csr_light_speedup_comparison( df, head_size=10 ):
-   format = 'CSR< Light >'
+   format = 'CSR< Light > Automatic Light'
    print( f"Writing comparison of speed-up of CSR Light compared to LightSPMV" )
    df['tmp'] = df[(format, 'GPU','bandwidth')]
    filtered_df=df.dropna(subset=[('tmp','','','')])
@@ -717,7 +756,7 @@ formats.append('TNL Best')
 multicolumns, df_data = get_multiindex( input_df, formats )
 
 print( "Converting data..." )
-result = convert_data_frame( input_df, multicolumns, df_data, 0, 10000 )
+result = convert_data_frame( input_df, multicolumns, df_data, 0, 200 )
 compute_speedup( result, formats )
 
 result.replace( to_replace=' ',value=np.nan,inplace=True)
-- 
GitLab


From 52ee50b2e9b81fc21364b8e70a5d04bc2ed42227 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 25 Sep 2021 07:55:42 +0200
Subject: [PATCH 114/117] Added FIXME notes for the getSerializationType
 methods in segments

---
 src/TNL/Algorithms/Segments/BiEllpack.hpp          | 1 +
 src/TNL/Algorithms/Segments/BiEllpackView.hpp      | 1 +
 src/TNL/Algorithms/Segments/CSR.hpp                | 1 +
 src/TNL/Algorithms/Segments/CSRView.hpp            | 3 ++-
 src/TNL/Algorithms/Segments/ChunkedEllpack.hpp     | 1 +
 src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp | 1 +
 src/TNL/Algorithms/Segments/Ellpack.hpp            | 1 +
 src/TNL/Algorithms/Segments/EllpackView.hpp        | 1 +
 src/TNL/Algorithms/Segments/SlicedEllpack.hpp      | 1 +
 src/TNL/Algorithms/Segments/SlicedEllpackView.hpp  | 1 +
 10 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/TNL/Algorithms/Segments/BiEllpack.hpp b/src/TNL/Algorithms/Segments/BiEllpack.hpp
index 4bbccbb0e..ddb3c0342 100644
--- a/src/TNL/Algorithms/Segments/BiEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpack.hpp
@@ -84,6 +84,7 @@ String
 BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
 getSerializationType()
 {
+   // FIXME: the serialized data DEPEND on the Organization and WarpSize parameters, so it should be reflected in the serialization type
    return "BiEllpack< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
 }
 
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.hpp b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
index ab79d5833..8a1b035aa 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
@@ -97,6 +97,7 @@ String
 BiEllpackView< Device, Index, Organization, WarpSize >::
 getSerializationType()
 {
+   // FIXME: the serialized data DEPEND on the Organization and WarpSize parameters, so it should be reflected in the serialization type
    return "BiEllpack< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
 }
 
diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp
index 0d15790bf..0bd2d33ca 100644
--- a/src/TNL/Algorithms/Segments/CSR.hpp
+++ b/src/TNL/Algorithms/Segments/CSR.hpp
@@ -80,6 +80,7 @@ getSerializationType()
 {
    return "CSR< [any_device], " +
       TNL::getSerializationType< IndexType >() + ", " +
+      // FIXME: the serialized data do not depend on the the kernel type so it should not be in the serialization type
       TNL::getSerializationType< KernelType >() + " >";
 }
 
diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp
index 4343e672b..7aac457af 100644
--- a/src/TNL/Algorithms/Segments/CSRView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRView.hpp
@@ -80,7 +80,8 @@ CSRView< Device, Index, Kernel >::
 getSerializationType()
 {
    return "CSR< [any_device], " +
-      TNL::getSerializationType< IndexType >() +
+      TNL::getSerializationType< IndexType >() + ", " +
+      // FIXME: the serialized data do not depend on the the kernel type so it should not be in the serialization type
       TNL::getSerializationType< KernelType >() + " >";
 }
 
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
index e39a16670..9a08957da 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
@@ -88,6 +88,7 @@ String
 ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
 getSerializationType()
 {
+   // FIXME: the serialized data DEPEND on the Organization parameter, so it should be reflected in the serialization type
    return "ChunkedEllpack< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
 }
 
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
index 6fc767107..a48afead5 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
@@ -122,6 +122,7 @@ String
 ChunkedEllpackView< Device, Index, Organization >::
 getSerializationType()
 {
+   // FIXME: the serialized data DEPEND on the Organization parameter, so it should be reflected in the serialization type
    return "ChunkedEllpack< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
 }
 
diff --git a/src/TNL/Algorithms/Segments/Ellpack.hpp b/src/TNL/Algorithms/Segments/Ellpack.hpp
index 27e7dcbe3..baef4d161 100644
--- a/src/TNL/Algorithms/Segments/Ellpack.hpp
+++ b/src/TNL/Algorithms/Segments/Ellpack.hpp
@@ -99,6 +99,7 @@ String
 Ellpack< Device, Index, IndexAllocator, Organization, Alignment >::
 getSerializationType()
 {
+   // FIXME: the serialized data DEPEND on the Organization and Alignment parameters, so it should be reflected in the serialization type
    return "Ellpack< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
 }
 
diff --git a/src/TNL/Algorithms/Segments/EllpackView.hpp b/src/TNL/Algorithms/Segments/EllpackView.hpp
index 6f49c55ee..e283a75d0 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/EllpackView.hpp
@@ -205,6 +205,7 @@ String
 EllpackView< Device, Index, Organization, Alignment >::
 getSerializationType()
 {
+   // FIXME: the serialized data DEPEND on the Organization and Alignment parameters, so it should be reflected in the serialization type
    return "Ellpack< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
 }
 
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
index 8a4903cbd..652eceb56 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
@@ -93,6 +93,7 @@ String
 SlicedEllpack< Device, Index, IndexAllocator, Organization, SliceSize >::
 getSerializationType()
 {
+   // FIXME: the serialized data DEPEND on the Organization and Alignment parameters, so it should be reflected in the serialization type
    return "SlicedEllpack< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
 }
 
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
index 5b97c72e2..d7ef9524c 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
@@ -83,6 +83,7 @@ String
 SlicedEllpackView< Device, Index, Organization, SliceSize >::
 getSerializationType()
 {
+   // FIXME: the serialized data DEPEND on the Organization and Alignment parameters, so it should be reflected in the serialization type
    return "SlicedEllpack< [any_device], " + TNL::getSerializationType< IndexType >() + " >";
 }
 
-- 
GitLab


From 340107a2434c33c937d622fbfb170471c708c030 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 4 Oct 2021 21:58:50 +0200
Subject: [PATCH 115/117] Improving script for SpMV benchmark results
 processinb.

---
 .../tnl-spmv-benchmark-make-tables-json.py    | 43 ++++++++++++-------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
index 341d88ab9..ac8d6e1ce 100755
--- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
@@ -31,6 +31,7 @@ def slugify(s):
 def latexFormatName( name ):
    name = name.replace('<','')
    name = name.replace('>','')
+   name = name.replace( 'Light  Automatic ', '')
    return name
 
 ####
@@ -68,7 +69,7 @@ def get_multiindex( input_df, formats ):
          level3.append( 'speed-up')
          level4.append( 'non-symmetric' )
          df_data[ 0 ].append( ' ' )
-      if format == 'CSR< Light > Automatic':
+      if format == 'CSR< Light > Automatic' or format == 'CSR< Light > Automatic Light':
          level1.append( format )
          level2.append( 'GPU' )
          level3.append( 'speed-up')
@@ -93,6 +94,7 @@ def convert_data_frame( input_df, multicolumns, df_data, begin_idx = 0, end_idx
    #max_out_idx = max_rows
    if end_idx == -1:
       end_idx = len(input_df.index)
+   best_count = 0
    while in_idx < len(input_df.index) and out_idx < end_idx:
       matrixName = input_df.iloc[in_idx]['matrix name']
       df_matrix = input_df.loc[input_df['matrix name'] == matrixName]
@@ -118,6 +120,7 @@ def convert_data_frame( input_df, multicolumns, df_data, begin_idx = 0, end_idx
              not 'cusparse' in current_format and
              not 'LightSpMV' in current_format and
              not 'Hybrid' in current_format and
+             current_format != 'CSR< Light > Automatic' and
              bw > best_bw ):
             best_bw = bw
             best_format = current_format
@@ -135,6 +138,7 @@ def convert_data_frame( input_df, multicolumns, df_data, begin_idx = 0, end_idx
          aux_df.iloc[0][('TNL Best','GPU','format','')] = best_format
       else:
          aux_df.iloc[0][('TNL Best','GPU','format','')] = 'cusparse'
+      best_count += 1
       if out_idx >= begin_idx:
          frames.append( aux_df )
       out_idx = out_idx + 1
@@ -173,18 +177,19 @@ def compute_cusparse_speedup( df, formats ):
 
 ####
 # Compute speedup of Light CSR
-def compute_csr_light_speedup( df ):
+def compute_csr_light_speedup( df, formats ):
    for light in [ 'CSR< Light > Automatic', 'CSR< Light > Automatic Light']:
-      csr_light_bdw_list = df[(light,'GPU','bandwidth')]
-      light_spmv_bdw_list = df[('LightSpMV Vector','GPU','bandwidth')]
+      if light in formats:
+         csr_light_bdw_list = df[(light,'GPU','bandwidth')]
+         light_spmv_bdw_list = df[('LightSpMV Vector','GPU','bandwidth')]
 
-      csr_light_speedup_list = []
-      for ( csr_light_bdw, light_spmv_bdw ) in zip(csr_light_bdw_list,light_spmv_bdw_list):
-         try:
-            csr_light_speedup_list.append( csr_light_bdw / light_spmv_bdw  )
-         except:
-            csr_light_speedup_list.append(float('nan'))
-      df[(light,'GPU','speed-up','LightSpMV Vector')] = csr_light_speedup_list
+         csr_light_speedup_list = []
+         for ( csr_light_bdw, light_spmv_bdw ) in zip(csr_light_bdw_list,light_spmv_bdw_list):
+            try:
+               csr_light_speedup_list.append( csr_light_bdw / light_spmv_bdw  )
+            except:
+               csr_light_speedup_list.append(float('nan'))
+         df[(light,'GPU','speed-up','LightSpMV Vector')] = csr_light_speedup_list
 
 ####
 # Compute speed-up of binary formats
@@ -223,7 +228,7 @@ def compute_symmetric_speedup( df, formats ):
 
 def compute_speedup( df, formats ):
    compute_cusparse_speedup( df, formats )
-   #compute_csr_light_speedup( df )
+   compute_csr_light_speedup( df, formats )
    compute_binary_speedup( df, formats )
    compute_symmetric_speedup( df, formats )
 
@@ -487,7 +492,7 @@ def cusparse_speedup_comparison( df, formats, head_size=10 ):
    profiles = {}
    for format in formats:
       if not format in ['cusparse','CSR']:
-         print( f"Writing comparison of speed-up of {format} compared to Cusparse" )
+         print( f"Writing comparison of speed-up of {format} ({latexFormatName(format)}) compared to Cusparse" )
          df['tmp'] = df[(format, 'GPU','bandwidth')]
          filtered_df=df.dropna(subset=[('tmp','','','')])
          filtered_df.sort_values(by=[(format,'GPU','speed-up','cusparse')],inplace=True,ascending=False)
@@ -752,11 +757,15 @@ input_df = json_normalize( d, record_path=['results'] )
 #input_df.to_html( "orig-pandas.html" )
 
 formats = list(set( input_df['format'].values.tolist() )) # list of all formats in the benchmark results
+formats.remove('CSR< Light > Automatic')
+formats.remove('Binary CSR< Light > Automatic')
+formats.remove('Symmetric CSR< Light > Automatic')
+formats.remove('Symmetric Binary CSR< Light > Automatic')
 formats.append('TNL Best')
 multicolumns, df_data = get_multiindex( input_df, formats )
 
 print( "Converting data..." )
-result = convert_data_frame( input_df, multicolumns, df_data, 0, 200 )
+result = convert_data_frame( input_df, multicolumns, df_data, 0, 2000 )
 compute_speedup( result, formats )
 
 result.replace( to_replace=' ',value=np.nan,inplace=True)
@@ -779,6 +788,8 @@ def processDf( df, formats, head_size = 10 ):
    csr_light_speedup_comparison( df, head_size )
 
    best = df[('TNL Best','GPU','format')].tolist()
+   best_formats = list(set(best))
+   sum = 0
    for format in formats:
       if( not 'Binary' in format and
           not 'Symmetric' in format and
@@ -787,7 +798,9 @@ def processDf( df, formats, head_size = 10 ):
           not 'TNL Best' in format ):
          cases = best.count(format)
          print( f'{format} is best in {cases} cases.')
-
+         sum += cases
+   print( f'Total is {sum}.' )
+   print( f'Best formats {best_formats}.')
 head_size = 25
 if not os.path.exists( 'general' ):
    os.mkdir( 'general' )
-- 
GitLab


From ca680ebd72c3e982df8ab82f825fcd75a9bb44b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 27 Oct 2021 22:55:47 +0200
Subject: [PATCH 116/117] Improving Python script for processing results of
 SpMV benchmark.

---
 .../tnl-spmv-benchmark-make-tables-json.py    | 357 ++++++++++++++----
 1 file changed, 289 insertions(+), 68 deletions(-)

diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
index ac8d6e1ce..4d77faffe 100755
--- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
+++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables-json.py
@@ -6,6 +6,7 @@ import pandas as pd
 from pandas.io.json import json_normalize
 import matplotlib.pyplot as plt
 import numpy as np
+import math
 
 #Latex fonst set-up
 
@@ -22,6 +23,38 @@ import numpy as np
 #})
 
 
+####
+# A map of rgb points in your distribution
+# [distance, (r, g, b)]
+# distance is percentage from left edge
+# https://stackoverflow.com/questions/25668828/how-to-create-colour-gradient-in-python/50784012#50784012
+heatmap = [
+    [0.0,  (0.1, 0.1, 1.0)],
+ #  [0.20, (0, 0, .5)],
+ #  [0.40, (0, .5, 0)],
+    [0.40, (0.1, 1.0, 0.1)],
+#   [0.80, (.75, .75, 0)],
+#   [0.90, (1.0, .75, 0)],
+    [1.00, (1.0, 0.1, 0.1)],
+]
+
+def gaussian(x, a, b, c, d=0):
+    return a * math.exp(-(x - b)**2 / (2 * c**2)) + d
+
+def color_map(x, width=100, map=[], spread=1):
+    width = float(width)
+    r = sum([gaussian(x, p[1][0], p[0] * width, width/(spread*len(map))) for p in map])
+    g = sum([gaussian(x, p[1][1], p[0] * width, width/(spread*len(map))) for p in map])
+    b = sum([gaussian(x, p[1][2], p[0] * width, width/(spread*len(map))) for p in map])
+    return min(1.0, r), min(1.0, g), min(1.0, b)
+
+#for x in range(im.size[0]):
+#    r, g, b = pixel(x, width=im.size[0], map=heatmap)
+#    r, g, b = [int(256*v) for v in (r, g, b)]
+#    for y in range(im.size[1]):
+#        ld[x, y] = r, g, b
+
+
 ####
 # Helper function
 def slugify(s):
@@ -32,6 +65,25 @@ def latexFormatName( name ):
    name = name.replace('<','')
    name = name.replace('>','')
    name = name.replace( 'Light  Automatic ', '')
+   #print( f'~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{name}~~~')
+   if name == 'CSR':
+      return 'CSR on CPU'
+   if name == 'cusparse':
+      return 'cuSPARSE'
+   if 'SlicedEllpack' in name:
+      return name.replace( 'SlicedEllpack', 'Sliced Ellpack' )
+   if 'ChunkedEllpack' in name:
+      return name.replace( 'ChunkedEllpack', 'Chunked Ellpack' )
+   if 'BiEllpack' in name:
+      return name.replace( 'BiEllpack', 'Bisection Ellpack' )
+   if 'CSR Scalar' in name:
+      return name.replace( 'CSR Scalar', 'Scalar CSR' )
+   if 'CSR Vector' in name:
+      return name.replace( 'CSR Vector', 'Vector CSR' )
+   if 'CSR Light' in name:
+      return name.replace( 'CSR Light', 'Light CSR' )
+   if 'CSR Adaptive' in name:
+      return name.replace( 'CSR Adaptive', 'Adaptive CSR' )
    return name
 
 ####
@@ -44,7 +96,7 @@ def get_multiindex( input_df, formats ):
    df_data = [[ ' ',' ',' ']]
    for format in formats:
       for device in ['CPU','GPU']:
-         for data in ['bandwidth' ]: #,'time','speed-up','non-zeros','stddev','stddev/time','diff.max','diff.l2']:
+         for data in ['bandwidth', 'time', 'diff.max' ]: #,'time','speed-up','non-zeros','stddev','stddev/time','diff.max','diff.l2']:
             level1.append( format )
             level2.append( device )
             level3.append( data )
@@ -112,7 +164,11 @@ def convert_data_frame( input_df, multicolumns, df_data, begin_idx = 0, end_idx
          current_device = row['device']
          #print( current_format + " / " + current_device )
          bw = pd.to_numeric(row['bandwidth'], errors='coerce')
+         time = pd.to_numeric(row['time'], errors='coerce')
+         diff_max = pd.to_numeric(row['CSR Diff.Max'], errors='coerce')
          aux_df.iloc[0][(current_format,current_device,'bandwidth','')] = bw
+         aux_df.iloc[0][(current_format,current_device,'time','')] = time
+         aux_df.iloc[0][(current_format,current_device,'diff.max','')] = diff_max
          if( current_device == 'GPU' and
              not 'Binary' in current_format and
              not 'Symmetric' in current_format and
@@ -154,21 +210,21 @@ def compute_cusparse_speedup( df, formats ):
          if not format in [ 'cusparse', 'CSR' ]:
             print( 'Adding speed-up for ', format )
             try:
-               format_bdw_list = df[(format,device,'bandwidth')]
+               format_times_list = df[(format,device,'time')]
             except:
                continue
-            cusparse_bdw_list = df[('cusparse','GPU','bandwidth')]
-            csr_bdw_list = df[('CSR','CPU','bandwidth')]
+            cusparse_times_list = df[('cusparse','GPU','time')]
+            csr_times_list = df[('CSR','CPU','time')]
             cusparse_speedup_list = []
             csr_speedup_list = []
-            for( format_bdw, cusparse_bdw, csr_bdw ) in zip( format_bdw_list, cusparse_bdw_list,csr_bdw_list ):
+            for( format_time, cusparse_time, csr_time ) in zip( format_times_list, cusparse_times_list,csr_times_list ):
                if( device == 'GPU' ):
                   try:
-                     cusparse_speedup_list.append( format_bdw / cusparse_bdw )
+                     cusparse_speedup_list.append( cusparse_time / format_time  )
                   except:
                      cusparse_speedup_list.append(float('nan'))
                try:
-                  csr_speedup_list.append( format_bdw / csr_bdw )
+                  csr_speedup_list.append( csr_time / format_time  )
                except:
                   csr_speedup_list.append(float('nan'))
             if( device == 'GPU' ):
@@ -215,13 +271,13 @@ def compute_symmetric_speedup( df, formats ):
       if 'Symmetric' in format:
          non_symmetric_format = format.replace( 'Symmetric ', '' )
          print( f'Adding speed-up of {format} vs {non_symmetric_format}' )
-         format_bdw_list = df[(format,'GPU','bandwidth')]
-         non_symmetric_bdw_list = df[(non_symmetric_format,'GPU','bandwidth')]
+         format_times_list = df[(format,'GPU','time')]
+         non_symmetric_times_list = df[(non_symmetric_format,'GPU','time')]
 
          symmetric_speedup_list = []
-         for ( format_bdw, non_symmetric_bdw ) in zip( format_bdw_list, non_symmetric_bdw_list ):
+         for ( format_time, non_symmetric_time ) in zip( format_times_list, non_symmetric_times_list ):
             try:
-               symmetric_speedup_list.append( format_bdw / non_symmetric_bdw )
+               symmetric_speedup_list.append( non_symmetric_time / format_time  )
             except:
                symmetric_speedup_list.append(float('nan'))
          df[(format,'GPU','speed-up','non-symmetric')] = symmetric_speedup_list
@@ -234,7 +290,7 @@ def compute_speedup( df, formats ):
 
 ###
 # Draw several profiles into one figure
-def draw_profiles( formats, profiles, xlabel, ylabel, filename, style=[] ):
+def draw_profiles( formats, profiles, xlabel, ylabel, filename, legend_loc='upper right', bar='none' ):
    fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
    latexNames = []
    size = 1
@@ -243,12 +299,14 @@ def draw_profiles( formats, profiles, xlabel, ylabel, filename, style=[] ):
       axs.plot( t, profiles[format], '-o', ms=1, lw=1 )
       size = len( profiles[format] )
       latexNames.append( latexFormatName( format ) )
-   if 'draw-bar' in style:
+   if bar != 'none':
       #print( f'size = {size}' )
-      bar = np.full( size, 1 )
-      axs.plot( t, bar, '-', ms=1, lw=1.5 )
+      bar_data = np.full( size, 1 )
+      axs.plot( t, bar_data, '-', ms=1, lw=1.5 )
+      if bar != '':
+         latexNames.append( bar )
 
-   axs.legend( latexNames, loc='upper right' )
+   axs.legend( latexNames, loc=legend_loc )
    axs.set_xlabel( xlabel )
    axs.set_ylabel( ylabel )
    axs.set_yscale( 'log' )
@@ -279,7 +337,7 @@ def effective_bw_profile( df, formats, head_size=10 ):
          profiles[format] = df[(format,'GPU','bandwidth')].copy()
          axs.plot( t, df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
       axs.legend( [ latexFormatName(format), 'CSR on CPU' ], loc='upper right' )
-      axs.set_ylabel( 'Bandwidth in GB/sec' )
+      axs.set_ylabel( 'Effective bandwidth in GB/sec' )
       plt.rcParams.update({
          "text.usetex": True,
          "font.family": "sans-serif",
@@ -290,8 +348,8 @@ def effective_bw_profile( df, formats, head_size=10 ):
       axs.set_yscale( 'log' )
       axs.plot( t, result[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
       axs.legend( [ latexFormatName(format), 'CSR on CPU' ], loc='lower left' )
-      axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} performance" )
-      axs.set_ylabel( 'Bandwidth in GB/sec' )
+      axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} performance" )
+      axs.set_ylabel( 'Effective bandwidth in GB/sec' )
       plt.rcParams.update({
          "text.usetex": True,
          "font.family": "sans-serif",
@@ -306,22 +364,22 @@ def effective_bw_profile( df, formats, head_size=10 ):
 
    # Draw ellpack formats profiles
    current_formats = []
-   xlabel = "Matrix ID - sorted by particular formats effective BW"
-   ylabel = "Bandwidth in GB/sec"
+   xlabel = "Matrix number - sorted by particular formats effective bandwidth"
+   ylabel = "Effective bandwidth in GB/sec"
    for format in formats:
-      if( ( 'Ellpack' in format and not 'Binary' in format and not 'Legacy' in format ) or
+      if( ( 'Ellpack' in format and not 'Binary' in format and not 'Symmetric' in format and not 'Legacy' in format ) or
           format == 'CSR' or
           format == 'cusparse' ):
          current_formats.append( format )
-   draw_profiles( current_formats, profiles, xlabel, ylabel, "ellpack-profiles-bw.pdf" )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "ellpack-profiles-bw.pdf", 'lower left', "none" )
 
    # Draw CSR formats profiles
    current_formats.clear()
    for format in formats:
-      if( ( 'CSR' in format and not 'Binary' in format and not 'Legacy' in format and not 'Hybrid' in format ) or
+      if( ( 'CSR' in format and not 'Binary' in format and not 'Symmetric' in format and not 'Legacy' in format and not 'Hybrid' in format ) or
           format == 'cusparse' ):
          current_formats.append( format )
-   draw_profiles( current_formats, profiles, xlabel, ylabel, "csr-profiles-bw.pdf" )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "csr-profiles-bw.pdf", 'lower left', 'none' )
 
 
 ####
@@ -334,21 +392,21 @@ def cusparse_comparison( df, formats, head_size=10 ):
    ascend_df.sort_values(by=[('cusparse','GPU','bandwidth')],inplace=True,ascending=True)
    for format in formats:
       if not format in ['cusparse','CSR']:
-         print( f"Writing comparison of {format} and Cusparse" )
+         print( f"Writing comparison of {format} and cuSPARSE" )
          filtered_df = df.dropna( subset=[(format,'GPU','bandwidth','')] )
          filtered_ascend_df = ascend_df.dropna( subset=[(format,'GPU','bandwidth','')] )
          t = np.arange(filtered_df[(format,'GPU','bandwidth')].size )
          fig, axs = plt.subplots( 2, 1 )
          axs[0].plot( t, filtered_df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
          axs[0].plot( t, filtered_df[('cusparse','GPU','bandwidth')], '-o', ms=1, lw=1 )
-         axs[0].legend( [ format, 'Cusparse' ], loc='upper right' )
-         axs[0].set_ylabel( 'Bandwidth in GB/sec' )
+         axs[0].legend( [ format, 'cuSPARSE' ], loc='upper right' )
+         axs[0].set_ylabel( 'Effective bandwidth in GB/sec' )
          axs[1].set_yscale( 'log' )
          axs[1].plot( t, filtered_df[(format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
          axs[1].plot( t, filtered_df[('cusparse','GPU','bandwidth')], '-o', ms=1, lw=1 )
-         axs[1].legend( [ latexFormatName(format), 'Cusparse' ], loc='lower left' )
-         axs[1].set_xlabel( 'Matrix ID - sorted w.r.t. Cusparse performance' )
-         axs[1].set_ylabel( 'Bandwidth in GB/sec' )
+         axs[1].legend( [ latexFormatName(format), 'cuSPARSE' ], loc='lower left' )
+         axs[1].set_xlabel( 'Matrix number - sorted w.r.t. cuSPARSE performance' )
+         axs[1].set_ylabel( 'Effective bandwidth in GB/sec' )
          plt.savefig( f"Cusparse-bw/{format}.pdf" )
          plt.close(fig)
          copy_df = df.copy()
@@ -375,13 +433,13 @@ def csr_comparison( df, formats, head_size=10 ):
             axs[0].plot( t, df[(format,device,'bandwidth')], '-o', ms=1, lw=1 )
             axs[0].plot( t, df[('CSR','CPU','bandwidth')], '-o', ms=1, lw=1 )
             axs[0].legend( [ latexFormatName(format), 'CSR on CPU' ], loc='upper right' )
-            axs[0].set_ylabel( 'Bandwidth in GB/sec' )
+            axs[0].set_ylabel( 'Effective bandwidth in GB/sec' )
             axs[1].set_yscale( 'log' )
             axs[1].plot( t, result[(format,device,'bandwidth')], '-o', ms=1, lw=1 )
             axs[1].plot( t, result[('CSR','CPU','bandwidth')], '-o', ms=1, lw=1 )
             axs[1].legend( [ latexFormatName(format), 'CSR on CPU' ], loc='lower left' )
-            axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} performance" )
-            axs[1].set_ylabel( 'Bandwidth in GB/sec' )
+            axs[1].set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} performance" )
+            axs[1].set_ylabel( 'Effective bandwidth in GB/sec' )
             plt.rcParams.update({
                "text.usetex": True,
                "font.family": "sans-serif",
@@ -416,13 +474,13 @@ def legacy_formats_comparison( df, formats, head_size=10 ):
          axs[0].plot( t, df[(ref_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
          axs[0].plot( t, df[(legacy_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
          axs[0].legend( [ latexFormatName(ref_format), latexFormatName(legacy_format) ], loc='upper right' )
-         axs[0].set_ylabel( 'Bandwidth in GB/sec' )
+         axs[0].set_ylabel( 'Effective bandwidth in GB/sec' )
          axs[1].set_yscale( 'log' )
          axs[1].plot( t, df[(ref_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
          axs[1].plot( t, df[(legacy_format,'GPU','bandwidth')], '-o', ms=1, lw=1 )
          axs[1].legend( [ latexFormatName(ref_format), latexFormatName(legacy_format) ], loc='lower left' )
-         axs[1].set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(ref_format)}  performance" )
-         axs[1].set_ylabel( 'Bandwidth in GB/sec' )
+         axs[1].set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(ref_format)}  performance" )
+         axs[1].set_ylabel( 'Effective bandwidth in GB/sec' )
          plt.rcParams.update({
             "text.usetex": True,
             "font.family": "sans-serif",
@@ -460,7 +518,7 @@ def csr_speedup_comparison( df, formats, head_size=10 ):
             axs.plot( t, bar, '-', ms=1, lw=1 )
             axs.legend( [ latexFormatName(format), 'CSR CPU' ], loc='upper right' )
             axs.set_ylabel( 'Speedup' )
-            axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
+            axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
             plt.rcParams.update({
                "text.usetex": True,
                "font.family": "sans-serif",
@@ -473,7 +531,7 @@ def csr_speedup_comparison( df, formats, head_size=10 ):
             axs.plot( t, filtered_df[(format,device,'speed-up','CSR CPU')], '-o', ms=1, lw=1 )
             axs.plot( t, bar, '-', ms=1, lw=1 )
             axs.legend( [ latexFormatName(format), 'CSR' ], loc='lower left' )
-            axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
+            axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
             axs.set_ylabel( 'Speedup' )
             plt.savefig( f"CSR-speed-up/{format}-{device}-log.pdf")
             plt.close(fig)
@@ -492,7 +550,7 @@ def cusparse_speedup_comparison( df, formats, head_size=10 ):
    profiles = {}
    for format in formats:
       if not format in ['cusparse','CSR']:
-         print( f"Writing comparison of speed-up of {format} ({latexFormatName(format)}) compared to Cusparse" )
+         print( f"Writing comparison of speed-up of {format} ({latexFormatName(format)}) compared to cuSPARSE" )
          df['tmp'] = df[(format, 'GPU','bandwidth')]
          filtered_df=df.dropna(subset=[('tmp','','','')])
          filtered_df.sort_values(by=[(format,'GPU','speed-up','cusparse')],inplace=True,ascending=False)
@@ -503,9 +561,9 @@ def cusparse_speedup_comparison( df, formats, head_size=10 ):
          bar = np.full( size, 1 )
          axs.plot( t, filtered_df[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
          axs.plot( t, bar, '-', ms=1, lw=1 )
-         axs.legend( [ latexFormatName(format), 'Cusparse' ], loc='upper right' )
+         axs.legend( [ latexFormatName(format), 'cuSPARSE' ], loc='upper right' )
          axs.set_ylabel( 'Speedup' )
-         axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
          plt.rcParams.update({
             "text.usetex": True,
             "font.family": "sans-serif",
@@ -517,8 +575,8 @@ def cusparse_speedup_comparison( df, formats, head_size=10 ):
          axs.set_yscale( 'log' )
          axs.plot( t, filtered_df[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
          axs.plot( t, bar, '-', ms=1, lw=1 )
-         axs.legend( [ latexFormatName(format), 'Cusparse' ], loc='lower left' )
-         axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         axs.legend( [ latexFormatName(format), 'cuSPARSE' ], loc='lower left' )
+         axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
          axs.set_ylabel( 'Speedup' )
          plt.savefig( f"Cusparse-speed-up/{format}-log.pdf")
          plt.close(fig)
@@ -529,31 +587,31 @@ def cusparse_speedup_comparison( df, formats, head_size=10 ):
          copy_df.to_html( f"Cusparse-speed-up/{format}.html" )
 
    # Draw Ellpack formats profiles
-   xlabel = "Matrix ID - sorted particular by formats speedup compared to Cusparse"
+   xlabel = "Matrix number - sorted by particular formats speedup compared to cuSPARSE"
    ylabel = "Speedup"
    current_formats = []
    for format in formats:
       if( 'Ellpack' in format and not 'Symmetric' in format and not 'Binary' in format and not 'Legacy' in format ):
          current_formats.append( format )
-   draw_profiles( current_formats, profiles, xlabel, ylabel, "ellpack-profiles-cusparse-speedup.pdf", "draw-bar" )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "ellpack-profiles-cusparse-speedup.pdf", 'upper right', "cuSPARSE" )
 
    current_formats.clear()
    for format in formats:
       if( 'Ellpack' in format and 'Symmetric' in format and not 'Binary' in format and not 'Legacy' in format ):
          current_formats.append( format )
-   draw_profiles( current_formats, profiles, xlabel, ylabel, "symmetric-ellpack-profiles-cusparse-speedup.pdf", "draw-bar" )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "symmetric-ellpack-profiles-cusparse-speedup.pdf", 'upper right', "cuSPARSE" )
 
    current_formats.clear()
    for format in formats:
       if( 'Ellpack' in format and not 'Symmetric' in format and 'Binary' in format and not 'Legacy' in format ):
          current_formats.append( format )
-   draw_profiles( current_formats, profiles, xlabel, ylabel, "binary-ellpack-profiles-cusparse-speedup.pdf", "draw-bar" )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "binary-ellpack-profiles-cusparse-speedup.pdf", 'upper right', "cuSPARSE" )
 
    current_formats.clear()
    for format in formats:
       if( 'Ellpack' in format and 'Symmetric' in format and 'Binary' in format and not 'Legacy' in format ):
          current_formats.append( format )
-   draw_profiles( current_formats, profiles, xlabel, ylabel, "symmetric-binary-ellpack-profiles-cusparse-speedup.pdf", "draw-bar" )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "symmetric-binary-ellpack-profiles-cusparse-speedup.pdf", 'upper right', "cuSPARSE" )
 
 
    # Draw CSR formats profiles
@@ -561,24 +619,24 @@ def cusparse_speedup_comparison( df, formats, head_size=10 ):
    for format in formats:
       if( 'CSR' in format and not 'Symmetric' in format and not 'Binary' in format and not 'Legacy' in format and not 'Hybrid' in format and format != 'CSR' ):
          current_formats.append( format )
-   draw_profiles( current_formats, profiles, xlabel, ylabel, "csr-profiles-cusparse-speedup.pdf", "draw-bar" )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "csr-profiles-cusparse-speedup.pdf", 'upper right', "cuSPARSE" )
    current_formats.clear()
    for format in formats:
       if( 'CSR' in format and 'Symmetric' in format and not 'Binary' in format and not 'Legacy' in format and not 'Hybrid' in format and format != 'CSR' ):
          current_formats.append( format )
-   draw_profiles( current_formats, profiles, xlabel, ylabel, "symmetric-csr-profiles-cusparse-speedup.pdf", "draw-bar" )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "symmetric-csr-profiles-cusparse-speedup.pdf", 'upper right', "cuSPARSE" )
    current_formats.clear()
 
    for format in formats:
       if( 'CSR' in format and not 'Symmetric' in format and 'Binary' in format and not 'Legacy' in format and not 'Hybrid' in format and format != 'CSR' ):
          current_formats.append( format )
-   draw_profiles( current_formats, profiles, xlabel, ylabel, "binary-csr-profiles-cusparse-speedup.pdf", "draw-bar" )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "binary-csr-profiles-cusparse-speedup.pdf", 'upper right', "cuSPARSE" )
    current_formats.clear()
 
    for format in formats:
       if( 'CSR' in format and 'Symmetric' in format and 'Binary' in format and not 'Legacy' in format and not 'Hybrid' in format and format != 'CSR' ):
          current_formats.append( format )
-   draw_profiles( current_formats, profiles, xlabel, ylabel, "-symmetric-binary-csr-profiles-cusparse-speedup.pdf", "draw-bar" )
+   draw_profiles( current_formats, profiles, xlabel, ylabel, "-symmetric-binary-csr-profiles-cusparse-speedup.pdf", 'upper right', "cuSPARSE" )
    current_formats.clear()
 
 ####
@@ -604,7 +662,7 @@ def binary_matrices_comparison( df, formats, head_size = 10 ):
          axs.plot( t, bar, '-', ms=1, lw=1 )
          axs.legend( [ latexFormatName(format), latexFormatName(non_binary_format) ], loc='upper right' )
          axs.set_ylabel( 'Speedup' )
-         axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
          plt.rcParams.update({
             "text.usetex": True,
             "font.family": "sans-serif",
@@ -623,7 +681,7 @@ def binary_matrices_comparison( df, formats, head_size = 10 ):
          axs.plot( t, filtered_df[(format,'GPU','speed-up','non-binary')], '-o', ms=1, lw=1 )
          axs.plot( t, bar, '-', ms=1, lw=1 )
          axs.legend( [ latexFormatName(format), latexFormatName(non_binary_format) ], loc='upper right' )
-         axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
          axs.set_ylabel( 'Speedup' )
          plt.savefig( f"Binary-speed-up/{format}-log.pdf")
          plt.close(fig)
@@ -649,10 +707,14 @@ def symmetric_matrices_comparison( df, formats, head_size = 10 ):
          print( f"Writing comparison of speed-up of {format} vs {non_symmetric_format}" )
          #df['tmp'] = df[(format, 'GPU','speed-up','non-symmetric')]
          filtered_df=df.dropna(subset=[(format, 'GPU','speed-up','non-symmetric')]) #('tmp','','','')])
-         ascend_df = filtered_df.copy()
+         #ascend_df = filtered_df.copy()
          #print( f"{format} -> {filtered_df[(format,'GPU','speed-up','non-symmetric')]}" )
          filtered_df.sort_values(by=[(format,'GPU','speed-up','non-symmetric')],inplace=True,ascending=False)
-         ascend_df.sort_values(by=[(format,'GPU','speed-up','non-symmetric')],inplace=True,ascending=True)
+         #ascend_df.sort_values(by=[(format,'GPU','speed-up','non-symmetric')],inplace=True,ascending=True)
+
+         cusparse_filtered_df=df.dropna(subset=[(format, 'GPU','speed-up','cusparse')]) #('tmp','','','')])
+         cusparse_filtered_df.sort_values(by=[(format,'GPU','speed-up','cusparse')],inplace=True,ascending=False)
+
          fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
          size = len(filtered_df[(format,'GPU','speed-up','non-symmetric')].index)
          t = np.arange( size )
@@ -661,17 +723,11 @@ def symmetric_matrices_comparison( df, formats, head_size = 10 ):
          axs.plot( t, bar, '-', ms=1, lw=1 )
          axs.legend( [ latexFormatName(format), latexFormatName(non_symmetric_format) ], loc='upper right' )
          axs.set_ylabel( 'Speedup' )
-         axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
          plt.rcParams.update({
             "text.usetex": True,
             "font.family": "sans-serif",
             "font.sans-serif": ["Helvetica"]})
-         # for Palatino and other serif fonts use:
-         #plt.rcParams.update({
-         #   "text.usetex": True,
-         #   "font.family": "serif",
-         #   "font.serif": ["Palatino"],
-         #})
          plt.savefig( f"Symmetric-speed-up/{format}.pdf")
          plt.close(fig)
 
@@ -680,12 +736,40 @@ def symmetric_matrices_comparison( df, formats, head_size = 10 ):
          axs.plot( t, filtered_df[(format,'GPU','speed-up','non-symmetric')], '-o', ms=1, lw=1 )
          axs.plot( t, bar, '-', ms=1, lw=1 )
          axs.legend( [ latexFormatName(format), latexFormatName(non_symmetric_format) ], loc='lower left' )
-         axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
          axs.set_ylabel( 'Speedup' )
          plt.savefig( f"Symmetric-speed-up/{format}-log.pdf")
          plt.close(fig)
          #head_df = filtered_df.head( head_size )
          #bottom_df = ascend_df.head( head_size )
+
+         size = len(cusparse_filtered_df[(format,'GPU','speed-up','cusparse')].index)
+         t = np.arange( size )
+         bar = np.full( size, 1 )
+         fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+         axs.plot( t, cusparse_filtered_df[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
+         axs.plot( t, bar, '-', ms=1, lw=1 )
+         axs.legend( [ latexFormatName(format), 'cuSPARSE' ], loc='upper right' )
+         axs.set_ylabel( 'Speedup' )
+         axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         plt.rcParams.update({
+            "text.usetex": True,
+            "font.family": "sans-serif",
+            "font.sans-serif": ["Helvetica"]})
+         plt.savefig( f"Symmetric-speed-up/{format}-cusparse.pdf")
+         plt.close(fig)
+
+         fig, axs = plt.subplots( 1, 1, figsize=(6,4) )
+         axs.set_yscale( 'log' )
+         axs.plot( t, cusparse_filtered_df[(format,'GPU','speed-up','cusparse')], '-o', ms=1, lw=1 )
+         axs.plot( t, bar, '-', ms=1, lw=1 )
+         axs.legend( [ latexFormatName(format), 'cuSPARSE' ], loc='lower left' )
+         axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
+         axs.set_ylabel( 'Speedup' )
+         plt.savefig( f"Symmetric-speed-up/{format}-cusparse-log.pdf")
+         plt.close(fig)
+
+
          copy_df = df.copy()
          for f in formats:
             if not f in ['cusparse','CSR',format,non_symmetric_format]:
@@ -693,7 +777,10 @@ def symmetric_matrices_comparison( df, formats, head_size = 10 ):
                #head_df.drop( labels=f, axis='columns', level=0, inplace=True )
                copy_df.drop( labels=f, axis='columns', level=0, inplace=True )
          #head_df.to_html( f"Symmetric-speed-up/{format}-head.html" )
+         copy_df.sort_values(by=[(format,'GPU','speed-up','non-symmetric')],inplace=True,ascending=False)
          copy_df.to_html( f"Symmetric-speed-up/{format}.html" )
+         #copy_df.sort_values(by=[(format,'GPU','speed-up','non-symmetric')],inplace=True,descending=True)
+         #copy_df.to_html( f"Symmetric-speed-up/{format}-sort.html" )
 
 ####
 # Comparison of speed-up w.r.t. LightSpMV
@@ -713,7 +800,7 @@ def csr_light_speedup_comparison( df, head_size=10 ):
    axs.plot( t, bar, '-', ms=1, lw=1 )
    axs.legend( [ latexFormatName(format), 'LightSpMV' ], loc='upper right' )
    axs.set_ylabel( 'Speedup' )
-   axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
+   axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
    plt.rcParams.update({
       "text.usetex": True,
       "font.family": "sans-serif",
@@ -732,7 +819,7 @@ def csr_light_speedup_comparison( df, head_size=10 ):
    axs.plot( t, filtered_df[(format,'GPU','speed-up','LightSpMV Vector')], '-o', ms=1, lw=1 )
    axs.plot( t, bar, '-', ms=1, lw=1 )
    axs.legend( [ latexFormatName(format), 'LightSpMV' ], loc='lower left' )
-   axs.set_xlabel( f"Matrix ID - sorted w.r.t. {latexFormatName(format)} speed-up" )
+   axs.set_xlabel( f"Matrix number - sorted w.r.t. {latexFormatName(format)} speed-up" )
    axs.set_ylabel( 'Speedup' )
    plt.savefig( f"LightSpMV-speed-up-log.pdf")
    plt.close(fig)
@@ -747,6 +834,120 @@ def csr_light_speedup_comparison( df, head_size=10 ):
    #head_df.to_html( f"LightSpMV-speed-up-head.html" )
    copy_df.to_html( f"LightSpMV-speed-up-bottom.html" )
 
+def write_colormap( file, max_bw, size, x_position, y_position, standalone = False ):
+   if standalone:
+      file.write( '\\documentclass{standalone}\n' )
+      file.write( '\\usepackage[utf8]{inputenc}\n' )
+      file.write( '\\usepackage{tikz}\n' )
+      file.write( '\\begin{document}\n' )
+      file.write( '\\begin{tikzpicture}\n' )
+   i = 0
+   x = x_position
+   while i <= max_bw:
+      y = y_position + i / max_bw * size
+      r, g, b = color_map(i, max_bw, map=heatmap)
+      file.write( f'\\definecolor{{color_hm_{i}}}{{rgb}}{{ {r}, {g}, {b} }}; \n' )
+      file.write( f'\\filldraw[color_hm_{i}] ({x},{y}) circle (2pt); \n' )
+      i = i + 5
+   i = 0
+   while i <= max_bw:
+      y = y_position + i / max_bw * size
+      file.write( f'\\filldraw[black] ({x},{y}) circle (1pt) node[anchor=west] {{{i}}}; \n' )
+      i = i + 400
+
+   if standalone:
+      file.write( '\\end{tikzpicture}\n' )
+      file.write( '\\end{document}\n' )
+
+def write_performance_circle_latex_base( file_name ):
+   file = open( f'{file_name}-base.tex', 'w')
+   file.write( '\\documentclass{standalone}\n' )
+   file.write( '\\usepackage[utf8]{inputenc}\n' )
+   file.write( '\\usepackage{tikz}\n' )
+   file.write( '\\begin{document}\n' )
+   file.write( '\\begin{tikzpicture}\n' )
+   file.write( f'\\input{{{file_name}.tex}}\n' )
+   file.write( '\\end{tikzpicture}\n' )
+   file.write( '\\end{document}\n' )
+
+#####
+# Draw performance circle in tikz
+def write_performance_circle( df, formats, circle_formats, file_name, scale=1, with_color_map = False ):
+   write_performance_circle_latex_base( file_name )
+   file = open( f'{file_name}.tex', 'w')
+   formats_number = 0
+   for format in circle_formats:
+      if format in formats:
+         formats_number += 1
+
+   format_idx = 0
+   pos_x = 5 * scale
+   pos_y = 5 * scale
+   rad = 5 * scale
+   formats_pos_x = {}
+   formats_pos_y = {}
+   for format in circle_formats:
+      if format in formats:
+         format_angle = math.pi/2 - 2*math.pi/formats_number*format_idx - math.pi / formats_number
+         if format_angle < 0:
+            format_angle = 2*math.pi + format_angle
+         x = pos_x + rad*math.cos( format_angle )
+         y = pos_y + rad*math.sin( format_angle )
+         formats_pos_x[ format ] = x
+         formats_pos_y[ format ] = y
+         anchor = ''
+         if format_angle <= math.pi * 1/4  or format_angle > math.pi * 7/4:
+            anchor = 'west'
+         if format_angle <= math.pi * 3/4 and format_angle > math.pi * 1/4:
+            anchor = 'south'
+         if format_angle <= math.pi * 5/4 and format_angle > math.pi * 3/4:
+            anchor = 'east'
+         if format_angle <= math.pi * 7/4 and format_angle > math.pi * 5/4:
+            anchor = 'north'
+         #print( f'{format_angle} : {format} -> {anchor} \n' )
+         file.write( f'\\filldraw[black] ({x},{y}) circle (2pt) node[anchor={anchor}]{{{latexFormatName(format)}}}; \n' )
+         div_angle = format_angle + math.pi / formats_number
+         div_x = pos_x + rad*math.cos( div_angle )
+         div_y = pos_y + rad*math.sin( div_angle )
+         file.write( f'\\draw [dashed] ({div_x},{div_y}) -- ({pos_x},{pos_y}); \n')
+         format_idx += 1
+   formats_count = format_idx
+   line_idx=0
+   elim = 0
+   while line_idx < len(df.index):
+      #matrixName = df.iloc[line_idx]['Matrix name']
+      sum_bw = 0
+      formats_bw = {}
+      max_bw = 0
+      for format in circle_formats:
+         if format in formats:
+            format_bw = df.iloc[line_idx][(format,'GPU','bandwidth','')]
+            formats_bw[ format ] = format_bw
+            #print( f'{matrixName} {format} -> {format_bw}')
+            #if format_bw > max_bw:
+            sum_bw = sum_bw + format_bw
+            if format_bw > max_bw:
+               max_bw = format_bw
+      for format in circle_formats:
+         if format in formats:
+            formats_bw[ format ] = formats_bw[ format ] / sum_bw
+      format_pos_x = 0
+      format_pos_y = 0
+      for format in circle_formats:
+         if format in formats:
+            format_pos_x = format_pos_x + formats_pos_x[ format ] * formats_bw[ format ]
+            format_pos_y = format_pos_y + formats_pos_y[ format ] * formats_bw[ format ]
+      if( format_pos_x == format_pos_x  and format_pos_y == format_pos_y ):  # check for NaN
+         r, g, b = color_map(max_bw, 1200, map=heatmap)
+         file.write( f'\\definecolor{{color_{line_idx}}}{{rgb}}{{ {r}, {g}, {b} }} \n' )
+         file.write( f'\\filldraw[color_{line_idx},opacity=0.75] ({format_pos_x},{format_pos_y}) circle (1pt); \n' )
+      else:
+         elim = elim + 1
+      line_idx += 1
+   if with_color_map:
+      write_colormap( file, 1200, 5, 13*scale, 1.5*scale, standalone=False )
+   os.system( f'pdflatex {file_name}-base.tex' )
+   print( f'Eliminated formats: {elim}')
 
 ####
 # Parse input file
@@ -765,7 +966,7 @@ formats.append('TNL Best')
 multicolumns, df_data = get_multiindex( input_df, formats )
 
 print( "Converting data..." )
-result = convert_data_frame( input_df, multicolumns, df_data, 0, 2000 )
+result = convert_data_frame( input_df, multicolumns, df_data, 0, 20000 )
 compute_speedup( result, formats )
 
 result.replace( to_replace=' ',value=np.nan,inplace=True)
@@ -801,6 +1002,26 @@ def processDf( df, formats, head_size = 10 ):
          sum += cases
    print( f'Total is {sum}.' )
    print( f'Best formats {best_formats}.')
+   write_performance_circle( df, formats,
+         ['cusparse', 'Ellpack', 'SlicedEllpack', 'ChunkedEllpack', 'BiEllpack', 'CSR< Scalar >', 'CSR< Adaptive >', 'CSR< Vector >', 'CSR< Light > Automatic Light'],
+         'performance-graph' )
+
+   scale = 0.6
+   aux_df = df
+   aux_df.sort_values(by=[('SlicedEllpack','GPU','bandwidth')],inplace=True,ascending=True)
+   write_performance_circle( aux_df, formats, ['Ellpack', 'ChunkedEllpack', 'SlicedEllpack' ], 'performance-graph-ellpacks-1', scale, with_color_map = False )
+   write_performance_circle( aux_df, formats, ['BiEllpack', 'ChunkedEllpack', 'SlicedEllpack',  ], 'performance-graph-ellpacks-2', scale, with_color_map = True )
+   #write_performance_circle( df, formats, ['CSR< Scalar >', 'CSR< Adaptive >', 'CSR< Vector >', 'CSR< Light > Automatic Light'], 'performance-graph-csr-1' )
+   aux_df.sort_values(by=[('CSR< Light > Automatic Light','GPU','bandwidth')],inplace=True,ascending=True)
+   write_performance_circle( aux_df, formats, ['CSR< Scalar >', 'CSR< Vector >', 'CSR< Light > Automatic Light'], 'performance-graph-csr-1', scale, with_color_map = False )
+   write_performance_circle( aux_df, formats, ['CSR< Adaptive >', 'CSR< Vector >', 'CSR< Light > Automatic Light'], 'performance-graph-csr-2', scale, with_color_map = False )
+   aux_df.sort_values(by=[('cusparse','GPU','bandwidth')],inplace=True,ascending=True)
+   write_performance_circle( aux_df, formats, ['cusparse', 'SlicedEllpack', 'ChunkedEllpack' ], 'performance-graph-cusparse-ellpacks', scale, with_color_map = False )
+   write_performance_circle( aux_df, formats, ['cusparse', 'CSR< Vector >', 'CSR< Light > Automatic Light'], 'performance-graph-cusparse-csr-1', scale, with_color_map = False )
+   write_performance_circle( aux_df, formats, ['cusparse', 'CSR< Adaptive >', 'CSR< Light > Automatic Light'], 'performance-graph-cusparse-csr-2', scale, with_color_map = True )
+   write_performance_circle( aux_df, formats, ['cusparse', 'CSR< Scalar >', 'CSR< Light > Automatic Light'], 'performance-graph-cusparse-csr-3', scale, with_color_map = False )
+   write_performance_circle( aux_df, formats, ['cusparse', 'SlicedEllpack', 'CSR< Light > Automatic Light'], 'performance-graph-cusparse-csr-ellpack', scale, with_color_map = True )
+
 head_size = 25
 if not os.path.exists( 'general' ):
    os.mkdir( 'general' )
-- 
GitLab


From 2bc2e9b8ce3e9515818e2ac2324b10d93731c367 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 28 Oct 2021 17:51:36 +0200
Subject: [PATCH 117/117] Fixes after rebase.

---
 Documentation/Examples/CMakeLists.txt                     | 6 ++++--
 Documentation/Tutorials/Pointers/CMakeLists.txt           | 8 ++++----
 src/Python/pytnl/tnl/SparseMatrix.h                       | 2 +-
 src/TNL/Algorithms/Segments/BiEllpackView.h               | 4 ++--
 src/TNL/Algorithms/Segments/BiEllpackView.hpp             | 2 +-
 src/TNL/Algorithms/Segments/ChunkedEllpack.hpp            | 2 +-
 src/TNL/Algorithms/Segments/ChunkedEllpackView.h          | 4 ++--
 src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp        | 2 +-
 src/TNL/Algorithms/Segments/EllpackView.hpp               | 8 ++++----
 .../Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp | 2 +-
 src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp   | 2 +-
 src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h      | 2 +-
 src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp    | 6 +++---
 src/TNL/Algorithms/Segments/Kernels/CSRScalarKernel.hpp   | 2 +-
 src/TNL/Algorithms/Segments/SlicedEllpack.hpp             | 3 ++-
 src/TNL/Algorithms/Segments/SlicedEllpackView.hpp         | 2 +-
 src/TNL/Algorithms/Segments/detail/BiEllpack.h            | 2 +-
 src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h       | 4 ++--
 src/TNL/Matrices/DenseMatrixView.hpp                      | 8 ++------
 src/TNL/Matrices/Sandbox/SparseSandboxMatrix.hpp          | 5 +++--
 src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp      | 2 +-
 src/TNL/Matrices/SparseMatrixView.hpp                     | 4 ++--
 22 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/Documentation/Examples/CMakeLists.txt b/Documentation/Examples/CMakeLists.txt
index e984d2f1f..7aa736429 100644
--- a/Documentation/Examples/CMakeLists.txt
+++ b/Documentation/Examples/CMakeLists.txt
@@ -3,7 +3,9 @@ ADD_SUBDIRECTORY( Containers )
 ADD_SUBDIRECTORY( Pointers )
 ADD_SUBDIRECTORY( Matrices )
 
-set( COMMON_EXAMPLES
+set( COMMON_EXAMPLES )
+
+set( CUDA_EXAMPLES
    FileExampleCuda
 )
 
@@ -24,7 +26,7 @@ set( HOST_EXAMPLES
    TimerExampleLogger )
 
 if( BUILD_CUDA )
-   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} ${CUDA_EXAMPLES} )
       cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
       add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
       set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
diff --git a/Documentation/Tutorials/Pointers/CMakeLists.txt b/Documentation/Tutorials/Pointers/CMakeLists.txt
index 0535e8fd5..9b83841fb 100644
--- a/Documentation/Tutorials/Pointers/CMakeLists.txt
+++ b/Documentation/Tutorials/Pointers/CMakeLists.txt
@@ -1,13 +1,13 @@
 IF( BUILD_CUDA )
-   CUDA_ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
+   CUDA_ADD_EXECUTABLE( UniquePointerExample_ UniquePointerExample.cu )
+   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample_ > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
    CUDA_ADD_EXECUTABLE( SharedPointerExample SharedPointerExample.cu )
    ADD_CUSTOM_COMMAND( COMMAND SharedPointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SharedPointerExample.out OUTPUT SharedPointerExample.out )
    CUDA_ADD_EXECUTABLE( DevicePointerExample DevicePointerExample.cu )
    ADD_CUSTOM_COMMAND( COMMAND DevicePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DevicePointerExample.out OUTPUT DevicePointerExample.out )
 ELSE()
-   ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cpp )
-   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
+   ADD_EXECUTABLE( UniquePointerExample_ UniquePointerExample.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample_ > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
 ENDIF()
 
 ADD_EXECUTABLE( UniquePointerHostExample UniquePointerHostExample.cpp )
diff --git a/src/Python/pytnl/tnl/SparseMatrix.h b/src/Python/pytnl/tnl/SparseMatrix.h
index 1dc375f98..aa0ea3394 100644
--- a/src/Python/pytnl/tnl/SparseMatrix.h
+++ b/src/Python/pytnl/tnl/SparseMatrix.h
@@ -67,7 +67,7 @@ struct export_CSR< Segments, typename TNL::enable_if_type< decltype(Segments{}.g
    static void e( Scope & s )
    {
       s
-         .def("getOffsets", []( const Segments& segments ) -> const typename Segments::OffsetsHolder& {
+         .def("getOffsets", []( const Segments& segments ) -> const typename Segments::OffsetsContainer& {
                   return segments.getOffsets();
             }, py::return_value_policy::reference_internal)
       ;
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.h b/src/TNL/Algorithms/Segments/BiEllpackView.h
index 91b055e26..f14282efb 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.h
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.h
@@ -15,7 +15,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/ElementsOrganization.h>
 #include <TNL/Algorithms/Segments/BiEllpackSegmentView.h>
-#include <TNL/Algorithms/Segments/details/BiEllpack.h>
+#include <TNL/Algorithms/Segments/detail/BiEllpack.h>
 #include <TNL/Algorithms/Segments/SegmentsPrinting.h>
 
 namespace TNL {
@@ -205,7 +205,7 @@ class BiEllpackView
                                              Real_ zero );
 
       template< typename Index_, typename Fetch_, int BlockDim_, int WarpSize_, bool B_ >
-      friend struct details::BiEllpackreduceSegmentsDispatcher;
+      friend struct detail::BiEllpackreduceSegmentsDispatcher;
 #endif
 };
 
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.hpp b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
index 8a1b035aa..2014ae3dc 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
@@ -425,7 +425,7 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
          dim3 cudaGridSize = Cuda::getMaxGridSize();
          if( gridIdx == cudaGrids - 1 )
             cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-         details::BiEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim  >
+         detail::BiEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim  >
             <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
             ( *this, gridIdx, first, last, fetch, reduction, keeper, zero );
          cudaThreadSynchronize();
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
index 9a08957da..6218a451c 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
@@ -349,7 +349,7 @@ template< typename Device,
 auto ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
 getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
-   return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentSize(
+   return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentSize(
       rowToSliceMapping.getConstView(),
       slices.getConstView(),
       rowToChunkMapping.getConstView(),
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
index ae400d2fe..0ed8ed413 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
@@ -16,7 +16,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/ElementsOrganization.h>
 #include <TNL/Algorithms/Segments/ChunkedEllpackSegmentView.h>
-#include <TNL/Algorithms/Segments/details/ChunkedEllpack.h>
+#include <TNL/Algorithms/Segments/detail/ChunkedEllpack.h>
 #include <TNL/Algorithms/Segments/SegmentsPrinting.h>
 
 namespace TNL {
@@ -228,7 +228,7 @@ class ChunkedEllpackView
                                                   Real_ zero );
 
       template< typename Index_, typename Fetch_, bool B_ >
-      friend struct details::ChunkedEllpackreduceSegmentsDispatcher;
+      friend struct detail::ChunkedEllpackreduceSegmentsDispatcher;
 #endif
 };
       } // namespace Segments
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
index a48afead5..6133a8438 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
@@ -456,7 +456,7 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
       {
          if( gridIdx == cudaGrids - 1 )
             cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-         details::ChunkedEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real  >
+         detail::ChunkedEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real  >
             <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
             ( *this, gridIdx, first, last, fetch, reduction, keeper, zero );
       }
diff --git a/src/TNL/Algorithms/Segments/EllpackView.hpp b/src/TNL/Algorithms/Segments/EllpackView.hpp
index e283a75d0..b5311d793 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/EllpackView.hpp
@@ -98,7 +98,7 @@ template< typename Index,
           typename Reduction,
           typename ResultKeeper,
           typename Real,
-          bool FullFetch = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
+          bool FullFetch = detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
 struct EllpackCudaReductionDispatcher
 {
    static void
@@ -393,7 +393,7 @@ void EllpackView< Device, Index, Organization, Alignment >::
 reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
    //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >() ) );
-   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
+   using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( Organization == RowMajorOrder )
    {
       if( std::is_same< Device, Devices::Cuda >::value )
@@ -404,11 +404,11 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
          auto l = [=] __cuda_callable__ ( const IndexType segmentIdx ) mutable {
             const IndexType begin = segmentIdx * segmentSize;
             const IndexType end = begin + segmentSize;
-            RealType aux( zero );
+            Real aux( zero );
             IndexType localIdx( 0 );
             bool compute( true );
             for( IndexType j = begin; j < end && compute; j++  )
-               aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
+               aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
             keeper( segmentIdx, aux );
          };
          Algorithms::ParallelFor< Device >::exec( first, last, l );
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp
index f213e9523..9c495fd70 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp
@@ -237,7 +237,7 @@ struct CSRAdaptiveKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reductio
 
       Index blocksCount;
 
-      const Index threads = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
+      const Index threads = detail::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
       constexpr size_t maxGridSize = TNL::Cuda::getMaxGridXSize();
 
       // Fill blocks
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
index 07225cc4e..68198f995 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
@@ -111,7 +111,7 @@ void reduceSegmentsCSRHybridMultivectorKernel(
     Index localIdx = laneIdx;
     for( Index globalIdx = beginIdx + laneIdx; globalIdx < endIdx && compute; globalIdx += ThreadsPerSegment )
     {
-       result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
+       result = reduce( result, detail::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
        localIdx += ThreadsPerSegment;
     }
     result += __shfl_down_sync(0xFFFFFFFF, result, 16);
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h
index 50322b826..be5fc1331 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h
@@ -14,7 +14,7 @@
 #include <TNL/Cuda/LaunchHelpers.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 
 namespace TNL {
    namespace Algorithms {
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
index 1c3518288..93d3e2800 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
@@ -14,7 +14,7 @@
 #include <TNL/Cuda/LaunchHelpers.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 #include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.h>
 
 namespace TNL {
@@ -317,7 +317,7 @@ void reduceSegmentsCSRLightMultivectorKernel(
     Index localIdx = laneIdx;
     for( Index globalIdx = beginIdx + laneIdx; globalIdx < endIdx && compute; globalIdx += ThreadsPerSegment )
     {
-       result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
+       result = reduce( result, detail::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
        localIdx += ThreadsPerSegment;
     }
     result += __shfl_down_sync(0xFFFFFFFF, result, 16);
@@ -377,7 +377,7 @@ template< typename Index,
           typename Reduce,
           typename Keep,
           bool DispatchScalarCSR =
-            details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ||
+            detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() ||
             std::is_same< Device, Devices::Host >::value >
 struct CSRLightKernelreduceSegmentsDispatcher;
 
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRScalarKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRScalarKernel.hpp
index 5b9c5e723..e901acfb9 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRScalarKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRScalarKernel.hpp
@@ -26,7 +26,7 @@ template< typename Index,
           typename Fetch,
           typename Reduce,
           typename Keep,
-          bool DispatchScalarCSR = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
+          bool DispatchScalarCSR = detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
 struct CSRScalarKernelreduceSegmentsDispatcher;
 
 template< typename Index,
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
index 652eceb56..b1e0a21f3 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
@@ -168,7 +168,8 @@ setSegmentsSizes( const SizesHolder& sizes )
       slice_segment_size_view[ i ] = res;
    };
    ellpack.reduceAllSegments( fetch, reduce, keep, std::numeric_limits< IndexType >::min() );
-   this->sliceOffsets.template scan< Algorithms::ScanType::Exclusive >();
+   Algorithms::inplaceExclusiveScan( this->sliceOffsets );
+   //this->sliceOffsets.template exclusiveScan< Algorithms::detail::ScanType::Exclusive >();
    this->size = sum( sizes );
    this->alignedSize = this->sliceOffsets.getElement( slicesCount );
 }
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
index d7ef9524c..80700367c 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
@@ -330,7 +330,7 @@ void
 SlicedEllpackView< Device, Index, Organization, SliceSize >::
 reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const
 {
-   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
+   using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >() ) );
    const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView();
    const auto sliceOffsets_view = this->sliceOffsets.getConstView();
diff --git a/src/TNL/Algorithms/Segments/detail/BiEllpack.h b/src/TNL/Algorithms/Segments/detail/BiEllpack.h
index db64d392d..f5f51f020 100644
--- a/src/TNL/Algorithms/Segments/detail/BiEllpack.h
+++ b/src/TNL/Algorithms/Segments/detail/BiEllpack.h
@@ -292,7 +292,7 @@ template< typename Index,
           typename Fetch,
           int BlockDim = 256,
           int WarpSize = 32,
-          bool HasAllParameters = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
+          bool HasAllParameters = detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
 struct BiEllpackreduceSegmentsDispatcher{};
 
 template< typename Index, typename Fetch, int BlockDim, int WarpSize >
diff --git a/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h
index d9a6c30f2..ed6163f3f 100644
--- a/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h
+++ b/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h
@@ -65,7 +65,7 @@ class ChunkedEllpack
       using OffsetsContainer = Containers::Vector< IndexType, DeviceType, IndexType >;
       using OffsetsHolderView = typename OffsetsContainer::ConstViewType;
       using SegmentsSizes = OffsetsContainer;
-      using ChunkedEllpackSliceInfoType = details::ChunkedEllpackSliceInfo< IndexType >;
+      using ChunkedEllpackSliceInfoType = detail::ChunkedEllpackSliceInfo< IndexType >;
       using ChunkedEllpackSliceInfoAllocator = typename Allocators::Default< Device >::template Allocator< ChunkedEllpackSliceInfoType >;
       using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >;
       using ChunkedEllpackSliceInfoContainerView = typename ChunkedEllpackSliceInfoContainer::ConstViewType;
@@ -233,7 +233,7 @@ class ChunkedEllpack
 #ifdef HAVE_CUDA
 template< typename Index,
           typename Fetch,
-          bool HasAllParameters = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
+          bool HasAllParameters = detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
 struct ChunkedEllpackreduceSegmentsDispatcher{};
 
 template< typename Index, typename Fetch >
diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index b8ad99394..3a44269d1 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -485,7 +485,7 @@ reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce,
          return fetch( rowIdx, columnIdx, values_view[ globalIdx ] );
       return identity;
    };
-   this->segments.reduceSegments( begin, end, fetch_, reduce, keep, zero );
+   this->segments.reduceSegments( begin, end, fetch_, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -502,11 +502,7 @@ reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce,
          return fetch( rowIdx, columnIdx, values_view[ globalIdx ] );
       return identity;
    };
-<<<<<<< HEAD
-   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, identity );
-=======
-   this->segments.reduceSegments( begin, end, fetch_, reduce, keep, zero );
->>>>>>> Renaming segmentsReduction to reduceSegments.
+   this->segments.reduceSegments( begin, end, fetch_, reduce, keep, identity );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.hpp b/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.hpp
index 63f49e6c8..e21e42042 100644
--- a/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.hpp
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.hpp
@@ -12,7 +12,7 @@
 
 #include <functional>
 #include <sstream>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 #include <TNL/Matrices/Sandbox/SparseSandboxMatrix.h>
 
 namespace TNL {
@@ -253,7 +253,8 @@ setRowCapacities( const RowsCapacitiesVector& rowsCapacities )
       }
    }
    this->rowPointers.setElement( this->getRows(), 0 );
-   this->rowPointers.template scan< Algorithms::ScanType::Exclusive >();
+   Algorithms::inplaceExclusiveScan( this->rowPointers );
+   //this->rowPointers.template scan< Algorithms::ScanType::Exclusive >();
    // End of sparse matrix format initiation.
 
    // SANDBOX_TODO: Compute number of all elements that need to be allocated by your format.
diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp
index 421b5c129..07342e8e7 100644
--- a/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp
@@ -12,7 +12,7 @@
 
 #include <functional>
 #include <TNL/Matrices/Sandbox/SparseSandboxMatrixView.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 #include <TNL/Algorithms/AtomicOperations.h>
 #include <TNL/Matrices/details/SparseMatrix.h>
 
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index e32236b3c..c3f7387fd 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -520,7 +520,7 @@ reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce,
       }
       return identity;
    };
-   this->segments.reduceSegments( begin, end, fetch_, reduce, keep, zero );
+   this->segments.reduceSegments( begin, end, fetch_, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -549,7 +549,7 @@ reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce,
       }
       return identity;
    };
-   this->segments.reduceSegments( begin, end, fetch_, reduce, keep, zero );
+   this->segments.reduceSegments( begin, end, fetch_, reduce, keep, identity );
 }
 
 template< typename Real,
-- 
GitLab