diff --git a/.gitignore b/.gitignore
index c5045f3562d2db78a4111ef0c1a7dd19e5ca9aa3..15a758dbdd2eee9d951328e4b7dd408c9d30b1e1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,6 @@
 /.settings
 /.project
 /.pydevproject
+
+# VSCode
+/.vscode
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d9b66026890c5a179a655f76e33c6bb44f7061e1..4ad81fd81169cc932f0326a8c37cf5b05a494c51 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -76,7 +76,7 @@ stages:
         - cmake ../..
                 -G Ninja
                 -DCMAKE_BUILD_TYPE=${BUILD_TYPE}
-                -DCMAKE_INSTALL_PREFIX=$(pwd)/${BUILD_TYPE}_install_prefix
+                -DCMAKE_INSTALL_PREFIX="$(pwd)/${BUILD_TYPE}_install_prefix"
                 -DWITH_OPENMP=${WITH_OPENMP}
                 -DWITH_MPI=${WITH_MPI}
                 -DWITH_CUDA=${WITH_CUDA}
@@ -102,8 +102,6 @@ stages:
     only:
         changes:
             - src/**/*.{h,hpp,cpp,cu}
-            - Documentation/Examples/**/*.{h,hpp,cpp,cu}
-            - Documentation/Tutorials/**/*.{h,hpp,cpp,cu}
             - "**/CMakeLists.txt"
             - .gitlab-ci.yml
     interruptible: true
@@ -117,11 +115,12 @@ dummy build job:
         - merge_requests
     except:
         changes:
+            # .build_template
             - src/**/*.{h,hpp,cpp,cu}
-            - Documentation/Examples/**/*.{h,hpp,cpp,cu}
-            - Documentation/Tutorials/**/*.{h,hpp,cpp,cu}
             - "**/CMakeLists.txt"
             - .gitlab-ci.yml
+            # build documentation
+            - Documentation/**/*
 
 # Cuda builds are specified first because they take more time than host-only builds,
 # which can be allocated on hosts whitout GPUs.
@@ -189,12 +188,6 @@ cuda_examples_Debug:
         WITH_CUDA: "yes"
         BUILD_TYPE: Debug
         WITH_EXAMPLES: "yes"
-        # build output snippets for documentation
-        WITH_DOC: "yes"
-    # store output snippets for documentation
-    artifacts:
-        paths:
-            - Documentation/output_snippets/
 
 cuda_examples_Release:
     extends: .build_template
@@ -468,6 +461,28 @@ clang_mpi_benchmarks_tools_python_Release:
 
 
 
+documentation examples:
+    extends: .build_template
+    stage: build:cuda
+    tags:
+        - docker
+        - nvidia
+    variables:
+        <<: *default_cmake_flags
+        WITH_CUDA: "yes"
+        BUILD_TYPE: Debug
+        # build output snippets for documentation
+        WITH_DOC: "yes"
+    only:
+        changes:
+            - Documentation/**/*
+            - src/TNL/**/*.{h,hpp}
+            - .gitlab-ci.yml
+    # store output snippets for documentation
+    artifacts:
+        paths:
+            - Documentation/output_snippets/
+
 build documentation:
     stage: build:doc
     only:
@@ -477,16 +492,13 @@ build documentation:
             - .gitlab-ci.yml
     # use "needs" instead of "dependencies" to allow out-of-order start of this job
     needs:
-        # the job which builds Documentation/output_snippets/
-        - job: cuda_examples_Debug
+        - job: documentation examples
           artifacts: true
     script:
         - ./Documentation/build
     artifacts:
         paths:
             - ./Documentation/html/
-#    tags:
-#        - doxygen
 
 deploy documentation:
     stage: deploy
diff --git a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_setElement.cpp b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_setElement.cpp
index 9441cc60d8418030bf9cd6951483a726532d85f0..221a23c0c460203203eb9117a2a7c34bfd9122ac 100644
--- a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_setElement.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_setElement.cpp
@@ -16,8 +16,8 @@ void setElements()
    std::cout << "Matrix set from the host:" << std::endl;
    std::cout << *matrix << std::endl;
 
-   auto f = [=] __cuda_callable__ ( int i ) mutable {
-      matrix->setElement( i, i, -i );
+   auto f = [=] __cuda_callable__ ( int i, int j ) mutable {
+      matrix->addElement( i, j, 5.0 );
    };
 
    /***
@@ -26,7 +26,7 @@ void setElements()
     * DenseMatrixView::getRow example for details.
     */
    TNL::Pointers::synchronizeSmartPointersOnDevice< Device >();
-   TNL::Algorithms::ParallelFor< Device >::exec( 0, 5, f );
+   TNL::Algorithms::ParallelFor2D< Device >::exec( 0, 0, 5, 5, f );
 
    std::cout << "Matrix set from its native device:" << std::endl;
    std::cout << *matrix << std::endl;
diff --git a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_setElement.cpp b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_setElement.cpp
index 92985bc5aafc465277d2c571a20d7f64391d6357..b3760d976121f9c6eda53fa2a10cb7cdca61ce10 100644
--- a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_setElement.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_setElement.cpp
@@ -14,10 +14,10 @@ void setElements()
    std::cout << "Matrix set from the host:" << std::endl;
    std::cout << matrix << std::endl;
 
-   auto f = [=] __cuda_callable__ ( int i ) mutable {
-      matrixView.setElement( i, i, -i );
+   auto f = [=] __cuda_callable__ ( int i, int j ) mutable {
+      matrixView.addElement( i, j, 5.0 );
    };
-   TNL::Algorithms::ParallelFor< Device >::exec( 0, 5, f );
+   TNL::Algorithms::ParallelFor2D< Device >::exec( 0, 0, 5, 5, f );
 
    std::cout << "Matrix set from its native device:" << std::endl;
    std::cout << matrix << std::endl;
diff --git a/Documentation/Examples/Matrices/LambdaMatrix/CMakeLists.txt b/Documentation/Examples/Matrices/LambdaMatrix/CMakeLists.txt
index 6315309b2362e822e35569316331e819e115566d..9bb9556267c8efb15163a6479835785df3930ac0 100644
--- a/Documentation/Examples/Matrices/LambdaMatrix/CMakeLists.txt
+++ b/Documentation/Examples/Matrices/LambdaMatrix/CMakeLists.txt
@@ -13,9 +13,19 @@ ADD_CUSTOM_COMMAND( COMMAND LambdaMatrixExample_getNonzeroElementsCount >
                      ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/LambdaMatrixExample_getNonzeroElementsCount.out
                     OUTPUT LambdaMatrixExample_getNonzeroElementsCount.out )
 
-
 IF( BUILD_CUDA )
-   CUDA_ADD_EXECUTABLE( LambdaMatrixExample_rowsReduction_cuda LambdaMatrixExample_rowsReduction.cu )
+   CUDA_ADD_EXECUTABLE( LambdaMatrixExample_Laplace_cuda LambdaMatrixExample_Laplace.cu )
+   ADD_CUSTOM_COMMAND( COMMAND LambdaMatrixExample_Laplace_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/LambdaMatrixExample_Laplace.out
+                     OUTPUT LambdaMatrixExample_Laplace.out )
+
+   CUDA_ADD_EXECUTABLE( LambdaMatrixExample_Laplace_2_cuda LambdaMatrixExample_Laplace_2.cu )
+   ADD_CUSTOM_COMMAND( COMMAND LambdaMatrixExample_Laplace_2_cuda >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/LambdaMatrixExample_Laplace_2.out
+                       OUTPUT LambdaMatrixExample_Laplace_2.out )
+                  
+
+                     CUDA_ADD_EXECUTABLE( LambdaMatrixExample_rowsReduction_cuda LambdaMatrixExample_rowsReduction.cu )
    ADD_CUSTOM_COMMAND( COMMAND LambdaMatrixExample_rowsReduction_cuda >
                         ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/LambdaMatrixExample_rowsReduction.out
                        OUTPUT LambdaMatrixExample_rowsReduction.out )
@@ -36,6 +46,16 @@ IF( BUILD_CUDA )
                        OUTPUT LambdaMatrixExample_forAllRows.out )
 
 ELSE()
+   ADD_EXECUTABLE( LambdaMatrixExample_Laplace LambdaMatrixExample_Laplace.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND LambdaMatrixExample_Laplace >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/LambdaMatrixExample_Laplace.out
+                       OUTPUT LambdaMatrixExample_Laplace.out )
+
+   ADD_EXECUTABLE( LambdaMatrixExample_Laplace_2 LambdaMatrixExample_Laplace_2.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND LambdaMatrixExample_Laplace_2 >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/LambdaMatrixExample_Laplace_2.out
+                        OUTPUT LambdaMatrixExample_Laplace_2.out )
+
    ADD_EXECUTABLE( LambdaMatrixExample_rowsReduction LambdaMatrixExample_rowsReduction.cpp )
    ADD_CUSTOM_COMMAND( COMMAND LambdaMatrixExample_rowsReduction >
                         ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/LambdaMatrixExample_rowsReduction.out
@@ -59,6 +79,8 @@ ENDIF()
 
 ADD_CUSTOM_TARGET( RunLambdaMatricesExamples ALL DEPENDS
    LambdaMatrixExample_Constructor.out
+   LambdaMatrixExample_Laplace.out
+   LambdaMatrixExample_Laplace_2.out
    LambdaMatrixExample_getCompressedRowLengths.out
    LambdaMatrixExample_getNonzeroElementsCount.out
    LambdaMatrixExample_rowsReduction.out
diff --git a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Constructor.cpp b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Constructor.cpp
index 2c418dd5407db95fa75a3ea49c9664db4be19fa2..fd6d2e1dfcc12a67e2efd83c69d7541f52a0c7d6 100644
--- a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Constructor.cpp
+++ b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Constructor.cpp
@@ -6,7 +6,7 @@ int main( int argc, char* argv[] )
    /***
     * Lambda functions defining the matrix.
     */
-   auto rowLengths = [=] __cuda_callable__ ( const int rows, const int columns, const int rowIdx ) -> int { return 1; };
+   auto compressedRowLengths = [=] __cuda_callable__ ( const int rows, const int columns, const int rowIdx ) -> int { return 1; };
    auto matrixElements1 = [=] __cuda_callable__ ( const int rows, const int columns, const int rowIdx, const int localIdx, int& columnIdx, double& value ) {
          columnIdx = rowIdx;
          value =  1.0;
@@ -21,13 +21,13 @@ int main( int argc, char* argv[] )
    /***
     * Matrix construction with explicit type definition.
     */
-   using MatrixType = decltype( TNL::Matrices::LambdaMatrixFactory< double, TNL::Devices::Host, int >::create( matrixElements1, rowLengths ) );
-   MatrixType m1( size, size, matrixElements1, rowLengths );
+   using MatrixType = decltype( TNL::Matrices::LambdaMatrixFactory< double, TNL::Devices::Host, int >::create( matrixElements1, compressedRowLengths ) );
+   MatrixType m1( size, size, matrixElements1, compressedRowLengths );
 
    /***
     * Matrix construction using 'auto'.
     */
-   auto m2 = TNL::Matrices::LambdaMatrixFactory< double, TNL::Devices::Host, int >::create( matrixElements2, rowLengths );
+   auto m2 = TNL::Matrices::LambdaMatrixFactory< double, TNL::Devices::Host, int >::create( matrixElements2, compressedRowLengths );
    m2.setDimensions( size, size );
 
    std::cout << "The first lambda matrix: " << std::endl << m1 << std::endl;
diff --git a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace.cpp b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..26b3e741a855ab0d7b886f42ca3aee3899e4648e
--- /dev/null
+++ b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace.cpp
@@ -0,0 +1,75 @@
+#include <iostream>
+#include <TNL/Matrices/LambdaMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+
+template< typename Device >
+void laplaceOperatorMatrix()
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method.
+    */
+   const int gridSize( 4 );
+   const int matrixSize = gridSize * gridSize;
+   auto rowLengths = [=] __cuda_callable__ ( const int rows, const int columns, const int rowIdx ) -> int
+   {
+      const int gridRow = rowIdx / gridSize;                  // coordinates in the numerical grid
+      const int gridColumn = rowIdx % gridSize;
+      if( gridRow == 0 || gridRow == gridSize - 1 ||          // boundary grid node
+          gridColumn == 0 || gridColumn == gridSize - 1 )
+          return 1;
+      return 5;
+   };
+   auto matrixElements = [=] __cuda_callable__ ( const int rows, const int columns, const int rowIdx, const int localIdx, int& columnIdx, double& value) {
+      const int gridRow = rowIdx / gridSize;                  // coordinates in the numerical grid
+      const int gridColumn = rowIdx % gridSize;
+      if( gridRow == 0 || gridRow == gridSize - 1 ||          // boundary grid node
+          gridColumn == 0 || gridColumn == gridSize - 1 )
+         {
+            columnIdx = rowIdx;                               // diagonal element ....
+            value = 1.0;                                      // ... is set to 1
+         }
+         else                                                 // interior grid node
+         {
+            switch( localIdx )                                // set diagonal element to 4
+            {                                                 // and the others to -1
+               case 0:
+                  columnIdx = rowIdx - gridSize;
+                  value = 1;
+                  break;
+               case 1:
+                  columnIdx = rowIdx - 1;
+                  value = 1;
+                  break;
+               case 2:
+                  columnIdx = rowIdx;
+                  value = -4;
+                  break;
+               case 3:
+                  columnIdx = rowIdx + 1;
+                  value = 1;
+                  break;
+               case 4:
+                  columnIdx = rowIdx + gridSize;
+                  value = 1;
+                  break;
+            }
+         }
+   };
+   auto matrix = TNL::Matrices::LambdaMatrixFactory< double, Device, int >::create(
+      matrixSize, matrixSize, matrixElements, rowLengths );
+   std::cout << "Laplace operator matrix: " << std::endl << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Creating Laplace operator matrix on CPU ... " << std::endl;
+   laplaceOperatorMatrix< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Creating Laplace operator matrix on CUDA GPU ... " << std::endl;
+   laplaceOperatorMatrix< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace.cu b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace.cu
new file mode 120000
index 0000000000000000000000000000000000000000..288e1097e4ec64e5b9490a8a0a67371a50151128
--- /dev/null
+++ b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace.cu
@@ -0,0 +1 @@
+LambdaMatrixExample_Laplace.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace_2.cpp b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace_2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3c70325d34a2c8ce3f17341f217e79f77168ff35
--- /dev/null
+++ b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace_2.cpp
@@ -0,0 +1,63 @@
+#include <iostream>
+#include <TNL/Matrices/LambdaMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+
+template< typename Device >
+void laplaceOperatorMatrix()
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method.
+    */
+   const int gridSize( 4 );
+   const int matrixSize = gridSize * gridSize;
+   TNL::Containers::Vector< int, Device > columnOffsets{ 0, -1, 0, 1, 0 };  // helper vector for getting matrix elements column indexes
+   columnOffsets.setElement( 0, -gridSize );
+   columnOffsets.setElement( 4, gridSize );
+   auto columnOffsetsView = columnOffsets.getView();
+   TNL::Containers::Vector< double, Device > values{ 1, 1, -4, 1, 1 };      // helper vector for getting matrix elements values
+   auto valuesView = values.getView();
+
+   auto rowLengths = [=] __cuda_callable__ ( const int rows, const int columns, const int rowIdx ) -> int
+   {
+      const int gridRow = rowIdx / gridSize;                  // coordinates in the numerical grid
+      const int gridColumn = rowIdx % gridSize;
+
+      if( gridRow == 0 || gridRow == gridSize - 1 ||          // boundary grid node
+          gridColumn == 0 || gridColumn == gridSize - 1 )
+          return 1;
+      return 5;
+   };
+
+   auto matrixElements = [=] __cuda_callable__ ( const int rows, const int columns, const int rowIdx, const int localIdx, int& columnIdx, double& value) {
+      const int gridRow = rowIdx / gridSize;                  // coordinates in the numerical grid
+      const int gridColumn = rowIdx % gridSize;
+      if( gridRow == 0 || gridRow == gridSize - 1 ||          // boundary grid node
+          gridColumn == 0 || gridColumn == gridSize - 1 )
+         {
+            columnIdx = rowIdx;                               // diagonal element ....
+            value = 1.0;                                      // ... is set to 1
+         }
+         else                                                 // interior grid node
+         {
+            columnIdx = rowIdx + columnOffsetsView[ localIdx ];
+            value = valuesView[ localIdx ];
+         }
+   };
+   auto matrix = TNL::Matrices::LambdaMatrixFactory< double, Device, int >::create(
+      matrixSize, matrixSize, matrixElements, rowLengths );
+   std::cout << "Laplace operator matrix: " << std::endl << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Creating Laplace operator matrix on CPU ... " << std::endl;
+   laplaceOperatorMatrix< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Creating Laplace operator matrix on CUDA GPU ... " << std::endl;
+   laplaceOperatorMatrix< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace_2.cu b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace_2.cu
new file mode 120000
index 0000000000000000000000000000000000000000..30b1ab049b1f264ab6faca73f9ce154ce282b8a7
--- /dev/null
+++ b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_Laplace_2.cu
@@ -0,0 +1 @@
+LambdaMatrixExample_Laplace_2.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forAllRows.cpp b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forAllRows.cpp
index 72ff9610141cdcc702bfc25128d37fbff2eec423..88ceb5687d2351fbdee9ee77d1613aef0291b115 100644
--- a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forAllRows.cpp
+++ b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_forAllRows.cpp
@@ -10,14 +10,14 @@ void forRowsExample()
    /***
     * Lambda functions defining the matrix.
     */
-   auto rowLengths = [=] __cuda_callable__ ( const int rows, const int columns, const int rowIdx ) -> int { return columns; };
+   auto compressedRowLengths = [=] __cuda_callable__ ( const int rows, const int columns, const int rowIdx ) -> int { return columns; };
    auto matrixElements = [=] __cuda_callable__ ( const int rows, const int columns, const int rowIdx, const int localIdx, int& columnIdx, double& value ) {
          columnIdx = localIdx;
          value = TNL::max( rowIdx - columnIdx + 1, 0 );
    };
 
    using MatrixFactory = TNL::Matrices::LambdaMatrixFactory< double, Device, int >;
-   auto matrix = MatrixFactory::create( 5, 5, matrixElements, rowLengths );
+   auto matrix = MatrixFactory::create( 5, 5, matrixElements, compressedRowLengths );
 
    TNL::Matrices::DenseMatrix< double, Device > denseMatrix( 5, 5 );
    auto denseView = denseMatrix.getView();
diff --git a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_rowsReduction.cpp b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_rowsReduction.cpp
index 17f3ace0d64ec2f95b4c20f28ec2609c2a36f3f7..4cb0aedab1f684f9196e6c9e56439cb5b195d452 100644
--- a/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_rowsReduction.cpp
+++ b/Documentation/Examples/Matrices/LambdaMatrix/LambdaMatrixExample_rowsReduction.cpp
@@ -40,7 +40,7 @@ void rowsReduction()
    /***
     * Reduce lambda return maximum of given values.
     */
-   auto reduce = [=] __cuda_callable__ ( double& a, const double& b ) -> double {
+   auto reduce = [=] __cuda_callable__ ( const double& a, const double& b ) -> double {
       return TNL::max( a, b );
    };
 
diff --git a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_Constructor.cpp b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_Constructor.cpp
index 8f8b8139bc172321d2037d931b51652c506581f2..0744804adedd5dbe51bf1ea7cece3c6f51d0294b 100644
--- a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_Constructor.cpp
+++ b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_Constructor.cpp
@@ -14,8 +14,8 @@ void laplaceOperatorMatrix()
     */
    const int gridSize( 4 );
    const int matrixSize = gridSize * gridSize;
-   TNL::Containers::Vector< int, Device > shifts { - gridSize, -1, 0, 1, gridSize };
-   TNL::Matrices::MultidiagonalMatrix< double, Device > matrix( matrixSize, matrixSize, shifts );
+   TNL::Containers::Vector< int, Device > offsets { - gridSize, -1, 0, 1, gridSize };
+   TNL::Matrices::MultidiagonalMatrix< double, Device > matrix( matrixSize, matrixSize, offsets );
    auto matrixView = matrix.getView();
    auto f = [=] __cuda_callable__ ( int i, int j ) mutable {
       const int elementIdx = j * gridSize + i;
diff --git a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_rowsReduction.cpp b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_rowsReduction.cpp
index dc3d4048384c48e85d952d2f35a10ad55a40d491..2b579d96372fff132b063ecd166536fcd3f57f10 100644
--- a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_rowsReduction.cpp
+++ b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_rowsReduction.cpp
@@ -48,7 +48,7 @@ void rowsReduction()
    /***
     * Reduce lambda return maximum of given values.
     */
-   auto reduce = [=] __cuda_callable__ ( double& a, const double& b ) -> double {
+   auto reduce = [=] __cuda_callable__ ( const double& a, const double& b ) -> double {
       return TNL::max( a, b );
    };
 
diff --git a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_getRow.cpp b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_getRow.cpp
index ac322f9aa61ccece6b0f5da4e6d911b29d0d48b7..db0872b103ca768d19c0ce63434a88036f5f140a 100644
--- a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_getRow.cpp
+++ b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_getRow.cpp
@@ -8,14 +8,11 @@ template< typename Device >
 void getRowExample()
 {
    const int matrixSize( 5 );
-   auto diagonalsOffsets = { -1, 0, 1 }; // Variadic templates in SharedPointer
-                                         // constructor do not recognize initializer
-                                         // list so we give it a hint.
    using MatrixType = TNL::Matrices::MultidiagonalMatrix< double, Device >;
    MatrixType matrix(
       matrixSize,  // number of matrix rows
       matrixSize,  // number of matrix columns
-      diagonalsOffsets );
+      { -1, 0, 1 } );
    auto view = matrix.getView();
 
    auto f = [=] __cuda_callable__ ( int rowIdx ) mutable {
diff --git a/Documentation/Examples/Matrices/SparseMatrix/CMakeLists.txt b/Documentation/Examples/Matrices/SparseMatrix/CMakeLists.txt
index 3f0410315d45d92555668125a50a258d07df97d1..e4000dec8f5469f7b22dc6e8eaecda76cad1682c 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/CMakeLists.txt
+++ b/Documentation/Examples/Matrices/SparseMatrix/CMakeLists.txt
@@ -9,6 +9,11 @@ IF( BUILD_CUDA )
                         ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_Constructor_init_list_2.out
                        OUTPUT SparseMatrixExample_Constructor_init_list_2.out )
 
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_Constructor_rowCapacities_vector_cuda SparseMatrixExample_Constructor_rowCapacities_vector.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_Constructor_rowCapacities_vector_cuda >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_Constructor_rowCapacities_vector.out
+                       OUTPUT SparseMatrixExample_Constructor_rowCapacities_vector.out )
+
    CUDA_ADD_EXECUTABLE( SparseMatrixExample_Constructor_std_map_cuda SparseMatrixExample_Constructor_std_map.cu )
    ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_Constructor_std_map_cuda >
                         ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_Constructor_std_map.out
@@ -150,6 +155,11 @@ ELSE()
                         ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_Constructor_init_list_2.out
                        OUTPUT SparseMatrixExample_Constructor_init_list_2.out )
 
+   ADD_EXECUTABLE( SparseMatrixExample_Constructor_rowCapacities_vector SparseMatrixExample_Constructor_rowCapacities_vector.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_Constructor_rowCapacities_vector >
+                        ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_Constructor_rowCapacities_vector.out
+                       OUTPUT SparseMatrixExample_Constructor_rowCapacities_vector.out )
+
    ADD_EXECUTABLE( SparseMatrixExample_Constructor_std_map SparseMatrixExample_Constructor_std_map.cpp )
    ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_Constructor_std_map >
                         ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_Constructor_std_map.out
@@ -285,6 +295,7 @@ ENDIF()
 ADD_CUSTOM_TARGET( RunSparseMatricesExamples ALL DEPENDS
    SparseMatrixExample_Constructor_init_list_1.out
    SparseMatrixExample_Constructor_init_list_2.out
+   SparseMatrixExample_Constructor_rowCapacities_vector.out
    SparseMatrixExample_Constructor_std_map.out
    SparseMatrixExample_getSerializationType.out
    SparseMatrixExample_setRowCapacities.out
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0ad6e6c4c310a9907767cb3f723ee8fab6f46f03
--- /dev/null
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cpp
@@ -0,0 +1,30 @@
+#include <iostream>
+#include <TNL/Matrices/SparseMatrix.h>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Devices/Host.h>
+
+
+template< typename Device >
+void initializerListExample()
+{
+   TNL::Containers::Vector< int, Device > rowCapacities{  1,  2,  3,  4,  5 };
+   TNL::Matrices::SparseMatrix< double, Device > matrix {
+      rowCapacities,  // row capacities
+      6 };            // number of matrix columns
+
+   for( int row = 0; row < matrix.getRows(); row++ )
+      for( int column = 0; column <= row; column++ )
+         matrix.setElement( row, column, row - column + 1 );
+   std::cout << "General sparse matrix: " << std::endl << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Creating matrices on CPU ... " << std::endl;
+   initializerListExample< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Creating matrices on CUDA GPU ... " << std::endl;
+   initializerListExample< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cu b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cu
new file mode 120000
index 0000000000000000000000000000000000000000..e409998b7935872bd54daf3f4d5909416587f66a
--- /dev/null
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cu
@@ -0,0 +1 @@
+SparseMatrixExample_Constructor_rowCapacities_vector.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forAllRows.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forAllRows.cpp
index 739600539260bba9e11c703c83b2d56ed8a75ff7..a8f6108bcc774ad55641e87f56e88f6040a98e93 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forAllRows.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forAllRows.cpp
@@ -9,13 +9,13 @@ void forAllRowsExample()
    TNL::Matrices::SparseMatrix< double, Device > matrix( { 1, 2, 3, 4, 5 }, 5 );
 
    auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value, bool& compute ) {
-      if( rowIdx < columnIdx )  // This is important, some matrix formats may allocate more matrix elements
-                                // than we requested. These padding elements are processed here as well.
+      if( rowIdx < localIdx )  // This is important, some matrix formats may allocate more matrix elements
+                               // than we requested. These padding elements are processed here as well.
          compute = false;
       else
       {
          columnIdx = localIdx;
-         value = rowIdx + localIdx;
+         value = rowIdx + localIdx + 1;
       }
    };
 
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forRows.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forRows.cpp
index 2330c2ca5d94439726dc4df53ef9977116d43de0..0e2ee342356ff67f8734c584e368ce8e133d34e3 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forRows.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_forRows.cpp
@@ -9,13 +9,13 @@ void forRowsExample()
    TNL::Matrices::SparseMatrix< double, Device > matrix( { 1, 2, 3, 4, 5 }, 5 );
 
    auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value, bool& compute ) {
-      if( rowIdx < columnIdx )  // This is important, some matrix formats may allocate more matrix elements
-                                // than we requested. These padding elements are processed here as well.
+      if( rowIdx < localIdx )  // This is important, some matrix formats may allocate more matrix elements
+                               // than we requested. These padding elements are processed here as well.
          compute = false;
       else
       {
          columnIdx = localIdx;
-         value = rowIdx + localIdx;
+         value = rowIdx + localIdx + 1;
       }
    };
 
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_setElement.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_setElement.cpp
index 178e502dca1fe4b249397173fda21305e3e152e6..d9a3ae4634a5b1481b5ef7875db5a6193fe3c0f5 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_setElement.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_setElement.cpp
@@ -11,12 +11,19 @@ void setElements()
 {
    auto rowCapacities = { 1, 1, 1, 1, 1 };
    TNL::Pointers::SharedPointer< TNL::Matrices::SparseMatrix< double, Device > > matrix( rowCapacities, 5 );
+
+   /***
+    * Calling the method setElements from host (CPU).
+    */
    for( int i = 0; i < 5; i++ )
       matrix->setElement( i, i, i );
 
    std::cout << "Matrix set from the host:" << std::endl;
    std::cout << *matrix << std::endl;
 
+   /***
+    * This lambda function will run on the native device of the matrix which can be CPU or GPU.
+    */
    auto f = [=] __cuda_callable__ ( int i ) mutable {
       matrix->setElement( i, i, -i );
    };
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_setRowCapacities.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_setRowCapacities.cpp
index f282aee6d724c01925d83ee9c9ec79ac3d1a8a66..fbf87e57a41999c381dabc8f9a8349a6234baa66 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_setRowCapacities.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_setRowCapacities.cpp
@@ -19,11 +19,11 @@ void setRowCapacitiesExample()
 
 int main( int argc, char* argv[] )
 {
-   std::cout << "Creating matrices on CPU ... " << std::endl;
+   std::cout << "Creating matrix on CPU ... " << std::endl;
    setRowCapacitiesExample< TNL::Devices::Host >();
 
 #ifdef HAVE_CUDA
-   std::cout << "Creating matrices on CUDA GPU ... " << std::endl;
+   std::cout << "Creating matrix on CUDA GPU ... " << std::endl;
    setRowCapacitiesExample< TNL::Devices::Cuda >();
 #endif
 }
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forAllRows.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forAllRows.cpp
index fda71a42ff2cbf17520e36ca1390f311441a0c98..ee09d61212b410404e7f267c1abb2beef0d78541 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forAllRows.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forAllRows.cpp
@@ -10,13 +10,13 @@ void forAllRowsExample()
    auto view = matrix.getView();
 
    auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value, bool& compute ) {
-      if( rowIdx < columnIdx )  // This is important, some matrix formats may allocate more matrix elements
-                                // than we requested. These padding elements are processed here as well.
+      if( rowIdx < localIdx )  // This is important, some matrix formats may allocate more matrix elements
+                               // than we requested. These padding elements are processed here as well.
          compute = false;
       else
       {
          columnIdx = localIdx;
-         value = rowIdx + localIdx;
+         value = rowIdx + localIdx + 1;
       }
    };
 
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forRows.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forRows.cpp
index 987c3dec4364fac94cf9c25f2dd7c4aa8493f184..8b76bae18572c1d7e1aa827d3625b0d674c0054f 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forRows.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_forRows.cpp
@@ -10,13 +10,13 @@ void forRowsExample()
    auto view = matrix.getView();
 
    auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, double& value, bool& compute ) {
-      if( rowIdx < columnIdx )  // This is important, some matrix formats may allocate more matrix elements
-                                // than we requested. These padding elements are processed here as well.
+      if( rowIdx < localIdx )  // This is important, some matrix formats may allocate more matrix elements
+                               // than we requested. These padding elements are processed here as well.
          compute = false;
       else
       {
          columnIdx = localIdx;
-         value = rowIdx + localIdx;
+         value = rowIdx + localIdx + 1;
       }
    };
 
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_setElement.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_setElement.cpp
index 3de6634a3bdd492135e83badc0d1febf02e9b5d3..ada2b2a82c75b72ce72bd086017ba2dd6f6a9952 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_setElement.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_setElement.cpp
@@ -10,6 +10,10 @@ template< typename Device >
 void setElements()
 {
    TNL::Matrices::SparseMatrix< double, Device > matrix( { 1, 1, 1, 1, 1 }, 5 );
+
+   /****
+    * Get the matrix view.
+    */
    auto view = matrix.getView();
    for( int i = 0; i < 5; i++ )
       view.setElement( i, i, i );
diff --git a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_rowsReduction.cpp b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_rowsReduction.cpp
index 792dc98d386fa3797b2ee06c13262c1838359269..aae0bd4e3adf87a37090a05d934328ec2e641204 100644
--- a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_rowsReduction.cpp
+++ b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_rowsReduction.cpp
@@ -46,7 +46,7 @@ void rowsReduction()
    /***
     * Reduce lambda return maximum of given values.
     */
-   auto reduce = [=] __cuda_callable__ ( double& a, const double& b ) -> double {
+   auto reduce = [=] __cuda_callable__ ( const double& a, const double& b ) -> double {
       return TNL::max( a, b );
    };
 
diff --git a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forRows.cpp b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forRows.cpp
index 24fe78f7f7f34a472e83ac3c060d9ba44171998b..d3ddd62085bee29ecd899a8960aa609618813e6b 100644
--- a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forRows.cpp
+++ b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forRows.cpp
@@ -10,11 +10,11 @@ void forRowsExample()
     * Set the following matrix (dots represent zero matrix elements and zeros are
     * padding zeros for memory alignment):
     * 
-    * 0 / 1  3  .  .  . \   -> { 0, 1, 3 }
-    *   | 2  1  3  .  . |   -> { 2, 1, 3 }
-    *   | .  2  1  3  . |   -> { 2, 1, 3 }
-    *   | .  .  2  1  3 |   -> { 2, 1, 3 }
-    *   \ .  .  .  2  1 / 0 -> { 2, 1, 0 } 
+    * 0 / 2  1  .  .  . \   -> { 0, 2, 1 }
+    *   | 3  2  1  .  . |   -> { 3, 2, 1 }
+    *   | .  3  2  1  . |   -> { 3, 2, 1 }
+    *   | .  .  3  2  1 |   -> { 3, 2, 1 }
+    *   \ .  .  .  3  2 / 0 -> { 3, 2, 0 } 
     */
    TNL::Matrices::TridiagonalMatrix< double, Device > matrix(
       5,      // number of matrix rows
@@ -29,11 +29,11 @@ void forRowsExample()
        * 
        *                           0  1  2  <- localIdx values
        *                           -------
-       * 0 / 1  3  .  .  . \   -> { 0, 1, 3 }
-       *   | 2  1  3  .  . |   -> { 2, 1, 3 }
-       *   | .  2  1  3  . |   -> { 2, 1, 3 }
-       *   | .  .  2  1  3 |   -> { 2, 1, 3 }
-       *   \ .  .  .  2  1 / 0 -> { 2, 1, 0 } 
+       * 0 / 2  1  .  .  . \   -> { 0, 2, 1 }
+       *   | 3  2  1  .  . |   -> { 3, 2, 1 }
+       *   | .  3  2  1  . |   -> { 3, 2, 1 }
+       *   | .  .  3  2  1 |   -> { 3, 2, 1 }
+       *   \ .  .  .  3  2 / 0 -> { 3, 2, 0 } 
        * 
        */
       value = 3 - localIdx;
diff --git a/Documentation/Tutorials/Arrays/tutorial_Arrays.md b/Documentation/Tutorials/Arrays/tutorial_Arrays.md
index 80e7c3816926c3da8bbc09848aaf5ae4c02898fe..b82caf0c3455ebd124844eb395171186e0499a97 100644
--- a/Documentation/Tutorials/Arrays/tutorial_Arrays.md
+++ b/Documentation/Tutorials/Arrays/tutorial_Arrays.md
@@ -1,22 +1,24 @@
 \page tutorial_Arrays  Arrays tutorial
 
+## Table of Contents
+- [Table of Contents](#table-of-contents)
+- [Introduction](#introduction)
+- [Arrays<a name="arrays"></a>](#arrays)
+  - [Array views<a name="array-views"></a>](#array-views)
+  - [Accessing the array elements<a name="accessing-the-array-elements"></a>](#accessing-the-array-elements)
+    - [Accessing the array elements with `operator[]`<a name="accessing-the-array-elements-with-operator"></a>](#accessing-the-array-elements-with-operator)
+    - [Accessing the array elements with `setElement` and `getElement`<a name="accessing-the-array-elements-with-setelement-and-getelement"></a>](#accessing-the-array-elements-with-setelement-and-getelement)
+  - [Arrays initiation with lambdas<a name="arrays-initiation-with-lambdas"></a>](#arrays-initiation-with-lambdas)
+  - [Checking the array contents<a name="checking-the-array-contents"></a>](#checking-the-array-contents)
+  - [IO operations with arrays<a name="io-operations-with-arrays"></a>](#io-operations-with-arrays)
+- [Static arrays<a name="static-arrays"></a>](#static-arrays)
+- [Distributed arrays<a name="distributed-arrays"></a>](#distributed-arrays)
+
 ## Introduction
 
 This tutorial introduces arrays in TNL. There are three types - common arrays with dynamic allocation, static arrays allocated on stack and distributed arrays with dynamic allocation. Arrays are one of the most important structures for memory management. Methods implemented in arrays are particularly useful for GPU programming. From this point of view, the reader will learn how to easily allocate memory on GPU, transfer data between GPU and CPU but also, how to initialize data allocated on GPU. In addition, the resulting code is hardware platform independent, so it can be ran on CPU nad GPU without any changes.
 
-## Table of Contents
-1. [Arrays](#arrays)
-   1. [Array views](#array_views)
-   2. [Accessing the array elements](#accessing_the_array_elements)
-      1. [Accessing the array elements with `operator[]`](#accessing_the_array_elements_with_operator)
-      2. [Accessing the array elements with `setElement` and `getElement`](#accessing_the_array_elements_with_set_get_element)
-   3. [Arrays initiation with lambdas](#arrays_initiation_with_lambdas)
-   4. [Checking the array contents](#checking_the_array_contents)
-   5. [IO operations with arrays](#io_operations_with-arrays)
-2. [Static arrays](#static_arrays)
-3. [Distributed arrays](#distributed_arrays)
-
-## Arrays <a name="arrays"></a>
+## Arrays<a name="arrays"></a>
 
 Array is templated class defined in namespace `TNL::Containers` having three template parameters:
 
@@ -33,7 +35,7 @@ The result looks as follows:
 \include ArrayAllocation.out
 
 
-### Array views <a name="array_views"></a>
+### Array views<a name="array-views"></a>
 
 Arrays cannot share data with each other or data allocated elsewhere. This can be achieved with the `ArrayView` structure which has similar semantics to `Array`, but it does not handle allocation and deallocation of the data. Hence, array view cannot be resized, but it can be used to wrap data allocated elsewhere (e.g. using an `Array` or an operator `new`) and to partition large arrays into subarrays. The process of wrapping external data with a view is called _binding_.
 
@@ -55,11 +57,11 @@ Output:
 
 Since array views do not allocate or deallocate memory, they can be created even in CUDA kernels, which is not possible with `Array`. `ArrayView` can also be passed-by-value into CUDA kernels or captured-by-value by device lambda functions, because the `ArrayView`'s copy-constructor makes only a shallow copy (i.e., it copies only the data pointer and size).
 
-### Accessing the array elements <a name="accessing_the_array_elements"></a>
+### Accessing the array elements<a name="accessing-the-array-elements"></a>
 
 There are two ways how to work with the array (or array view) elements - using the indexing operator (`operator[]`) which is more efficient or using methods `setElement` and `getElement` which is more flexible.
 
-#### Accessing the array elements with `operator[]` <a name="accessing_the_array_elements_with_operator"></a>
+#### Accessing the array elements with `operator[]`<a name="accessing-the-array-elements-with-operator"></a>
 
 Indexing operator `operator[]` is implemented in both `Array` and `ArrayView` and it is defined as `__cuda_callable__`. It means that it can be called even in CUDA kernels if the data is allocated on GPU, i.e. the `Device` parameter is `Devices::Cuda`. This operator returns a reference to given array element and so it is very efficient. However, calling this operator from host for data allocated on device (or vice versa) leads to segmentation fault (on the host system) or broken state of the device. It means:
 
@@ -76,7 +78,7 @@ Output:
 
 In general in TNL, each method defined as `__cuda_callable__` can be called from the CUDA kernels. The method `ArrayView::getSize` is another example. We also would like to point the reader to better ways of arrays initiation for example with method `ArrayView::evaluate` or with `ParallelFor`.
 
-#### Accessing the array element with `setElement` and `getElement` <a name="accessing_the_array_elements_with_set_get_element"></a>
+#### Accessing the array elements with `setElement` and `getElement`<a name="accessing-the-array-elements-with-setelement-and-getelement"></a>
 
 On the other hand, the methods `setElement` and `getElement` can be called from the host **no matter where the array is allocated**. In addition they can be called from kernels on device where the array is allocated. `getElement` returns copy of an element rather than a reference. Therefore it is slightly slower. If the array is on GPU and the methods are called from the host, the array element is copied from the device on the host (or vice versa) which is significantly slower. In the parts of code where the performance matters, these methods shall not be called from the host when the array is allocated on the device. In this way, their use is, however, easier compared to `operator[]` and they allow to write one simple code for both CPU and GPU. Both methods are good candidates for:
 
@@ -92,7 +94,7 @@ Output:
 
 \include ElementsAccessing-2.out
 
-### Arrays initiation with lambdas <a name="arrays_inititation_with_lambdas"></a>
+### Arrays initiation with lambdas<a name="arrays-initiation-with-lambdas"></a>
 
 More efficient and still quite simple method for the arrays initiation is with the use of C++ lambda functions and method `evaluate`. This method is implemented in `ArrayView` only. As an argument a lambda function is passed which is then evaluated for all elements. Optionally one may define only subinterval of element indexes where the lambda shall be evaluated. If the underlying array is allocated on GPU, the lambda function is called from CUDA kernel. This is why it is more efficient than use of `setElement`. On the other hand, one must be careful to use only `__cuda_callable__` methods inside the lambda. The use of the method `evaluate` demonstrates the following example.
 
@@ -102,7 +104,7 @@ Output:
 
 \include ArrayViewEvaluate.out
 
-### Checking the array contents <a name="arrays"></a>
+### Checking the array contents<a name="checking-the-array-contents"></a>
 
 Methods `containsValue` and `containsOnlyValue` serve for testing the contents of the arrays. `containsValue` returns `true` of there is at least one element in the array with given value. `containsOnlyValue` returns `true` only if all elements of the array equal given value. The test can be restricted to subinterval of array elements. Both methods are implemented in `Array` as well as in `ArrayView`. See the following code snippet for example of use.
 
@@ -112,7 +114,7 @@ Output:
 
 \include ContainsValue.out
 
-### IO operations with arrays <a name="arrays"></a>
+### IO operations with arrays<a name="io-operations-with-arrays"></a>
 
 Methods `save` and `load` serve for storing/restoring the array to/from a file in a binary form. In case of `Array`, loading of an array from a file causes data reallocation. `ArrayView` cannot do reallocation, therefore the data loaded from a file is copied to the memory managed by the `ArrayView`. The number of elements managed by the array view and those loaded from the file must be equal. See the following example.
 
@@ -122,7 +124,7 @@ Output:
 
 \include ArrayIO.out
 
-## Static arrays <a name="static_arrays"></a>
+## Static arrays<a name="static-arrays"></a>
 
 Static arrays are allocated on stack and thus they can be created even in CUDA kernels. Their size is fixed and it is given by a template parameter. Static array is a templated class defined in namespace `TNL::Containers` having two template parameters:
 
@@ -137,4 +139,4 @@ The output looks as:
 
 \include StaticArrayExample.out
 
-## Distributed arrays <a name="distributed_arrays"></a>
+## Distributed arrays<a name="distributed-arrays"></a>
diff --git a/Documentation/Tutorials/CMakeLists.txt b/Documentation/Tutorials/CMakeLists.txt
index 7f6aea702e1e98b41f4a415fd86e93b2a7d725a6..53ce4df627f319ae6015464407bceb4410c5bb36 100644
--- a/Documentation/Tutorials/CMakeLists.txt
+++ b/Documentation/Tutorials/CMakeLists.txt
@@ -1,5 +1,7 @@
+add_subdirectory( GeneralConcepts )
 add_subdirectory( Arrays )
 add_subdirectory( Vectors )
 add_subdirectory( ReductionAndScan )
 add_subdirectory( ForLoops )
 add_subdirectory( Pointers )
+add_subdirectory( Matrices )
diff --git a/Documentation/Tutorials/GeneralConcepts/CMakeLists.txt b/Documentation/Tutorials/GeneralConcepts/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_parallel_for.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_parallel_for.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5477541cb6804d6a09fefff1ce5a2fd39f7c10c0
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_parallel_for.cpp
@@ -0,0 +1,8 @@
+template< typename Device >
+void vectorAddition( double* v1, double* v2, double* sum, const int size )
+{
+    auto sum_lambda = [=] __cuda_callable__ ( int i ) mutable {
+        sum[ i ] = v1[ i ] + v2[ i ];
+    }
+    TNL::Algorithms::ParalellFor< Device >::exec( 0, size, sum_lambda );
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..85ba93408035aa72b5fa31411b38f53d276ed122
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction.cpp
@@ -0,0 +1,10 @@
+template< typename Device >
+void scalarProduct( double* v1, double* v2, double* product, const int size )
+{
+    auto fetch = [=] __cuda_callable__ ( int i ) -> double {
+        return = v1[ i ] * v2[ i ];
+    }
+    auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) {
+        return a + b; };
+    TNL::Algorithms::Reduction< Device >::reduce( 0, size, reduce, fetch, 0.0 );
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_2.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..deeb49dd5161b90a65adf00164340635f035bfd9
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_2.cpp
@@ -0,0 +1,12 @@
+template< typename Device >
+void scalarProduct( double* u1, double* u2,
+                    double* v1, double* v2,
+                    double* product, const int size )
+{
+    auto fetch = [=] __cuda_callable__ ( int i ) -> double {
+        return = ( u1[ i ] + u2[ i ] ) * ( v1[ i ] + v2[ i ] );
+    }
+    auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) {
+        return a + b; };
+    TNL::Algorithms::Reduction< Device >::reduce( 0, size, reduce, fetch, 0.0 );
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_cublas.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_cublas.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ccb0329b94a6519fc4a73a2f2428b26b379d0248
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_cublas.cpp
@@ -0,0 +1,9 @@
+void scalarProduct( double* u1, double* u2,
+                    double* v1, double* v2,
+                    double* product, const int size )
+{
+    cublasHandle_t handle;
+    cublasSaxpy( handle, size, 1.0, u1, 1, u2, 1 );
+    cublasSaxpy( handle, size, 1.0, v1, 1, v2, 1 );
+    cublasSdot ( handle, size, 1.0, u1, 1, v1, 1, &product );
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_assignment_example.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_assignment_example.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3ee8150d40611ac72d973cf574c4e1a53b5578c2
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_assignment_example.cpp
@@ -0,0 +1 @@
+cuda_array = host_aray;
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_device_deduction.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_device_deduction.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f3cad42fe4deb23679fb1d0c024c52f54ccdad1c
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_device_deduction.cpp
@@ -0,0 +1,6 @@
+template< typename Array >
+void deduceDevice
+{
+    using Device = typename Array::DeviceType;
+    TNL::Container::Array< int, Device > array;
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_device_test.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_device_test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3f6d217217a919665e9fccb3413ecaeaaf7f41be
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_device_test.cpp
@@ -0,0 +1,9 @@
+template< typename Array >
+void testDevice
+{
+    using Device = typename Array::DeviceType;
+    if( std::is_same< Device, TNL::Device::Host >::value )
+        std::cout << "Device is host CPU." << std::endl;
+    if( std::is_same< Device, TNL::Device::Cuda >::value )
+        std::cout << "Device is CUDA GPU." << std::endl;
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_example.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_example.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2fcb006f5993147681f6d979f47e38b1d6035ff0
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_example.cpp
@@ -0,0 +1,2 @@
+TNL::Containers::Array< int, TNL::Devices::Host > host_array;
+TNL::Containers::Array< int, TNL::Devices::Cuda > cuda_array;
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_setsize_example.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_setsize_example.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8369de9f3c0b74b3c776497fec2f906e48eb5508
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_devices_and_allocators_arrays_setsize_example.cpp
@@ -0,0 +1,2 @@
+host_array.setSize( 10 );
+cuda_array.setSize( 10 );
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_reference.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_reference.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8eeb657a470d1eca16310915fb096355c226348a
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_reference.cpp
@@ -0,0 +1,11 @@
+template< typename Device >
+void lambda_capture_by_value( int size )
+{
+    TNL::Containers::Array< int, Device > a( size );
+    auto f = [&a] __cuda_callable__ ( int i ) mutable {
+        a[ i ] = 1;
+    };
+    TNL::Algorithms::ParallelFor< Device >::exec( 0, size, f );
+}
+
+
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_value.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_value.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..77bfccca2ba19532c59f9d1f438c2d1bb96ef553
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_value.cpp
@@ -0,0 +1,11 @@
+template< typename Device >
+void lambda_capture_by_value( int size )
+{
+    TNL::Containers::Array< int, Device > a( size );
+    auto f = [=] __cuda_callable__ ( int i ) mutable {
+        a[ i ] = 1;
+    };
+    TNL::Algorithms::ParallelFor< Device >::exec( 0, size, f );
+}
+
+
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_view.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_view.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..96d82fe0b62d3c6c8a99c1cbc243fa1d6a325e0a
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_view.cpp
@@ -0,0 +1,10 @@
+template< typename Device >
+void lambda_capture_by_value( int size )
+{
+    TNL::Containers::Array< int, Device > a( size );
+    auto view = a.getView();
+    auto f = [=] __cuda_callable__ ( int i ) mutable {
+        view[ i ] = 1;
+    };
+    TNL::Algorithms::ParallelFor< Device >::exec( 0, size, f );
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_view_change.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_view_change.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4d994df5a0eb0e0f8fa613e379cd7c65ee168a84
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_view_change.cpp
@@ -0,0 +1,11 @@
+template< typename Device >
+void lambda_capture_by_value( int size )
+{
+    TNL::Containers::Array< int, Device > a( size );
+    auto view = a.getView();
+    a.setSize( 2 * size );
+    auto f = [=] __cuda_callable__ ( int i ) mutable {
+        view[ i ] = 1;
+    };
+    TNL::Algorithms::ParallelFor< Device >::exec( 0, size, f );
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_view_change_2.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_view_change_2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e6699e9f94eea8a317c358e55692c7cd088874a2
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_shared_pointers_and_views_capture_view_change_2.cpp
@@ -0,0 +1,11 @@
+template< typename Device >
+void lambda_capture_by_value( int size )
+{
+    TNL::Containers::Array< int, Device > a( size );
+    auto view = a.getView();
+    a.setElement( 0, 1 );
+    auto f = [=] __cuda_callable__ ( int i ) mutable {
+        view[ i ] = 1;
+    };
+    TNL::Algorithms::ParallelFor< Device >::exec( 0, size, f );
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/tutorial_GeneralConcepts.md b/Documentation/Tutorials/GeneralConcepts/tutorial_GeneralConcepts.md
new file mode 100644
index 0000000000000000000000000000000000000000..0e91cd6eef405042f4eb53b7f0ca722b91328b5a
--- /dev/null
+++ b/Documentation/Tutorials/GeneralConcepts/tutorial_GeneralConcepts.md
@@ -0,0 +1,115 @@
+\page tutorial_GeneralConcepts General concepts
+
+## Table of Contents
+- [Table of Contents](#table-of-contents)
+- [Introduction](#introduction)
+- [Devices and allocators<a name="devices-and-allocators"></a>](#devices-and-allocators)
+- [Algorithms and lambda functions<a name="algorithms-and-lambda-functions"></a>](#algorithms-and-lambda-functions)
+- [Shared pointers and views<a name="shared-pointers-and-views"></a>](#shared-pointers-and-views)
+  - [Data structures views<a name="data-structures-views"></a>](#data-structures-views)
+  - [Shared pointers<a name="shared-pointers"></a>](#shared-pointers)
+
+## Introduction
+
+In this part we describe some general and core concepts of programming with TNL. Understanding these ideas may significantly help to understand the design of TNL algorithms and data structure and it also helps to use TNL more efficiently. The main goal of TNL is to allow developing high performance algorithms that could run on multicore CPUs and GPUs. TNL offers unified interface and so the developer writes one code for both architectures.
+
+## Devices and allocators<a name="devices-and-allocators"></a>
+
+TNL offers unified interface for both CPUs (also referred as a host system) and GPUs (referred as device). Connection between CPU and GPU is usually represented by [PCI-Express bus](https://en.wikipedia.org/wiki/PCI_Express) which is orders of magnitude slower compared to speed of the global memory of GPU. Therefore, the communication between CPU and GPU must be reduced as much as possible. As a result, the programmer operates with two different address spaces, one for CPU and one for GPU. To distinguish between the address spaces, each data structure requiring dynamic allocation of memory needs to now on what device it resides. This is done by a template parameter `Device`. For example the following code creates two arrays, one on CPU and the other on GPU
+
+\includelineno snippet_devices_and_allocators_arrays_example.cpp
+
+Since now, [C++ template sepcialization](https://en.wikipedia.org/wiki/Partial_template_specialization) takes care of using the right methods for given device (in meaning hardware architecture and so the  device can be even CPU). For example, calling a method `setSize`
+
+\includelineno snippet_devices_and_allocators_arrays_setsize_example.cpp
+
+results in different memory allocation on CPU (for `host_array`) and on GPU (for `cuda_array`). The same holds for assignment
+
+\includelineno snippet_devices_and_allocators_arrays_assignment_example.cpp
+
+in which case appropriate data transfer from CPU to GPU is performed. Each such data structure contains inner type named `DeviceType` which tells where it resides as we can see here:
+
+\includelineno snippet_devices_and_allocators_arrays_device_deduction.cpp
+
+If we need to specialize some parts of algorithm with respect to its device we can do it by means of  \ref std::is_same :
+
+\includelineno snippet_devices_and_allocators_arrays_device_check.cpp
+
+TODO: Allocators
+
+## Algorithms and lambda functions<a name="algorithms-and-lambda-functions"></a>
+
+Developing a code for GPUs (in [CUDA](https://developer.nvidia.com/CUDA-zone) for example) consists mainly of writing [kernels](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#kernels) which are special functions running on GPU in parallel. This can be very hard and tedious work especially when it comes to debugging. [Parallel reduction](https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf) is a perfect example of an algorithm which is relatively hard to understand and implement on one hand but it is necessary to use frequently. Writing tens of lines of code every time we need to sum up some data is exactly what we mean by tedious programming. TNL offers skeletons or patterns of such algorithms and combines them with user defined [lambda functions](https://en.cppreference.com/w/cpp/language/lambda). This approach is not absolutely general, which means that you can use it only in situation when there is a skeleton/pattern (see \ref TNL::Algorithms) suitable for your problem. But when there is, it offers several advantages:
+
+1. Implementing lambda functions is much easier compared to implementing GPU kernels.
+2. Code implemented this way works even on CPU, so the developer writes only one code for both hardware architectures.
+3. The developer may debug the code on CPU first and then just run it on GPU. Quite likely it will work with only a little or no changes.
+
+The following code snippet demonstrates it on use of \ref TNL::Algorithms::ParallelFor:
+
+\includelineno snippet_algorithms_and_lambda_functions_parallel_for.cpp
+
+In this example, we assume that all arrays `v1`, `v2` and `sum` were properly allocated on given `Device`. If `Device` equals \ref TNL::Devices::Host , the lambda function is processed sequentially or in parallel by several OpenMP threads on CPU. If `Device` equals \ref TNL::Devices::Cuda , the lambda function is called from CUDA kernel (this is why it is defined as `__cuda_callable__` which is just a substitute for `__host__ __device__` ) by apropriate number of CUDA threads. One more example demonstrates use of \ref TNL::Algorithms::Reduction .
+
+\includelineno snippet_algorithms_and_lambda_functions_reduction.cpp
+
+We will not explain the parallel reduction in TNL at this moment (see the section about [flexible parallel reduction](tutorial_ReductionAndScan.html#flexible_parallel_reduction) ), we hope that the idea is more or less clear from the code snippet. If `Device` equals to \ref TNL::Device::Host , the scalar product is evaluated sequentially or in parallel by several OpenMP threads on CPU, if `Device` equals \ref TNL::Algorithms::Cuda, the [parallel reduction](https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf) fine tuned with the lambda functions is performed. Fortunately, there is no performance drop. On the contrary, since it is easy to generate CUDA kernels for particular situations, we may get more efficient code. Consider computing a scalar product of sum of vectors like this
+
+\f[
+s = (u_1 + u_2, v_1 + v_2).
+\f]
+
+This can be solved by the following code
+
+\includelineno snippet_algorithms_and_lambda_functions_reduction_2.cpp
+
+We have changed only the `fetch` lambda function to perform the sums of `u1[ i ] + u2[ i ]` and `v1[ i ] + v2[ i ]` (line 7). Now we get completely new CUDA kernel tailored exactly for our problem. Doing the same with [Cublas](https://developer.nvidia.com/cublas), for example, would require splitting into three separate kernels:
+
+1. Kernel to compute \f$u_1 = u_1 + u_2\f$.
+2. Kernel to compute \f$v_1 = v_1 + v_2\f$.
+3. Kernel to compute \f$product = ( u_1, v_1 )\f$.
+
+This could be achieved with the following code:
+
+\includelineno snippet_algorithms_and_lambda_functions_reduction_cublas.cpp
+
+We believe that C++ lambda functions with properly designed patterns of parallel algorithms could make programming of GPUs significantly easier. We see a parallel with [MPI standard](https://en.wikipedia.org/wiki/Message_Passing_Interface) which in nineties defined frequent communication operations in distributed parallel computing. It made programming of distributed systems much easier and at the same time MPI helps to write efficient programs. We aim to add additional skeletons or patterns to \ref TNL::Algorithms.
+
+## Shared pointers and views<a name="shared-pointers-and-views"></a>
+
+You might notice that in the previous section we used only C style arrays represented by pointers in the lambda functions. There is a difficulty when we want to access TNL arrays or other data structures inside the lambda functions. We may capture the outside variables either by a value or a reference. The first case would be as follows:
+
+\includelineno snippet_shared_pointers_and_views_capture_value.cpp
+
+In this case a deep copy of array `a` will be made and so there will be no effect of what we do with the array `a` in the lambda function. Capturing by a reference may look as follows:
+
+\includelineno snippet_shared_pointers_and_views_capture_reference.cpp
+
+This would be correct on CPU (i.e. when `Device` is \ref TNL::Devices::Host ). However, we are not allowed to pass references to CUDA kernels and so this source code would not even compile with CUDA compiler. To overcome this issue, TNL offers two solutions:
+
+1. Data structures views
+2. Shared pointers
+
+### Data structures views<a name="data-structures-views"></a>
+
+View is a kind of lightweight reference object which makes only a shallow copy of itself in copy constructor. Therefore view can by captured by value, but because it is, in fact, a reference to another object, everything we do with the view will affect the original object. The example with the array would look as follows:
+
+\includelineno snippet_shared_pointers_and_views_capture_view.cpp
+
+The differences are on the line 5 where we fetch the view by means of method `getView` and on the line 7 where we work with the `view` and not with the array `a`. The view has very similar interface (see \ref TNL::Containers::ArrayView) as the array (\ref TNL::Containers::Array) and so mostly there is no difference in using array and its view for the programmer. In TNL, each data structure which can be accessed from GPU kernels (it means that it has methods defined as `__cuda_callable__`) provides also a method `getView` for getting appropriate view of the object.
+
+Views are simple objects because they must be transferred to GPU in each kernel call. So there are no smart links between a view and the original object. In fact, the array view contains just a pointer the data managed by the array and the size of the array. Therefore if the original object get changed, all views obtained from the object before may become invalid. See the following example:
+
+\includelineno snippet_shared_pointers_and_views_capture_view_change.cpp
+
+Such code would not work because after obtaining the view on the line 5, we change the size of the array `a` which will cause data reallocation. As we mentioned, there is no pointer from the view to the array and so the view has no chance to check if it is still up-to-date with the original object. However, if you fetch all necessary views immediately before capturing by a lambda function, there will be no problem. And this is why **the views are recommended for accessing TNL data structures in lambda functions or GPU kernels**.
+
+Note, that changing the data managed by the array after fetching the view is not an issue. See the following example:
+
+\includelineno snippet_shared_pointers_and_views_capture_view_change_2.cpp
+
+On the line 6, we change value of the first element. This causes no data reallocation or change of size and so the view fetched on the line 5 is still valid and up-to-date.
+
+### Shared pointers<a name="shared-pointers"></a>
+
+TNL offers smart pointers working across different devices (meaning CPU or GPU).
diff --git a/Documentation/Tutorials/Matrices/CMakeLists.txt b/Documentation/Tutorials/Matrices/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0d672aa0b04f51eeda133f5b785568632dfe8b0e
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/CMakeLists.txt
@@ -0,0 +1,139 @@
+IF( BUILD_CUDA )
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_Constructor_init_list DenseMatrixExample_Constructor_init_list.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_Constructor_init_list >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_Constructor_init_list.out 
+                       OUTPUT DenseMatrixExample_Constructor_init_list.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_addElement DenseMatrixExample_addElement.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_addElement >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_addElement.out
+                       OUTPUT DenseMatrixExample_addElement.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_setElement DenseMatrixExample_setElement.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_setElement >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_setElement.out
+                       OUTPUT DenseMatrixExample_setElement.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_forRows DenseMatrixExample_forRows.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_forRows >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_forRows.out
+                       OUTPUT DenseMatrixExample_forRows.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_rowsReduction_vectorProduct DenseMatrixExample_rowsReduction_vectorProduct.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_rowsReduction_vectorProduct >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_rowsReduction_vectorProduct.out
+                       OUTPUT DenseMatrixExample_rowsReduction_vectorProduct.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixExample_rowsReduction_maxNorm DenseMatrixExample_rowsReduction_maxNorm.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixExample_rowsReduction_maxNorm >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixExample_rowsReduction_maxNorm.out
+                       OUTPUT DenseMatrixExample_rowsReduction_maxNorm.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixViewExample_setElement DenseMatrixViewExample_setElement.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixViewExample_setElement >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixViewExample_setElement.out OUTPUT
+                       DenseMatrixViewExample_setElement.out )
+
+   CUDA_ADD_EXECUTABLE( DenseMatrixViewExample_data_encapsulation DenseMatrixViewExample_data_encapsulation.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DenseMatrixViewExample_data_encapsulation >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DenseMatrixViewExample_data_encapsulation.out OUTPUT
+                       DenseMatrixViewExample_data_encapsulation.out )
+
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_Constructor_init_list_2 SparseMatrixExample_Constructor_init_list_2.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_Constructor_init_list_2 >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_Constructor_init_list_2.out
+                       OUTPUT SparseMatrixExample_Constructor_init_list_2.out )
+
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_setRowCapacities SparseMatrixExample_setRowCapacities.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_setRowCapacities >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_setRowCapacities.out
+                       OUTPUT SparseMatrixExample_setRowCapacities.out )
+
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_Constructor_rowCapacities_vector SparseMatrixExample_Constructor_rowCapacities_vector.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_Constructor_rowCapacities_vector >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_Constructor_rowCapacities_vector.out
+                       OUTPUT SparseMatrixExample_Constructor_rowCapacities_vector.out )
+
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_Constructor_std_map SparseMatrixExample_Constructor_std_map.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_Constructor_std_map >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_Constructor_std_map.out
+                       OUTPUT SparseMatrixExample_Constructor_std_map.out )
+
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_setElements SparseMatrixExample_setElements.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_setElements >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_setElements.out
+                       OUTPUT SparseMatrixExample_setElements.out )
+
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_setElements_map SparseMatrixExample_setElements_map.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_setElements_map >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_setElements_map.out
+                       OUTPUT SparseMatrixExample_setElements_map.out )
+
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_setElement SparseMatrixExample_setElement.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_setElement >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_setElement.out
+                       OUTPUT SparseMatrixExample_setElement.out )
+
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_addElement SparseMatrixExample_addElement.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_addElement >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_addElement.out
+                       OUTPUT SparseMatrixExample_addElement.out )
+
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_forRows SparseMatrixExample_forRows.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_forRows >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_forRows.out
+                       OUTPUT SparseMatrixExample_forRows.out )
+
+   CUDA_ADD_EXECUTABLE( SparseMatrixExample_rowsReduction_vectorProduct SparseMatrixExample_rowsReduction_vectorProduct.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixExample_rowsReduction_vectorProduct >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixExample_rowsReduction_vectorProduct.out
+                       OUTPUT SparseMatrixExample_rowsReduction_vectorProduct.out )
+
+   CUDA_ADD_EXECUTABLE( SparseMatrixViewExample_setElement SparseMatrixViewExample_setElement.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SparseMatrixViewExample_setElement >
+                       ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SparseMatrixViewExample_setElement.out
+                       OUTPUT SparseMatrixViewExample_setElement.out )
+
+   ####
+   # THe following examples/benchmarks run for very long time
+   CUDA_ADD_EXECUTABLE( DenseMatrixSetup_Benchmark_cuda DenseMatrixSetup_Benchmark.cu )
+   CUDA_ADD_EXECUTABLE( SparseMatrixSetup_Benchmark_cuda SparseMatrixSetup_Benchmark.cu )
+   CUDA_ADD_EXECUTABLE( MultidiagonalMatrixSetup_Benchmark_cuda MultidiagonalMatrixSetup_Benchmark.cu )
+
+ELSE()
+
+   ####
+   # THe following examples/benchmarks run for very long time
+   ADD_EXECUTABLE( DenseMatrixSetup_Benchmark DenseMatrixSetup_Benchmark_cuda.cpp )
+   ADD_EXECUTABLE( SparseMatrixSetup_Benchmark SparseMatrixSetup_Benchmark_cuda.cpp )
+   ADD_EXECUTABLE( MultidiagonalMatrixSetup_Benchmark MultidiagonalMatrixSetup_Benchmark_cuda.cpp )
+ENDIF()
+
+IF( BUILD_CUDA )
+ADD_CUSTOM_TARGET( TutorialsMatricesCuda ALL DEPENDS
+   DenseMatrixExample_Constructor_init_list.out
+   DenseMatrixExample_addElement.out
+   DenseMatrixExample_setElement.out
+   DenseMatrixExample_forRows.out
+   DenseMatrixExample_rowsReduction_vectorProduct.out
+   DenseMatrixExample_rowsReduction_maxNorm.out
+   DenseMatrixViewExample_setElement.out
+   DenseMatrixViewExample_data_encapsulation.out
+   SparseMatrixExample_Constructor_init_list_2.out
+   SparseMatrixExample_setRowCapacities.out
+   SparseMatrixExample_Constructor_std_map.out
+   SparseMatrixExample_setElements.out
+   SparseMatrixExample_setElements_map.out
+   SparseMatrixExample_setElement.out
+   SparseMatrixExample_forRows.out
+   SparseMatrixExample_rowsReduction_vectorProduct.out
+   SparseMatrixViewExample_setElement.out
+ )
+ELSE()
+ADD_CUSTOM_TARGET( TutorialsMatrices ALL DEPENDS
+)
+ENDIF()
+#
+#ADD_CUSTOM_TARGET( TutorialsPointers ALL DEPENDS
+#   UniquePointerHostExample.out
+#)
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_Constructor_init_list.cpp b/Documentation/Tutorials/Matrices/DenseMatrixExample_Constructor_init_list.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..faa270f15a4dc3c4e835a1fc7888d40cd26f4b59
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_Constructor_init_list.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/DenseMatrix/DenseMatrixExample_Constructor_init_list.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_Constructor_init_list.cu b/Documentation/Tutorials/Matrices/DenseMatrixExample_Constructor_init_list.cu
new file mode 120000
index 0000000000000000000000000000000000000000..e633e76a9b80504e7a2b750eb43655f9215820b8
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_Constructor_init_list.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/DenseMatrix/DenseMatrixExample_Constructor_init_list.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_addElement.cpp b/Documentation/Tutorials/Matrices/DenseMatrixExample_addElement.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..c471b0ce3155b8942a78da330767e5bda04259be
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_addElement.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/DenseMatrix/DenseMatrixExample_addElement.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_addElement.cu b/Documentation/Tutorials/Matrices/DenseMatrixExample_addElement.cu
new file mode 120000
index 0000000000000000000000000000000000000000..67dd6dced3eb6d3e7149be4907542b77f8326a36
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_addElement.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/DenseMatrix/DenseMatrixExample_addElement.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_forRows.cpp b/Documentation/Tutorials/Matrices/DenseMatrixExample_forRows.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..690bdbf92976467e0a91dd1e6ce0d0792baf0856
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_forRows.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/DenseMatrix/DenseMatrixExample_forRows.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_forRows.cu b/Documentation/Tutorials/Matrices/DenseMatrixExample_forRows.cu
new file mode 120000
index 0000000000000000000000000000000000000000..0783daeded45612f99d01857c0bc74692f550e42
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_forRows.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/DenseMatrix/DenseMatrixExample_forRows.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_maxNorm.cpp b/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_maxNorm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a1837ebc7f9769fedf3fd03456f39130519d76a9
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_maxNorm.cpp
@@ -0,0 +1,66 @@
+#include <iostream>
+#include <iomanip>
+#include <functional>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+template< typename Device >
+void rowsReduction()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix {
+      {  1,  0,  0,  0,  0 },
+      {  1,  2,  0,  0,  0 },
+      {  0,  1,  8,  0,  0 },
+      {  0,  0,  1,  9,  0 },
+      {  0,  0,  0,  0,  1 } };
+
+   /***
+    * Find largest element in each row.
+    */
+   TNL::Containers::Vector< double, Device > rowMax( matrix.getRows() );
+
+   /***
+    * Prepare vector view for lambdas.
+    */
+   auto rowMaxView = rowMax.getView();
+
+   /***
+    * Fetch lambda just returns absolute value of matrix elements.
+    */
+   auto fetch = [=] __cuda_callable__ ( int rowIdx, int columnIdx, const double& value ) -> double {
+      return TNL::abs( value );
+   };
+
+   /***
+    * Reduce lambda return maximum of given values.
+    */
+   auto reduce = [=] __cuda_callable__ ( double& a, const double& b ) -> double {
+      return TNL::max( a, b );
+   };
+
+   /***
+    * Keep lambda store the largest value in each row to the vector rowMax.
+    */
+   auto keep = [=] __cuda_callable__ ( int rowIdx, const double& value ) mutable {
+      rowMaxView[ rowIdx ] = value;
+   };
+
+   /***
+    * Compute the largest values in each row.
+    */
+   matrix.rowsReduction( 0, matrix.getRows(), fetch, reduce, keep, std::numeric_limits< double >::lowest() );
+
+   std::cout << "Max. elements in rows are: " << rowMax << std::endl;
+   std::cout << "Max. matrix norm is: " << TNL::max( rowMax ) << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Rows reduction on host:" << std::endl;
+   rowsReduction< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Rows reduction on CUDA device:" << std::endl;
+   rowsReduction< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_maxNorm.cu b/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_maxNorm.cu
new file mode 120000
index 0000000000000000000000000000000000000000..04b5e78e1eb43ca3ee016e86de75333c73241041
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_maxNorm.cu
@@ -0,0 +1 @@
+DenseMatrixExample_rowsReduction_maxNorm.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_vectorProduct.cpp b/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_vectorProduct.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1dcef95dda4907ef1658a2f0dac4ed06ef38cf1d
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_vectorProduct.cpp
@@ -0,0 +1,70 @@
+#include <iostream>
+#include <iomanip>
+#include <functional>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void rowsReduction()
+{
+   TNL::Matrices::DenseMatrix< double, Device > matrix {
+      {  1,  0,  0,  0,  0 },
+      {  1,  2,  0,  0,  0 },
+      {  0,  1,  8,  0,  0 },
+      {  0,  0,  1,  9,  0 },
+      {  0,  0,  0,  0,  1 } };
+
+   /***
+    * Allocate input and output vectors for matrix-vector product
+    */
+   TNL::Containers::Vector< double, Device > x( matrix.getColumns() ),
+                                             y( matrix.getRows() );
+
+   /***
+    * Fill the input vectors with ones.
+    */
+   x = 1.0;
+
+   /***
+    * Prepare vector view for lambdas.
+    */
+   auto xView = x.getView();
+   auto yView = y.getView();
+
+   /***
+    * Fetch lambda just returns product of appropriate matrix elements and the
+    * input vector elements.
+    */
+   auto fetch = [=] __cuda_callable__ ( int rowIdx, int columnIdx, const double& value ) -> double {
+      return xView[ columnIdx ] * value;
+   };
+
+   /***
+    * Keep lambda store the result of matrix-vector product to output vector y.
+    */
+   auto keep = [=] __cuda_callable__ ( int rowIdx, const double& value ) mutable {
+      yView[ rowIdx ] = value;
+   };
+
+   /***
+    * Compute matrix-vector product.
+    */
+   matrix.rowsReduction( 0, matrix.getRows(), fetch, std::plus<>{}, keep, 0.0 );
+
+   std::cout << "The matrix reads as:" << std::endl << matrix << std::endl;
+   std::cout << "The input vector is:" << x << std::endl;
+   std::cout << "Result of matrix-vector multiplication is: " << y << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Rows reduction on host:" << std::endl;
+   rowsReduction< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << std::endl;
+   std::cout << "Rows reduction on CUDA device:" << std::endl;
+   rowsReduction< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_vectorProduct.cu b/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_vectorProduct.cu
new file mode 120000
index 0000000000000000000000000000000000000000..36e05a773c60ccd436358108bc9e0ef6362ad100
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_rowsReduction_vectorProduct.cu
@@ -0,0 +1 @@
+DenseMatrixExample_rowsReduction_vectorProduct.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_setElement.cpp b/Documentation/Tutorials/Matrices/DenseMatrixExample_setElement.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..cb68721bb18b354fee17a97b3b9ef3815bd54746
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_setElement.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/DenseMatrix/DenseMatrixExample_setElement.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixExample_setElement.cu b/Documentation/Tutorials/Matrices/DenseMatrixExample_setElement.cu
new file mode 120000
index 0000000000000000000000000000000000000000..79539e197f7448949ffe2cde01ccee65348454f6
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixExample_setElement.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/DenseMatrix/DenseMatrixExample_setElement.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cpp b/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7545376c927e5d3633f7df19dfe06ace1531e7df
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cpp
@@ -0,0 +1,151 @@
+#include <iostream>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/SparseMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Timer.h>
+
+const int testsCount = 5;
+
+template< typename Matrix >
+void setElement_on_host( const int matrixSize, Matrix& matrix )
+{
+   matrix.setDimensions( matrixSize, matrixSize );
+
+   for( int j = 0; j < matrixSize; j++ )
+      for( int i = 0; i < matrixSize; i++ )
+         matrix.setElement( i, j,  i + j );
+}
+
+template< typename Matrix >
+void setElement_on_host_and_transfer( const int matrixSize, Matrix& matrix )
+{
+   using RealType = typename Matrix::RealType;
+   using IndexType = typename Matrix::IndexType;
+   using HostMatrix = TNL::Matrices::DenseMatrix< RealType, TNL::Devices::Host, IndexType >;
+   HostMatrix hostMatrix( matrixSize, matrixSize );
+
+   for( int j = 0; j < matrixSize; j++ )
+      for( int i = 0; i < matrixSize; i++ )
+         hostMatrix.setElement( i, j,  i + j );
+   matrix = hostMatrix;
+}
+
+template< typename Matrix >
+void setElement_on_device( const int matrixSize, Matrix& matrix )
+{
+   matrix.setDimensions( matrixSize, matrixSize );
+
+   auto matrixView = matrix.getView();
+   auto f = [=] __cuda_callable__ ( int i, int j ) mutable {
+         matrixView.setElement( i, j,  i + j );
+   };
+   TNL::Algorithms::ParallelFor2D< typename Matrix::DeviceType >::exec( 0, 0, matrixSize, matrixSize, f );
+}
+
+template< typename Matrix >
+void getRow( const int matrixSize, Matrix& matrix )
+{
+   matrix.setDimensions( matrixSize, matrixSize );
+
+   auto matrixView = matrix.getView();
+   auto f = [=] __cuda_callable__ ( int rowIdx ) mutable {
+      auto row = matrixView.getRow( rowIdx );
+      for( int i = 0; i < matrixSize; i++ )
+         row.setElement( i, rowIdx + i );
+   };
+   TNL::Algorithms::ParallelFor< typename Matrix::DeviceType >::exec( 0, matrixSize, f );
+}
+
+template< typename Matrix >
+void forRows( const int matrixSize, Matrix& matrix )
+{
+   matrix.setDimensions( matrixSize, matrixSize );
+
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, float& value, bool& compute ) mutable {
+      value = rowIdx + columnIdx;
+   };
+   matrix.forRows( 0, matrixSize, f );
+}
+
+template< typename Device >
+void setupDenseMatrix()
+{
+   std::cout << " Dense matrix test:" << std::endl;
+   for( int matrixSize = 16; matrixSize <= 8192; matrixSize *= 2 )
+   {
+      std::cout << "  Matrix size = " << matrixSize << std::endl;
+      TNL::Timer timer;
+
+      std::cout << "   setElement on host: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::DenseMatrix< float, Device, int > matrix;
+         setElement_on_host( matrixSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      std::cout << "   setElement on device: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::DenseMatrix< float, Device, int > matrix;
+         setElement_on_device( matrixSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      if( std::is_same< Device, TNL::Devices::Cuda >::value )
+      {
+         std::cout << "   setElement on host and transfer on GPU: ";
+         timer.reset();
+         timer.start();
+         for( int i = 0; i < testsCount; i++ )
+         {
+            TNL::Matrices::DenseMatrix< float, Device, int > matrix;
+            setElement_on_host_and_transfer( matrixSize, matrix );
+         }
+         timer.stop();
+         std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+      }
+
+      std::cout << "   getRow: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::DenseMatrix< float, Device, int > matrix;
+         getRow( matrixSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      std::cout << "   forRows: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::DenseMatrix< float, Device, int > matrix;
+         forRows( matrixSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+   }
+}
+
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Creating dense matrix on CPU ... " << std::endl;
+   setupDenseMatrix< TNL::Devices::Host >();
+
+
+#ifdef HAVE_CUDA
+   std::cout << "Creating dense matrix on CUDA GPU ... " << std::endl;
+   setupDenseMatrix< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cu b/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cu
new file mode 120000
index 0000000000000000000000000000000000000000..d9b61a3cf8ea64a1f4b3f33e910e6d9206a75fad
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixSetup_Benchmark.cu
@@ -0,0 +1 @@
+DenseMatrixSetup_Benchmark.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixViewExample_data_encapsulation.cpp b/Documentation/Tutorials/Matrices/DenseMatrixViewExample_data_encapsulation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..99cf67583994d0de03487d213ce5bf53a378edf1
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixViewExample_data_encapsulation.cpp
@@ -0,0 +1,64 @@
+#include <iostream>
+#ifdef HAVE_CUDA
+#include <cuda.h>
+#endif
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/DenseMatrix.h>
+#include <TNL/Devices/Host.h>
+
+template< typename Device >
+void encapsulation()
+{
+   const int size = 5;
+
+   /***
+    * Allocate the dense matrix with no use of TNL
+    */
+   double* host_data = new double[ size * size ];
+   for( int row = 0; row < size; row++ )
+      for( int column = 0; column < size; column++ )
+         host_data[ row * size + column ] = row * size + column + 1;
+   double* data = nullptr;
+   if( std::is_same< Device, TNL::Devices::Host >::value )
+   {
+      data = new double[ size * size ];
+      memcpy( data, host_data, sizeof( double ) * size * size );
+   }
+#ifdef HAVE_CUDA
+   else if( std::is_same< Device, TNL::Devices::Cuda >::value )
+   {
+      cudaMalloc( ( void**) &data, sizeof( double ) * size * size );
+      cudaMemcpy( data, host_data, sizeof( double ) * size * size,  cudaMemcpyHostToDevice );
+   }
+#endif
+
+   /***
+    * Encapsulate the matrix into DenseMatrixView.
+    */
+   TNL::Containers::VectorView< double, Device > dataView( data, size * size );
+   TNL::Matrices::DenseMatrixView< double, Device, int, TNL::Algorithms::Segments::RowMajorOrder > matrix( 5, 5, dataView );
+
+   std::cout << "Dense matrix view reads as:" << std::endl;
+   std::cout << matrix << std::endl;
+
+   auto f = [=] __cuda_callable__ ( int i ) mutable {
+      matrix.setElement( i, i, -i );
+   };
+   TNL::Algorithms::ParallelFor< Device >::exec( 0, 5, f );
+
+   std::cout << "Dense matrix view after elements manipulation:" << std::endl;
+   std::cout << matrix << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Dense matrix encapsulation on host:" << std::endl;
+   encapsulation< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Dense matrix encapsulation on CUDA device:" << std::endl;
+   encapsulation< TNL::Devices::Cuda >();
+#endif
+}
+
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixViewExample_data_encapsulation.cu b/Documentation/Tutorials/Matrices/DenseMatrixViewExample_data_encapsulation.cu
new file mode 120000
index 0000000000000000000000000000000000000000..8be2d545adc09f06a58967288261d1b9b73c3a30
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixViewExample_data_encapsulation.cu
@@ -0,0 +1 @@
+DenseMatrixViewExample_data_encapsulation.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixViewExample_setElement.cpp b/Documentation/Tutorials/Matrices/DenseMatrixViewExample_setElement.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..a3832e2e8dee2e173b044a2039efef818676a41a
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixViewExample_setElement.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/DenseMatrix/DenseMatrixViewExample_setElement.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/DenseMatrixViewExample_setElement.cu b/Documentation/Tutorials/Matrices/DenseMatrixViewExample_setElement.cu
new file mode 120000
index 0000000000000000000000000000000000000000..9d1266dd3a5795d35fb97da20c24b91d6208cad2
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/DenseMatrixViewExample_setElement.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/DenseMatrix/DenseMatrixViewExample_setElement.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/MultidiagonalMatrixExample_Constructor.cpp b/Documentation/Tutorials/Matrices/MultidiagonalMatrixExample_Constructor.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..da76904277e43a262fcd293b44defacaee4b96ff
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/MultidiagonalMatrixExample_Constructor.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_Constructor.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/MultidiagonalMatrixExample_Constructor_init_list_1.cpp b/Documentation/Tutorials/Matrices/MultidiagonalMatrixExample_Constructor_init_list_1.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..1e5ca52b04d64ba3560dbbd682f7bc6b55356bd3
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/MultidiagonalMatrixExample_Constructor_init_list_1.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_Constructor_init_list_1.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cpp b/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..713394cedb5060b7d3ef4bdb37d8026a289d840d
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cpp
@@ -0,0 +1,263 @@
+#include <iostream>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/MultidiagonalMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Timer.h>
+
+const int testsCount = 5;
+
+template< typename Device >
+TNL::Containers::Vector< int, Device > getOffsets( const int gridSize )
+{
+   TNL::Containers::Vector< int, Device > offsets( 5 );
+   offsets.setElement( 0, -gridSize );
+   offsets.setElement( 1, -1 );
+   offsets.setElement( 2, 0 );
+   offsets.setElement( 3, 1 );
+   offsets.setElement( 4, gridSize );
+   return offsets;
+}
+
+template< typename Matrix >
+void setElement_on_host( const int gridSize, Matrix& matrix )
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method by means setElement method called
+    * from the host system.
+    */
+   const int matrixSize = gridSize * gridSize;
+   matrix.setDimensions( matrixSize, matrixSize, getOffsets< typename Matrix::DeviceType >( gridSize ) );
+
+   for( int j = 0; j < gridSize; j++ )
+      for( int i = 0; i < gridSize; i++ )
+      {
+         const int rowIdx = j * gridSize + i;
+         if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 )
+            matrix.setElement( rowIdx, rowIdx,  1.0 );
+         else
+         {
+            matrix.setElement( rowIdx, rowIdx - gridSize,  1.0 );
+            matrix.setElement( rowIdx, rowIdx - 1,  1.0 );
+            matrix.setElement( rowIdx, rowIdx,  -4.0 );
+            matrix.setElement( rowIdx, rowIdx + 1,  1.0 );
+            matrix.setElement( rowIdx, rowIdx + gridSize,  1.0 );
+         }
+      }
+}
+
+template< typename Matrix >
+void setElement_on_host_and_transfer( const int gridSize, Matrix& matrix )
+{
+   using RealType = typename Matrix::RealType;
+   using IndexType = typename Matrix::IndexType;
+   using HostMatrix = TNL::Matrices::MultidiagonalMatrix< RealType, TNL::Devices::Host, IndexType >;
+   const int matrixSize = gridSize * gridSize;
+   HostMatrix hostMatrix( matrixSize, matrixSize, getOffsets< typename Matrix::DeviceType >( gridSize ) );
+
+   for( int j = 0; j < gridSize; j++ )
+      for( int i = 0; i < gridSize; i++ )
+      {
+         const int rowIdx = j * gridSize + i;
+         if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 )
+            hostMatrix.setElement( rowIdx, rowIdx,  1.0 );
+         else
+         {
+            hostMatrix.setElement( rowIdx, rowIdx - gridSize,  1.0 );
+            hostMatrix.setElement( rowIdx, rowIdx - 1,  1.0 );
+            hostMatrix.setElement( rowIdx, rowIdx,  -4.0 );
+            hostMatrix.setElement( rowIdx, rowIdx + 1,  1.0 );
+            hostMatrix.setElement( rowIdx, rowIdx + gridSize,  1.0 );
+         }
+      }
+   matrix = hostMatrix;
+}
+
+
+template< typename Matrix >
+void setElement_on_device( const int gridSize, Matrix& matrix )
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method by means of setElement method called
+    * from the native device.
+    */
+   const int matrixSize = gridSize * gridSize;
+   matrix.setDimensions( matrixSize, matrixSize, getOffsets< typename Matrix::DeviceType >( gridSize ) );
+
+   auto matrixView = matrix.getView();
+   auto f = [=] __cuda_callable__ ( int i, int j ) mutable {
+      const int rowIdx = j * gridSize + i;
+      if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 )
+         matrixView.setElement( rowIdx, rowIdx,  1.0 );
+      else
+      {
+         matrixView.setElement( rowIdx, rowIdx - gridSize,  1.0 );
+         matrixView.setElement( rowIdx, rowIdx - 1,  1.0 );
+         matrixView.setElement( rowIdx, rowIdx,  -4.0 );
+         matrixView.setElement( rowIdx, rowIdx + 1,  1.0 );
+         matrixView.setElement( rowIdx, rowIdx + gridSize,  1.0 );
+      }
+   };
+   TNL::Algorithms::ParallelFor2D< typename Matrix::DeviceType >::exec( 0, 0, gridSize, gridSize, f );
+}
+
+template< typename Matrix >
+void getRow( const int gridSize, Matrix& matrix )
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method by means of getRow method.
+    */
+   const int matrixSize = gridSize * gridSize;
+   matrix.setDimensions( matrixSize, matrixSize, getOffsets< typename Matrix::DeviceType >( gridSize ) );
+
+   auto matrixView = matrix.getView();
+   auto f = [=] __cuda_callable__ ( int rowIdx ) mutable {
+      const int i = rowIdx % gridSize;
+      const int j = rowIdx / gridSize;
+      auto row = matrixView.getRow( rowIdx );
+      if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 )
+         row.setElement( 2, 1.0 );
+      else
+      {
+         row.setElement( 0, 1.0 );
+         row.setElement( 1, 1.0 );
+         row.setElement( 2, -4.0 );
+         row.setElement( 3, 1.0 );
+         row.setElement( 4, 1.0 );
+      }
+   };
+   TNL::Algorithms::ParallelFor< typename Matrix::DeviceType >::exec( 0, matrixSize, f );
+}
+
+template< typename Matrix >
+void forRows( const int gridSize, Matrix& matrix )
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method by means of forRows method.
+    */
+
+   const int matrixSize = gridSize * gridSize;
+   matrix.setDimensions( matrixSize, matrixSize, getOffsets< typename Matrix::DeviceType >( gridSize ) );
+
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int columnIdx, float& value, bool& compute ) mutable {
+      const int i = rowIdx % gridSize;
+      const int j = rowIdx / gridSize;
+      if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 && localIdx == 0 )
+      {
+         columnIdx = rowIdx;
+         value = 1.0;
+      }
+      else
+      {
+         switch( localIdx )
+         {
+            case 0:
+               columnIdx = rowIdx - gridSize;
+               value = 1.0;
+               break;
+            case 1:
+               columnIdx = rowIdx - 1;
+               value = 1.0;
+               break;
+            case 2:
+               columnIdx = rowIdx;
+               value = -4.0;
+               break;
+            case 3:
+               columnIdx = rowIdx + 1;
+               value = 1.0;
+               break;
+            case 4:
+               columnIdx = rowIdx + gridSize;
+               value = 1.0;
+               break;
+         }
+      }
+   };
+   matrix.forRows( 0, matrixSize, f );
+}
+
+template< typename Device >
+void laplaceOperatorMultidiagonalMatrix()
+{
+   std::cout << " Sparse matrix test:" << std::endl;
+   for( int gridSize = 16; gridSize <= 8192; gridSize *= 2 )
+   {
+      std::cout << "  Grid size = " << gridSize << std::endl;
+      TNL::Timer timer;
+
+      std::cout << "   setElement on host: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::MultidiagonalMatrix< float, Device, int > matrix;
+         setElement_on_host( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      if( std::is_same< Device, TNL::Devices::Cuda >::value )
+      {
+         std::cout << "   setElement on host and transfer on GPU: ";
+         timer.reset();
+         timer.start();
+         for( int i = 0; i < testsCount; i++ )
+         {
+            TNL::Matrices::MultidiagonalMatrix< float, Device, int > matrix;
+            setElement_on_host_and_transfer( gridSize, matrix );
+         }
+         timer.stop();
+         std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+      }
+
+      std::cout << "   setElement on device: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::MultidiagonalMatrix< float, Device, int > matrix;
+         setElement_on_device( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      std::cout << "   getRow: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::MultidiagonalMatrix< float, Device, int > matrix;
+         getRow( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      std::cout << "   forRows: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::MultidiagonalMatrix< float, Device, int > matrix;
+         forRows( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+   }
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Creating Laplace operator matrix on CPU ... " << std::endl;
+   laplaceOperatorMultidiagonalMatrix< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Creating Laplace operator matrix on CUDA GPU ... " << std::endl;
+   laplaceOperatorMultidiagonalMatrix< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cu b/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cu
new file mode 120000
index 0000000000000000000000000000000000000000..ec14fc9eaf8be3a38719a4e8b18238eda6fd45c6
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/MultidiagonalMatrixSetup_Benchmark.cu
@@ -0,0 +1 @@
+MultidiagonalMatrixSetup_Benchmark.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_init_list_2.cpp b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_init_list_2.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..9d23bbb1c8210f48cb7fc30ee6017e30b8139f7d
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_init_list_2.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_init_list_2.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_init_list_2.cu b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_init_list_2.cu
new file mode 120000
index 0000000000000000000000000000000000000000..759f1a1ca5a3942d37c35f5f28e3774095008067
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_init_list_2.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_init_list_2.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_rowCapacities_vector.cpp b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_rowCapacities_vector.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..ddeed9a7bc3fe6628aee27b3c0f8fcbf8fb48645
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_rowCapacities_vector.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_rowCapacities_vector.cu b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_rowCapacities_vector.cu
new file mode 120000
index 0000000000000000000000000000000000000000..5957448c64b03905dcd5c9e7935d275e33f6bedb
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_rowCapacities_vector.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_std_map.cpp b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_std_map.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..dcc4ec9aef3a7b7e53fafee9e4738304f68f33b7
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_std_map.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_std_map.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_std_map.cu b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_std_map.cu
new file mode 120000
index 0000000000000000000000000000000000000000..75a75befb2cda5813fb7d01d1eb4681fb47b4528
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_Constructor_std_map.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_Constructor_std_map.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_addElement.cpp b/Documentation/Tutorials/Matrices/SparseMatrixExample_addElement.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..215dde5df06dfa5c541b862359267f2e83efccb9
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_addElement.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_addElement.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_addElement.cu b/Documentation/Tutorials/Matrices/SparseMatrixExample_addElement.cu
new file mode 120000
index 0000000000000000000000000000000000000000..c2425241fcb6bb03cfe57b09dbd70b1eaeca7f0a
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_addElement.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_addElement.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_forRows.cpp b/Documentation/Tutorials/Matrices/SparseMatrixExample_forRows.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..6115ba2275a5346258589783760bfefd6c203c53
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_forRows.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_forRows.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_forRows.cu b/Documentation/Tutorials/Matrices/SparseMatrixExample_forRows.cu
new file mode 120000
index 0000000000000000000000000000000000000000..b6d3f173231fd2f3f01892a8492dca32eb72d9ee
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_forRows.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_forRows.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_rowsReduction_vectorProduct.cpp b/Documentation/Tutorials/Matrices/SparseMatrixExample_rowsReduction_vectorProduct.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dd72230feab644ab6f0593f45e837de08e159721
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_rowsReduction_vectorProduct.cpp
@@ -0,0 +1,70 @@
+#include <iostream>
+#include <iomanip>
+#include <functional>
+#include <TNL/Matrices/SparseMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+template< typename Device >
+void rowsReduction()
+{
+   TNL::Matrices::SparseMatrix< double, Device > matrix { 5, 5, {
+      { 0, 0, 1 },
+      { 1, 0, 1 }, { 1, 1, 2 },
+      { 2, 1, 1 }, { 2, 2, 8 },
+      { 3, 2, 1 }, { 3, 3, 9 },
+      { 4, 4, 1 } } };
+
+   /***
+    * Allocate input and output vectors for matrix-vector product
+    */
+   TNL::Containers::Vector< double, Device > x( matrix.getColumns() ),
+                                             y( matrix.getRows() );
+
+   /***
+    * Fill the input vectors with ones.
+    */
+   x = 1.0;
+
+   /***
+    * Prepare vector view for lambdas.
+    */
+   auto xView = x.getView();
+   auto yView = y.getView();
+
+   /***
+    * Fetch lambda just returns product of appropriate matrix elements and the
+    * input vector elements.
+    */
+   auto fetch = [=] __cuda_callable__ ( int rowIdx, int columnIdx, const double& value ) -> double {
+      return xView[ columnIdx ] * value;
+   };
+
+   /***
+    * Keep lambda store the result of matrix-vector product to output vector y.
+    */
+   auto keep = [=] __cuda_callable__ ( int rowIdx, const double& value ) mutable {
+      yView[ rowIdx ] = value;
+   };
+
+   /***
+    * Compute matrix-vector product.
+    */
+   matrix.rowsReduction( 0, matrix.getRows(), fetch, std::plus<>{}, keep, 0.0 );
+
+   std::cout << "The matrix reads as:" << std::endl << matrix << std::endl;
+   std::cout << "The input vector is:" << x << std::endl;
+   std::cout << "Result of matrix-vector multiplication is: " << y << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Rows reduction on host:" << std::endl;
+   rowsReduction< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << std::endl;
+   std::cout << "Rows reduction on CUDA device:" << std::endl;
+   rowsReduction< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_rowsReduction_vectorProduct.cu b/Documentation/Tutorials/Matrices/SparseMatrixExample_rowsReduction_vectorProduct.cu
new file mode 120000
index 0000000000000000000000000000000000000000..1be7a26d87fe2fb8c12ab3bc22ee82333c7ed21d
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_rowsReduction_vectorProduct.cu
@@ -0,0 +1 @@
+SparseMatrixExample_rowsReduction_vectorProduct.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_setElement.cpp b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElement.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..1507393de5fbb5eff4088c8a08eafc6fadc72eab
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElement.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_setElement.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_setElement.cu b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElement.cu
new file mode 120000
index 0000000000000000000000000000000000000000..2f13c04edeb3506ca1f0aa9b73754c00c7759068
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElement.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_setElement.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements.cpp b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..0f5e5d1dd5d2ab59ce265203779509479203ba54
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_setElements.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements.cu b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements.cu
new file mode 120000
index 0000000000000000000000000000000000000000..120be66591ee38ebaba3c5fa18c76076de9d7df9
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_setElements.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements_map.cpp b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements_map.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..5206fcc2e5d70994d67de439117a1acfa4dde872
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements_map.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_setElements_map.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements_map.cu b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements_map.cu
new file mode 120000
index 0000000000000000000000000000000000000000..9c5f7c0f52dc07f51cd25d62111ff1106a47b1f1
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_setElements_map.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_setElements_map.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_setRowCapacities.cpp b/Documentation/Tutorials/Matrices/SparseMatrixExample_setRowCapacities.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..973b2f3a8aef18f13f2b0c7d4defbd88a1996291
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_setRowCapacities.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_setRowCapacities.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixExample_setRowCapacities.cu b/Documentation/Tutorials/Matrices/SparseMatrixExample_setRowCapacities.cu
new file mode 120000
index 0000000000000000000000000000000000000000..ef674e0f0c9fc59ff2d375ba437e2b94594e5e4c
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixExample_setRowCapacities.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixExample_setRowCapacities.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp b/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c53a8f5b4a9edc06b499448dcec71e91b4428529
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp
@@ -0,0 +1,299 @@
+#include <iostream>
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Matrices/SparseMatrix.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Timer.h>
+
+const int testsCount = 5;
+
+template< typename Matrix >
+void STL_Map( const int gridSize, Matrix& matrix )
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method by means of STL map.
+    */
+   const int matrixSize = gridSize * gridSize;
+   matrix.setDimensions( matrixSize, matrixSize );
+   std::map< std::pair< int, int >, double > map;
+   for( int j = 0; j < gridSize; j++ )
+      for( int i = 0; i < gridSize; i++ )
+      {
+         const int rowIdx = j * gridSize + i;
+         if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 )
+            map.insert( std::make_pair( std::make_pair( rowIdx, rowIdx ),  1.0 ) );
+         else
+         {
+            map.insert( std::make_pair( std::make_pair( rowIdx, rowIdx - gridSize ),  1.0 ) );
+            map.insert( std::make_pair( std::make_pair( rowIdx, rowIdx - 1 ),  1.0 ) );
+            map.insert( std::make_pair( std::make_pair( rowIdx, rowIdx ),  -4.0 ) );
+            map.insert( std::make_pair( std::make_pair( rowIdx, rowIdx + 1 ),  1.0 ) );
+            map.insert( std::make_pair( std::make_pair( rowIdx, rowIdx + gridSize ),  1.0 ) );
+         }
+      }
+   matrix.setElements( map );
+}
+
+template< typename Matrix >
+void setElement_on_host( const int gridSize, Matrix& matrix )
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method by means setElement method called
+    * from the host system.
+    */
+   const int matrixSize = gridSize * gridSize;
+   TNL::Containers::Vector< int, typename Matrix::DeviceType, int > rowCapacities( matrixSize, 5 );
+   matrix.setDimensions( matrixSize, matrixSize );
+   matrix.setRowCapacities( rowCapacities );
+
+   for( int j = 0; j < gridSize; j++ )
+      for( int i = 0; i < gridSize; i++ )
+      {
+         const int rowIdx = j * gridSize + i;
+         if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 )
+            matrix.setElement( rowIdx, rowIdx,  1.0 );
+         else
+         {
+            matrix.setElement( rowIdx, rowIdx - gridSize,  1.0 );
+            matrix.setElement( rowIdx, rowIdx - 1,  1.0 );
+            matrix.setElement( rowIdx, rowIdx,  -4.0 );
+            matrix.setElement( rowIdx, rowIdx + 1,  1.0 );
+            matrix.setElement( rowIdx, rowIdx + gridSize,  1.0 );
+         }
+      }
+}
+
+template< typename Matrix >
+void setElement_on_host_and_transfer( const int gridSize, Matrix& matrix )
+{
+   using RealType = typename Matrix::RealType;
+   using HostMatrix = typename Matrix::Self< RealType, TNL::Devices::Host >;
+
+   const int matrixSize = gridSize * gridSize;
+   TNL::Containers::Vector< int, typename HostMatrix::DeviceType, int > rowCapacities( matrixSize, 5 );
+   HostMatrix hostMatrix( matrixSize, matrixSize );
+   hostMatrix.setRowCapacities( rowCapacities );
+
+   for( int j = 0; j < gridSize; j++ )
+      for( int i = 0; i < gridSize; i++ )
+      {
+         const int rowIdx = j * gridSize + i;
+         if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 )
+            hostMatrix.setElement( rowIdx, rowIdx,  1.0 );
+         else
+         {
+            hostMatrix.setElement( rowIdx, rowIdx - gridSize,  1.0 );
+            hostMatrix.setElement( rowIdx, rowIdx - 1,  1.0 );
+            hostMatrix.setElement( rowIdx, rowIdx,  -4.0 );
+            hostMatrix.setElement( rowIdx, rowIdx + 1,  1.0 );
+            hostMatrix.setElement( rowIdx, rowIdx + gridSize,  1.0 );
+         }
+      }
+   matrix = hostMatrix;
+}
+
+template< typename Matrix >
+void setElement_on_device( const int gridSize, Matrix& matrix )
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method by means of setElement method called
+    * from the native device.
+    */
+   const int matrixSize = gridSize * gridSize;
+   TNL::Containers::Vector< int, typename Matrix::DeviceType, int > rowCapacities( matrixSize, 5 );
+   matrix.setDimensions( matrixSize, matrixSize );
+   matrix.setRowCapacities( rowCapacities );
+
+   auto matrixView = matrix.getView();
+   auto f = [=] __cuda_callable__ ( int i, int j ) mutable {
+      const int rowIdx = j * gridSize + i;
+      if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 )
+         matrixView.setElement( rowIdx, rowIdx,  1.0 );
+      else
+      {
+         matrixView.setElement( rowIdx, rowIdx - gridSize,  1.0 );
+         matrixView.setElement( rowIdx, rowIdx - 1,  1.0 );
+         matrixView.setElement( rowIdx, rowIdx,  -4.0 );
+         matrixView.setElement( rowIdx, rowIdx + 1,  1.0 );
+         matrixView.setElement( rowIdx, rowIdx + gridSize,  1.0 );
+      }
+   };
+   TNL::Algorithms::ParallelFor2D< typename Matrix::DeviceType >::exec( 0, 0, gridSize, gridSize, f );
+}
+
+template< typename Matrix >
+void getRow( const int gridSize, Matrix& matrix )
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method by means of getRow method.
+    */
+   const int matrixSize = gridSize * gridSize;
+   TNL::Containers::Vector< int, typename Matrix::DeviceType, int > rowCapacities( matrixSize, 5 );
+   matrix.setDimensions( matrixSize, matrixSize );
+   matrix.setRowCapacities( rowCapacities );
+
+   auto matrixView = matrix.getView();
+   auto f = [=] __cuda_callable__ ( int rowIdx ) mutable {
+      const int i = rowIdx % gridSize;
+      const int j = rowIdx / gridSize;
+      auto row = matrixView.getRow( rowIdx );
+      if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 )
+         row.setElement( 2, rowIdx,  1.0 );
+      else
+      {
+         row.setElement( 0, rowIdx - gridSize, 1.0 );
+         row.setElement( 1, rowIdx - 1, 1.0 );
+         row.setElement( 2, rowIdx, -4.0 );
+         row.setElement( 3, rowIdx + 1, 1.0 );
+         row.setElement( 4, rowIdx + gridSize, 1.0 );
+      }
+   };
+   TNL::Algorithms::ParallelFor< typename Matrix::DeviceType >::exec( 0, matrixSize, f );
+}
+
+template< typename Matrix >
+void forRows( const int gridSize, Matrix& matrix )
+{
+   /***
+    * Set  matrix representing approximation of the Laplace operator on regular
+    * grid using the finite difference method by means of forRows method.
+    */
+
+   const int matrixSize = gridSize * gridSize;
+   TNL::Containers::Vector< int, typename Matrix::DeviceType, int > rowCapacities( matrixSize, 5 );
+   matrix.setDimensions( matrixSize, matrixSize );
+   matrix.setRowCapacities( rowCapacities );
+
+   auto f = [=] __cuda_callable__ ( int rowIdx, int localIdx, int& columnIdx, float& value, bool& compute ) mutable {
+      const int i = rowIdx % gridSize;
+      const int j = rowIdx / gridSize;
+      if( i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1 && localIdx == 0 )
+      {
+         columnIdx = rowIdx;
+         value = 1.0;
+      }
+      else
+      {
+         switch( localIdx )
+         {
+            case 0:
+               columnIdx = rowIdx - gridSize;
+               value = 1.0;
+               break;
+            case 1:
+               columnIdx = rowIdx - 1;
+               value = 1.0;
+               break;
+            case 2:
+               columnIdx = rowIdx;
+               value = -4.0;
+               break;
+            case 3:
+               columnIdx = rowIdx + 1;
+               value = 1.0;
+               break;
+            case 4:
+               columnIdx = rowIdx + gridSize;
+               value = 1.0;
+               break;
+         }
+      }
+   };
+   matrix.forRows( 0, matrixSize, f );
+}
+
+template< typename Device >
+void laplaceOperatorSparseMatrix()
+{
+   std::cout << " Sparse matrix test:" << std::endl;
+   for( int gridSize = 16; gridSize <= 8192; gridSize *= 2 )
+   {
+      std::cout << "  Grid size = " << gridSize << std::endl;
+      TNL::Timer timer;
+
+      std::cout << "   STL map: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::SparseMatrix< float, Device, int > matrix;
+         STL_Map( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      std::cout << "   setElement on host: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::SparseMatrix< float, Device, int > matrix;
+         setElement_on_host( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      if( std::is_same< Device, TNL::Devices::Cuda >::value )
+      {
+         std::cout << "   setElement on host and transfer on GPU: ";
+         timer.reset();
+         timer.start();
+         for( int i = 0; i < testsCount; i++ )
+         {
+            TNL::Matrices::SparseMatrix< float, Device, int > matrix;
+            setElement_on_host_and_transfer( gridSize, matrix );
+         }
+         timer.stop();
+         std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+      }
+
+      std::cout << "   setElement on device: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::SparseMatrix< float, Device, int > matrix;
+         setElement_on_device( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      std::cout << "   getRow: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::SparseMatrix< float, Device, int > matrix;
+         getRow( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+      std::cout << "   forRows: ";
+      timer.reset();
+      timer.start();
+      for( int i = 0; i < testsCount; i++ )
+      {
+         TNL::Matrices::SparseMatrix< float, Device, int > matrix;
+         forRows( gridSize, matrix );
+      }
+      timer.stop();
+      std::cout << timer.getRealTime() / ( double ) testsCount << " sec." << std::endl;
+
+   }
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Creating Laplace operator matrix on CPU ... " << std::endl;
+   laplaceOperatorSparseMatrix< TNL::Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Creating Laplace operator matrix on CUDA GPU ... " << std::endl;
+   laplaceOperatorSparseMatrix< TNL::Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cu b/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cu
new file mode 120000
index 0000000000000000000000000000000000000000..f5a79c1329a2e809e2a0bede0f89741b92eb0e7e
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cu
@@ -0,0 +1 @@
+SparseMatrixSetup_Benchmark.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixViewExample_setElement.cpp b/Documentation/Tutorials/Matrices/SparseMatrixViewExample_setElement.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..0b861369e1d298d1a4a28d50d1d1a41186fd7f7c
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixViewExample_setElement.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixViewExample_setElement.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/SparseMatrixViewExample_setElement.cu b/Documentation/Tutorials/Matrices/SparseMatrixViewExample_setElement.cu
new file mode 120000
index 0000000000000000000000000000000000000000..9a6e8304dda6e10d06cfd87fe8b1989b16dfa7ce
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/SparseMatrixViewExample_setElement.cu
@@ -0,0 +1 @@
+../../Examples/Matrices/SparseMatrix/SparseMatrixViewExample_setElement.cu
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_Constructor_init_list_1.cpp b/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_Constructor_init_list_1.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..f074fa48bc889cf387995910e92895dcedbd8ec2
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_Constructor_init_list_1.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_Constructor_init_list_1.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_rowsReduction.cpp b/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_rowsReduction.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..5a8b79027874d4ee8a1d70e07228811e6cafc872
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_rowsReduction.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_rowsReduction.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_setElement.cpp b/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_setElement.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..aa3443952bb40bb19e089d06357c8d433c5ea204
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_setElement.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_setElement.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_setElements.cpp b/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_setElements.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..6a1a2e1ef872fe6664d627e73a895d9d3bf744c0
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/TridiagonalMatrixExample_setElements.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_setElements.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/TridiagonalMatrixViewExample_forRows.cpp b/Documentation/Tutorials/Matrices/TridiagonalMatrixViewExample_forRows.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..8f072994a75e9fd388c7ed858be86fd0fd741348
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/TridiagonalMatrixViewExample_forRows.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_forRows.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/TridiagonalMatrixViewExample_getRow.cpp b/Documentation/Tutorials/Matrices/TridiagonalMatrixViewExample_getRow.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..960c717f44d6fbbbe82eb34363d840b6412ba6e6
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/TridiagonalMatrixViewExample_getRow.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_getRow.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/TridiagonalMatrixViewExample_setElement.cpp b/Documentation/Tutorials/Matrices/TridiagonalMatrixViewExample_setElement.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..59094634e4b36a17746a4bedfe5bb2c3fb66b4f2
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/TridiagonalMatrixViewExample_setElement.cpp
@@ -0,0 +1 @@
+../../Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_setElement.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/snippet_CompressedRowLengthsLambda_declaration.cpp b/Documentation/Tutorials/Matrices/snippet_CompressedRowLengthsLambda_declaration.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ac3a9de9490406229c4aca8947518438180bddb5
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/snippet_CompressedRowLengthsLambda_declaration.cpp
@@ -0,0 +1,4 @@
+auto compressedRowLengths = [=] __cuda_callable__ (
+    Index rows,
+    Index columns,
+    Index row ) -> Index;
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/snippet_MatrixElementsLambda_declaration.cpp b/Documentation/Tutorials/Matrices/snippet_MatrixElementsLambda_declaration.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0db51ee48f6225284e767bcf88eacc8aa5c7ba00
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/snippet_MatrixElementsLambda_declaration.cpp
@@ -0,0 +1,8 @@
+auto matrixElements = [=] __cuda_callable__ (
+    Index rows,
+    Index columns,
+    Index row,
+    Index localIdx,
+    Index& columnIdx,
+    Real& value );
+
diff --git a/Documentation/Tutorials/Matrices/snippet_rows_reduction_fetch_declaration.cpp b/Documentation/Tutorials/Matrices/snippet_rows_reduction_fetch_declaration.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..57439abfa3cb3faa7147a2e0f3c0d6567a2e4a69
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/snippet_rows_reduction_fetch_declaration.cpp
@@ -0,0 +1 @@
+auto fetch = [=] __cuda_callable__ ( Index rowIdx, Index columnIdx, const Real& value ) -> Real;
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/snippet_rows_reduction_keep_declaration.cpp b/Documentation/Tutorials/Matrices/snippet_rows_reduction_keep_declaration.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8428768ee4560646f5aedc56c29ea83d144295ab
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/snippet_rows_reduction_keep_declaration.cpp
@@ -0,0 +1 @@
+auto keep = [=] __cuda_callable__ ( Index rowIdx, const Real& value ) mutable;
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/snippet_rows_reduction_reduce_declaration.cpp b/Documentation/Tutorials/Matrices/snippet_rows_reduction_reduce_declaration.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e2d63c014839b9e21c05f7e428982b668b993821
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/snippet_rows_reduction_reduce_declaration.cpp
@@ -0,0 +1 @@
+auto reduce = [] __cuda_callable__ ( const Real& a, const Real& b ) -> Real;
\ No newline at end of file
diff --git a/Documentation/Tutorials/Matrices/tutorial_Matrices.md b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
new file mode 100644
index 0000000000000000000000000000000000000000..c6847e64c8f10673fb6201161dc36bc4a3ebbd26
--- /dev/null
+++ b/Documentation/Tutorials/Matrices/tutorial_Matrices.md
@@ -0,0 +1,1416 @@
+\page tutorial_Matrices  Matrices tutorial
+
+## Table of Contents
+1. [Overview of matrix types](#overview_of_matrix_types)
+2. [Indexing of nonzero matrix elements in sparse matrices](#indexing_of_nonzero_matrix_elements_in_sparse_matrices)
+3. [Matrix view](#matrix_view)
+4. [Allocation and setup of different matrix types](#allocation_and_setup_of_different_matrix_types)
+   1. [Dense matrices](#dense_matrices_setup)
+   2. [Sparse matrices](#sparse_matrices_setup)
+   3. [Tridiagonal matrices](#tridiagonal_matrices_setup)
+   4. [Multidiagonal matrices](#multidiagonal_matrices_setup)
+   5. [Lambda matrices](#lambda_matrices_setup)
+   6. [Distributed matrices](#distributed-matrices-setup)
+5. [Flexible reduction in matrix rows](#flexible_reduction_in_matrix_rows)
+   1. [Dense matrices example](#dense-matrices-flexible-reduction-example)
+   2. [Sparse matrices example](#sparse-matrices-flexible-reduction-example)
+   3. [Tridiagonal matrices example](#tridiagonal-matrices-flexible-reduction-example)
+   4. [Multidiagonal matrices example](#multidiagonal-matrices-flexible-reduction-example)
+   5. [Lambda matrices example](#lambda-matrices-flexible-reduction-example)
+6. [Matrix-vector product](#matrix_vector_product)
+7. [Matrix I/O operations](#matrix_io_operations)
+   1. [Matrix reader](#matrix-reader)
+   2. [Matrix writer](#matrix-writer)
+8. [Appendix](#appendix)
+
+## Introduction
+
+TNL offers several types of matrices like dense (\ref TNL::Matrices::DenseMatrix), sparse (\ref TNL::Matrices::SparseMatrix), tridiagonal (\ref TNL::Matrices::TridiagonalMatrix), multidiagonal (\ref TNL::Matrices::MultidiagonalMatrix) and lambda matrices (\ref TNL::Matrices::LambdaMatrix). The sparse matrices can be symmetric to lower the memory requirements. The interfaces of given matrix types are designed to be as unified as possible to ensure that the user can easily switch between different matrix types while making no or only a little changes in the source code. All matrix types allows traversing all matrix elements and manipulate them using lambda functions as well as performing flexible reduction in matrix rows. The following text describes particular matrix types and their unified interface in details.
+
+
+## Overview of matrix types <a name="overview_of_matrix_types"></a>
+
+In a lot of numerical algorithms either dense or sparse matrices are used. The dense matrix (\ref TNL::Matrices::DenseMatrix) is such that all or at least most of its matrix elements are nonzero. On the other hand [sparse matrix](https://en.wikipedia.org/wiki/Sparse_matrix) (\ref TNL::Matrices::SparseMatrix) is a matrix which has most of the matrix elements equal to zero. From the implementation point of view, the data structures for the dense matrices allocates all matrix elements while formats for the sparse matrices aim to store explicitly only the nonzero matrix elements. The most popular format for storing the sparse matrices is [CSR format](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)). However, especially for better data alignment in memory of GPUs, many other formats were designed. In TNL, the user may choose between several different sparse matrix formats. There are also sparse matrices with specific pattern of the nonzero elements like [tridiagonal matrices](https://en.wikipedia.org/wiki/Tridiagonal_matrix) (\ref TNL::Matrices::TridiagonalMatrix) which "has nonzero elements on the main diagonal, the first diagonal below this, and the first diagonal above the main diagonal only". An example of such matrix may look as follows:
+
+\f[
+\left(
+ \begin{array}{ccccccc}
+ -2  &  1  &  .  & .   &  . & .  \\
+  1  & -2  &  1  &  .  &  . & .  \\
+  .  &  1  & -2  &  1   &  . & .  \\
+  .  &  .  &  1  & -2  &  1 &  . \\
+  .  &  .  &  .  &  1  & -2 &  1 \\
+  .  &  .  &  .  &  .  &  1 & -2
+ \end{array}
+ \right)
+\f]
+
+Similar but more general type of matrices are multidiagonal matrices (\ref TNL::Matrices::MultidiagonalMatrix) which have the nonzero matrix elements positioned only on lines parallel to the main diagonal like the following matrix:
+
+\f[
+  \left(
+  \begin{array}{ccccccc}
+  -4  &  1  &  .  &  1  &  . & .  \\
+   1  & -4  &  1  &  .  &  1 & .  \\
+   .  &  1  & -4  &  1  &  . &  1 \\
+   1  & .   &  1  & -4  &  1 &  . \\
+   .  &  1  &  .  &  1  & -4 &  1 \\
+   .  &  .  &  1  &  .  &  1 & -4
+  \end{array}
+  \right)
+ \f]
+
+Finally, TNL offers so called *lambda matrices* (\ref TNL::Matrices::LambdaMatrix) which are kind of "matrix-free matrices". They do not store the matrix elements explicitly in the memory, but rather evaluates them on-the-fly based on user defined lambda functions.
+
+In the following table we show comparison of expressing a tridiagonal matrix by means of different matrix types.
+
+| Matrix dimensions | Dense elems.   | Dense mem. | Sparse elems. | Sparse mem.  | Tridiag. elems. | Tridiag. mem. | Multidiag. elems. | Mutlidiag. mem. |
+|------------------:|---------------:|-----------:|--------------:|-------------:|----------------:|--------------:|------------------:|----------------:|
+|             10x10 |            100 |     800  B |           >28 |       >336 B |              30 |         240 B |                30 |           252 B |
+|           100x100 |         10,000 |      80 kB |          >298 |     >3,576 B |             300 |       2,400 B |               300 |         2,412 B |
+|       1,000x1,000 |      1,000,000 |       8 MB |        >2,998 |    >35,976 B |           3,000 |      24,000 B |             3,000 |        24,012 B |
+|     10,000x10,000 |    100,000,000 |     800 MB |       >29,998 |   >359,976 B |          30,000 |     240,000 B |            30,000 |       240,012 B |
+|   100,000x100,000 | 10,000,000,000 |      80 GB |      >299,998 | >3,599,876 B |         300,000 |   2,400,000 B |           300,000 |     2,400,012 B |
+
+In the table:
+
+* **Matrix dimensions** is the number of matrix rows and columns
+* **Dense elems.** is the number of allocated matrix elements in the dense matrix.
+* **Dense mem.** is the allocated memory for the matrix elements in the dense matrix if the elements are stored in the double precision.
+* **Sparse elems.** is the number of allocated matrix elements in the sparse matrix. Some formats may allocate padding zeros for better data alignment in the memory and so the number of allocated matrix elements may increase.
+* **Sparse mem.** is the allocated memory for the matrix elements in the sparse matrix if the elements are stored in the double precision and column indexes in 32-bit integer.
+* **Tridiag. elems** is the number of allocated matrix elements in the tridiagonal matrix.
+* **Tridiag mem.** is the allocated memory for the matrix elements in the tridiagonal matrix if the elements are stored in the double precision.
+* **Multidiag. elems** is the number of allocated matrix elements in the multidiagonal matrix.
+* **Multidiag mem.** is the allocated memory for the matrix elements in the multidiagonal matrix if the elements are stored in the double precision.
+
+Choosing the best matrix type can have tremendous impact on the performance but also on memory requirements. If we would treat each matrix as dense one we would not be able to to work with matrices larger than 50,000x50,000 on common personal computers, because we would need tens of gibabytes of memory. At the same time, we see that the other matrix types can do the same job with only few megabytes. In addition, other matrix types work with much less matrix elements and so operations like matrix-vector multiplication can be done with significantly less operations which means much faster. Since in the modern hardware architectures, the computing units are limited mainly by the performance of the memory chips (so [called memory wall](https://en.wikipedia.org/wiki/Random-access_memory#Memory_wall)), transferring less data from the memory increases the performance even more.
+
+The following table shows the same as the one above but when storing a matrix which has only five nonzero elements in each row. Such matrices arise often from the finite difference method for solution of the partial differential equations:
+
+| Matrix dimensions | Dense elems.   | Dense mem. | Sparse elems. | Sparse mem.  | Multidiag. elems. | Mutlidiag. mem. |
+|------------------:|---------------:|-----------:|--------------:|-------------:|------------------:|----------------:|
+|             10x10 |            100 |     800  B |           >50 |       >600 B |                50 |           420 B |
+|           100x100 |         10,000 |      80 kB |          >500 |     >6,000 B |               500 |         4,020 B |
+|       1,000x1,000 |      1,000,000 |       8 MB |        >5,000 |    >60,000 B |             5,000 |        40,020 B |
+|     10,000x10,000 |    100,000,000 |     800 MB |       >50,000 |   >600,000 B |            50,000 |       400,020 B |
+|   100,000x100,000 | 10,000,000,000 |      80 GB |      >500,000 | >6,000,000 B |           500,000 |     4,000,020 B |
+
+There is no change in the dense matrix part of the table. The numbers grow proportionally in case of sparse and mutlidiagonal matrix. General sparse matrix formats need to store column indexes for each matrix element which is not true for the multidiagonal matrix. The following table shows how many bytes we need for storing of one matrix element with different matrix types depending on the type of the matrix elements (`Real`) and column indexes (`Index`):
+
+| Real   | Index  | Dense matrix | Multidiagonal matrix |  Sparse matrix | Fill ratio |
+|:------:|:------:|:------------:|:--------------------:|:--------------:|:----------:|
+| float  | 32-bit |          4 B |                  4 B |            8 B |     << 50% |
+| float  | 32-bit |          4 B |                  4 B |           12 B |     << 30% |
+| double | 32-bit |          8 B |                  8 B |           12 B |     << 60% |
+| double | 64-bit |          8 B |                  8 B |           16 B |     << 50% |
+
+In this table:
+
+* **Real** is matrix element type.
+* **Index** is column index type.
+* **Dense matrix** is number of bytes needed to store one matrix element in the dense matrix.
+* **Multidiagonal matrix** is number of bytes needed to store one matrix element in the mutldiagonal matrix.
+* **Sparse matrix** is number of bytes needed to store one matrix element in the sparse matrix.
+* **Fill ratio** is maximal percentage of the nonzero matrix elements until which the sparse matrix can perform better.
+
+The multidiagonal matrix type is especially suitable for the finite difference method or similar numerical methods for solution of the partial differential equations.
+
+## Indexing of nonzero matrix elements in sparse matrices<a name="indexing-of-nonzero-matrix-elements-in-sparse-matrices"></a>
+
+The sparse matrix formats usually, in the first step, compress the matrix rows by omitting the zero matrix elements as follows
+
+\f[
+\left(
+\begin{array}{ccccc}
+0 & 1 & 0 & 2 & 0 \\
+0 & 0 & 5 & 0 & 0 \\
+4 & 0 & 0 & 0 & 7 \\
+0 & 3 & 0 & 8 & 5 \\
+0 & 5 & 7 & 0 & 0
+\end{array}
+\right)
+\rightarrow
+\left(
+\begin{array}{ccccc}
+1 & 2 & . & . & . \\
+5 & . & . & . & . \\
+4 & 7 & . & . & . \\
+3 & 8 & 5 & . & . \\
+5 & 7 & . & . & .
+\end{array}
+\right)
+\f]
+
+In such a form, it is more efficient to refer the nonzero matrix elements in given row by their rank in the compressed matrix row rather than by their column index in the original matrix. In methods for the sparse matrices, this parameter is called `localIdx`. Some sparse matrix formats add padding zeros for better alignment of data in memory. But if this is not the case, the variable `localIdx` of particular matrix elements would read as:
+
+\f[
+\left(
+\begin{array}{ccccc}
+0 & 1 & . & . & . \\
+0 & . & . & . & . \\
+0 & 1 & . & . & . \\
+0 & 1 & 2 & . & . \\
+0 & 1 & . & . & .
+\end{array}
+\right)
+\f]
+
+## Matrix view <a name="matrix_view"></a>
+
+Matrix views are small reference objects which help accessing the matrix in GPU kernels or lambda functions being executed on GPUs. We describe this in details in section about [Shared pointers and views](tutorial_GeneralConcepts.html#shared-pointers-and-views). The problem lies in fact that we cannot pass references to GPU kernels and we do not want to pass there deep copies of matrices. Matrix view is some kind of reference to a matrix. A copy of matrix view is always shallow and so it behaves like a reference.  The following example shows how to obtain the matrix view by means of method `getView` and pass it to a lambda function:
+
+\includelineno SparseMatrixViewExample_getRow.cpp
+
+Here we create sparse matrix `matrix` on the line 11, and use the method `getView` to get the matrix view on the line 12. The view is then used in the lambda function on the line 15 where it is captured by value (see `[=]` in the definition of the lambda function `f` on the line 14).
+
+
+## Allocation and setup of different matrix types <a name="allocation_and_setup_of_different_matrix_types"></a>
+
+There are several ways how to create a new matrix:
+
+1. **Initializer lists** allow to create matrix from the [C++ initializer lists](https://en.cppreference.com/w/cpp/utility/initializer_list). The matrix elements must be therefore encoded in the source code and so it is useful for rather smaller matrices. Methods and constructors with initializer lists are user friendly and simple to use. It is a good choice for tool problems with small matrices.
+2. **STL map** can be used for creation of sparse matrices only. The user first insert all matrix elements together with their coordinates into [`std::map`](https://en.cppreference.com/w/cpp/container/map) based on which the sparse matrix is created in the next step. It is simple and user friendly approach suitable for creation of large matrices. An advantage is that we do not need to know the distribution of the nonzero matrix elements in matrix rows in advance like we do in other ways of construction of sparse matrices. This makes the use of STL map suitable for combining of sparse matrices from TNL with other numerical packages. However, the sparse matrix is constructed on the host and then copied on GPU if necessary. Therefore, this approach is not a good choice if fast and efficient matrix construction is required.
+3. **Methods `setElement` and `addElement` called from the host** allow to change particular matrix elements. The methods can be called from host even for matrices allocated on GPU. In this case, however, the matrix elements are transferred on GPU one by one which is very inefficient. If the matrix is allocated on the host system (CPU), the efficiency is nearly optimal. In case of sparse matrices, one must set row capacities (i.e. maximal number of nonzero elements in each row) before using these methods. If the row capacity is exceeded, the matrix has to be reallocated and all matrix elements are lost.
+4. **Methods `setElement` and `addElement` called on the host and copy matrix on GPU** setting particular matrix elements by the methods `setElement` and `addElement` when the matrix is allocated on GPU can be time consuming for large matrices. Setting up the matrix on CPU using the same methods and copying it on GPU at once when the setup is finished can be significantly more efficient. A drawback is that we need to allocate temporarily whole matrix on CPU.
+5. **Methods `setElement` and `addElement` called from native device** allow to do efficient matrix elements setup even on devices (GPUs). In this case, the methods must be called from a GPU kernel or a lambda function combined with the parallel for (\ref TNL::Algorithms::ParallelFor). The user get very good performance even when manipulating matrix allocated on GPU. On the other hand, only data structures allocated on GPUs can be accessed from the kernel or lambda function. The matrix can be accessed in the GPU kernel or lambda function by means of [matrix view](#matrix_view) or the shared pointer (\ref TNL::Pointers::SharedPointer).
+6. **Method `getRow` combined with `ParallelFor`** is very similar to the previous one. The difference is that we first fetch helper object called *matrix row* which is linked to particular matrix row. Using methods of this object, one may change the matrix elements in given matrix row. An advantage is that the access to the matrix row is resolved only once for all elements in the row. In some more sophisticated sparse matrix formats, this can be nontrivial operation and this approach may slightly improve the performance. Another advantage for sparse matrices is that we access the matrix elements based on their *local index* ('localIdx', see [Indexing of nonzero matrix elements in sparse matrices](indexing_of_nonzero_matrix_elements_in_sparse_matrices)) in the row which is something like a rank of the nonzero element in the row. This is more efficient than addressing the matrix elements by the column indexes which requires searching in the matrix row. So this may significantly improve the performance of setup of sparse matrices. When it comes to dense matrices, there should not be great difference in performance compared to use of the methods `setElement` and `getElement`. Note that when the method is called from a GPU kernel or a lambda function, only data structures allocated on GPU can be accessed and the matrix must be made accessible by the means of matrix view.
+7. **Method `forRows`** this approach is very similar to the previous one but it avoids using `ParallelFor` and necessity of passing the matrix to GPU kernels by matrix view or shared pointers.
+
+The following table shows pros and cons of particular methods:
+
+|  Method                                 | Efficient | Easy to use |  Pros                                                                 | Cons                                                                  |
+|:----------------------------------------|:----------|:------------|:----------------------------------------------------------------------|:----------------------------------------------------------------------|
+| **Initializer list**                    | **        | *****       | Very easy to use.                                                     | Only for small matrices.                                              |
+|                                         |           |             | Does not need setting of matrix rows capacities                       |                                                                       |
+| **STL map**                             | **        | *****       | Very easy to use.                                                     | Higher memory requirements.                                           |
+|                                         |           |             | Does not need setting of matrix rows capacities                       | Slow transfer on GPU.                                                 |
+| **[set,add]Element on host**            | ****/*    | *****       | Very easy to use.                                                     | Requires setting of row capacities.                                   |
+|                                         |           |             |                                                                       | Extremely slow transfer on GPU.                                       |
+| **[set,and]Element on host&copy on GPU**| ***       | ****        | Easy to use.                                                          | Requires setting of row capacities.                                   |
+|                                         |           |             | Reasonable efficiency.                                                | Allocation of auxiliary matrix on CPU.                                |
+| **[set,add]Element on native device**   | ****      |             | Good efficiency.                                                      | Requires setting of row capacities.                                   |
+|                                         |           |             |                                                                       | Requires writing GPU kernel or lambda function.                       |
+|                                         |           |             |                                                                       | Allows accessing only data allocated on the same device/memory space. |
+| **getRow and ParallelFor**              | *****     | **          | Best efficiency for sparse matrices.                                  | Requires setting of row capacities.                                   |
+|                                         |           |             |                                                                       | Requires writing GPU kernel or lambda function.                       |
+|                                         |           |             |                                                                       | Allows accessing only data allocated on the same device/memory space. |
+|                                         |           |             |                                                                       | Use of matrix local indexes can be less intuitive.                    |
+| **forRows**                             | *****     | **          | Best efficiency for sparse matrices.                                  | Requires setting of row capacities.                                   |
+|                                         |           |             | Avoid use of matrix view or shared pointer in kernels/lambda function.| Requires writing GPU kernel or lambda function.                       |
+|                                         |           |             |                                                                       | Allows accessing only data allocated on the same device/memory space. |
+|                                         |           |             |                                                                       | Use of matrix local indexes is less intuitive.                        |
+
+Though it may seem that the later methods come with more cons than pros, they offer much higher performance and we believe that even they are still user friendly. On the other hand, if the matrix setup performance is not a priority, the use easy-to-use but slow method can still be a good choice. The following tables demonstrate the performance of different methods. The tests were performed with the following setup:
+
+|              |                                                   |
+|--------------|---------------------------------------------------|
+| CPU          | Intel i9-9900KF, 3.60GHz, 8 cores, 16384 KB cache |
+| GPU          | GeForce RTX 2070                                  |
+| g++ version  | 10.2.0                                            |
+| nvcc version | 11.2.67                                           |
+| Precision    | single precision                                  |
+
+### Dense matrix
+
+In the test of dense matrices, we set each matrix element to value equal to `rowIdx + columnIdx`. The times in seconds obtained on CPU looks as follows:
+
+| Matrix rows and columns     | `setElement` on host | `setElement` with `ParallelFor` |  `getRow`    | `forRows`   |
+|----------------------------:|---------------------:|--------------------------------:|-------------:|------------:|
+|                          16 |           0.00000086 |                       0.0000053 |   0.00000035 |   0.0000023 |
+|                          32 |           0.00000278 |                       0.0000050 |   0.00000201 |   0.0000074 |
+|                          64 |           0.00000703 |                       0.0000103 |   0.00000354 |   0.0000203 |
+|                         128 |           0.00002885 |                       0.0000312 |   0.00000867 |   0.0000709 |
+|                         256 |           0.00017543 |                       0.0000439 |   0.00002490 |   0.0001054 |
+|                         512 |           0.00078153 |                       0.0001683 |   0.00005999 |   0.0002713 |
+|                        1024 |           0.00271989 |                       0.0006691 |   0.00003808 |   0.0003942 |
+|                        2048 |           0.01273520 |                       0.0038295 |   0.00039116 |   0.0017083 |
+|                        4096 |           0.08381450 |                       0.0716542 |   0.00937997 |   0.0116771 |
+|                        8192 |           0.51596800 |                       0.3535530 |   0.03971900 |   0.0467374 |
+
+Here:
+
+* **setElement on host** tests run in one thread. Therefore they are faster for small matrices compared to "`setElement` with `ParallelFor`" tests.
+* **setElement with ParallelFor** tests run in parallel in several OpenMP threads. This approach is faster for larger matrices.
+* **getRow** tests run in parallel in several OpenMP threads mapping of which is more efficient compared to "`setElement` on host" tests.
+
+And the same on GPU is in the following table:
+
+| Matrix rows and columns     | `setElement` on host | `setElement` on host and copy | `setElement` on GPU | `getRow`     | `forRows`   |
+|----------------------------:|---------------------:|------------------------------:|--------------------:|-------------:|------------:|
+|                          16 |           0.027835   |                       0.02675 |         0.000101198 | 0.00009903   | 0.000101214 |
+|                          32 |           0.002776   |                       0.00018 |         0.000099197 | 0.00009901   | 0.000100481 |
+|                          64 |           0.010791   |                       0.00015 |         0.000094446 | 0.00009493   | 0.000101796 |
+|                         128 |           0.043014   |                       0.00021 |         0.000099397 | 0.00010024   | 0.000102729 |
+|                         256 |           0.171029   |                       0.00056 |         0.000100469 | 0.00010448   | 0.000105893 |
+|                         512 |           0.683627   |                       0.00192 |         0.000103346 | 0.00011034   | 0.000112752 |
+|                        1024 |           2.736680   |                       0.00687 |         0.000158805 | 0.00016932   | 0.000170302 |
+|                        2048 |          10.930300   |                       0.02474 |         0.000509000 | 0.00050917   | 0.000511183 |
+|                        4096 |          43.728700   |                       0.13174 |         0.001557030 | 0.00156117   | 0.001557930 |
+|                        8192 |         174.923000   |                       0.70602 |         0.005312470 | 0.00526658   | 0.005263870 |
+
+Here:
+
+* **setElement on host** tests are very slow especially for large matrices since each matrix element is copied on GPU separately.
+* **setElement on host and copy** tests are much faster because the matrix is copied from CPU to GPU on the whole which is more efficient.
+* **setElement on GPU** tests are even more faster since there is no transfer of data between CPU and GPU.
+* **getRow** tests have the same performance as "`setElement` on GPU".
+* **forRows** tests have the same performance as both "`setElement` on GPU" and "`getRow`".
+
+You can see the source code of the previous benchmark in [Appendix](#benchmark-of-dense-matrix-setup).
+
+### Sparse matrix
+
+The sparse matrices are tested on computation of matrix the [discrete Laplace operator in 2D](https://en.wikipedia.org/wiki/Discrete_Laplace_operator). This matrix has at most five nonzero elements in each row. The times for sparse matrix (with CSR format) on CPU in seconds looks as follows:
+
+| Matrix rows and columns     |  STL Map     | `setElement` on host | `setElement` with `ParallelFor` | `getRow`    | `forRows`    |
+|----------------------------:|-------------:|---------------------:|--------------------------------:|------------:|-------------:|
+|                         256 |      0.00016 |             0.000017 |                        0.000014 |    0.000013 |     0.000020 |
+|                       1,024 |      0.00059 |             0.000044 |                        0.000021 |    0.000019 |     0.000022 |
+|                       4,096 |      0.00291 |             0.000130 |                        0.000031 |    0.000022 |     0.000031 |
+|                      16,384 |      0.01414 |             0.000471 |                        0.000067 |    0.000031 |     0.000065 |
+|                      65,536 |      0.06705 |             0.001869 |                        0.000218 |    0.000074 |     0.000209 |
+|                     262,144 |      0.31728 |             0.007436 |                        0.000856 |    0.000274 |     0.000799 |
+|                   1,048,576 |      1.46388 |             0.027087 |                        0.006162 |    0.005653 |     0.005904 |
+|                   4,194,304 |      7.46147 |             0.102808 |                        0.028385 |    0.027925 |     0.027937 |
+|                  16,777,216 |     38.95900 |             0.413823 |                        0.125870 |    0.124588 |     0.123858 |
+|                  67,108,864 |    185.75700 |             1.652580 |                        0.505232 |    0.501003 |     0.500927 |
+
+Here:
+
+* **STL Map** tests show that use of STL Map can be very slow on large matrices and, of course, they need to allocate the map containing all the matrix elements. This can be memory consuming. On the other hand, it is the only way which does not require knowing the matrix row capacities in advance.
+* **setElement on host** tests are much faster compared to STL map, it does not need to allocate anything else except the sparse matrix. However, matrix row capacities must be known in advance.
+* **setElement with ParallelFor** tests run in parallel in several OpenMP threads and so this can be faster for larger matrices.
+* **getRow** tests perform the same as "setElement with ParallelFor".
+* **forRows** tests perform the same as both "setElement with ParallelFor" and "forRows".
+
+We see, that the use of STL map makes sense only in situation when it is hard to estimate necessary row capacities. Otherwise very easy setup with `setElement` method is much faster. If the performance is the highest priority, `getRow` method should be preferred. The results for GPU are in the following table:
+
+| Matrix rows and columns     |  STL Map     | `setElement` on host | `setElement` on host and copy |`setElement` on GPU | `getRow`    | `forRows`   |
+|----------------------------:|-------------:|---------------------:|------------------------------:|-------------------:|------------:|------------:|
+|                         256 |       0.002  |                0.036 |                        0.0280 |            0.00017 |     0.00017 |     0.00017 |
+|                       1,024 |       0.001  |                0.161 |                        0.0006 |            0.00017 |     0.00017 |     0.00017 |
+|                       4,096 |       0.003  |                0.680 |                        0.0010 |            0.00020 |     0.00020 |     0.00020 |
+|                      16,384 |       0.015  |                2.800 |                        0.0034 |            0.00021 |     0.00020 |     0.00021 |
+|                      65,536 |       0.074  |               11.356 |                        0.0130 |            0.00048 |     0.00047 |     0.00048 |
+|                     262,144 |       0.350  |               45.745 |                        0.0518 |            0.00088 |     0.00087 |     0.00088 |
+|                   1,048,576 |       1.630  |              183.632 |                        0.2057 |            0.00247 |     0.00244 |     0.00245 |
+|                   4,194,304 |       8.036  |              735.848 |                        0.8119 |            0.00794 |     0.00783 |     0.00788 |
+|                  16,777,216 |      41.057  |             2946.610 |                        3.2198 |            0.02481 |     0.02429 |     0.02211 |
+|                  67,108,864 |     197.581  |            11791.601 |                       12.7775 |            0.07196 |     0.06329 |     0.06308 |
+
+Here:
+
+* **STL Map** tests show that the times are comparable to CPU times which means the most of the time is spent by creating the matrix on CPU.
+* **setElement on host**  tests are again extremely slow for large matrices. It is even slower than the use of STL map. So in case of GPU, this is another reason for using the STL map.
+* **setElement on host and copy** tests are, similar to the dense matrix, much faster compared to the previous approaches. So it is the best way when you need to use data structures available only on the host system (CPU).
+* **setElement on GPU** tests exhibit the best performance together with `getRow` and `forRows` methods. Note, however, that this method can be slower that `getRow` and `forRows` if there would be more nonzero matrix elements in a row.
+* **getRow** tests exhibit the best performance together with `setElement` on GPU and `forRows` methods.
+* **forRows** tests exhibit the best performance together with `getRow` and `setElement` on GPU methods.
+
+Here we see, that the `setElement` methods performs extremely bad because all matrix elements are transferred to GPU one-by-one. Even STL map is much faster. Note, that the times for STL map are not much higher compared to CPU which indicates that the transfer of the matrix on GPU is not dominant. Setup of the matrix on CPU by the means of `setElement` method and transfer on GPU is even faster. However, the best performance can be obtained only we creating the matrix directly on GPU by methods `setElement`, `getRow` and `forRows`. Note, however, that even if all of them perform the same way, for matrices with more nonzero matrix elements in a row, `setElement` could be slower compared to the `getRow` and `forRows`.
+
+You can see the source code of the previous benchmark in [Appendix](#benchmark-of-sparse-matrix-setup).
+
+### Multidiagonal matrix
+
+Finally, the following tables show the times of the same test performed with multidiagonal matrix. Times on CPU in seconds looks as follows:
+
+| Matrix rows and columns     |  `setElement` on host     | `setElement` with `ParallelFor` | `getRow`    | `forRows`   |
+|----------------------------:|--------------------------:|--------------------------------:|------------:|------------:|
+|                         256 |                  0.000055 |                       0.0000038 |    0.000004 |    0.000009 |
+|                       1,024 |                  0.000002 |                       0.0000056 |    0.000003 |    0.000006 |
+|                       4,096 |                  0.000087 |                       0.0000130 |    0.000005 |    0.000014 |
+|                      16,384 |                  0.000347 |                       0.0000419 |    0.000010 |    0.000046 |
+|                      65,536 |                  0.001378 |                       0.0001528 |    0.000032 |    0.000177 |
+|                     262,144 |                  0.005504 |                       0.0006025 |    0.000131 |    0.000711 |
+|                   1,048,576 |                  0.019392 |                       0.0028773 |    0.001005 |    0.003265 |
+|                   4,194,304 |                  0.072078 |                       0.0162378 |    0.011915 |    0.018065 |
+|                  16,777,216 |                  0.280085 |                       0.0642682 |    0.048876 |    0.072084 |
+|                  67,108,864 |                  1.105120 |                       0.2427610 |    0.181974 |    0.272579 |
+
+Here:
+
+* **setElement on host** tests show that this method is fairly efficient.
+* **setElement with ParallelFor** tests run in parallel in several OpenMP threads compared to "setElement on host" tests. For larger matrices, this way of matrix setup performs better.
+* **getRow** tests perform more or less the same as "setElement with ParallelFor" and `forRows`.
+* **forRows** tests perform more or less the same as "setElement with ParallelFor" and `getRow`.
+
+Note, that setup of multidiagonal matrix is faster compared to the same matrix stored in general sparse format. Results for GPU are in the following table:
+
+| Matrix rows and columns     | `setElement` on host | `setElement` on host and copy | `setElement` on GPU | `getRow`    | `forRows`   |
+|----------------------------:|---------------------:|------------------------------:|--------------------:|------------:|------------:|
+|                         256 |                0.035 |                       0.02468 |            0.000048 |    0.000045 |   0.000047  |
+|                       1,024 |                0.059 |                       0.00015 |            0.000047 |    0.000045 |   0.000047  |
+|                       4,096 |                0.251 |                       0.00044 |            0.000048 |    0.000045 |   0.000047  |
+|                      16,384 |                1.030 |                       0.00158 |            0.000049 |    0.000046 |   0.000048  |
+|                      65,536 |                4.169 |                       0.00619 |            0.000053 |    0.000048 |   0.000052  |
+|                     262,144 |               16.807 |                       0.02187 |            0.000216 |    0.000214 |   0.000217  |
+|                   1,048,576 |               67.385 |                       0.08043 |            0.000630 |    0.000629 |   0.000634  |
+|                   4,194,304 |              270.025 |                       0.31272 |            0.001939 |    0.001941 |   0.001942  |
+|                  16,777,216 |             1080.741 |                       1.18849 |            0.003212 |    0.004185 |   0.004207  |
+|                  67,108,864 |             4326.120 |                       4.74481 |            0.013672 |    0.022494 |   0.030369  |
+
+* **setElement on host** tests are extremely slow again, especially for large matrices.
+* **setElement on host and copy** tests are much faster compared to the previous.
+* **setElement with ParallelFor** tests offer the best performance. They are even faster then `getRow` and `forRows` method. This, however, does not have be true for matrices having more nonzero elements in a row.
+* **getRow** tests perform more or less the same as `forRows`. For matrices having more nonzero elements in a row this method could be faster than `setElement`.
+* **forRows** tests perform more or less the same as `getRow`.
+
+Note that multidiagonal matrix performs better compared to general sparse matrix. One reason for it is the fact, that the multidiagonal type does not store explicitly column indexes of all matrix elements. Because of this, less data need to be transferred from the memory.
+
+You can see the source code of the previous benchmark in [Appendix](#benchmark-of-multidiagonal-matrix-setup).
+
+In the following parts we will describe hoe to setup particular matrix types by means of the methods mentioned above.
+
+### Dense matrices <a name="dense_matrices_setup"></a>
+
+Dense matrix (\ref TNL::Matrices::DenseMatrix) is a templated class defined in the namespace \ref TNL::Matrices. It has five template parameters:
+
+* `Real` is a type of the matrix elements. It is `double` by default.
+* `Device` is a device where the matrix shall be allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for CUDA supporting GPUs. It is \ref TNL::Devices::Host by default.
+* `Index` is a type to be used for indexing of the matrix elements. It is `int` by default.
+* `ElementsOrganization` defines the organization of the matrix elements in memory. It can be \ref TNL::Algorithms::Segments::ColumnMajorOrder or \ref TNL::Algorithms::Segments::RowMajorOrder for column-major and row-major organization respectively. Be default, it is the row-major order if the matrix is allocated on the host system and column major order if it is allocated on GPU.
+* `RealAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements. By default, it is the default allocator for given `Real` type and `Device` type -- see \ref TNL::Allocators::Default.
+
+The following examples show how to allocate the dense matrix and how to initialize the matrix elements.
+
+#### Initializer list
+
+Small matrices can be created simply by the constructor with an [initializer list](https://en.cppreference.com/w/cpp/utility/initializer_list).
+
+\includelineno Matrices/DenseMatrix/DenseMatrixExample_Constructor_init_list.cpp
+
+In fact, the constructor takes a list of initializer lists. Each embedded list defines one matrix row and so the number of matrix rows is given by the size of the outer initializer list.  The number of matrix columns is given by the longest inner initializer lists. Shorter inner lists are filled with zeros from the right side. The result looks as follows:
+
+\include DenseMatrixExample_Constructor_init_list.out
+
+#### Methods `setElement` and `addElement`
+
+Larger matrices can be setup with methods `setElement` and `addElement` (\ref TNL::Matrices::DenseMatrix::setElement, \ref TNL::Matrices::DenseMatrix::addElement). The following example shows how to call these methods from the host.
+
+\includelineno DenseMatrixExample_addElement.cpp
+
+As we can see, both methods can be called from the host no matter where the matrix is allocated. If it is on GPU, each call of `setElement` or `addElement` (\ref TNL::Matrices::DenseMatrix::setElement, \ref TNL::Matrices::DenseMatrix::addElement) causes slow transfer of tha data between CPU and GPU. Use this approach only if the performance is not a priority. The result looks as follows:
+
+\include DenseMatrixExample_addElement.out
+
+More efficient way of the matrix initialization on GPU consists of calling the methods `setElement` and `addElement` (\ref TNL::Matrices::DenseMatrix::setElement, \ref TNL::Matrices::DenseMatrix::addElement) directly from GPU, for example by means of lambda function and `ParallelFor2D` (\ref TNL::Algorithms::ParallelFor2D). It is demonstrated in the following example (of course it works even on CPU):
+
+\includelineno DenseMatrixViewExample_setElement.cpp
+
+Here we get the matrix view (\ref TNL::Matrices::DenseMatrixView) (line 10) to make the matrix accessible in lambda function even on GPU (see [Shared pointers and views](tutorial_GeneralConcepts.html#shared-pointers-and-views) ). We first call the `setElement` method from CPU to set the `i`-th diagonal element to `i` (lines 11-12). Next we iterate over the matrix rows with `ParallelFor2D` (\ref TNL::Algorithms::ParallelFor2D) (line 20) and for each row we call the lambda function `f`. This is done on the same device where the matrix is allocated and so it we get optimal performance even for matrices on GPU. In the lambda function we add one to each matrix element (line 18). The result looks as follows:
+
+\include DenseMatrixExample_setElement.out
+
+#### Method `getRow`
+
+This method is available for the dense matrix (\ref TNL::Matrices::DenseMatrix::getRow) mainly for two reasons:
+
+1. The method `getRow` is recommended for sparse matrices. In most cases, it is not optimal for dense matrices. However, if one needs to have one code for both dense and sparse matrices, this method is a good choice.
+2. In general, use of `setElement` (\ref TNL::Matrices::DenseMatrix::setElement) combined with `ParallelFor2D` (\ref TNL::Algorithms::ParallelFor2D) is preferred, for dense matrices, since it offers more parallelism for GPUs. `ParallelFor2D` creates one CUDA thread per each matrix element which is desirable for GPUs. With the use of the method `getRow` we have only one CUDA thread per each matrix row. This makes sense only in situation when we need to setup each matrix row sequentially.
+
+Here we show an example:
+
+\includelineno DenseMatrixViewExample_getRow.cpp
+
+Here we create the matrix on the line 10 and get the matrix view on the line 16. Next we use `ParallelFor` (\ref TNL::Algorithms::ParallelFor) (line 26) to iterate over the matrix rows and the lambda function `f` (lines 18-21) for each of them. In the lambda function, we first fetch the matrix row by means of the merhod `getRow` (\ref TNL::Matrices::DenseMatrixView::getRow) and next we set the matrix elements by using the method `setElement` of the matrix row (\ref TNL::Matrices::DenseMatrixRowView::setElement). For the compatibility with the sparse matrices, use the variant of `setElement` with the parameter `localIdx`. It has no effect here, it is only for compatibility of the interface.
+
+#### Method `forRows`
+
+ The next example demonstrates the method `forRows` (\ref TNL::Matrices::DenseMatrix::forRows) which works in very similar way as the method `getRow` but it is slightly easier to use. It is also compatible with sparse matrices. See the following example:
+
+\includelineno DenseMatrixExample_forRows.cpp
+
+We do not need any matrix view and instead of calling `ParallelFor` (\ref TNL::Algorithms::ParallelFor) we call just the method `forRows` (line 18). The lambda function `f` (line 11) must accept the following parameters:
+
+* `rowIdx` is the row index of given matrix element.
+* `columnIdx` is the column index of given matrix element.
+* `value` is a reference on the matrix element value and so by changing this value we can modify the matrix element.
+* `compute` is a boolean which, when set to `false`, indicates that we can skip the rest of the matrix row. This is, however, only a hint and it does not guarantee that the rest of the matrix row is really skipped.
+
+The result looks as follows:
+
+\include DenseMatrixExample_forRows.out
+
+### Sparse matrices <a name="sparse_matrices_setup"></a>
+
+[Sparse matrices](https://en.wikipedia.org/wiki/Sparse_matrix) are extremely important in a lot of numerical algorithms. They are used at situations when we need to operate with matrices having majority of the matrix elements equal to zero. In this case, only the non-zero matrix elements are stored with possibly some *padding zeros* used for memory alignment. This is necessary mainly on GPUs. See the [Overview of matrix types](#overview_of_matrix_types) for the differences in memory requirements.
+
+Major disadvantage of sparse matrices is that there are a lot of different formats for their storage in memory. Though [CSR (Compressed Sparse Row)](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)) format is the most popular of all, especially for GPUs, there are many other formats. Often their performance differ significantly for various matrices. So it is a good idea to test several sparse matrix formats if you want to get the best performance. In TNL, there is one templated class \ref TNL::Matrices::SparseMatrix representing general sparse matrices. The change of underlying matrix format can be done just by changing one template parameter. The list of the template paramaters is as follows:
+
+* `Real` is type if the matrix elements. It is `double` by default.
+* `Device` is a device where the matrix is allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for CUDA supporting GPUs. It is \ref TNL::Devices::Host by default.
+* `Index` is a type to be used for indexing of the matrix elements. It is `int` by default.
+* `MatrixType` tells if the matrix is symmetric (\ref TNL::Matrices::SymmetricMatrix) or general (\ref TNL::Matrices::GeneralMatrix). It is a \ref TNL::Matrices::GeneralMatrix by default.
+* `Segments` define the format of the sparse matrix. It can be one of the following (by default, it is \ref TNL::Algorithms::Segments::CSR):
+   * \ref TNL::Algorithms::Segments::CSR for [CSR format](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)).
+   * \ref TNL::Algorithms::Segments::Ellpack for [Ellpack format](http://mgarland.org/files/papers/nvr-2008-004.pdf).
+   * \ref TNL::Algorithms::Segments::SlicedEllpack for [SlicedEllpack format](https://link.springer.com/chapter/10.1007/978-3-642-11515-8_10) which was also presented as [Row-grouped CSR format](https://arxiv.org/abs/1012.2270).
+   * \ref TNL::Algorithms::Segments::ChunkedEllpack for [ChunkedEllpack format](http://geraldine.fjfi.cvut.cz/~oberhuber/data/vyzkum/publikace/12-heller-oberhuber-improved-rgcsr-format.pdf) which we reffered as Improved Row-grouped CSR and we renamed it to Ellpack format since it uses padding zeros.
+   * \ref TNL::Algorithms::Segments::BiEllpack for [BiEllpack format](https://www.sciencedirect.com/science/article/pii/S0743731514000458?casa_token=2phrEj0Ef1gAAAAA:Lgf6rMBUN6T7TJne6mAgI_CSUJ-jR8jz7Eghdv6L0SJeGm4jfso-x6Wh8zgERk3Si7nFtTAJngg).
+* `ComputeReal` is type which is used for internal computations. By default it is the same as `Real` if `Real` is not `bool`. If `Real` is `bool`, `ComputeReal` is set to `Index` type. This can be changed, of course, by the user.
+* `RealAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements. By default, it is the default allocator for given `Real` type and `Device` type – see \ref TNL::Allocators::Default.
+* `IndexAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the column indexes of the matrix elements. By default, it is the default allocator for given `Index` type and `Device` type – see \ref TNL::Allocators::Default.
+
+**If `Real` is set to `bool`, we get *a binary matrix* for which the non-zero elements can be equal only to one and so the matrix elements values are not stored explicitly in the memory.** This can significantly reduce the memory requirements and also increase performance.
+
+In the following text we will show how to create and setup sparse matrices.
+
+#### Setting of row capacities<a name="setting-of-matrix-row-capacities"></a>
+
+Larger sparse matrices are created in two steps:
+
+1. We use a method \ref TNL::Matrices::SparseMatrix::setRowCapacities to initialize the underlying matrix format and to allocate memory for the matrix elements. This method only needs to know how many non-zero elements are supposed to be in each row. Once this is set, it cannot be changed only by resetting the whole matrix. In most situations, it is not an issue to compute the number of nonzero elements in each row. Otherwise, we can currently only recommend the use of matrix setup with [STL map](#sparse-matrix-stl-map), which is, however, quite slow.
+2. Now, the nonzero matrix elements can be set one after another by telling its coordinates and a value. Since majority of sparse matrix formats are designed to allow quick access to particular matrix rows the insertion can be done in parallel by mapping different threads to different matrix rows. This approach is usually optimal or nearly optimal when it comes to efficiency.
+
+See the following example which creates lower triangular matrix like this one
+
+\f[
+\left(
+\begin{array}{ccccc}
+ 1 &  0 &  0 &  0 &  0 \\
+ 2 &  1 &  0 &  0 &  0 \\
+ 3 &  2 &  1 &  0 &  0 \\
+ 4 &  3 &  2 &  1 &  0 \\
+ 5 &  4 &  3 &  2 &  1
+\end{array}
+\right).
+\f]
+
+\includelineno SparseMatrixExample_setRowCapacities.cpp
+
+The method \ref TNL::Matrices::SparseMatrix::setRowCapacities reads the required capacities of the matrix rows from a vector (or simmilar container - \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector and \ref TNL::Containers::VectorView) which has the same number of elements as the number of matrix rows and each element defines the capacity of the related row. The result looks as follows:
+
+\include SparseMatrixExample_setRowCapacities.out
+
+There are constructors which also set the row capacities. The first one uses a vector:
+
+\includelineno SparseMatrixExample_Constructor_rowCapacities_vector.cpp
+
+The second one uses an initializer list:
+
+\includelineno SparseMatrixExample_Constructor_init_list_1.cpp
+
+The result of both examples looks as follows:
+
+\include SparseMatrixExample_Constructor_init_list_1.out
+
+
+#### Initializer list
+
+Small matrices can be initialized by a constructor with an [initializer list](https://en.cppreference.com/w/cpp/utility/initializer_list). We assume having the following sparse matrix
+
+\f[
+\left(
+\begin{array}{ccccc}
+ 1 &  0 &  0 &  0 &  0 \\
+-1 &  2 & -1 &  0 &  0 \\
+ 0 & -1 &  2 & -1 &  0 \\
+ 0 &  0 & -1 &  2 & -1 \\
+ 0 &  0 &  0 & -1 &  0
+\end{array}
+\right).
+\f]
+
+It can be created with the initializer list constructor like we shows in the following example:
+
+\includelineno SparseMatrixExample_Constructor_init_list_2.cpp
+
+The constructor accepts the following parameters (lines 9-17):
+
+* `rows` is a number of matrix rows.
+* `columns` is a number of matrix columns.
+* `data` is definition of nonzero matrix elements. It is a initializer list of triples having a form `{ row_index, column_index, value }`. In fact, it is very much like the Coordinate format - [COO](https://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_(COO)).
+
+The constructor also accepts `Real` and `Index` allocators (\ref TNL::Allocators) but the default ones are used in this example. A method `setElements` (\ref TNL::Matrices::SparseMatrix::setElements) works the same way:
+
+\includelineno SparseMatrixExample_setElements.cpp
+
+In this example, we create the matrix in two steps. Firstly we use constructor with only matrix dimensions as parameters (line 9) and next we set the matrix elements by `setElements` method (lines 10-15). The result of both examples looks as follows:
+
+\include SparseMatrixExample_Constructor_init_list_2.out
+
+#### STL map<a name="sparse-matrix-stl-map"></a>
+
+The constructor which creates the sparse matrix from [`std::map`](https://en.cppreference.com/w/cpp/container/map) is useful especially in situations when you cannot estimate the [matrix row capacities](#setting-of-matrix-row-capacities) in advance. You can first store the matrix elements in [`std::map`](https://en.cppreference.com/w/cpp/container/map) data structure in a [COO](https://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_(COO)) format manner. It means that each entry of the `map` is the following pair:
+
+```
+std::pair( std::pair( row_index, column_index ), element_value )
+```
+
+which defines one matrix element at given coordinates `(row_index,column_index)` with given value (`element_value`). Of course, you can insert such entries into the `map` in arbitrary order. When it is complete, you pass the map to the sparse matrix. See the following example:
+
+\includelineno SparseMatrixExample_Constructor_std_map.cpp
+
+The method `setElements` (\ref TNL::Matrices::SparseMatrix::setElements) works the same way for already existing instances of sparse matrix:
+
+\includelineno SparseMatrixExample_setElements_map.cpp
+
+The result of both examples looks as follows:
+
+\include SparseMatrixExample_setElements_map.out
+
+Note, however, that the map can be constructed only on CPU and not on GPU. It requires allocation of additional memory on the host system (CPU) and if the target sparse matrix resided on GPU, the matrix elements must be copied on GPU. This is the reason, why this way of the sparse matrix setup is inefficient compared to other methods.
+
+#### Methods `setElement` and `addElement`
+
+Another way of setting the sparse matrix is by means of the methods `setElement` and `addElement` (\ref TNL::Matrices::SparseMatrix::setElement, \ref TNL::Matrices::SparseMatrix::addElement). The method can be called from both host (CPU) and device (GPU) if the matrix is allocated there. Note, however, that if the matrix is allocated on GPU and the methods are called from CPU there will be significant performance drop because the matrix elements will be transferer one-by-one separately. However, if the matrix elements setup is not a critical part of your algorithm this can be an easy way how to do it. See the following example:
+
+\includelineno SparseMatrixViewExample_setElement.cpp
+
+We first allocate matrix with five rows (it is given by the size of the [initializer list](https://en.cppreference.com/w/cpp/utility/initializer_list) and columns and we set capacity each row to one (line 12). The first for-loop (lines 17-19) runs on CPU no matter where the matrix is allocated. After printing the matrix (lines 21-22), we call the lambda function `f` (lines 24-26) with a help of `ParallelFor` (\ref TNL::Algorithms::ParallelFor , line 28) which is device sensitive and so it runs on CPU or GPU depending on where the matrix is allocated. The result looks as follows:
+
+\include SparseMatrixExample_setElement.out
+
+The method `addElement` (\ref TNL::Matrices::SparseMatrix::addElement) adds a value to specific matrix element. Otherwise, it behaves the same as `setElement`. See the following example:
+
+\includelineno SparseMatrixExample_addElement.cpp
+
+The result looks as follows:
+
+\include SparseMatrixExample_addElement.out
+
+#### Method `getRow`
+
+More efficient method, especially for GPUs, is to combine `getRow` (\ref TNL::Matrices::SparseMatrix::getRow) method with `ParallelFor` (\ref TNL::Algorithms::ParallelFor) and lambda function as the following example demonstrates:
+
+\includelineno SparseMatrixViewExample_getRow.cpp
+
+On the line 11, we create small matrix having five rows (number of rows is given by the size of the [initializer list](https://en.cppreference.com/w/cpp/utility/initializer_list) ) and columns (number of columns is given by the second parameter) and we set each row capacity to one (particular elements of the initializer list). On the line 22, we call `ParallelFor` (\ref TNL::Algorithms::ParallelFor) to iterate over all matrix elements. Each row is processed by the lambda function `f` (lines 14-17). In the lambda function, we first fetch a sparse matrix row (\ref TNL::Matrices::SparseMatrixRowView) which serves for accessing particular matrix rows. This object has a method `setElement` (\ref TNL::Matrices::SparseMatrixRowView::setElement) accepting three parameters:
+
+1. `localIdx` is a rank of the nonzero element in given matrix row.
+2. `columnIdx` is the new column index of the matrix element.
+3. `value` is the new value of the matrix element.
+
+The result looks as follows:
+
+\include SparseMatrixViewExample_getRow.out
+
+#### Method `forRows`
+
+Finally, another efficient way of setting the nonzero matrix elements, is use of the method `forRows` (\ref TNL::Matrices::SparseMatrix::forRows). It requires indexes of the range of rows (`begin` and `end`) to be processed and a lambda function `function` which is called for each nonzero element. The lambda function provides the following data:
+
+* `rowIdx` is a row index of the matrix element.
+* `localIdx` is an index of the nonzero matrix element within the matrix row.
+* `columnIdx` is a column index of the matrix element. If the matrix element column index is supposed to be modified, this parameter can be a reference and so its value can be changed.
+* `value` is a value of the matrix element. If the matrix element value is supposed to be modified, this parameter can be a reference as well and so the element value can be changed.
+* `compute` is a bool reference. When it is set to `false` the rest of the row can be omitted. This is, however, only a hint and it depends on the underlying matrix format if it is taken into account.
+
+See the following example:
+
+\includelineno SparseMatrixExample_forRows.cpp
+
+On the line 9, we allocate a lower triangular matrix byt setting the row capacities as `{1,2,3,4,5}`. On the line 11, we prepare lambda function `f` which we execute on the line 22 just by calling the method `forRows` (\ref TNL::Matrices::SparseMatrix::forRows). This method takes the range of matrix rows as the first two parameters and the lambda function as the last parameter. The lambda function receives parameters mentioned above (see the line 11). We first check if the matrix element coordinates (`rowIdx` and `localIdx`) points to an element lying before the matrix diagonal or on the diagonal (line 12). In case of the lower triangular matrix in our example, the local index is in fact the same as the column index
+
+\f[
+\left(
+\begin{array}{ccccc}
+0 & . & . & . & . \\
+0 & 1 & . & . & . \\
+0 & 1 & 2 & . & . \\
+0 & 1 & 2 & 3 & . \\
+0 & 1 & 2 & 3 & 4
+\end{array}
+\right)
+\f]
+
+If we call the method `forRows` (\ref TNL::Matrices::SparseMatrix::forRows) to setup the matrix elements for the first time, the parameter `columnIdx` has no sense because the matrix elements and their column indexes were not set yet. Therefore it is important that the test on the line 12 reads as
+
+```
+if( rowIdx < localIdx )
+```
+
+because
+
+```
+if( rowIdx < columnIdx )
+```
+
+would not make sense. If we pass through this test, the matrix element lies in the lower triangular part of the matrix and we may set the matrix elements which is done on the lines 17 and 18. The column index (`columnIdx`) is set to local index (line 17) and `value` is set on the line 18. The result looks as follows:
+
+\include SparseMatrixExample_forRows.out
+
+### Tridiagonal matrices <a name="tridiagonal_matrices_setup"></a>
+
+Tridiagonal matrix format serves for specific matrix pattern when the nonzero matrix elements can be placed only at the diagonal and immediately next to the diagonal. Here is an example:
+
+\f[
+\left(
+ \begin{array}{ccccccc}
+  2  & -1  &  .  & .   &  . & .  \\
+ -1  &  2  & -1  &  .  &  . & .  \\
+  .  & -1  &  2  & -1  &  . & .  \\
+  .  &  .  & -1  &  2  & -1 &  . \\
+  .  &  .  &  .  & -1  &  2 & -1 \\
+  .  &  .  &  .  &  .  & -1 &  2
+ \end{array}
+ \right)
+\f]
+
+An advantage is that we do not store the column indexes explicitly as it is in \ref TNL::Matrices::SparseMatrix. This can reduce significantly the  memory requirements which also means better performance. See the following table for the storage requirements comparison between \ref TNL::Matrices::TridiagonalMatrix and \ref TNL::Matrices::SparseMatrix.
+
+  Real   | Index      |      SparseMatrix    | TridiagonalMatrix   | Ratio
+ --------|------------|----------------------|---------------------|--------
+  float  | 32-bit int | 8 bytes per element  | 4 bytes per element | 50%
+  double | 32-bit int | 12 bytes per element | 8 bytes per element | 75%
+  float  | 64-bit int | 12 bytes per element | 4 bytes per element | 30%
+  double | 64-bit int | 16 bytes per element | 8 bytes per element | 50%
+
+Tridiagonal matrix is a templated class defined in the namespace \ref TNL::Matrices. It has five template parameters:
+
+* `Real` is a type of the matrix elements. It is `double` by default.
+* `Device` is a device where the matrix shall be allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for GPU supporting CUDA. It is \ref TNL::Devices::Host by default.
+* `Index` is a type to be used for indexing of the matrix elements. It is `int` by default.
+* `ElementsOrganization` defines the organization of the matrix elements in memory. It can be \ref TNL::Algorithms::Segments::ColumnMajorOrder or \ref TNL::Algorithms::Segments::RowMajorOrder for column-major and row-major organization respectively. Be default it is the row-major order if the matrix is allocated in the host system and column major order if it is allocated on GPU.
+* `RealAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements. By default, it is the default allocator for given `Real` type and `Device` type -- see \ref TNL::Allocators::Default.
+
+For better alignment in the memory the tridiagonal format is organized like if there were three nonzero matrix elements in each row. This is not true for example in the first row where there is no matrix element on the left side of the diagonal. The same happens on the last row of the matrix. We have to add even the artificial matrix elements like this:
+
+\f[
+\begin{array}{c}
+0 \\
+. \\
+. \\
+. \\
+. \\
+.
+\end{array}
+\left(
+ \begin{array}{ccccccc}
+  2  & -1  &  .  & .   &  . & .  \\
+ -1  &  2  & -1  &  .  &  . & .  \\
+  .  & -1  &  2  & -1  &  . & .  \\
+  .  &  .  & -1  &  2  & -1 &  . \\
+  .  &  .  &  .  & -1  &  2 & -1 \\
+  .  &  .  &  .  &  .  & -1 &  2
+ \end{array}
+ \right)
+ \begin{array}{c}
+. \\
+. \\
+. \\
+. \\
+. \\
+0
+\end{array}
+\f]
+
+If the tridiagonal matrix has more rows then columns, we have to extend the last two rows with nonzero elements in this way
+
+\f[
+\left(
+ \begin{array}{ccccccc}
+  2  & -1  &  .  & .   &  . & .  \\
+ -1  &  2  & -1  &  .  &  . & .  \\
+  .  & -1  &  2  & -1  &  . & .  \\
+  .  &  .  & -1  &  2  & -1 &  . \\
+  .  &  .  &  .  & -1  &  2 & -1 \\
+  .  &  .  &  .  &  .  & -1 &  2 \\
+  .  &  .  &  .  &  .  &  . & -1
+ \end{array}
+ \right)
+\rightarrow
+\begin{array}{c}
+0 \\
+. \\
+. \\
+. \\
+. \\
+. \\
+.
+\end{array}
+\left(
+ \begin{array}{ccccccc}
+  2  & -1  &  .  & .   &  . & .  \\
+ -1  &  2  & -1  &  .  &  . & .  \\
+  .  & -1  &  2  & -1  &  . & .  \\
+  .  &  .  & -1  &  2  & -1 &  . \\
+  .  &  .  &  .  & -1  &  2 & -1 \\
+  .  &  .  &  .  &  .  & -1 &  2 \\
+  .  &  .  &  .  &  .  &  . & -1
+ \end{array}
+ \right)
+ \begin{array}{cc}
+. & . \\
+. & . \\
+. & . \\
+. & . \\
+. & . \\
+0 & . \\
+0 & 0
+\end{array}
+\f]
+
+We also would like to remind the meaning of the local index (`localIdx`) of the matrix element within a matrix row. It is a rank of the nonzero matrix element in given row as we explained  in section [Indexing of nonzero matrix elements in sparse matrices](#indexing-of-nonzero-matrix-elements-in-sparse-matrices). The values of the local index for tridiagonal matrix elements are as follows
+
+\f[
+\left(
+\begin{array}{cccccc}
+1 & 2 &   &   &   &     \\
+0 & 1 & 2 &   &   &     \\
+  & 0 & 1 & 2 &   &     \\
+  &   & 0 & 1 & 2 &     \\
+  &   &   & 0 & 1 & 2   \\
+  &   &   &   & 0 & 1
+\end{array}
+\right)
+\f]
+
+
+In the following text we show different methods for setup of tridiagonal matrices.
+
+#### Initializer list
+
+The tridiagonal matrix can be initialized by the means of the constructor with [initializer list](https://en.cppreference.com/w/cpp/utility/initializer_list). The matrix from the beginning of this section can be constructed as the following example demonstrates:
+
+\includelineno TridiagonalMatrixExample_Constructor_init_list_1.cpp
+
+The matrix elements values are defined on lines 39-44. Each matrix row is represented by embedded an initializer list. We set three values in each row including the padding zeros.
+
+The output of the example looks as:
+
+\include TridiagonalMatrixExample_Constructor_init_list_1.out
+
+#### Methods `setElement` and `addElement`
+
+Similar way of the tridiagonal matrix setup is offered by the method `setElements` (\ref TNL::Matrices::TridiagonalMatrix::setElements) as the following example demonstrates:
+
+\includelineno TridiagonalMatrixExample_setElements.cpp
+
+Here we create the matrix in two steps. Firstly, we setup the matrix dimensions by the appropriate constructor (line 24) and after that we setup the matrix elements (line 25-45). The result looks the same as in the previous example:
+
+\include TridiagonalMatrixExample_setElements.out
+
+In the following example we create tridiagonal matrix with 5 rows and 5 columns (line 12-14) by the means of a shared pointer (\ref TNL::Pointers::SharedPointer) to make this work even on GPU. We set numbers 0,...,4 on the diagonal (line 16) and we print the matrix (line 18). Next we use a lambda function (lines 21-27) combined with parallel for (\ref TNL::Algorithms::ParallelFor) (line 35), to modify the matrix. The offdiagonal elements are set to 1 (lines 23 and 26) and for the diagonal elements, we change the sign (line 24).
+
+\includelineno TridiagonalMatrixExample_setElement.cpp
+
+The result looks as follows:
+
+\include TridiagonalMatrixExample_setElement.out
+
+#### Method `getRow`
+
+ A bit different way how to do the same is the use of tridiagonal matrix view and the method `getRow` (\ref TNL::Matrices::TridiagonalMatrixView::getRow) as the following example demonstrates:
+
+\includelineno TridiagonalMatrixViewExample_getRow.cpp
+
+We create a matrix with the same size (line 10-15). Next, we fetch the tridiagonal matrix view (ef TNL::Matrices::TridiagonalMatrixView ,line 16) which we use in the lambda function for matrix elements modification (lines 18-26). Inside the lambda function, we first get a matrix row by calling the method `getRow` (\ref TNL::Matrices::TridiagonalMatrixView::getRow) using which we can access the matrix elements (lines 21-25). We would like to stress that the method `setElement` addresses the matrix elements with the `localIdx` parameter which is a rank of the nonzero element in the matrix row - see [Indexing of nonzero matrix elements in sparse matrices](#indexing-of-nonzero-matrix-elements-in-sparse-matrices). The lambda function is called by the `ParallelFor` (\ref TNL::Algorithms::ParallelFor).
+
+The result looks as follows:
+
+\include TridiagonalMatrixViewExample_getRow.out
+
+#### Method `forRows`
+
+Finally, even a bit more simple way of matrix elements manipulation with the method `forRows` (\ref TNL::Matrices::TridiagonalMatrix::forRows) is demonstrated in the following example:
+
+\includelineno TridiagonalMatrixViewExample_forRows.cpp
+
+On the line 41, we call the method `forRows` (\ref TNL::Matrices::TridiagonalMatrix::forRows) instead of parallel for (\ref TNL::Algorithms::ParallelFor). This method iterates over all matrix rows and all nonzero matrix elements. The lambda function on the line 24 therefore do not receive only the matrix row index but also local index of the matrix element (`localIdx`) which is a rank of the nonzero matrix element in given row  - see [Indexing of nonzero matrix elements in sparse matrices](#indexing-of-nonzero-matrix-elements-in-sparse-matrices). Next parameter, `columnIdx` received by the lambda function, is the column index of the matrix element. The fourth parameter `value` is a reference on the matrix element which we use for its modification. If the last parameter `compute` is set to false, the iterations over the matrix rows is terminated.
+
+The result looks as follows:
+
+\include TridiagonalMatrixViewExample_forRows.out
+
+### Multidiagonal matrices <a name="multidiagonal_matrices_setup"></a>
+
+Multidiagonal matrices are generalization of the tridiagonal ones. It is a special type of sparse matrices with specific pattern of the nonzero matrix elements which are positioned only parallel along diagonal. See the following example:
+
+\f[
+  \left(
+  \begin{array}{ccccccc}
+   4  & -1  &  .  & -1  &  . & .  \\
+  -1  &  4  & -1  &  .  & -1 & .  \\
+   .  & -1  &  4  & -1  &  . & -1 \\
+  -1  & .   & -1  &  4  & -1 &  . \\
+   .  & -1  &  .  & -1  &  4 & -1 \\
+   .  &  .  & -1  &  .  & -1 &  4
+  \end{array}
+  \right)
+ \f]
+
+ We can see that the matrix elements lay on lines parallel to the main diagonal. Such lines can be characterized by their offsets from the main diagonal. On the following figure, each such line is depicted in different color:
+
+  \f[
+\begin{array}{ccc}
+\color{green}{-3} & .                & \color{cyan}{-1} \\
+\hline
+ \color{green}{*} & .                & \color{cyan}{*} \\
+ .                & \color{green}{*} & . \\
+ .                & .                & \color{green}{*} \\
+ .                & .                & . \\
+ .                & .                & . \\
+ .                & .                & .
+\end{array}
+\left(
+  \begin{array}{ccccccc}
+ \color{blue}{0}    & \color{magenta}{1}   & .                   & \color{red}{3}      & .                   & . \\
+   \hline
+  \color{blue}{4}   & \color{magenta}{-1}  &  .                  & \color{red}{-1}     &  .                  & .  \\
+  \color{cyan}{-1}  & \color{blue}{4}      & \color{magenta}{-1} &  .                  & \color{red}{-1}     & .  \\
+   .                & \color{cyan}{-1}     & \color{blue}{4}     & \color{magenta}{-1} &  .                  & \color{red}{-1} \\
+  \color{green}{-1} & .                    & \color{cyan}{-1}    & \color{blue}{4}     & \color{magenta}{-1} &  . \\
+   .                & \color{green}{-1}    &  .                  & \color{cyan}{-1}    &  \color{blue}{4}    & \color{magenta}{-1} \\
+   .                &  .                   & \color{green}{-1}   &  .                  & \color{cyan}{-1}    &  \color{blue}{4}
+  \end{array}
+  \right)
+ \f]
+
+ In this matrix, the offsets reads as \f$\{-3, -1, 0, +1, +3\}\f$. It also means that the column indexes on \f$i-\f$th row are \f$\{i-3, i-1, i, i+1, i+3\}\f$ (where we accept only nonnegative indexes smaller than the number of matrix columns). An advantage is that, similar to the tridiagonal matrix (\ref TNL::Matrices::TridiagonalMatrix), we do not store the column indexes explicitly as it is in \ref TNL::Matrices::SparseMatrix. This can significantly reduce the  memory requirements which also means better performance. See the following table for the storage requirements comparison between multidiagonal matrix (\ref TNL::Matrices::MultidiagonalMatrix) and general sparse matrix (\ref TNL::Matrices::SparseMatrix).
+
+  Real   | Index     |      SparseMatrix    | MultidiagonalMatrix | Ratio
+ --------|-----------|----------------------|---------------------|--------
+  float  | 32-bit int| 8 bytes per element  | 4 bytes per element | 50%
+  double | 32-bit int| 12 bytes per element | 8 bytes per element | 75%
+  float  | 64-bit int| 12 bytes per element | 4 bytes per element | 30%
+  double | 64-bit int| 16 bytes per element | 8 bytes per element | 50%
+
+ For the sake of better memory alignment and faster access to the matrix elements, we store all subdiagonals in complete form including the elements which are outside the matrix as depicted on the following figure where zeros stand for the padding artificial zero matrix elements
+
+\f[
+\begin{array}{cccc}
+0  &   &   & 0  \\
+   & 0 &   &    \\
+   &   & 0 &    \\
+   &   &   & 0  \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &
+\end{array}
+\left(
+\begin{array}{cccccccccccccccc}
+1  &  0 &    &    &  0 &    &    &    &    &    &    &    &     &    &    &   \\
+0  &  1 &  0 &    &    &  0 &    &    &    &    &    &    &     &    &    &   \\
+   &  0 &  1 &  0 &    &    & 0  &    &    &    &    &    &     &    &    &   \\
+   &    &  0 &  1 &  0 &    &    &  0 &    &    &    &    &     &    &    &   \\
+0  &    &    &  0 &  1 & 0  &    &    & 0  &    &    &    &     &    &    &   \\
+   & -1 &    &    & -1 & 1  & -1 &    &    & -1 &    &    &     &    &    &   \\
+   &    & -1 &    &    & -1 &  1 & -1 &    &    & -1 &    &     &    &    &   \\
+   &    &    & 0  &    &    &  0 &  1 & 0  &    &    & 0  &     &    &    &   \\
+   &    &    &    & 0  &    &    &  0 & 1  &  0 &    &    &  0  &    &    &   \\
+   &    &    &    &    & -1 &    &    & -1 &  1 & -1 &    &     & -1 &    &   \\
+   &    &    &    &    &    & -1 &    &    & -1 &  1 & -1 &     &    & -1 &   \\
+   &    &    &    &    &    &    &  0 &    &    &  0 &  1 &  0  &    &    & 0 \\
+   &    &    &    &    &    &    &    & 0  &    &    &  0 &  1  & 0  &    &   \\
+   &    &    &    &    &    &    &    &    &  0 &    &    &  0  & 1  & 0  &   \\
+   &    &    &    &    &    &    &    &    &    &  0 &    &     & 0  & 1  & 0 \\
+   &    &    &    &    &    &    &    &    &    &    & 0  &     &    & 0  & 1
+\end{array}
+\right)
+\begin{array}
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+0  &   &   &    \\
+   & 0 &   &    \\
+   &   & 0 &    \\
+0  &   &   & 0
+\end{array}
+\f]
+
+Multidiagonal matrix is a templated class defined in the namespace \ref TNL::Matrices. It has six template parameters:
+
+* `Real` is a type of the matrix elements. It is `double` by default.
+* `Device` is a device where the matrix shall be allocated. Currently it can be either \ref TNL::Devices::Host for CPU or \ref TNL::Devices::Cuda for CUDA supporting GPUs. It is \ref TNL::Devices::Host by default.
+* `Index` is a type to be used for indexing of the matrix elements. It is `int` by default.
+* `ElementsOrganization` defines the organization of the matrix elements in memory. It can be \ref TNL::Algorithms::Segments::ColumnMajorOrder or \ref TNL::Algorithms::Segments::RowMajorOrder for column-major and row-major organization respectively. Be default, it is the row-major order if the matrix is allocated in the host system and column major order if it is allocated on GPU.
+* `RealAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements. By default, it is the default allocator for given `Real` type and `Device` type -- see \ref TNL::Allocators::Default.
+* `IndexAllocator` is a memory allocator (one from \ref TNL::Allocators) which shall be used for allocation of the matrix elements offsets. By default, it is the default allocator for given `Index` type and `Device` type -- see \ref TNL::Allocators::Default.
+
+In the following text we show different methods how to setup multidiagonal matrices.
+
+#### Initializer list
+
+Smaller multidiagonal matrices can be created using the constructor with initializer lists (\ref std::initializer_list) as we demonstrate in the following example:
+
+\includelineno MultidiagonalMatrixExample_Constructor_init_list_2.cpp
+
+Here, we create a matrix which looks as
+
+\f[
+\left(
+\begin{array}{cccccc}
+4  & -1 &    & -1 &    &    \\
+-1 &  4 & -1 &    & -1 &    \\
+   & -1 & 4  & -1 &    & -1 \\
+-1 &    & -1 &  4 & -1 &    \\
+   & -1 &    & -1 & 4  & -1 \\
+   &    & -1 &    & -1 &  4 \\
+\end{array}
+\right).
+\f]
+
+On the lines 25-46, we call the constructor which, in addition to matrix dimensions and subdiagonals offsets, accepts also initializer list of initializer lists with matrix elements values. Each embedded list corresponds to one matrix row and it contains values of matrix elements on particular subdiagonals including those which lies out of the matrix. The result looks as follows:
+
+\include MultidiagonalMatrixExample_Constructor_init_list_2.out
+
+#### Methods `setElement` and `addElement`
+
+Another and more efficient way of setting the matrix elements is by means of the method `setElement` (\ref TNL::Matrices::MultidiagonalMatrix::setElement). It is demonstrated in the following example:
+
+\includelineno MultidiagonalMatrixViewExample_setElement.cpp
+
+This examples shows that the method `setElement` can be used both on the host (CPU) (line 19) as well as in the lambda functions that can be called from GPU kernels (lines 25-29). For this purpose, we fetch a matrix view on the line 16. The result looks as follows:
+
+\include MultidiagonalMatrixViewExample_setElement.out
+
+#### Method `getRow`
+
+Slightly more efficient way of the multidiagonal matrix setup is offered by the method `getRow` (\ref TNL::Matrices::MultidiagonalMatrix::getRow). We will use it to create a matrix of the following form:
+
+\f[
+\left(
+\begin{array}{cccccccccccccccc}
+1  &  . &    &    &  . &    &    &    &    &    &    &    &     &    &    &   \\
+.  &  1 &  . &    &    &  . &    &    &    &    &    &    &     &    &    &   \\
+   &  . &  1 &  . &    &    & .  &    &    &    &    &    &     &    &    &   \\
+   &    &  . &  1 &  . &    &    &  . &    &    &    &    &     &    &    &   \\
+.  &    &    &  . &  1 & .  &    &    & .  &    &    &    &     &    &    &   \\
+   & -1 &    &    & -1 & -4 & -1 &    &    & -1 &    &    &     &    &    &   \\
+   &    & -1 &    &    & -1 & -4 & -1 &    &    & -1 &    &     &    &    &   \\
+   &    &    & .  &    &    &  . &  1 & .  &    &    & .  &     &    &    &   \\
+   &    &    &    & .  &    &    &  . & 1  &  . &    &    &  .  &    &    &   \\
+   &    &    &    &    & -1 &    &    & -1 & -4 & -1 &    &     & -1 &    &   \\
+   &    &    &    &    &    & -1 &    &    & -1 & -4 & -1 &     &    & -1 &   \\
+   &    &    &    &    &    &    &  . &    &    &  . &  1 &  .  &    &    & . \\
+   &    &    &    &    &    &    &    & .  &    &    &  . &  1  & .  &    &   \\
+   &    &    &    &    &    &    &    &    &  . &    &    &  .  & 1  & .  &   \\
+   &    &    &    &    &    &    &    &    &    &  . &    &     & .  & 1  & . \\
+   &    &    &    &    &    &    &    &    &    &    & .  &     &    & .  & 1
+\end{array}
+\right)
+\f]
+
+The matrices of this form arise from a discretization of the [Laplace operator in 2D](https://en.wikipedia.org/wiki/Discrete_Laplace_operator) by the [finite difference method](https://en.wikipedia.org/wiki/Discrete_Poisson_equation). We use this example because it is a frequent numerical problem and the multidiagonal format is very suitable for such matrices. If the reader, however, is not familiar with the finite difference method, please, do not be scared, we will just create the matrix mentioned above. The code based on use of method `getRow` reads as:
+
+\includelineno MultidiagonalMatrixExample_Constructor.cpp
+
+We firstly compute the matrix size (`matrixSize`) based on the numerical grid dimensions on the line 16. The subdiagonals offsets are defined by the numerical grid size and since it is four in this example the offsets read as \f$\left\{-4,-1,0,1,4 \right\} \f$ or `{ -gridSize, -1, 0, 1, gridSize}` (line 17). Here we store the offsets in vector (\ref TNL::Containers::Vector) called `offsets`. Next we use a constructor with matrix dimensions and offsets passed via TNL vector (line 18) and we fetch a matrix view (\ref TNL::Matrices::MultidiagonalMatrixView, line 19).
+
+The matrix is constructed by iterating over particular nodes of the numerical grid. Each node correspond to one matrix row. This is why the lambda function `f` (lines 20-35) take two indexes `i` and `j` (line 20). Their values are coordinates of the two-dimensional numerical grid. Based on these coordinates we compute index (`elementIdx`) of the corresponding matrix row (line 21). We fetch matrix row (`row`) by calling the `getRow` method (\ref TNL::Matrices::MultidiagonalMatrix::getRow) (line 22). Depending on the grid node coordinates we set either the boundary conditions (lines 23-26) for the boundary nodes (those laying on the boundary of the grid and so their coordinates fulfil the condition `i == 0 || j == 0 || i == gridSize - 1 || j == gridSize - 1` ) for which se set only the diagonal element to 1. The inner nodes of the numerical grid are handled on the lines 29-33 where we set coefficients approximating the Laplace operator. We use the method `setElement` of the matrix row (\ref TNL::Matrices::MultidiagonalMatrixRow::setElement) which takes the local index of the nonzero matrix element as the first parameter (see [Indexing of nonzero matrix elements in sparse matrices](#indexing-of-nonzero-matrix-elements-in-sparse-matrices)) and the new value of the element as the second parameter. The local indexes, in fact, refer to particular subdiagonals as depicted on the following figure (in blue):
+
+\f[
+\begin{array}{cccc}
+\color{blue}{-4} &   &   & \color{blue}{-1} \\
+\hline
+.  &   &   & .  \\
+   & . &   &    \\
+   &   & . &    \\
+   &   &   & .  \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &    \\
+   &   &   &
+\end{array}
+\left(
+\begin{array}{cccccccccccccccc}
+\color{blue}{0}  &  \color{blue}{1} &    &    &  \color{blue}{4} &    &    &    &    &    &    &    &     &    &    &   \\
+\hline
+1  &  . &    &    &  . &    &    &    &    &    &    &    &     &    &    &   \\
+.  &  1 &  . &    &    &  . &    &    &    &    &    &    &     &    &    &   \\
+   &  . &  1 &  . &    &    & .  &    &    &    &    &    &     &    &    &   \\
+   &    &  . &  1 &  . &    &    &  . &    &    &    &    &     &    &    &   \\
+.  &    &    &  . &  1 & .  &    &    & .  &    &    &    &     &    &    &   \\
+   & -1 &    &    & -1 & 1  & -1 &    &    & -1 &    &    &     &    &    &   \\
+   &    & -1 &    &    & -1 &  1 & -1 &    &    & -1 &    &     &    &    &   \\
+   &    &    & .  &    &    &  . &  1 & .  &    &    & .  &     &    &    &   \\
+   &    &    &    & .  &    &    &  . & 1  &  . &    &    &  .  &    &    &   \\
+   &    &    &    &    & -1 &    &    & -1 &  1 & -1 &    &     & -1 &    &   \\
+   &    &    &    &    &    & -1 &    &    & -1 &  1 & -1 &     &    & -1 &   \\
+   &    &    &    &    &    &    &  . &    &    &  . &  1 &  .  &    &    & . \\
+   &    &    &    &    &    &    &    & .  &    &    &  . &  1  & .  &    &   \\
+   &    &    &    &    &    &    &    &    &  . &    &    &  .  & 1  & .  &   \\
+   &    &    &    &    &    &    &    &    &    &  . &    &     & .  & 1  & . \\
+   &    &    &    &    &    &    &    &    &    &    & .  &     &    & .  & 1
+\end{array}
+\right)
+\f]
+
+We use `ParallelFor2D` (\ref TNL::Algorithms::ParallelFor2D) to iterate over all nodes of the numerical grid (line 36) and apply the lambda function. The result looks as follows:
+
+\include MultidiagonalMatrixExample_Constructor.out
+
+#### Method `forRows`
+
+Similar and even a bit simpler way of setting the matrix elements is offered by the method `forRows` (\ref TNL::Matrices::MultidiagonalMatrix::forRows, \ref TNL::Matrices::MultidiagonalMatrixView::forRows) as demonstrated in the following example:
+
+\includelineno MultidiagonalMatrixViewExample_forRows.cpp
+
+In this case, we need to provide a lambda function `f` (lines 27-43) which is called for each matrix row just by the method `forRows` (line 44). The lambda function `f` provides the following parameters
+
+* `rowIdx` is an index iof the matrix row.
+* `localIdx` is in index of the matrix subdiagonal.
+* `columnIdx` is a column index of the matrix element.
+* `value` is a reference to the matrix element value. It can be used even for changing the value.
+* `compute` is a reference to boolean. If it is set to false, the iteration over the matrix row can be stopped.
+
+In this example, the matrix element value depends only on the subdiagonal index `localIdx` (see [Indexing of nonzero matrix elements in sparse matrices](#indexing-of-nonzero-matrix-elements-in-sparse-matrices)) as we can see on the line 42. The result looks as follows:
+
+\include MultidiagonalMatrixExample_forRows.out
+
+### Lambda matrices <a name="lambda_matrices_setup"></a>
+
+Lambda matrix (\ref TNL::Matrices::LambdaMatrix) is a special type of matrix which could be also called *matrix-free matrix*. The matrix elements are not stored in memory explicitly but they are evaluated **on-the-fly** by means of user defined lambda functions. If the matrix elements can be expressed by computationally not expansive formula, we can significantly reduce the memory consumption which is appreciated especially on GPUs. Since the memory accesses are quite expensive even on both CPU and GPU, we can get, at the end, even much faster code.
+
+The lambda matrix (\ref TNL::Matrices::LambdaMatrix) is a templated class with the following template parameters:
+
+* `MatrixElementsLambda` is a lambda function which evaluates the matrix elements values and column indexes.
+* `CompressedRowLengthsLambda` is a lambda function telling how many nonzero elements are there in given matrix row.
+* `Real` is a type of matrix elements values.
+* `Device` is a device on which the lambda functions mentioned above will be evaluated.
+* `Index` is a type to be used for indexing.
+
+The lambda function `MatrixElementsLambda` is supposed to have the following declaration:
+
+\includelineno snippet_MatrixElementsLambda_declaration.cpp
+
+where the particular parameters have the following meaning:
+
+* `rows` tells the number of matrix rows.
+* `columns` tells the number of matrix columns.
+* `rowIdx` is index of the matrix row in which we are supposed to evaluate the matrix element.
+* `localIdx` is a rank of the nonzero matrix element, see [Indexing of nonzero matrix elements in sparse matrices](#indexing-of-nonzero-matrix-elements-in-sparse-matrices).
+* `columnIdx` is a reference on variable where we are supposed to store the matrix element column index.
+* `value` is a reference on variable where we are supposed to store the matrix element value.
+
+The lambda function `CompressedRowLengthsLambda` (by compressed row length we mean the number of matrix elements in a row after ignoring/compressing the zero elements) is supposed to be declared like this:
+
+\includelineno snippet_CompressedRowLengthsLambda_declaration.cpp
+
+where the parameters can be described as follows:
+
+* `rows` tells the number of matrix rows.
+* `columns` tells the number of matrix columns.
+* `rowIdx` is index of the matrix row for which we are supposed to evaluate the number of nonzero matrix elements.
+
+The lambda function is supposed to return just the number of the nonzero matrix elements in given matrix row.
+
+#### Lambda matrix inititation
+
+How to put the lambda functions together with the lambda matrix is demonstrated in the following example:
+
+\includelineno LambdaMatrixExample_Constructor.cpp
+
+Here we create two simple diagonal matrices. Therefore they share the same lambda function `compressedRowLengths` telling the number of nonzero matrix elements in particular matrix rows which is always one (line 9). The first matrix, defined by the lambda function `matrixElements1`, is identity matrix and so its each diagonal element equals one. We set the matrix element value to `1.0` (line 12) and the column index equals the row index (line 15). The second matrix, defined by the lambda function `matrixElements2`, is also diagonal but not the identity matrix. The values of the diagonal elements equal to row index (line 16).
+
+With the same lambda functions we can define matrices with different dimensions. In this example, we set the matrix size to five (line 19). It could be quite difficult to express the lambda matrix type because it depends on the types of the lambda functions. To make this easier, one may use the lambda-matrix factory (\ref TNL::Matrices::LambdaMatrixFactory). Using `decltype` one can deduce even the matrix type (line 24) followed by calling lambda matrix constructor with matrix dimensions and instances of the lambda functions (line 25). Or one can just simply employ the keyword `auto` (line 30) followed by setting the matrix dimensions (line 31).
+
+The result looks as follows:
+
+\include LambdaMatrixExample_Constructor.out
+
+#### Method `forRows`
+
+The lambda matrix has the same interface as other matrix types except of the method `getRow`. The following example demonstrates the use of the method `forRows` (\ref TNL::Matrices::LambdaMatrix::forRows) to copy the lambda matrix into the dense matrix:
+
+\includelineno LambdaMatrixExample_forRows.cpp
+
+Here, we treat the lambda matrix as if it was dense matrix and so the lambda function `compressedRowLengths` returns the number of the nonzero elements equal to the number of matrix columns (line 13). However, the lambda function `matrixElements` (lines 14-17), sets nonzero values only to lower triangular part of the matrix. The elements in the upper part are equal to zero (line 16). Next we create an instance of the lambda matrix with a help of the lambda matrix factory (\ref TNL::Matrices::LambdaMatrixFactory) (lines 19-20) and an instance of the dense matrix (\ref TNL::Matrices::DenseMatrix) (lines 22-23).
+
+Next we call the lambda function `f` by the method `forRows` (\ref TNL::Matrices::LambdaMatrix::forRows) to set the matrix elements of the dense matrix `denseMatrix` (line 26) via the dense matrix view (`denseView`) (\ref TNL::Matrices::DenseMatrixView). Note, that in the lambda function `f` we get the matrix element value already evaluated in the variable `value` as we are used to from other matrix types. So in fact, the same lambda function `f` would do the same job even for sparse matrix or any other. Also note, that in this case we iterate even over all zero matrix elements because the lambda function `compressedRowLengths` (line 13) tells so. The result looks as follows:
+
+\include LambdaMatrixExample_forRows.out
+
+At the end of this part, we show two more examples, how to express a matrix approximating the Laplace operator:
+
+\includelineno LambdaMatrixExample_Laplace.cpp
+
+The following is another way of doing the same but with precomputed supporting vectors:
+
+\includelineno LambdaMatrixExample_Laplace_2.cpp
+
+The result of both examples looks as follows:
+
+\include LambdaMatrixExample_Laplace.out
+
+### Distributed matrices <a name="distributed-matrices-setup"></a>
+
+TODO: Write documentation on distributed matrices.
+
+## Flexible reduction in matrix rows <a name="flexible_reduction_in_matrix_rows"></a>
+
+Flexible reduction in matrix rows is a powerful tool for many different matrix operations. It is represented by the method `rowsReduction` (\ref TNL::Matrices::DenseMatrix::rowsReduction, 
+\ref TNL::Matrices::SparseMatrix::rowsReduction, \ref TNL::Matrices::TridiagonalMatrix::rowsReduction, \ref TNL::Matrices::MultidiagonalMatrix::rowsReduction, \ref TNL::Matrices::LambdaMatrix::rowsReduction) and similar to the method `forRows` it iterates over particular matrix rows. However, it performs *flexible paralell reduction* in addition. For example, the matrix-vector product can be seen as a reduction of products of matrix elements with the input vector in particular matrix rows. The first element of the result vector ios obtained as:
+
+\f[
+y_1 = a_{11} x_1 + a_{12} x_2 + \ldots + a_{1n} x_n = \sum_{j=1}^n a_{1j}x_j
+\f]
+
+and in general i-th element of the result vector is computed as
+
+\f[
+y_i = a_{i1} x_1 + a_{i2} x_2 + \ldots + a_{in} x_n = \sum_{j=1}^n a_{ij}x_j.
+\f]
+
+We see that in i-th matrix row we have to compute the sum \f$\sum_{j=1}^n a_{ij}x_j\f$ which is reduction of products \f$ a_{ij}x_j\f$. Similar to flexible parallel reduction (\ref TNL::Algorithms::Reduction) we just need to design proper lambda functions. There are three of them.
+
+1. `fetch` reads and preprocesses data entering the flexible parallel reduction.
+2. `reduce` performs the reduction operation.
+3. `keep` stores the results from each matrix row.
+
+#### Lambda function fetch
+
+This lambda function has the same purpose as the lambda function `fetch` in flexible parallel reduction for arrays and vectors (see [Flexible Parallel Reduction](tutorial_ReductionAndScan.html#flexible_parallel_reduction)). It is supposed to be declared as follows:
+
+\includelineno snippet_rows_reduction_fetch_declaration.cpp
+
+The meaning of the particular parameters is as follows:
+
+1. `rowIdx` is the row index of the matrix element.
+2. `columnIdx` is the column index of the matrix element.
+3. `value` is the value of the matrix element.
+
+The lambda function returns a value of type `Real` based on the input data.
+
+#### Lambda function reduce
+
+The lambda function `reduce` expresses reduction operation (sum, product, minimum, maximum etc.) which is supposed to be done during the flexible reduction.
+
+\includelineno snippet_rows_reduction_reduce_declaration.cpp
+
+The meaning of the particular parameters is as follows:
+
+1. `a` is the first operand for the reduction operation.
+2. `b` is the second operand for the reduction operation.
+
+#### Lambda function keep
+
+The lambda function `keep` is new one compared to the flexible reduction for arrays, vectors or other linear structures. The reason is that the result consists of as many numbers as there are matrix rows. Result obtained for each matrix row is processed by this lambda function. It is declared as follows:
+
+\includelineno snippet_rows_reduction_keep_declaration.cpp
+
+The meaning of the particular parameters is as follows:
+
+1. `rowIdx` is an index of the matrix row related to given result of flexible reduction.
+2. `value`is the result of the flexible reduction in given matrix row.
+
+The method `rowsReduction` (\ref TNL::Matrices::DenseMatrix::rowsReduction, \ref TNL::Matrices::SparseMatrix::rowsReduction, \ref TNL::Matrices::TridiagonalMatrix::rowsReduction, \ref TNL::Matrices::MultidiagonalMatrix::rowsReduction, \ref TNL::Matrices::LambdaMatrix::rowsReduction) accepts the following arguments:
+
+1. `begin` is the beginning of the matrix rows range on which the reduction will be performed.
+2. `end` is the end of the matrix rows range on which the reduction will be performed. The last matrix row which is going to be processed has index `end-1`.
+3. `fetch` is the lambda function for data fetching.
+4. `reduce` is the the lambda function performing the reduction.
+5. `keep` is the lambda function responsible for processing the results from particular matrix rows.
+6. `zero` is the "zero" element of given reduction operation also known as *idempotent*.
+
+Though the interface is the same for all matrix types, in the following part we will show several examples for different matrix types to better demonstrate possible ways of use of the flexible reduction for matrices.
+
+### Dense matrices example <a name="dense-matrices-flexible-reduction-example"></a>
+
+The following example demonstrates implementation of the dense matrix-vector product \f$ {\bf y} = A \vec {\bf x}\f$, i.e.
+
+\f[
+   y_i = \sum_{j=0}^{columns - 1} a_{ij} x_j \text{ for } i = 0, \ldots, rows-1.
+\f]
+
+\includelineno DenseMatrixExample_rowsReduction_vectorProduct.cpp
+
+We set the following lambda functions:
+
+* `fetch` lambda function computes the product \f$ a_{ij}x_j\f$ where \f$ a_{ij} \f$ is represented by `value` and \f$x_j \f$ is represented by `xView[columnIdx]` (line 40).
+* `reduce` - reduction is just sum of particular products and it is represented by \ref std::plus (line 53).
+* `keep` is responsible for storing the results of reduction in each matrix row (which is represented by the variable `value`) into the output vector `y`.
+
+The result looks as:
+
+\include DenseMatrixExample_rowsReduction_vectorProduct.out
+
+We will show one more example which is a computation of maximal absolute value in each matrix row. The results will be stored in a vector:
+
+\f[
+y_i = \max_{j=1,\ldots,n} |a_{ij}|.
+\f]
+
+See the following example:
+
+\includelineno DenseMatrixExample_rowsReduction_maxNorm.cpp
+
+The lambda functions rare:
+
+* `fetch` lambda function just returns absolute value of \f$a_{ij} \f$ which is represented again by the variable `value`.
+* `reduce` lambda function returns larger of given values.
+* `keep` stores the results to the output vector the same way as in the previous example. 
+
+Note, that the idempotent value for the reduction is \ref std::numeric_limits< double >::lowest. Of course, if we compute the maximum of all output vector elements, we get some kind of maximal matrix norm. The output looks as:
+
+\include DenseMatrixExample_rowsReduction_maxNorm.out
+
+### Sparse matrices example <a name="sparse-matrices-flexible-reduction-example"></a>
+
+The following example demonstrates sparse matrix-vector product:
+
+\includelineno SparseMatrixExample_rowsReduction_vectorProduct.cpp
+
+On the lines 11-16 we set the following matrix:
+
+\f[
+\left(
+\begin{array}{ccccc}
+1 & . & . & . & . \\
+1 & 2 & . & . & . \\
+. & 1 & 8 & . & . \\
+. & . & 1 & 9 & . \\
+. & . & . & . & 1
+\end{array}
+\right)
+\f]
+
+The lambda functions on the lines 39-48 are the same as in the example with the dense matrix. The result looks as follows:
+
+\include SparseMatrixExample_rowsReduction_vectorProduct.out
+
+### Tridiagonal matrices example <a name="tridiagonal-matrices-flexible-reduction-example"></a>
+
+In this example, we will compute maximal absolute value in each row of the following tridiagonal matrix:
+
+\f[
+\left(
+\begin{array}{ccccc}
+1 & 3 &   &   &   &    \\
+2 & 1 & 3 &   &   &    \\
+  & 2 & 1 & 3 &   &    \\
+  &   & 2 & 1 & 3 &    \\
+  &   &   & 2 & 1 & 3
+\end{array}
+\right).
+\f]
+
+The source code reads as follows:
+
+\includelineno TridiagonalMatrixExample_rowsReduction.cpp
+
+Here we first set the tridiagonal matrix (lines 10-27). Next we allocate the vector `rowMax` where we will store the results (line 32). The lambda function are:
+
+* `fetch` (lines 42-44) is responsible for reading the matrix elements. In our example, the only thing this function has to do, is to compute the absolute value of each matrix element represented by variable `value`.
+* `reduce` (lines 49-51), performs reduction operation. In this case, it returns maximum of two input values `a` and `b`.
+* `keep` (lines 56-58) takes the result of the reduction in variable `value` in each row and stores it into the vector `rowMax` via related vector view `rowMaxView`.
+
+Note, that the idempotent value for the reduction is \ref std::numeric_limits< double >::lowest. The results looks as follows:
+
+\include TridiagonalMatrixExample_rowsReduction.out
+
+### Multidiagonal matrices example <a name="multidiagonal-matrices-flexible-reduction-example"></a>
+
+The next example computes again the maximal absolute value in each row. Now, we do it for multidiagonal matrix the following form:
+
+\f[
+\left(
+\begin{array}{ccccc}
+1  &   &   &   &  \\
+2  & 1 &   &   &  \\
+3  & 2 & 1 &   &  \\
+   & 3 & 2 & 1 &  \\
+   &   & 3 & 2 & 1
+\end{array}
+\right)
+\f]
+
+We first create vector `rowMax` into which we will store the results and fetch it view `rowMaxView` (line 39). Next we prepare necessary lambda functions:
+
+* `fetch` (lines 44-46) is responsible for reading the matrix element value which is stored in the constant reference `value` and for returning its absolute value. The other parameters `rowIdx` and `columnIdx` correspond to row and column indexes respectively and they are omitted in our example.
+* `reduce` (lines 51-53) returns maximum value of the two input values `a` and `b`.
+* `keep` (line 58-60) stores the input `value` at the corresponding position, given by the row index `rowIdx`, in the output vector view `rowMaxView`.
+
+Finally, we call the method `rowsReduction` (\ref TNL::Matrices::MultidiagonalMatrix::rowsReduction) with parameters telling the interval of rows to be processed (the first and second parameter), the lambda functions `fetch`, `reduce` and `keep`, and the idempotent element for the reduction operation which is the lowest number of given type (\ref std::numeric_limits< double >::lowest ). The result looks as follows:
+
+\include MultidiagonalMatrixExample_rowsReduction.out
+
+### Lambda matrices example <a name="lambda-matrices-flexible-reduction-example"></a>
+
+The reduction of matrix rows is available for the lambda matrices as well. See the following example:
+
+\includelineno LambdaMatrixExample_rowsReduction.cpp
+
+On the lines 14-21, we create the lower triangular lambda matrix which looks as follows:
+
+\f[
+\left(
+\begin{array}{ccccc}
+1 &   &   &   &   \\
+2 & 1 &   &   &   \\
+3 & 2 & 1 &   &   \\
+4 & 3 & 2 & 1 &   \\
+5 & 4 & 3 & 2 & 1
+\end{array}
+\right)
+\f]
+
+We want to compute maximal absolute value of matrix elements in each row. For this purpose we define well known lambda functions:
+
+* `fetch` takes the value of the lambda matrix element and returns its absolute value.
+* `reduce` computes maximum value of two input variables.
+* `keep` stores the results into output vector `rowMax`.
+
+Note that the interface of the lambda functions is the same as for other matrix types. The result looks as follows:
+
+\include LambdaMatrixExample_rowsReduction.out
+
+## Matrix-vector product <a name="matrix_vector_product"></a>
+
+One of the most important matrix operation is the matrix-vector multiplication. It is represented by a method `vectorProduct` (\ref TNL::Matrices::DenseMatrix::vectorProduct, \ref TNL::Matrices::SparseMatrix::vectorProduct, \ref TNL::Matrices::TridiagonalMatrix::vectorProduct, \ref TNL::Matrices::MultidiagonalMatrix::vectorProduct, \ref TNL::Matrices::LambdaMatrix::vectorProduct). It is templated method with two template parameters `InVector` and `OutVector` telling the types of input and output vector respectively. Usually one will substitute some of \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or \ref TNL::Containers::VectorView for these types. The method accepts the following parameters:
+
+1. `inVector` is the input vector having the same number of elements as the number of matrix columns.
+2. `outVector` is the output vector having the same number of elements as the number of matrix rows.
+3. `matrixMultiplicator` is a number by which the result of matrix-vector product is multiplied.
+4. `outVectorMultiplicator` is a number by which the output vector is multiplied before added to the result of matrix-vector product.
+5. `begin` is the beginning of the matrix rows range on which we compute the matrix-vector product.
+6. `end` is the end of the matrix rows range on which the matrix-vector product will be evaluated. The last matrix row which is going to be processed has index `end-1`.
+
+Note that the output vector dimension must be the same as the number of matrix rows no matter how we set `begin` and `end` parameters. These parameters just say that some matrix rows and the output vector elements are omitted.
+
+To summarize, this method computes the following formula:
+
+`outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector.`
+
+## Matrix I/O operations <a name="matrix_io_operations"></a>
+
+All  matrices can be saved to a file using a method `save` (\ref TNL::Matrices::DenseMatrix::save, \ref TNL::Matrices::SparseMatrix::save, \ref TNL::Matrices::TridiagonalMatrix::save, \ref TNL::Matrices::MultidiagonalMatrix::save, \ref TNL::Matrices::LambdaMatrix::save) and restored with a method `load` (\ref TNL::Matrices::DenseMatrix::load, \ref TNL::Matrices::SparseMatrix::load, \ref TNL::Matrices::TridiagonalMatrix::load, \ref TNL::Matrices::MultidiagonalMatrix::load, \ref TNL::Matrices::LambdaMatrix::load). To print the matrix, there is a method `print` (\ref TNL::Matrices::DenseMatrix::print, \ref TNL::Matrices::SparseMatrix::print, \ref TNL::Matrices::TridiagonalMatrix::print, \ref TNL::Matrices::MultidiagonalMatrix::print, \ref TNL::Matrices::LambdaMatrix::print) can be used. TNL also offers matrix reader (\ref TNL::Matrices::MatrixReader) for import of matrices. We describe it in the following sections.
+
+### Matrix reader <a name="matrix-reader></a>
+
+TODO: Write documentation on matrix reader.
+
+## Appendix<a name="appendix"></a>
+
+### Benchmark of dense matrix setup<a name="benchmark-of-dense-matrix-setup"></a>
+
+\includelineno DenseMatrixSetup_Benchmark.cpp
+
+### Benchmark of sparse matrix setup<a name="benchmark-of-sparse-matrix-setup"></a>
+
+\includelineno SparseMatrixSetup_Benchmark.cpp
+
+### Benchmark of multidiagonal matrix setup<a name="benchmark-of-multidiagonal-matrix-setup"></a>
+
+\includelineno MultidiagonalMatrixSetup_Benchmark.cpp
+
+
+
diff --git a/Documentation/Tutorials/index.md b/Documentation/Tutorials/index.md
index 132f30799c7cdee942ee4c26dd3ed721fa70b4a3..56a51cc2234f964866bdb7ae3d2f07e03851ebea 100644
--- a/Documentation/Tutorials/index.md
+++ b/Documentation/Tutorials/index.md
@@ -2,8 +2,11 @@
 
 ## Tutorials
 
-1. [Arrays](tutorial_Arrays.html)
-2. [Vectors](tutorial_Vectors.html)
-3. [Flexible parallel reduction and scan](tutorial_ReductionAndScan.html)
-4. [For loops](tutorial_ForLoops.html)
-5. [Cross-device pointers](tutorial_Pointers.html)
+1. [Building applications with TNL](tutorial_building_applications_with_tnl.html)
+2. [General concepts](tutorial_GeneralConcepts.html)
+3. [Arrays](tutorial_Arrays.html)
+4. [Vectors](tutorial_Vectors.html)
+5. [Flexible parallel reduction and scan](tutorial_ReductionAndScan.html)
+6. [For loops](tutorial_ForLoops.html)
+7. [Cross-device pointers](tutorial_Pointers.html)
+8. [Matrices](tutorial_Matrices.html)
diff --git a/src/TNL/Matrices/DenseMatrix.h b/src/TNL/Matrices/DenseMatrix.h
index d77b821be92a38807bbb9fd0668f7eda7d08c689..32c4678d045bb3a7892f2f5fdc6c39b90b22ba40 100644
--- a/src/TNL/Matrices/DenseMatrix.h
+++ b/src/TNL/Matrices/DenseMatrix.h
@@ -616,6 +616,10 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *    is computed. It is zero by default.
        * \param end is the end of the rows range for which the vector product
        *    is computed. It is number if the matrix rows by default.
+       *
+       * Note that the ouput vector dimension must be the same as the number of matrix rows
+       * no matter how we set `begin` and `end` parameters. These parameters just say that
+       * some matrix rows and the output vector elements are omitted.
        */
       template< typename InVector, typename OutVector >
       void vectorProduct( const InVector& inVector,
diff --git a/src/TNL/Matrices/DenseMatrixRowView.h b/src/TNL/Matrices/DenseMatrixRowView.h
index 498ec84f113a67e0f805b83c5597ed014d1606a9..49774949b0d31ac2d7a50c8a795d12c21a8d178d 100644
--- a/src/TNL/Matrices/DenseMatrixRowView.h
+++ b/src/TNL/Matrices/DenseMatrixRowView.h
@@ -15,17 +15,17 @@ namespace TNL {
 
 /**
  * \brief RowView is a simple structure for accessing rows of dense matrix.
- * 
+ *
  * \tparam SegmentView is a segment view of segments representing the matrix format.
  * \tparam ValuesView is a vector view storing the matrix elements values.
- * 
+ *
  * See \ref DenseMatrix and \ref DenseMatrixView.
- * 
+ *
  * \par Example
  * \include Matrices/DenseMatrix/DenseMatrixExample_getRow.cpp
  * \par Output
  * \include DenseMatrixExample_getRow.out
- * 
+ *
  * \par Example
  * \include Matrices/DenseMatrix/DenseMatrixViewExample_getRow.cpp
  * \par Output
@@ -59,7 +59,7 @@ class DenseMatrixRowView
 
       /**
        * \brief Constructor with \e segmentView and \e values
-       * 
+       *
        * \param segmentView instance of SegmentViewType representing matrix row.
        * \param values is a container view for storing the matrix elements values.
        */
@@ -69,7 +69,7 @@ class DenseMatrixRowView
 
       /**
        * \brief Returns size of the matrix row, i.e. number of matrix elements in this row.
-       * 
+       *
        * \return Size of the matrix row.
        */
       __cuda_callable__
@@ -77,9 +77,9 @@ class DenseMatrixRowView
 
       /**
        * \brief Returns constants reference to an element with given column index.
-       * 
+       *
        * \param column is column index of the matrix element.
-       * 
+       *
        * \return constant reference to the matrix element.
        */
       __cuda_callable__
@@ -87,9 +87,9 @@ class DenseMatrixRowView
 
       /**
        * \brief Returns non-constants reference to an element with given column index.
-       * 
+       *
        * \param column is a column index of the matrix element.
-       * 
+       *
        * \return non-constant reference to the matrix element.
        */
       __cuda_callable__
@@ -104,6 +104,20 @@ class DenseMatrixRowView
       __cuda_callable__
       void setElement( const IndexType column,
                        const RealType& value );
+
+      /**
+       * \brief Sets value of matrix element with given column index
+       *
+       * The \e localIdx parameter is here only for compatibility with
+       * the sparse matrices and it is omitted.
+       *
+       * \param column is a column index of the matrix element.
+       * \param value is a value the matrix element will be set to.
+       */
+      __cuda_callable__
+      void setElement( const IndexType localIdx,
+                       const IndexType column,
+                       const RealType& value );
    protected:
 
       SegmentViewType segmentView;
diff --git a/src/TNL/Matrices/DenseMatrixRowView.hpp b/src/TNL/Matrices/DenseMatrixRowView.hpp
index 9ca725396d2c4a0524f9b268cbce900d8e660daf..1c7af4adf97ee607163c83468ce0f067d7f46018 100644
--- a/src/TNL/Matrices/DenseMatrixRowView.hpp
+++ b/src/TNL/Matrices/DenseMatrixRowView.hpp
@@ -56,7 +56,7 @@ getElement( const IndexType column ) -> RealType&
 
 template< typename SegmentView,
           typename ValuesView >
-__cuda_callable__ void 
+__cuda_callable__ void
 DenseMatrixRowView< SegmentView, ValuesView >::
 setElement( const IndexType column,
             const RealType& value )
@@ -66,6 +66,18 @@ setElement( const IndexType column,
    values[ globalIdx ] = value;
 }
 
+template< typename SegmentView,
+          typename ValuesView >
+__cuda_callable__ void
+DenseMatrixRowView< SegmentView, ValuesView >::
+setElement( const IndexType localIdx,
+            const IndexType column,
+            const RealType& value )
+{
+   TNL_ASSERT_LT( column, this->getSize(), "Column index exceeds matrix row size." );
+   const IndexType globalIdx = segmentView.getGlobalIndex( column );
+   values[ globalIdx ] = value;
+}
 
    } // namespace Matrices
 } // namespace TNL
diff --git a/src/TNL/Matrices/DenseMatrixView.h b/src/TNL/Matrices/DenseMatrixView.h
index b28817a20e168b09cf1f3130313bd8ef0a920bf7..2cf97177123734dafb8608be1f0c665f2e45b536 100644
--- a/src/TNL/Matrices/DenseMatrixView.h
+++ b/src/TNL/Matrices/DenseMatrixView.h
@@ -29,8 +29,9 @@ namespace Matrices {
  * \tparam Real is a type of matrix elements.
  * \tparam Device is a device where the matrix is allocated.
  * \tparam Index is a type for indexing of the matrix elements.
- * \tparam MatrixElementsOrganization tells the ordering of matrix elements. It is either RowMajorOrder
- *         or ColumnMajorOrder.
+ * \tparam MatrixElementsOrganization tells the ordering of matrix elements in memory. It is either
+ *         \ref TNL::Algorithms::Segments::RowMajorOrder
+ *         or \ref TNL::Algorithms::Segments::ColumnMajorOrder.
  * 
  * See \ref DenseMatrix.
  */
@@ -564,6 +565,10 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        *    is computed. It is zero by default.
        * \param end is the end of the rows range for which the vector product
        *    is computed. It is number if the matrix rows by default.
+       *
+       * Note that the ouput vector dimension must be the same as the number of matrix rows
+       * no matter how we set `begin` and `end` parameters. These parameters just say that
+       * some matrix rows and the output vector elements are omitted.
        */
       template< typename InVector, typename OutVector >
       void vectorProduct( const InVector& inVector,
diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index d7a781e20fc93054ac4f1eb80677741b4c0ab7ae..c8645b13b5262fc4de238e3fe59c07126bf220df 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -678,7 +678,8 @@ void DenseMatrixView< Real, Device, Index, Organization >::print( std::ostream&
          str_ << std::setw( 4 ) << std::right << column << ":" << std::setw( 4 ) << std::left << this->getElement( row, column );
          str << std::setw( 10 ) << str_.str();
       }
-      str << std::endl;
+      if( row < this->getRows() - 1 )
+         str << std::endl;
    }
 }
 
diff --git a/src/TNL/Matrices/LambdaMatrix.h b/src/TNL/Matrices/LambdaMatrix.h
index 1692510e70ecfeb0aa3e5365d7535db47c221599..27ba94cea3fa926e56565bfb63dd57fa1505558b 100644
--- a/src/TNL/Matrices/LambdaMatrix.h
+++ b/src/TNL/Matrices/LambdaMatrix.h
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include <type_traits>
 #include <TNL/String.h>
 #include <TNL/Devices/Host.h>
 
@@ -17,31 +18,31 @@ namespace TNL {
 namespace Matrices {
 
 /**
- * \brief "Matrix-free" matrix based on lambda functions.
- * 
+ * \brief "Matrix-free matrix" based on lambda functions.
+ *
  * The elements of this matrix are not stored explicitly in memory but
  * implicitly on a form of lambda functions.
- * 
+ *
  * \tparam MatrixElementsLambda is a lambda function returning matrix elements values and positions.
- * 
+ *
  *    It has the following form:
- * 
- *   `matrixElements( IndexType rows, IndexType columns, IndexType row, IndexType localIdx, IndexType& elementColumn, RealType& elementValue )`
- * 
- *    where \e rows is the number of matrix rows, \e columns is the number of matrix columns, \e row is the index of matrix row being queried,
- *    \e localIdx is the rank of the non-zero element in given row, \e elementColumn is a column index of the matrix element computed by
- *    this lambda and \e elementValue is a value of the matrix element computed by this lambda.
+ *
+ *   `matrixElements( Index rows, Index columns, Index rowIdx, Index localIdx, Index& columnIdx, Real& value )`
+ *
+ *    where \e rows is the number of matrix rows, \e columns is the number of matrix columns, \e rowIdx is the index of matrix row being queried,
+ *    \e localIdx is the rank of the non-zero element in given row, \e columnIdx is a column index of the matrix element computed by
+ *    this lambda and \e value is a value of the matrix element computed by this lambda.
  * \tparam CompressedRowLengthsLambda is a lambda function returning a number of non-zero elements in each row.
- * 
+ *
  *    It has the following form:
- * 
- *    `rowLengths( IndexType rows, IndexType columns, IndexType row ) -> IndexType`
- * 
- *    where \e rows is the number of matrix rows, \e columns is the number of matrix columns and \e row is an index of the row being queried.
+ *
+ *    `rowLengths( Index rows, Index columns, Index rowIdx ) -> IndexType`
+ *
+ *    where \e rows is the number of matrix rows, \e columns is the number of matrix columns and \e rowIdx is an index of the row being queried.
  *
  * \tparam Real is a type of matrix elements values.
- * \tparam Device is a device on which the lambda functions will be evaluated. 
- * \ẗparam Index is a type used for indexing.
+ * \tparam Device is a device on which the lambda functions will be evaluated.
+ * \ẗparam Index is a type to be used for indexing.
  */
 template< typename MatrixElementsLambda,
           typename CompressedRowLengthsLambda,
@@ -72,13 +73,13 @@ class LambdaMatrix
 
       /**
        * \brief Constructor with lambda functions defining the matrix elements.
-       * 
+       *
        * Note: It might be difficult to express the types of the lambdas. For easier creation of
        * \e LambdaMatrix you may use \ref LambdaMatrixFactory.
-       * 
+       *
        * \param matrixElements is a lambda function giving matrix elements position and value.
        * \param compressedRowLentghs is a lambda function returning how many non-zero matrix elements are in given row.
-       * 
+       *
        * \par Example
        * \include Matrices/LambdaMatrix/LambdaMatrixExample_Constructor.cpp
        * \par Output
@@ -89,15 +90,15 @@ class LambdaMatrix
 
       /**
        * \brief Constructor with matrix dimensions and lambda functions defining the matrix elements.
-       * 
+       *
        * Note: It might be difficult to express the types of the lambdas. For easier creation of
        * \e LambdaMatrix you may use \ref LambdaMatrixFactory.
-       * 
+       *
        * \param rows is a number of the matrix rows.
        * \param columns is a number of the matrix columns.
        * \param matrixElements is a lambda function giving matrix elements position and value.
        * \param compressedRowLentghs is a lambda function returning how many non-zero matrix elements are in given row.
-       * 
+       *
        * \par Example
        * \include Matrices/LambdaMatrix/LambdaMatrixExample_Constructor.cpp
        * \par Output
@@ -110,21 +111,21 @@ class LambdaMatrix
 
       /**
        * \brief Copy constructor.
-       * 
+       *
        * \param matrix is input matrix.
        */
       LambdaMatrix( const LambdaMatrix& matrix ) = default;
 
       /**
        * \brief Move constructor.
-       * 
+       *
        * \param matrix is input matrix.
        */
       LambdaMatrix( LambdaMatrix&& matrix ) = default;
 
       /**
        * \brief Set number of rows and columns of this matrix.
-       * 
+       *
        * \param rows is the number of matrix rows.
        * \param columns is the number of matrix columns.
        */
@@ -133,7 +134,7 @@ class LambdaMatrix
 
       /**
        * \brief Returns a number of matrix rows.
-       * 
+       *
        * \return number of matrix rows.
        */
       __cuda_callable__
@@ -141,7 +142,7 @@ class LambdaMatrix
 
       /**
        * \brief Returns a number of matrix columns.
-       * 
+       *
        * \return number of matrix columns.
        */
       __cuda_callable__
@@ -149,10 +150,10 @@ class LambdaMatrix
 
       /**
        * \brief Computes number of non-zeros in each row.
-       * 
+       *
        * \param rowLengths is a vector into which the number of non-zeros in each row
        * will be stored.
-       * 
+       *
        * \par Example
        * \include Matrices/LambdaMatrix/LambdaMatrixExample_getCompressedRowLengths.cpp
        * \par Output
@@ -163,9 +164,9 @@ class LambdaMatrix
 
       /**
        * \brief Returns number of non-zero matrix elements.
-       * 
+       *
        * \return number of all non-zero matrix elements.
-       * 
+       *
        * \par Example
        * \include Matrices/LambdaMatrix/LambdaMatrixExample_getElementsCount.cpp
        * \par Output
@@ -175,10 +176,10 @@ class LambdaMatrix
 
       /**
        * \brief Returns value of matrix element at position given by its row and column index.
-       * 
+       *
        * \param row is a row index of the matrix element.
        * \param column i a column index of the matrix element.
-       * 
+       *
        * \return value of given matrix element.
        */
       RealType getElement( const IndexType row,
@@ -186,7 +187,7 @@ class LambdaMatrix
 
       /**
        * \brief Method for performing general reduction on matrix rows.
-       * 
+       *
        * \tparam Fetch is a type of lambda function for data fetch declared as
        *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
        *          The return type of this lambda can be any non void.
@@ -195,14 +196,14 @@ class LambdaMatrix
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
        *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
        * \tparam FetchValue is type returned by the Fetch lambda function.
-       * 
+       *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
        * \param zero is zero of given reduction operation also known as idempotent element.
-       * 
+       *
        * \par Example
        * \include Matrices/LambdaMatrix/LambdaMatrixExample_rowsReduction.cpp
        * \par Output
@@ -213,7 +214,7 @@ class LambdaMatrix
 
       /**
        * \brief Method for performing general reduction on ALL matrix rows.
-       * 
+       *
        * \tparam Fetch is a type of lambda function for data fetch declared as
        *          `fetch( IndexType rowIdx, IndexType columnIdx, RealType elementValue ) -> FetchValue`.
        *          The return type of this lambda can be any non void.
@@ -222,12 +223,12 @@ class LambdaMatrix
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
        *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
        * \tparam FetchValue is type returned by the Fetch lambda function.
-       * 
+       *
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
        * \param zero is zero of given reduction operation also known as idempotent element.
-       * 
+       *
        * \par Example
        * \include Matrices/LambdaMatrix/LambdaMatrixExample_allRowsReduction.cpp
        * \par Output
@@ -238,18 +239,18 @@ class LambdaMatrix
 
       /**
        * \brief Method for iteration over all matrix rows for constant instances.
-       * 
+       *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
        *  `function( IndexType rowIdx, IndexType columnIdx, IndexType columnIdx_, const RealType& value, bool& compute )`.
-       *  The column index repeats twice only for compatibility with sparse matrices. 
-       *  If the 'compute' variable is set to false the iteration over the row can 
+       *  The column index repeats twice only for compatibility with sparse matrices.
+       *  If the 'compute' variable is set to false the iteration over the row can
        *  be interrupted.
-       * 
+       *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
-       * 
+       *
        * \par Example
        * \include Matrices/LambdaMatrix/LambdaMatrixExample_forRows.cpp
        * \par Output
@@ -260,12 +261,12 @@ class LambdaMatrix
 
       /**
        * \brief This method calls \e forRows for all matrix rows (for constant instances).
-       * 
+       *
        * See \ref LambdaMatrix::forRows.
-       * 
+       *
        * \tparam Function is a type of lambda function that will operate on matrix elements.
        * \param function  is an instance of the lambda function to be called in each row.
-       * 
+       *
        * \par Example
        * \include Matrices/LambdaMatrix/LambdaMatrixExample_forAllRows.cpp
        * \par Output
@@ -276,16 +277,16 @@ class LambdaMatrix
 
       /**
        * \brief Computes product of matrix and vector.
-       * 
+       *
        * More precisely, it computes:
-       * 
+       *
        * `outVector = matrixMultiplicator * ( *this ) * inVector + outVectorMultiplicator * outVector`
-       * 
+       *
        * \tparam InVector is type of input vector.  It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
        * \tparam OutVector is type of output vector. It can be \ref Vector,
        *     \ref VectorView, \ref Array, \ref ArraView or similar container.
-       * 
+       *
        * \param inVector is input vector.
        * \param outVector is output vector.
        * \param matrixMultiplicator is a factor by which the matrix is multiplied. It is one by default.
@@ -314,7 +315,7 @@ class LambdaMatrix
 
       /**
        * \brief Method for printing the matrix to output stream.
-       * 
+       *
        * \param str is the output stream.
        */
       void print( std::ostream& str ) const;
@@ -330,7 +331,7 @@ class LambdaMatrix
 
 /**
  * \brief Insertion operator for dense matrix and output stream.
- * 
+ *
  * \param str is the output stream.
  * \param matrix is the lambda matrix.
  * \return reference to the stream.
@@ -344,9 +345,9 @@ std::ostream& operator<< ( std::ostream& str, const LambdaMatrix< MatrixElements
 
 /**
  * \brief Helper class for creating instances of LambdaMatrix.
- * 
+ *
  * See \ref LambdaMatrix.
- * 
+ *
  * \param matrixElementsLambda
  * \param compressedRowLengthsLambda
  */
@@ -360,12 +361,12 @@ struct LambdaMatrixFactory
 
    /**
     * \brief Creates lambda matrix with given lambda functions.
-    * 
+    *
     * \param matrixElementsLambda is a lambda function evaluating matrix elements.
     * \param compressedRowLengthsLambda is a lambda function returning number of
     *    non-zero matrix elements in given \e row.
     * \return instance of LambdaMatrix.
-    * 
+    *
     * \par Example
     * \include Matrices/LambdaMatrix/LambdaMatrixExample_Constructor.cpp
     * \par Output
@@ -377,6 +378,15 @@ struct LambdaMatrixFactory
                        CompressedRowLengthsLambda& compressedRowLengthsLambda )
    -> LambdaMatrix< MatrixElementsLambda, CompressedRowLengthsLambda, Real, Device, Index >
    {
+      // TODO: fix the following asserts, they do not work in fact
+      static_assert( std::is_same<
+            std::enable_if_t< true, decltype(matrixElementsLambda( Index(), Index(), Index(), Index(), std::declval< Index& >(), std::declval< Real& >() ) ) >,
+            void >::value,
+         "Wong type of MatrixElementsLambda, it should be - matrixElementsLambda( Index rows, Index columns, Index rowIdx, Index localIdx, Index& columnIdx, Real& value )" );
+      static_assert( std::is_integral<
+         std::enable_if_t< true, decltype(compressedRowLengthsLambda( Index(), Index(), Index() ) ) >
+          >::value ,
+         "Wong type of CompressedRowLengthsLambda, it should be - matrixElementsLambda( Index rows, Index columns, Index rowIdx )" );
       return LambdaMatrix< MatrixElementsLambda, CompressedRowLengthsLambda, Real, Device, Index >(
          matrixElementsLambda,
          compressedRowLengthsLambda );
@@ -384,14 +394,14 @@ struct LambdaMatrixFactory
 
    /**
     * \brief Creates lambda matrix with given dimensions and lambda functions.
-    * 
+    *
     * \param rows is number of matrix rows.
     * \param columns is number of matrix columns.
     * \param matrixElementsLambda is a lambda function evaluating matrix elements.
     * \param compressedRowLengthsLambda is a lambda function returning number of
     *    non-zero matrix elements in given \e row.
     * \return instance of LambdaMatrix.
-    * 
+    *
     * \par Example
     * \include Matrices/LambdaMatrix/LambdaMatrixExample_Constructor.cpp
     * \par Output
@@ -405,6 +415,16 @@ struct LambdaMatrixFactory
                        CompressedRowLengthsLambda& compressedRowLengthsLambda )
    -> LambdaMatrix< MatrixElementsLambda, CompressedRowLengthsLambda, Real, Device, Index >
    {
+      // TODO: fix the following asserts, they do not work in fact
+      static_assert( std::is_same<
+            std::enable_if_t< true, decltype(matrixElementsLambda( Index(), Index(), Index(), Index(), std::declval< Index& >(), std::declval< Real& >() ) ) >,
+            void >::value,
+         "Wong type of MatrixElementsLambda, it should be - matrixElementsLambda( Index rows, Index columns, Index rowIdx, Index localIdx, Index& columnIdx, Real& value )" );
+      static_assert( std::is_integral<
+         std::enable_if_t< true, decltype(compressedRowLengthsLambda( Index(), Index(), Index() ) ) >
+          >::value ,
+         "Wong type of CompressedRowLengthsLambda, it should be - matrixElementsLambda( Index rows, Index columns, Index rowIdx )" );
+
       return LambdaMatrix< MatrixElementsLambda, CompressedRowLengthsLambda, Real, Device, Index >(
          rows, columns,
          matrixElementsLambda,
diff --git a/src/TNL/Matrices/MultidiagonalMatrix.h b/src/TNL/Matrices/MultidiagonalMatrix.h
index fa612e9f03da5039ddf76f12ee6d6518de9c5741..797d16a3fd3340687b0b566a41eadbbc005d8d52 100644
--- a/src/TNL/Matrices/MultidiagonalMatrix.h
+++ b/src/TNL/Matrices/MultidiagonalMatrix.h
@@ -47,14 +47,15 @@ namespace Matrices {
  * are \f$\{-3,-1,0,1,3\}\f$. Advantage is that we do not store the column indexes
  * explicitly as it is in \ref SparseMatrix. This can reduce significantly the
  * memory requirements which also means better performance. See the following table
- * for the storage requirements comparison between \ref MultidiagonalMatrix and \ref SparseMatrix.
+ * for the storage requirements comparison between \ref TNL::Matrices::MultidiagonalMatrix 
+ * and \ref TNL::Matrices::SparseMatrix.
  *
- *  Data types         |      SparseMatrix    | MultidiagonalMatrix | Ratio
- * --------------------|----------------------|---------------------|--------
- *  float + 32-bit int | 8 bytes per element  | 4 bytes per element | 50%
- *  double + 32-bit int| 12 bytes per element | 8 bytes per element | 75%
- *  float + 64-bit int | 12 bytes per element | 4 bytes per element | 30%
- *  double + 64-bit int| 16 bytes per element | 8 bytes per element | 50%
+ *  Real   | Index     |      SparseMatrix    | MultidiagonalMatrix | Ratio
+ * --------|-----------|----------------------|---------------------|-------
+ *  float  | 32-bit int| 8 bytes per element  | 4 bytes per element | 50%
+ *  double | 32-bit int| 12 bytes per element | 8 bytes per element | 75%
+ *  float  | 64-bit int| 12 bytes per element | 4 bytes per element | 30%
+ *  double | 64-bit int| 16 bytes per element | 8 bytes per element | 50%
  *
  * \tparam Real is a type of matrix elements.
  * \tparam Device is a device where the matrix is allocated.
@@ -296,6 +297,28 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
                           const IndexType columns,
                           const Vector& diagonalsOffsets );
 
+      /**
+       * @brief Set the diagonals offsets by means of vector-like container.
+       *
+       * This method deletes current matrix elements.
+       *
+       * @tparam Vector is a type of vector-like container holding the subdiagonals offsets.
+       * @param diagonalsOffsets  is a vector-like container holding the subdiagonals offsets.
+       */
+      template< typename Vector >
+      void setDiagonalsOffsets( const Vector& diagonalsOffsets );
+
+      /**
+       * @brief Set the diagonals offsets by means of initializer list.
+       *
+       * This method deletes current matrix elements.
+       *
+       * @tparam ListIndex is type of indexes used for the subdiagonals offsets definition.
+       * @param diagonalsOffsets is a initializer list with subdiagonals offsets.
+       */
+      template< typename ListIndex >
+      void setDiagonalsOffsets( const std::initializer_list< ListIndex > diagonalsOffsets );
+
       /**
        * \brief This method is for compatibility with \ref SparseMatrix.
        *
@@ -311,6 +334,20 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       template< typename RowCapacitiesVector >
       void setRowCapacities( const RowCapacitiesVector& rowCapacities );
 
+      /**
+       * \brief Returns number of diagonals.
+       *
+       * \return Number of diagonals.
+       */
+      const IndexType& getDiagonalsCount() const;
+
+      /**
+       * \brief Returns vector with diagonals offsets.
+       *
+       * \return vector with diagonals offsets.
+       */
+      const DiagonalsOffsetsType& getDiagonalsOffsets() const;
+
       /**
        * \brief Set matrix elements from an initializer list.
        *
@@ -329,20 +366,6 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
       template< typename ListReal >
       void setElements( const std::initializer_list< std::initializer_list< ListReal > >& data );
 
-      /**
-       * \brief Returns number of diagonals.
-       *
-       * \return Number of diagonals.
-       */
-      const IndexType& getDiagonalsCount() const;
-
-      /**
-       * \brief Returns vector with diagonals offsets.
-       *
-       * \return vector with diagonals offsets.
-       */
-      const DiagonalsOffsetsType& getDiagonalsOffsets() const;
-
       /**
        * \brief Computes number of non-zeros in each row.
        *
@@ -666,9 +689,21 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
-       *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
+       *
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`,
+       *
+       * where
+       *
+       * \e rowIdx is an index of the matrix row.
+       *
+       * \e localIdx parameter is a rank of the non-zero element in given row. It is also, in fact,
+       *  index of the matrix subdiagonal.
+       *
+       * \e columnIdx is a column index of the matrx element.
+       *
+       * \e value is the matrix element value.
+       *
+       * \e compute is a reference to a boolen variable. If it is set to false the iteration over the row can
        *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
@@ -688,9 +723,21 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        *
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
-       *  The \e localIdx parameter is a rank of the non-zero element in given row.
-       *  If the 'compute' variable is set to false the iteration over the row can
+       *
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`,
+       *
+       * where
+       *
+       * \e rowIdx is an index of the matrix row.
+       *
+       * \e localIdx parameter is a rank of the non-zero element in given row. It is also, in fact,
+       *  index of the matrix subdiagonal.
+       *
+       * \e columnIdx is a column index of the matrix element.
+       *
+       * \e value is a reference to the matrix element value. It can be used even for changing the matrix element value.
+       *
+       * \e compute is a reference to a boolen variable. If it is set to false the iteration over the row can
        *  be interrupted.
        *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
diff --git a/src/TNL/Matrices/MultidiagonalMatrix.hpp b/src/TNL/Matrices/MultidiagonalMatrix.hpp
index 1adad89ea48b186a683b60661b4b9af6d0202fa0..99cd518bc9680424f1b2f02c652bf83354b665bd 100644
--- a/src/TNL/Matrices/MultidiagonalMatrix.hpp
+++ b/src/TNL/Matrices/MultidiagonalMatrix.hpp
@@ -41,7 +41,7 @@ MultidiagonalMatrix( const IndexType rows,
                const IndexType columns,
                const Vector& diagonalsOffsets )
 {
-   TNL_ASSERT_GT( diagonalsOffsets.getSize(), 0, "Cannot construct mutltidiagonal matrix with no diagonals shifts." );
+   TNL_ASSERT_GT( diagonalsOffsets.getSize(), 0, "Cannot construct multidiagonal matrix with no diagonals offsets." );
    this->setDimensions( rows, columns, diagonalsOffsets );
 }
 
@@ -57,9 +57,9 @@ MultidiagonalMatrix( const IndexType rows,
                      const IndexType columns,
                      const std::initializer_list< ListIndex > diagonalsOffsets )
 {
-   Containers::Vector< IndexType, DeviceType, IndexType > shifts( diagonalsOffsets );
-   TNL_ASSERT_GT( shifts.getSize(), 0, "Cannot construct multidiagonal matrix with no diagonals shifts." );
-   this->setDimensions( rows, columns, shifts );
+   Containers::Vector< IndexType, DeviceType, IndexType > offsets( diagonalsOffsets );
+   TNL_ASSERT_GT( offsets.getSize(), 0, "Cannot construct multidiagonal matrix with no diagonals offsets." );
+   this->setDimensions( rows, columns, offsets );
 }
 
 template< typename Real,
@@ -74,9 +74,9 @@ MultidiagonalMatrix( const IndexType columns,
                      const std::initializer_list< ListIndex > diagonalsOffsets,
                      const std::initializer_list< std::initializer_list< ListReal > >& data )
 {
-   Containers::Vector< IndexType, DeviceType, IndexType > shifts( diagonalsOffsets );
-   TNL_ASSERT_GT( shifts.getSize(), 0, "Cannot construct multidiagonal matrix with no diagonals shifts." );
-   this->setDimensions( data.size(), columns, shifts );
+   Containers::Vector< IndexType, DeviceType, IndexType > offsets( diagonalsOffsets );
+   TNL_ASSERT_GT( offsets.getSize(), 0, "Cannot construct multidiagonal matrix with no diagonals offsets." );
+   this->setDimensions( data.size(), columns, offsets );
    this->setElements( data );
 }
 
@@ -149,6 +149,37 @@ setDimensions( const IndexType rows,
    this->view = this->getView();
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename Vector >
+void
+MultidiagonalMatrix< Real, Device, Index, Organization, RealAllocator, IndexAllocator >::
+setDiagonalsOffsets( const Vector& diagonalsOffsets )
+{
+   TNL_ASSERT_GT( diagonalsOffsets.getSize(), 0, "Cannot construct multidiagonal matrix with no diagonals offsets." );
+   this->setDimensions( this->getRows(), this->getColumns(), diagonalsOffsets );
+}
+
+template< typename Real,
+          typename Device,
+          typename Index,
+          ElementsOrganization Organization,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename ListIndex >
+void
+MultidiagonalMatrix< Real, Device, Index, Organization, RealAllocator, IndexAllocator >::
+setDiagonalsOffsets( const std::initializer_list< ListIndex > diagonalsOffsets )
+{
+   Containers::Vector< IndexType, DeviceType, IndexType > offsets( diagonalsOffsets );
+   TNL_ASSERT_GT( offsets.getSize(), 0, "Cannot construct multidiagonal matrix with no diagonals offsets." );
+   this->setDimensions( this->getRows(), this->getColumns(), offsets );
+}
+
 template< typename Real,
           typename Device,
           typename Index,
diff --git a/src/TNL/Matrices/MultidiagonalMatrixView.h b/src/TNL/Matrices/MultidiagonalMatrixView.h
index a058e643ed51a13abdd72469a9f6fe900e536bc8..a26251a3b243e0d92362e208cbd786ba8b02f27c 100644
--- a/src/TNL/Matrices/MultidiagonalMatrixView.h
+++ b/src/TNL/Matrices/MultidiagonalMatrixView.h
@@ -427,7 +427,7 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
 
       /**
        * \brief Method for performing general reduction on all matrix rows.
-       * 
+       *
        * \tparam Fetch is a type of lambda function for data fetch declared as
        *          `fetch( IndexType rowIdx, IndexType& columnIdx, RealType& elementValue ) -> FetchValue`.
        *          The return type of this lambda can be any non void.
@@ -436,12 +436,12 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \tparam Keep is a type of lambda function for storing results of reduction in each row.
        *          It is declared as `keep( const IndexType rowIdx, const double& value )`.
        * \tparam FetchValue is type returned by the Fetch lambda function.
-       * 
+       *
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
        * \param zero is zero of given reduction operation also known as idempotent element.
-       * 
+       *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_allRowsReduction.cpp
        * \par Output
@@ -455,15 +455,27 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * 
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
-       *  The \e localIdx parameter is a rank of the non-zero element in given row. 
-       *  If the 'compute' variable is set to false the iteration over the row can 
+       *
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`,
+       *
+       * where
+       *
+       * \e rowIdx is an index of the matrix row.
+       *
+       * \e localIdx parameter is a rank of the non-zero element in given row. It is also, in fact,
+       *  index of the matrix subdiagonal.
+       *
+       * \e columnIdx is a column index of the matrx element.
+       *
+       * \e value is the matrix element value.
+       *
+       * \e compute is a reference to a boolen variable. If it is set to false the iteration over the row can
        *  be interrupted.
-       * 
+       *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
-       * 
+       *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_forRows.cpp
        * \par Output
@@ -477,15 +489,27 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * 
        * \tparam Function is type of lambda function that will operate on matrix elements.
        *    It is should have form like
-       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`.
-       *  The \e localIdx parameter is a rank of the non-zero element in given row. 
-       *  If the 'compute' variable is set to false the iteration over the row can 
-       *  be interrupted.
        * 
+       *  `function( IndexType rowIdx, IndexType localIdx, IndexType columnIdx, const RealType& value, bool& compute )`,
+       *
+       * where
+       *
+       * \e rowIdx is an index of the matrix row.
+       *
+       * \e localIdx parameter is a rank of the non-zero element in given row. It is also, in fact,
+       *  index of the matrix subdiagonal.
+       *
+       * \e columnIdx is a column index of the matrx element.
+       *
+       * \e value is a reference to the matrix element value. It can be used even for changing the matrix element value.
+       *
+       * \e compute is a reference to a boolen variable. If it is set to false the iteration over the row can
+       *  be interrupted.
+       *
        * \param begin defines beginning of the range [begin,end) of rows to be processed.
        * \param end defines ending of the range [begin,end) of rows to be processed.
        * \param function is an instance of the lambda function to be called in each row.
-       * 
+       *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_forRows.cpp
        * \par Output
diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index 3bb7a3e583f6b778938557aba9fb07d2cc1bc00b..6d068f370f3be3a2d8b6eea20386054d6c984776 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -41,7 +41,7 @@ namespace Matrices {
  * \tparam RealAllocator is allocator for the matrix elements values.
  * \tparam IndexAllocator is allocator for the matrix elements column indexes.
  */
-template< typename Real,
+template< typename Real =  double,
           typename Device = Devices::Host,
           typename Index = int,
           typename MatrixType = GeneralMatrix,
@@ -202,14 +202,20 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param realAllocator is used for allocation of matrix elements values.
        * \param indexAllocator is used for allocation of matrix elements column indexes.
        */
-      SparseMatrix( const IndexType rows,
-                    const IndexType columns,
+      template< typename Index_t, std::enable_if_t< std::is_integral< Index_t >::value, int > = 0 >
+      SparseMatrix( const Index_t rows,
+                    const Index_t columns,
                     const RealAllocatorType& realAllocator = RealAllocatorType(),
                     const IndexAllocatorType& indexAllocator = IndexAllocatorType() );
 
       /**
+<<<<<<< HEAD
        * \brief Constructor with matrix rows capacities and number of columns.
        *
+=======
+       * \brief Constructor with matrix rows capacities given as an initializer list and a number of columns.
+       * 
+>>>>>>> Added SparseMatrix constructor with row capacities vector.
        * The number of matrix rows is given by the size of \e rowCapacities list.
        *
        * \tparam ListIndex is the initializer list values type.
@@ -230,6 +236,31 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
                              const RealAllocatorType& realAllocator = RealAllocatorType(),
                              const IndexAllocatorType& indexAllocator = IndexAllocatorType() );
 
+      /**
+       * \brief Constructor with matrix rows capacities given as a vector and number of columns.
+       * 
+       * The number of matrix rows is given by the size of \e rowCapacities vector.
+       * 
+       * \tparam RowCapacitiesVector is the row capacities vector type. Usually it is some of
+       *    \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or
+       *    \ref TNL::Containers::VectorView.
+       * \param rowCapacities is a vector telling how many matrix elements must be
+       *    allocated in each row.
+       * \param columns is the number of matrix columns.
+       * \param realAllocator is used for allocation of matrix elements values.
+       * \param indexAllocator is used for allocation of matrix elements column indexes.
+       * 
+       * \par Example
+       * \include Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cpp
+       * \par Output
+       * \include SparseMatrixExample_Constructor_rowCapacities_vector.out
+       */
+      template< typename RowCapacitiesVector, std::enable_if_t< TNL::IsArrayType< RowCapacitiesVector >::value, int > = 0 >
+      explicit SparseMatrix( const RowCapacitiesVector& rowCapacities,
+                             const IndexType columns,
+                             const RealAllocatorType& realAllocator = RealAllocatorType(),
+                             const IndexAllocatorType& indexAllocator = IndexAllocatorType() );
+
       /**
        * \brief Constructor with matrix dimensions and data in initializer list.
        *
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index 1371fc27b4ef850c6300045ec6b52102c8112e70..e2086d0eb2ed0da9ed09ef42f090313853bc87fe 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -41,9 +41,10 @@ template< typename Real,
           typename ComputeReal,
           typename RealAllocator,
           typename IndexAllocator >
+   template< typename Index_t, std::enable_if_t< std::is_integral< Index_t >::value, int > >
 SparseMatrix< Real, Device, Index, MatrixType, Segments, ComputeReal, RealAllocator, IndexAllocator >::
-SparseMatrix( const IndexType rows,
-              const IndexType columns,
+SparseMatrix( const Index_t rows,
+              const Index_t columns,
               const RealAllocatorType& realAllocator,
               const IndexAllocatorType& indexAllocator )
 : BaseType( rows, columns, realAllocator ), columnIndexes( indexAllocator ),
@@ -71,6 +72,25 @@ SparseMatrix( const std::initializer_list< ListIndex >& rowCapacities,
    this->setRowCapacities( RowsCapacitiesType( rowCapacities ) );
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename MatrixType,
+          template< typename, typename, typename > class Segments,
+          typename ComputeReal,
+          typename RealAllocator,
+          typename IndexAllocator >
+   template< typename RowCapacitiesVector, std::enable_if_t< TNL::IsArrayType< RowCapacitiesVector >::value, int > >
+SparseMatrix< Real, Device, Index, MatrixType, Segments, ComputeReal, RealAllocator, IndexAllocator >::
+SparseMatrix( const RowCapacitiesVector& rowCapacities,
+              const IndexType columns,
+              const RealAllocatorType& realAllocator,
+              const IndexAllocatorType& indexAllocator )
+: BaseType( rowCapacities.getSize(), columns, realAllocator ), columnIndexes( indexAllocator )
+{
+   this->setRowCapacities( rowCapacities );
+}
+
 template< typename Real,
           typename Device,
           typename Index,
diff --git a/src/TNL/Matrices/TridiagonalMatrix.h b/src/TNL/Matrices/TridiagonalMatrix.h
index e5c9ee24a4e2db301f720741b240218cd943b16d..426fa2e74a5929d76628b896082a9f6d674b5a4a 100644
--- a/src/TNL/Matrices/TridiagonalMatrix.h
+++ b/src/TNL/Matrices/TridiagonalMatrix.h
@@ -44,12 +44,12 @@ namespace Matrices {
  * memory requirements which also means better performance. See the following table
  * for the storage requirements comparison between \ref TridiagonalMatrix and \ref SparseMatrix.
  *
- *  Data types         |      SparseMatrix    | TridiagonalMatrix | Ratio
- * --------------------|----------------------|---------------------|--------
- *  float + 32-bit int | 8 bytes per element  | 4 bytes per element | 50%
- *  double + 32-bit int| 12 bytes per element | 8 bytes per element | 75%
- *  float + 64-bit int | 12 bytes per element | 4 bytes per element | 30%
- *  double + 64-bit int| 16 bytes per element | 8 bytes per element | 50%
+ *  Real   | Index      |      SparseMatrix    | TridiagonalMatrix   | Ratio
+ * --------|------------|----------------------|---------------------|-------
+ *  float  | 32-bit int | 8 bytes per element  | 4 bytes per element | 50%
+ *  double | 32-bit int | 12 bytes per element | 8 bytes per element | 75%
+ *  float  | 64-bit int | 12 bytes per element | 4 bytes per element | 30%
+ *  double | 64-bit int | 16 bytes per element | 8 bytes per element | 50%
  *
  * \tparam Real is a type of matrix elements.
  * \tparam Device is a device where the matrix is allocated.
diff --git a/src/TNL/Matrices/TridiagonalMatrixView.hpp b/src/TNL/Matrices/TridiagonalMatrixView.hpp
index b84c63f9b9aa8418212e12ea34dd87632b1f219a..30afaa93877c4e6b4b6c4925b566f40ace38c2f7 100644
--- a/src/TNL/Matrices/TridiagonalMatrixView.hpp
+++ b/src/TNL/Matrices/TridiagonalMatrixView.hpp
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include <iomanip>
 #include <TNL/Assert.h>
 #include <TNL/Matrices/TridiagonalMatrixView.h>
 #include <TNL/Exceptions/NotImplementedError.h>
@@ -98,8 +99,8 @@ getCompressedRowLengths( Vector& rowLengths ) const
    auto fetch = [] __cuda_callable__ ( IndexType row, IndexType column, const RealType& value ) -> IndexType {
       return ( value != 0.0 );
    };
-   auto reduce = [] __cuda_callable__ ( IndexType& aux, const IndexType a ) {
-      aux += a;
+   auto reduce = [] __cuda_callable__ ( const IndexType& aux, const IndexType a ) -> IndexType {
+      return aux + a;
    };
    auto keep = [=] __cuda_callable__ ( const IndexType rowIdx, const IndexType value ) mutable {
       rowLengths_view[ rowIdx ] = value;
@@ -275,23 +276,23 @@ rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Ke
       Real_ sum( zero );
       if( rowIdx == 0 )
       {
-         reduce( sum, fetch( 0, 1, values_view[ indexer.getGlobalIndex( 0, 1 ) ] ) );
-         reduce( sum, fetch( 0, 2, values_view[ indexer.getGlobalIndex( 0, 2 ) ] ) );
+         sum = reduce( sum, fetch( 0, 1, values_view[ indexer.getGlobalIndex( 0, 1 ) ] ) );
+         sum = reduce( sum, fetch( 0, 2, values_view[ indexer.getGlobalIndex( 0, 2 ) ] ) );
          keep( 0, sum );
          return;
       }
       if( rowIdx + 1 < indexer.getColumns() )
       {
-         reduce( sum, fetch( rowIdx, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] ) );
-         reduce( sum, fetch( rowIdx, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] ) );
-         reduce( sum, fetch( rowIdx, rowIdx + 1, values_view[ indexer.getGlobalIndex( rowIdx, 2 ) ] ) );
+         sum = reduce( sum, fetch( rowIdx, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] ) );
+         sum = reduce( sum, fetch( rowIdx, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] ) );
+         sum = reduce( sum, fetch( rowIdx, rowIdx + 1, values_view[ indexer.getGlobalIndex( rowIdx, 2 ) ] ) );
          keep( rowIdx, sum );
          return;
       }
       if( rowIdx < indexer.getColumns() )
       {
-         reduce( sum, fetch( rowIdx, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] ) );
-         reduce( sum, fetch( rowIdx, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] ) );
+         sum = reduce( sum, fetch( rowIdx, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] ) );
+         sum = reduce( sum, fetch( rowIdx, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] ) );
          keep( rowIdx, sum );
       }
       else
@@ -319,23 +320,23 @@ rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Ke
       Real_ sum( zero );
       if( rowIdx == 0 )
       {
-         reduce( sum, fetch( 0, 1, values_view[ indexer.getGlobalIndex( 0, 1 ) ] ) );
-         reduce( sum, fetch( 0, 2, values_view[ indexer.getGlobalIndex( 0, 2 ) ] ) );
+         sum = reduce( sum, fetch( 0, 1, values_view[ indexer.getGlobalIndex( 0, 1 ) ] ) );
+         sum = reduce( sum, fetch( 0, 2, values_view[ indexer.getGlobalIndex( 0, 2 ) ] ) );
          keep( 0, sum );
          return;
       }
       if( rowIdx + 1 < indexer.getColumns() )
       {
-         reduce( sum, fetch( rowIdx, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] ) );
-         reduce( sum, fetch( rowIdx, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] ) );
-         reduce( sum, fetch( rowIdx, rowIdx + 1, values_view[ indexer.getGlobalIndex( rowIdx, 2 ) ] ) );
+         sum = reduce( sum, fetch( rowIdx, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] ) );
+         sum = reduce( sum, fetch( rowIdx, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] ) );
+         sum = reduce( sum, fetch( rowIdx, rowIdx + 1, values_view[ indexer.getGlobalIndex( rowIdx, 2 ) ] ) );
          keep( rowIdx, sum );
          return;
       }
       if( rowIdx < indexer.getColumns() )
       {
-         reduce( sum, fetch( rowIdx, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] ) );
-         reduce( sum, fetch( rowIdx, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] ) );
+         sum = reduce( sum, fetch( rowIdx, rowIdx - 1, values_view[ indexer.getGlobalIndex( rowIdx, 0 ) ] ) );
+         sum = reduce( sum, fetch( rowIdx, rowIdx,     values_view[ indexer.getGlobalIndex( rowIdx, 1 ) ] ) );
          keep( rowIdx, sum );
       }
       else
@@ -487,8 +488,8 @@ vectorProduct( const InVector& inVector,
    auto fetch = [=] __cuda_callable__ ( const IndexType& row, const IndexType& column, const RealType& value ) -> RealType {
       return value * inVectorView[ column ];
    };
-   auto reduction = [] __cuda_callable__ ( RealType& sum, const RealType& value ) {
-      sum += value;
+   auto reduction = [] __cuda_callable__ ( const RealType& sum, const RealType& value ) -> RealType {
+      return sum + value;
    };
    auto keeper1 = [=] __cuda_callable__ ( IndexType row, const RealType& value ) mutable {
       outVectorView[ row ] = value;
@@ -693,9 +694,13 @@ void TridiagonalMatrixView< Real, Device, Index, Organization >::print( std::ost
       for( IndexType column = row - 1; column < row + 2; column++ )
          if( column >= 0 && column < this->columns )
          {
-            auto v = this->getElement( row, column );
-            if( v )
-               str << " Col:" << column << "->" << v << "\t";
+            auto value = this->getElement( row, column );
+            if( value )
+            {
+               std::stringstream str_;
+               str_ << std::setw( 4 ) << std::right << column << ":" << std::setw( 4 ) << std::left << value;
+               str << std::setw( 10 ) << str_.str();
+            }
          }
       str << std::endl;
    }
diff --git a/src/UnitTests/Matrices/MultidiagonalMatrixTest.h b/src/UnitTests/Matrices/MultidiagonalMatrixTest.h
index dd721dd895190097fd03891115ffa518883d425e..8051f039ff00f2134d0eefe51c053a267dd4b608 100644
--- a/src/UnitTests/Matrices/MultidiagonalMatrixTest.h
+++ b/src/UnitTests/Matrices/MultidiagonalMatrixTest.h
@@ -64,6 +64,44 @@ void test_SetDimensions()
    EXPECT_EQ( m.getColumns(), 8 );
 }
 
+template< typename Matrix >
+void test_SetDiagonalsOffsets()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+   using DiagonalsOffsetsType = typename Matrix::DiagonalsOffsetsType;
+
+   const IndexType rows = 9;
+   const IndexType cols = 8;
+   const DiagonalsOffsetsType diagonalsOffsets{ -3, -1, 0, 2, 4 };
+
+   Matrix m;
+   m.setDimensions( rows, cols );
+   m.setDiagonalsOffsets( diagonalsOffsets );
+
+   EXPECT_EQ( m.getRows(), 9 );
+   EXPECT_EQ( m.getColumns(), 8 );
+}
+
+template< typename Matrix >
+void test_SetDiagonalsOffsets_initalizer_list()
+{
+   using RealType = typename Matrix::RealType;
+   using DeviceType = typename Matrix::DeviceType;
+   using IndexType = typename Matrix::IndexType;
+   using DiagonalsOffsetsType = typename Matrix::DiagonalsOffsetsType;
+
+   const IndexType rows = 9;
+   const IndexType cols = 8;
+
+   Matrix m;
+   m.setDimensions( rows, cols );
+   m.setDiagonalsOffsets( { -3, -1, 0, 2, 4 } );
+
+   EXPECT_EQ( m.getRows(), 9 );
+   EXPECT_EQ( m.getColumns(), 8 );
+}
 
 template< typename Matrix1, typename Matrix2 >
 void test_SetLike()
diff --git a/src/UnitTests/Matrices/TridiagonalMatrixTest.h b/src/UnitTests/Matrices/TridiagonalMatrixTest.h
index 500ed99388a17d1f6d58e1f016857493f7c2b2e0..3b68f7490950d4035869fc67f3b6b75be2d5bdec 100644
--- a/src/UnitTests/Matrices/TridiagonalMatrixTest.h
+++ b/src/UnitTests/Matrices/TridiagonalMatrixTest.h
@@ -1264,54 +1264,6 @@ void test_SaveAndLoad()
    EXPECT_EQ( savedMatrix.getElement( 3, 3 ), 16 );
 }
 
-template< typename Matrix >
-void test_Print()
-{
-   using RealType = typename Matrix::RealType;
-   using DeviceType = typename Matrix::DeviceType;
-   using IndexType = typename Matrix::IndexType;
-
-   /*
-    * Sets up the following 5x4 sparse matrix:
-    *
-    *    /  1  2  0  0 \
-    *    |  5  6  7  0 |
-    *    |  0 10 11 12 |
-    *    |  0  0 15 16 |
-    *    \  0  0  0 20 /
-    */
-   const IndexType rows = 5;
-   const IndexType cols = 4;
-
-   Matrix m( rows, cols );
-
-   RealType value = 1;
-   for( IndexType i = 0; i < rows; i++)
-      for( IndexType j = 0; j < cols; j++)
-      {
-         if( abs( i - j ) <= 1 )
-            m.setElement( i, j, value );
-         value++;
-      }
-
-   std::stringstream printed;
-   std::stringstream couted;
-
-   //change the underlying buffer and save the old buffer
-   auto old_buf = std::cout.rdbuf(printed.rdbuf());
-
-   m.print( std::cout ); //all the std::cout goes to ss
-
-   std::cout.rdbuf(old_buf); //reset
-   couted << "Row: 0 ->  Col:0->1\t Col:1->2\t\n"
-             "Row: 1 ->  Col:0->5\t Col:1->6\t Col:2->7\t\n"
-             "Row: 2 ->  Col:1->10\t Col:2->11\t Col:3->12\t\n"
-             "Row: 3 ->  Col:2->15\t Col:3->16\t\n"
-             "Row: 4 ->  Col:3->20\t\n";
-
-   EXPECT_EQ( printed.str(), couted.str() );
-}
-
 // test fixture for typed tests
 template< typename Matrix >
 class MatrixTest : public ::testing::Test
@@ -1478,13 +1430,6 @@ TYPED_TEST( MatrixTest, saveAndLoadTest )
     test_SaveAndLoad< MatrixType >();
 }
 
-TYPED_TEST( MatrixTest, printTest )
-{
-    using MatrixType = typename TestFixture::MatrixType;
-
-    test_Print< MatrixType >();
-}
-
 //// test_getType is not general enough yet. DO NOT TEST IT YET.
 
 //TEST( TridiagonalMatrixTest, Tridiagonal_GetTypeTest_Host )